susumu.yata
null+****@clear*****
Fri Feb 13 10:00:43 JST 2015
susumu.yata 2015-02-13 10:00:43 +0900 (Fri, 13 Feb 2015) New Revision: 163cce8c7a57fbe52b17e11e16db0eae2c692f7f https://github.com/groonga/grnxx/commit/163cce8c7a57fbe52b17e11e16db0eae2c692f7f Message: Add a parser for Expression. (#143) Modified files: include/grnxx/expression.hpp include/grnxx/string.hpp lib/grnxx/expression.cpp lib/grnxx/impl/expression.cpp lib/grnxx/impl/expression.hpp test/test_expression.cpp Modified: include/grnxx/expression.hpp (+2 -1) =================================================================== --- include/grnxx/expression.hpp 2015-02-09 18:40:48 +0900 (a9185a2) +++ include/grnxx/expression.hpp 2015-02-13 10:00:43 +0900 (1bad27a) @@ -145,7 +145,8 @@ class Expression { // // On success, returns the expression. // On failure, throws an exception. - static std::unique_ptr<Expression> parse(const String &query); + static std::unique_ptr<Expression> parse(const Table *table, + const String &query); }; class ExpressionBuilder { Modified: include/grnxx/string.hpp (+39 -0) =================================================================== --- include/grnxx/string.hpp 2015-02-09 18:40:48 +0900 (7ae2f26) +++ include/grnxx/string.hpp 2015-02-13 10:00:43 +0900 (e1cddc3) @@ -419,6 +419,45 @@ class String { (std::memcmp(data_, rhs, size_) == 0) : false; } + // Find "byte" in "this", except the first "offset" bytes. + // + // If found, return the first position. + // If not found, return "npos". + size_t find_first_of(char byte, size_t offset = 0) const { + for (size_t i = offset; i < size_; ++i) { + if (data_[i] == byte) { + return i; + } + } + return npos; + } + // Find a member of "bytes" in "this", except the first "offset" bytes. + // + // If found, return the first position. + // If not found, return "npos". + size_t find_first_of(const String &bytes, size_t offset = 0) const { + for (size_t i = offset; i < size_; ++i) { + if (bytes.find_first_of(data_[i]) != npos) { + return i; + } + } + return npos; + } + // Find a non-member of "bytes" in "this", except the first "offset" bytes. + // + // If found, return the first position. + // If not found, return "npos". + size_t find_first_not_of(const String &bytes, size_t offset = 0) const { + for (size_t i = offset; i < size_; ++i) { + if (bytes.find_first_of(data_[i]) == npos) { + return i; + } + } + return npos; + } + + static constexpr size_t npos = -1; + private: union { char *buffer_; Modified: lib/grnxx/expression.cpp (+3 -2) =================================================================== --- lib/grnxx/expression.cpp 2015-02-09 18:40:48 +0900 (5763b58) +++ lib/grnxx/expression.cpp 2015-02-13 10:00:43 +0900 (3cb3452) @@ -6,8 +6,9 @@ namespace grnxx { -std::unique_ptr<Expression> Expression::parse(const String &query) { - return impl::ExpressionParser::parse(query); +std::unique_ptr<Expression> Expression::parse(const Table *table, + const String &query) { + return impl::ExpressionParser::parse(table, query); } std::unique_ptr<ExpressionBuilder> ExpressionBuilder::create( Modified: lib/grnxx/impl/expression.cpp (+330 -6) =================================================================== --- lib/grnxx/impl/expression.cpp 2015-02-09 18:40:48 +0900 (033d4a8) +++ lib/grnxx/impl/expression.cpp 2015-02-13 10:00:43 +0900 (9c54123) @@ -2577,21 +2577,345 @@ int ExpressionToken::get_operator_priority(OperatorType operator_type) { } std::unique_ptr<ExpressionInterface> ExpressionParser::parse( + const TableInterface *table, const String &query) try { - std::unique_ptr<ExpressionParser> parser(new ExpressionParser); + std::unique_ptr<ExpressionParser> parser(new ExpressionParser(table)); parser->tokenize(query); - return parser->build(); + parser->analyze(); + return parser->builder_->release(); } catch (const std::bad_alloc &) { throw "Memory allocation failed"; } +ExpressionParser::ExpressionParser(const TableInterface *table) + : table_(table), + tokens_(), + stack_(), + builder_() {} + void ExpressionParser::tokenize(const String &query) { - // TODO + String rest = query; + while (rest.size() != 0) { + // Ignore white-space characters. + auto delim_pos = rest.find_first_not_of(" \t\r\n"); + if (delim_pos == rest.npos) { + break; + } + rest = rest.substring(delim_pos); + switch (rest[0]) { + case '!': { + if ((rest.size() >= 2) && (rest[1] == '=')) { + tokens_.push_back(ExpressionToken("!=", GRNXX_NOT_EQUAL)); + rest = rest.substring(2); + } else { + tokens_.push_back(ExpressionToken("!", GRNXX_LOGICAL_NOT)); + rest = rest.substring(1); + } + break; + } + case '~': { + tokens_.push_back(ExpressionToken("~", GRNXX_BITWISE_NOT)); + rest = rest.substring(1); + break; + } + case '=': { + if ((rest.size() >= 2) && (rest[1] == '=')) { + tokens_.push_back(ExpressionToken("==", GRNXX_EQUAL)); + rest = rest.substring(2); + } else { + throw "Invalid query"; + } + break; + } + case '<': { + if ((rest.size() >= 2) && (rest[1] == '=')) { + tokens_.push_back(ExpressionToken("<=", GRNXX_LESS_EQUAL)); + rest = rest.substring(2); + } else { + tokens_.push_back(ExpressionToken("<", GRNXX_LESS)); + rest = rest.substring(1); + } + break; + } + case '>': { + if ((rest.size() >= 2) && (rest[1] == '=')) { + tokens_.push_back(ExpressionToken(">=", GRNXX_GREATER_EQUAL)); + rest = rest.substring(2); + } else { + tokens_.push_back(ExpressionToken(">", GRNXX_GREATER)); + rest = rest.substring(1); + } + break; + } + case '&': { + if ((rest.size() >= 2) && (rest[1] == '&')) { + tokens_.push_back(ExpressionToken("&&", GRNXX_LOGICAL_AND)); + rest = rest.substring(2); + } else { + tokens_.push_back(ExpressionToken("&", GRNXX_BITWISE_AND)); + rest = rest.substring(1); + } + break; + } + case '|': { + if ((rest.size() >= 2) && (rest[1] == '|')) { + tokens_.push_back(ExpressionToken("||", GRNXX_LOGICAL_OR)); + rest = rest.substring(2); + } else { + tokens_.push_back(ExpressionToken("|", GRNXX_BITWISE_OR)); + rest = rest.substring(1); + } + break; + } + case '^': { + tokens_.push_back(ExpressionToken("^", GRNXX_BITWISE_XOR)); + rest = rest.substring(1); + break; + } + case '+': { + tokens_.push_back(ExpressionToken("+", GRNXX_PLUS)); + rest = rest.substring(1); + break; + } + case '-': { + tokens_.push_back(ExpressionToken("-", GRNXX_MINUS)); + rest = rest.substring(1); + break; + } + case '*': { + tokens_.push_back(ExpressionToken("*", GRNXX_MULTIPLICATION)); + rest = rest.substring(1); + break; + } + case '/': { + tokens_.push_back(ExpressionToken("/", GRNXX_DIVISION)); + rest = rest.substring(1); + break; + } + case '%': { + tokens_.push_back(ExpressionToken("%", GRNXX_MODULUS)); + rest = rest.substring(1); + break; + } + case '@': { + if ((rest.size() >= 2) && (rest[1] == '^')) { + tokens_.push_back(ExpressionToken("@^", GRNXX_STARTS_WITH)); + rest = rest.substring(2); + } else if ((rest.size() >= 2) && (rest[1] == '$')) { + tokens_.push_back(ExpressionToken("@$", GRNXX_ENDS_WITH)); + rest = rest.substring(2); + } else { + tokens_.push_back(ExpressionToken("@", GRNXX_CONTAINS)); + rest = rest.substring(1); + } + break; + } + case '.': { + tokens_.push_back(ExpressionToken(".", DEREFERENCE_TOKEN)); + rest = rest.substring(1); + break; + } + case '(': { + tokens_.push_back(ExpressionToken("(", LEFT_ROUND_BRACKET)); + rest = rest.substring(1); + break; + } + case ')': { + tokens_.push_back(ExpressionToken(")", RIGHT_ROUND_BRACKET)); + rest = rest.substring(1); + break; + } + case '[': { + tokens_.push_back(ExpressionToken("[", LEFT_SQUARE_BRACKET)); + rest = rest.substring(1); + break; + } + case ']': { + tokens_.push_back(ExpressionToken("]", RIGHT_SQUARE_BRACKET)); + rest = rest.substring(1); + break; + } + case '"': { + auto end = rest.find_first_of('"', 1); + if (end == rest.npos) { + throw "Invalid query"; + } + tokens_.push_back( + ExpressionToken(rest.substring(0, end + 1), CONSTANT_TOKEN)); + rest = rest.substring(end + 1); + break; + } + case '0' ... '9': { + // TODO: Improve this. + auto end = rest.find_first_not_of("0123456789", 1); + if (end == rest.npos) { + end = rest.size(); + } else if (rest[end] == '.') { + end = rest.find_first_not_of("0123456789", end + 1); + if (end == rest.npos) { + end = rest.size(); + } + } + String token = rest.substring(0, end); + tokens_.push_back(ExpressionToken(token, CONSTANT_TOKEN)); + rest = rest.substring(end); + break; + } + case 'A' ... 'Z': + case 'a' ... 'z': { + // TODO: Improve this. + auto end = rest.find_first_not_of("0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ_" + "abcdefghijklmnopqrstuvwxyz", 1); + if (end == rest.npos) { + end = rest.size(); + } + String token = rest.substring(0, end); + if ((token == "TRUE") || (token == "FALSE")) { + tokens_.push_back(ExpressionToken(token, CONSTANT_TOKEN)); + } else { + tokens_.push_back(ExpressionToken(token, NAME_TOKEN)); + } + rest = rest.substring(end); + break; + } + default: { + throw "Invalid query"; + } + } + } +} + +void ExpressionParser::analyze() { + if (tokens_.size() == 0) { + throw "Empty query"; + } + builder_ = ExpressionBuilderInterface::create(table_); + push_token(ExpressionToken("(", LEFT_ROUND_BRACKET)); + for (size_t i = 0; i < tokens_.size(); ++i) { + push_token(tokens_[i]); + } + push_token(ExpressionToken(")", RIGHT_ROUND_BRACKET)); } -std::unique_ptr<ExpressionInterface> ExpressionParser::build() { - // TODO - return builder_->release(); +void ExpressionParser::push_token(const ExpressionToken &token) { + switch (token.type()) { + case DUMMY_TOKEN: { + if ((stack_.size() != 0) && (stack_.back().type() == DUMMY_TOKEN)) { + throw "Invalid query"; + } + stack_.push_back(token); + break; + } + case CONSTANT_TOKEN: { + Datum datum; + std::string string(token.string().data(), token.string().size()); + if (std::isdigit(static_cast<uint8_t>(string[0]))) { + if (string.find_first_of('.') == string.npos) { + datum = grnxx::Int(std::stoll(string)); + } else { + datum = grnxx::Float(std::stod(string)); + } + } else { + const String &string = token.string(); + datum = grnxx::Text(string.substring(1, string.size() - 2)); + } + push_token(ExpressionToken(token.string(), DUMMY_TOKEN)); + builder_->push_constant(datum); + break; + } + case NAME_TOKEN: { + push_token(ExpressionToken(token.string(), DUMMY_TOKEN)); + builder_->push_column(token.string()); + break; + } + case UNARY_OPERATOR_TOKEN: { + if ((stack_.size() != 0) && (stack_.back().type() == DUMMY_TOKEN)) { + // 単項演算子の直前にオペランドがあるのはおかしい. + throw "Invalid query"; + } + stack_.push_back(token); + break; + } + case BINARY_OPERATOR_TOKEN: { + if ((stack_.size() == 0) || (stack_.back().type() != DUMMY_TOKEN)) { + // 二項演算子の前はダミーでなければならない. + throw "Invalid query"; + } + // 前方にある優先度の高い演算子を適用する. + while (stack_.size() >= 2) { + ExpressionToken operator_token = stack_[stack_.size() - 2]; + if (operator_token.type() == UNARY_OPERATOR_TOKEN) { + builder_->push_operator(operator_token.operator_type()); + stack_.pop_back(); + stack_.pop_back(); + push_token(ExpressionToken("", DUMMY_TOKEN)); + } else if ((operator_token.type() == BINARY_OPERATOR_TOKEN) && + (operator_token.priority() <= token.priority())) { + builder_->push_operator(operator_token.operator_type()); + stack_.pop_back(); + stack_.pop_back(); + stack_.pop_back(); + push_token(ExpressionToken("", DUMMY_TOKEN)); + } else { + break; + } + } + stack_.push_back(token); + break; + } + case DEREFERENCE_TOKEN: { + // TODO: Not supported yet. + throw "Not supported yet"; + } + case BRACKET_TOKEN: { + if (token.bracket_type() == LEFT_ROUND_BRACKET) { + // 開き括弧の直前にダミーがあるのはおかしい. + if ((stack_.size() != 0) && (stack_.back().type() == DUMMY_TOKEN)) { + throw "Invalid query"; + } + stack_.push_back(token); + } else if (token.bracket_type() == RIGHT_ROUND_BRACKET) { + // 閉じ括弧の直前にはダミーが必要であり,それより前に開き括弧が必要である. + if ((stack_.size() < 2) || (stack_.back().type() != DUMMY_TOKEN)) { + throw "Invalid query 1"; + } + // 括弧内にある演算子をすべて解決する. + while (stack_.size() >= 2) { + ExpressionToken operator_token = stack_[stack_.size() - 2]; + if (operator_token.type() == UNARY_OPERATOR_TOKEN) { + builder_->push_operator(operator_token.operator_type()); + stack_.pop_back(); + stack_.pop_back(); + push_token(ExpressionToken("", DUMMY_TOKEN)); + } else if (operator_token.type() == BINARY_OPERATOR_TOKEN) { + builder_->push_operator(operator_token.operator_type()); + stack_.pop_back(); + stack_.pop_back(); + stack_.pop_back(); + push_token(ExpressionToken("", DUMMY_TOKEN)); + } else { + break; + } + } + if ((stack_.size() < 2) || + (stack_[stack_.size() - 2].type() != BRACKET_TOKEN) || + (stack_[stack_.size() - 2].bracket_type() != LEFT_ROUND_BRACKET)) { + throw "Invalid query"; + } + stack_[stack_.size() - 2] = stack_.back(); + stack_.pop_back(); + break; + } else { + // TODO: Not supported yet ([]). + throw "Not supported yet"; + } + break; + } + default: { + throw "Invalid query"; + } + } } } // namespace impl Modified: lib/grnxx/impl/expression.hpp (+28 -12) =================================================================== --- lib/grnxx/impl/expression.hpp 2015-02-09 18:40:48 +0900 (1264847) +++ lib/grnxx/impl/expression.hpp 2015-02-13 10:00:43 +0900 (2b67cac) @@ -206,7 +206,9 @@ class ExpressionBuilder : public ExpressionBuilderInterface { }; enum ExpressionTokenType { - NODE_TOKEN, + DUMMY_TOKEN, + CONSTANT_TOKEN, + NAME_TOKEN, UNARY_OPERATOR_TOKEN, BINARY_OPERATOR_TOKEN, DEREFERENCE_TOKEN, @@ -222,20 +224,29 @@ enum ExpressionBracketType { class ExpressionToken { public: - ExpressionToken() : type_(NODE_TOKEN), dummy_(0), priority_(0) {} - explicit ExpressionToken(ExpressionTokenType token_type) - : type_(token_type), + ExpressionToken() : string_(), type_(DUMMY_TOKEN), dummy_(0), priority_(0) {} + explicit ExpressionToken(const String &string, + ExpressionTokenType token_type) + : string_(string), + type_(token_type), dummy_(0), priority_(0) {} - explicit ExpressionToken(ExpressionBracketType bracket_type) - : type_(BRACKET_TOKEN), + explicit ExpressionToken(const String &string, + ExpressionBracketType bracket_type) + : string_(string), + type_(BRACKET_TOKEN), bracket_type_(bracket_type), priority_(0) {} - explicit ExpressionToken(OperatorType operator_type) - : type_(get_operator_token_type(operator_type)), + explicit ExpressionToken(const String &string, + OperatorType operator_type) + : string_(string), + type_(get_operator_token_type(operator_type)), operator_type_(operator_type), priority_(get_operator_priority(operator_type)) {} + const String &string() const { + return string_; + } ExpressionTokenType type() const { return type_; } @@ -250,6 +261,7 @@ class ExpressionToken { } private: + String string_; ExpressionTokenType type_; union { int dummy_; @@ -265,19 +277,23 @@ class ExpressionToken { class ExpressionParser { public: - static std::unique_ptr<ExpressionInterface> parse(const String &query); + static std::unique_ptr<ExpressionInterface> parse( + const TableInterface *table, + const String &query); ~ExpressionParser() = default; private: + const TableInterface *table_; Array<ExpressionToken> tokens_; Array<ExpressionToken> stack_; - std::unique_ptr<ExpressionBuilder> builder_; + std::unique_ptr<ExpressionBuilderInterface> builder_; - ExpressionParser() = default; + explicit ExpressionParser(const TableInterface *table); void tokenize(const String &query); - std::unique_ptr<ExpressionInterface> build(); + void analyze(); + void push_token(const ExpressionToken &token); }; } // namespace impl Modified: test/test_expression.cpp (+23 -0) =================================================================== --- test/test_expression.cpp 2015-02-09 18:40:48 +0900 (e2d3ed1) +++ test/test_expression.cpp 2015-02-13 10:00:43 +0900 (f407fac) @@ -2748,6 +2748,26 @@ void test_subexpression() { } } +void test_parser() try { + // Create an expression. + auto expression = grnxx::Expression::parse( + test.table, "Int < Int2 && Float >= Float2"); + + auto records = create_input_records(); + + grnxx::Array<grnxx::Bool> bool_results; + expression->evaluate(records, &bool_results); + assert(bool_results.size() == test.table->num_rows()); + for (size_t i = 0; i < bool_results.size(); ++i) { + size_t row_id = records[i].row_id.raw(); + assert(bool_results[i].match( + (test.int_values[row_id] < test.int2_values[row_id]) & + (test.float_values[row_id] >= test.float2_values[row_id]))); + } +} catch (const char *msg) { + std::cerr << "msg: " << msg << std::endl; +} + void test_sequential_filter() { // Create an object for building expressions. auto builder = grnxx::ExpressionBuilder::create(test.table); @@ -2956,6 +2976,9 @@ int main() { // Subexpression. test_subexpression(); + // Parser. + test_parser(); + // Test sequential operations. test_sequential_filter(); test_sequential_adjust(); -------------- next part -------------- HTML����������������������������... 다운로드