[Groonga-commit] groonga/grnxx at 163cce8 [master] Add a parser for Expression. (#143)

Back to archive index

susumu.yata null+****@clear*****
Fri Feb 13 10:00:43 JST 2015


susumu.yata	2015-02-13 10:00:43 +0900 (Fri, 13 Feb 2015)

  New Revision: 163cce8c7a57fbe52b17e11e16db0eae2c692f7f
  https://github.com/groonga/grnxx/commit/163cce8c7a57fbe52b17e11e16db0eae2c692f7f

  Message:
    Add a parser for Expression. (#143)

  Modified files:
    include/grnxx/expression.hpp
    include/grnxx/string.hpp
    lib/grnxx/expression.cpp
    lib/grnxx/impl/expression.cpp
    lib/grnxx/impl/expression.hpp
    test/test_expression.cpp

  Modified: include/grnxx/expression.hpp (+2 -1)
===================================================================
--- include/grnxx/expression.hpp    2015-02-09 18:40:48 +0900 (a9185a2)
+++ include/grnxx/expression.hpp    2015-02-13 10:00:43 +0900 (1bad27a)
@@ -145,7 +145,8 @@ class Expression {
   //
   // On success, returns the expression.
   // On failure, throws an exception.
-  static std::unique_ptr<Expression> parse(const String &query);
+  static std::unique_ptr<Expression> parse(const Table *table,
+                                           const String &query);
 };
 
 class ExpressionBuilder {

  Modified: include/grnxx/string.hpp (+39 -0)
===================================================================
--- include/grnxx/string.hpp    2015-02-09 18:40:48 +0900 (7ae2f26)
+++ include/grnxx/string.hpp    2015-02-13 10:00:43 +0900 (e1cddc3)
@@ -419,6 +419,45 @@ class String {
            (std::memcmp(data_, rhs, size_) == 0) : false;
   }
 
+  // Find "byte" in "this", except the first "offset" bytes.
+  //
+  // If found, return the first position.
+  // If not found, return "npos".
+  size_t find_first_of(char byte, size_t offset = 0) const {
+    for (size_t i = offset; i < size_; ++i) {
+      if (data_[i] == byte) {
+        return i;
+      }
+    }
+    return npos;
+  }
+  // Find a member of "bytes" in "this", except the first "offset" bytes.
+  //
+  // If found, return the first position.
+  // If not found, return "npos".
+  size_t find_first_of(const String &bytes, size_t offset = 0) const {
+    for (size_t i = offset; i < size_; ++i) {
+      if (bytes.find_first_of(data_[i]) != npos) {
+        return i;
+      }
+    }
+    return npos;
+  }
+  // Find a non-member of "bytes" in "this", except the first "offset" bytes.
+  //
+  // If found, return the first position.
+  // If not found, return "npos".
+  size_t find_first_not_of(const String &bytes, size_t offset = 0) const {
+    for (size_t i = offset; i < size_; ++i) {
+      if (bytes.find_first_of(data_[i]) == npos) {
+        return i;
+      }
+    }
+    return npos;
+  }
+
+  static constexpr size_t npos = -1;
+
  private:
   union {
     char *buffer_;

  Modified: lib/grnxx/expression.cpp (+3 -2)
===================================================================
--- lib/grnxx/expression.cpp    2015-02-09 18:40:48 +0900 (5763b58)
+++ lib/grnxx/expression.cpp    2015-02-13 10:00:43 +0900 (3cb3452)
@@ -6,8 +6,9 @@
 
 namespace grnxx {
 
-std::unique_ptr<Expression> Expression::parse(const String &query) {
-  return impl::ExpressionParser::parse(query);
+std::unique_ptr<Expression> Expression::parse(const Table *table,
+                                              const String &query) {
+  return impl::ExpressionParser::parse(table, query);
 }
 
 std::unique_ptr<ExpressionBuilder> ExpressionBuilder::create(

  Modified: lib/grnxx/impl/expression.cpp (+330 -6)
===================================================================
--- lib/grnxx/impl/expression.cpp    2015-02-09 18:40:48 +0900 (033d4a8)
+++ lib/grnxx/impl/expression.cpp    2015-02-13 10:00:43 +0900 (9c54123)
@@ -2577,21 +2577,345 @@ int ExpressionToken::get_operator_priority(OperatorType operator_type) {
 }
 
 std::unique_ptr<ExpressionInterface> ExpressionParser::parse(
+    const TableInterface *table,
     const String &query) try {
-  std::unique_ptr<ExpressionParser> parser(new ExpressionParser);
+  std::unique_ptr<ExpressionParser> parser(new ExpressionParser(table));
   parser->tokenize(query);
-  return parser->build();
+  parser->analyze();
+  return parser->builder_->release();
 } catch (const std::bad_alloc &) {
 	throw "Memory allocation failed";
 }
 
+ExpressionParser::ExpressionParser(const TableInterface *table)
+    : table_(table),
+      tokens_(),
+      stack_(),
+      builder_() {}
+
 void ExpressionParser::tokenize(const String &query) {
-  // TODO
+  String rest = query;
+  while (rest.size() != 0) {
+    // Ignore white-space characters.
+    auto delim_pos = rest.find_first_not_of(" \t\r\n");
+    if (delim_pos == rest.npos) {
+      break;
+    }
+    rest = rest.substring(delim_pos);
+    switch (rest[0]) {
+      case '!': {
+        if ((rest.size() >= 2) && (rest[1] == '=')) {
+          tokens_.push_back(ExpressionToken("!=", GRNXX_NOT_EQUAL));
+          rest = rest.substring(2);
+        } else {
+          tokens_.push_back(ExpressionToken("!", GRNXX_LOGICAL_NOT));
+          rest = rest.substring(1);
+        }
+        break;
+      }
+      case '~': {
+        tokens_.push_back(ExpressionToken("~", GRNXX_BITWISE_NOT));
+        rest = rest.substring(1);
+        break;
+      }
+      case '=': {
+        if ((rest.size() >= 2) && (rest[1] == '=')) {
+          tokens_.push_back(ExpressionToken("==", GRNXX_EQUAL));
+          rest = rest.substring(2);
+        } else {
+          throw "Invalid query";
+        }
+        break;
+      }
+      case '<': {
+        if ((rest.size() >= 2) && (rest[1] == '=')) {
+          tokens_.push_back(ExpressionToken("<=", GRNXX_LESS_EQUAL));
+          rest = rest.substring(2);
+        } else {
+          tokens_.push_back(ExpressionToken("<", GRNXX_LESS));
+          rest = rest.substring(1);
+        }
+        break;
+      }
+      case '>': {
+        if ((rest.size() >= 2) && (rest[1] == '=')) {
+          tokens_.push_back(ExpressionToken(">=", GRNXX_GREATER_EQUAL));
+          rest = rest.substring(2);
+        } else {
+          tokens_.push_back(ExpressionToken(">", GRNXX_GREATER));
+          rest = rest.substring(1);
+        }
+        break;
+      }
+      case '&': {
+        if ((rest.size() >= 2) && (rest[1] == '&')) {
+          tokens_.push_back(ExpressionToken("&&", GRNXX_LOGICAL_AND));
+          rest = rest.substring(2);
+        } else {
+          tokens_.push_back(ExpressionToken("&", GRNXX_BITWISE_AND));
+          rest = rest.substring(1);
+        }
+        break;
+      }
+      case '|': {
+        if ((rest.size() >= 2) && (rest[1] == '|')) {
+          tokens_.push_back(ExpressionToken("||", GRNXX_LOGICAL_OR));
+          rest = rest.substring(2);
+        } else {
+          tokens_.push_back(ExpressionToken("|", GRNXX_BITWISE_OR));
+          rest = rest.substring(1);
+        }
+        break;
+      }
+      case '^': {
+        tokens_.push_back(ExpressionToken("^", GRNXX_BITWISE_XOR));
+        rest = rest.substring(1);
+        break;
+      }
+      case '+': {
+        tokens_.push_back(ExpressionToken("+", GRNXX_PLUS));
+        rest = rest.substring(1);
+        break;
+      }
+      case '-': {
+        tokens_.push_back(ExpressionToken("-", GRNXX_MINUS));
+        rest = rest.substring(1);
+        break;
+      }
+      case '*': {
+        tokens_.push_back(ExpressionToken("*", GRNXX_MULTIPLICATION));
+        rest = rest.substring(1);
+        break;
+      }
+      case '/': {
+        tokens_.push_back(ExpressionToken("/", GRNXX_DIVISION));
+        rest = rest.substring(1);
+        break;
+      }
+      case '%': {
+        tokens_.push_back(ExpressionToken("%", GRNXX_MODULUS));
+        rest = rest.substring(1);
+        break;
+      }
+      case '@': {
+        if ((rest.size() >= 2) && (rest[1] == '^')) {
+          tokens_.push_back(ExpressionToken("@^", GRNXX_STARTS_WITH));
+          rest = rest.substring(2);
+        } else if ((rest.size() >= 2) && (rest[1] == '$')) {
+          tokens_.push_back(ExpressionToken("@$", GRNXX_ENDS_WITH));
+          rest = rest.substring(2);
+        } else {
+          tokens_.push_back(ExpressionToken("@", GRNXX_CONTAINS));
+          rest = rest.substring(1);
+        }
+        break;
+      }
+      case '.': {
+        tokens_.push_back(ExpressionToken(".", DEREFERENCE_TOKEN));
+        rest = rest.substring(1);
+        break;
+      }
+      case '(': {
+        tokens_.push_back(ExpressionToken("(", LEFT_ROUND_BRACKET));
+        rest = rest.substring(1);
+        break;
+      }
+      case ')': {
+        tokens_.push_back(ExpressionToken(")", RIGHT_ROUND_BRACKET));
+        rest = rest.substring(1);
+        break;
+      }
+      case '[': {
+        tokens_.push_back(ExpressionToken("[", LEFT_SQUARE_BRACKET));
+        rest = rest.substring(1);
+        break;
+      }
+      case ']': {
+        tokens_.push_back(ExpressionToken("]", RIGHT_SQUARE_BRACKET));
+        rest = rest.substring(1);
+        break;
+      }
+      case '"': {
+        auto end = rest.find_first_of('"', 1);
+        if (end == rest.npos) {
+          throw "Invalid query";
+        }
+        tokens_.push_back(
+            ExpressionToken(rest.substring(0, end + 1), CONSTANT_TOKEN));
+        rest = rest.substring(end + 1);
+        break;
+      }
+      case '0' ... '9': {
+        // TODO: Improve this.
+        auto end = rest.find_first_not_of("0123456789", 1);
+        if (end == rest.npos) {
+          end = rest.size();
+        } else if (rest[end] == '.') {
+          end = rest.find_first_not_of("0123456789", end + 1);
+          if (end == rest.npos) {
+            end = rest.size();
+          }
+        }
+        String token = rest.substring(0, end);
+        tokens_.push_back(ExpressionToken(token, CONSTANT_TOKEN));
+        rest = rest.substring(end);
+        break;
+      }
+      case 'A' ... 'Z':
+      case 'a' ... 'z': {
+        // TODO: Improve this.
+        auto end = rest.find_first_not_of("0123456789"
+                                          "ABCDEFGHIJKLMNOPQRSTUVWXYZ_"
+                                          "abcdefghijklmnopqrstuvwxyz", 1);
+        if (end == rest.npos) {
+          end = rest.size();
+        }
+        String token = rest.substring(0, end);
+        if ((token == "TRUE") || (token == "FALSE")) {
+          tokens_.push_back(ExpressionToken(token, CONSTANT_TOKEN));
+        } else {
+          tokens_.push_back(ExpressionToken(token, NAME_TOKEN));
+        }
+        rest = rest.substring(end);
+        break;
+      }
+      default: {
+        throw "Invalid query";
+      }
+    }
+  }
+}
+
+void ExpressionParser::analyze() {
+  if (tokens_.size() == 0) {
+    throw "Empty query";
+  }
+  builder_ = ExpressionBuilderInterface::create(table_);
+  push_token(ExpressionToken("(", LEFT_ROUND_BRACKET));
+  for (size_t i = 0; i < tokens_.size(); ++i) {
+    push_token(tokens_[i]);
+  }
+  push_token(ExpressionToken(")", RIGHT_ROUND_BRACKET));
 }
 
-std::unique_ptr<ExpressionInterface> ExpressionParser::build() {
-  // TODO
-  return builder_->release();
+void ExpressionParser::push_token(const ExpressionToken &token) {
+  switch (token.type()) {
+    case DUMMY_TOKEN: {
+      if ((stack_.size() != 0) && (stack_.back().type() == DUMMY_TOKEN)) {
+        throw "Invalid query";
+      }
+      stack_.push_back(token);
+      break;
+    }
+    case CONSTANT_TOKEN: {
+      Datum datum;
+      std::string string(token.string().data(), token.string().size());
+      if (std::isdigit(static_cast<uint8_t>(string[0]))) {
+        if (string.find_first_of('.') == string.npos) {
+          datum = grnxx::Int(std::stoll(string));
+        } else {
+          datum = grnxx::Float(std::stod(string));
+        }
+      } else {
+        const String &string = token.string();
+        datum = grnxx::Text(string.substring(1, string.size() - 2));
+      }
+      push_token(ExpressionToken(token.string(), DUMMY_TOKEN));
+      builder_->push_constant(datum);
+      break;
+    }
+    case NAME_TOKEN: {
+      push_token(ExpressionToken(token.string(), DUMMY_TOKEN));
+      builder_->push_column(token.string());
+      break;
+    }
+    case UNARY_OPERATOR_TOKEN: {
+      if ((stack_.size() != 0) && (stack_.back().type() == DUMMY_TOKEN)) {
+        // 単項演算子の直前にオペランドがあるのはおかしい.
+        throw "Invalid query";
+      }
+      stack_.push_back(token);
+      break;
+    }
+    case BINARY_OPERATOR_TOKEN: {
+      if ((stack_.size() == 0) || (stack_.back().type() != DUMMY_TOKEN)) {
+        // 二項演算子の前はダミーでなければならない.
+        throw "Invalid query";
+      }
+      // 前方にある優先度の高い演算子を適用する.
+      while (stack_.size() >= 2) {
+        ExpressionToken operator_token = stack_[stack_.size() - 2];
+        if (operator_token.type() == UNARY_OPERATOR_TOKEN) {
+          builder_->push_operator(operator_token.operator_type());
+          stack_.pop_back();
+          stack_.pop_back();
+          push_token(ExpressionToken("", DUMMY_TOKEN));
+        } else if ((operator_token.type() == BINARY_OPERATOR_TOKEN) &&
+                   (operator_token.priority() <= token.priority())) {
+          builder_->push_operator(operator_token.operator_type());
+          stack_.pop_back();
+          stack_.pop_back();
+          stack_.pop_back();
+          push_token(ExpressionToken("", DUMMY_TOKEN));
+        } else {
+          break;
+        }
+      }
+      stack_.push_back(token);
+      break;
+    }
+    case DEREFERENCE_TOKEN: {
+      // TODO: Not supported yet.
+      throw "Not supported yet";
+    }
+    case BRACKET_TOKEN: {
+      if (token.bracket_type() == LEFT_ROUND_BRACKET) {
+        // 開き括弧の直前にダミーがあるのはおかしい.
+        if ((stack_.size() != 0) && (stack_.back().type() == DUMMY_TOKEN)) {
+          throw "Invalid query";
+        }
+        stack_.push_back(token);
+      } else if (token.bracket_type() == RIGHT_ROUND_BRACKET) {
+        // 閉じ括弧の直前にはダミーが必要であり,それより前に開き括弧が必要である.
+        if ((stack_.size() < 2) || (stack_.back().type() != DUMMY_TOKEN)) {
+          throw "Invalid query 1";
+        }
+        // 括弧内にある演算子をすべて解決する.
+        while (stack_.size() >= 2) {
+          ExpressionToken operator_token = stack_[stack_.size() - 2];
+          if (operator_token.type() == UNARY_OPERATOR_TOKEN) {
+            builder_->push_operator(operator_token.operator_type());
+            stack_.pop_back();
+            stack_.pop_back();
+            push_token(ExpressionToken("", DUMMY_TOKEN));
+          } else if (operator_token.type() == BINARY_OPERATOR_TOKEN) {
+            builder_->push_operator(operator_token.operator_type());
+            stack_.pop_back();
+            stack_.pop_back();
+            stack_.pop_back();
+            push_token(ExpressionToken("", DUMMY_TOKEN));
+          } else {
+            break;
+          }
+        }
+        if ((stack_.size() < 2) ||
+            (stack_[stack_.size() - 2].type() != BRACKET_TOKEN) ||
+            (stack_[stack_.size() - 2].bracket_type() != LEFT_ROUND_BRACKET)) {
+          throw "Invalid query";
+        }
+        stack_[stack_.size() - 2] = stack_.back();
+        stack_.pop_back();
+        break;
+      } else {
+        // TODO: Not supported yet ([]).
+        throw "Not supported yet";
+      }
+      break;
+    }
+    default: {
+      throw "Invalid query";
+    }
+  }
 }
 
 }  // namespace impl

  Modified: lib/grnxx/impl/expression.hpp (+28 -12)
===================================================================
--- lib/grnxx/impl/expression.hpp    2015-02-09 18:40:48 +0900 (1264847)
+++ lib/grnxx/impl/expression.hpp    2015-02-13 10:00:43 +0900 (2b67cac)
@@ -206,7 +206,9 @@ class ExpressionBuilder : public ExpressionBuilderInterface {
 };
 
 enum ExpressionTokenType {
-  NODE_TOKEN,
+  DUMMY_TOKEN,
+  CONSTANT_TOKEN,
+  NAME_TOKEN,
   UNARY_OPERATOR_TOKEN,
   BINARY_OPERATOR_TOKEN,
   DEREFERENCE_TOKEN,
@@ -222,20 +224,29 @@ enum ExpressionBracketType {
 
 class ExpressionToken {
  public:
-  ExpressionToken() : type_(NODE_TOKEN), dummy_(0), priority_(0) {}
-  explicit ExpressionToken(ExpressionTokenType token_type)
-      : type_(token_type),
+  ExpressionToken() : string_(), type_(DUMMY_TOKEN), dummy_(0), priority_(0) {}
+  explicit ExpressionToken(const String &string,
+                           ExpressionTokenType token_type)
+      : string_(string),
+        type_(token_type),
         dummy_(0),
         priority_(0) {}
-  explicit ExpressionToken(ExpressionBracketType bracket_type)
-      : type_(BRACKET_TOKEN),
+  explicit ExpressionToken(const String &string,
+                           ExpressionBracketType bracket_type)
+      : string_(string),
+        type_(BRACKET_TOKEN),
         bracket_type_(bracket_type),
         priority_(0) {}
-  explicit ExpressionToken(OperatorType operator_type)
-      : type_(get_operator_token_type(operator_type)),
+  explicit ExpressionToken(const String &string,
+                           OperatorType operator_type)
+      : string_(string),
+        type_(get_operator_token_type(operator_type)),
         operator_type_(operator_type),
         priority_(get_operator_priority(operator_type)) {}
 
+  const String &string() const {
+    return string_;
+  }
   ExpressionTokenType type() const {
     return type_;
   }
@@ -250,6 +261,7 @@ class ExpressionToken {
   }
 
  private:
+  String string_;
   ExpressionTokenType type_;
   union {
     int dummy_;
@@ -265,19 +277,23 @@ class ExpressionToken {
 
 class ExpressionParser {
  public:
-  static std::unique_ptr<ExpressionInterface> parse(const String &query);
+  static std::unique_ptr<ExpressionInterface> parse(
+      const TableInterface *table,
+      const String &query);
 
   ~ExpressionParser() = default;
 
  private:
+  const TableInterface *table_;
   Array<ExpressionToken> tokens_;
   Array<ExpressionToken> stack_;
-  std::unique_ptr<ExpressionBuilder> builder_;
+  std::unique_ptr<ExpressionBuilderInterface> builder_;
 
-  ExpressionParser() = default;
+  explicit ExpressionParser(const TableInterface *table);
 
   void tokenize(const String &query);
-  std::unique_ptr<ExpressionInterface> build();
+  void analyze();
+  void push_token(const ExpressionToken &token);
 };
 
 }  // namespace impl

  Modified: test/test_expression.cpp (+23 -0)
===================================================================
--- test/test_expression.cpp    2015-02-09 18:40:48 +0900 (e2d3ed1)
+++ test/test_expression.cpp    2015-02-13 10:00:43 +0900 (f407fac)
@@ -2748,6 +2748,26 @@ void test_subexpression() {
   }
 }
 
+void test_parser() try {
+  // Create an expression.
+  auto expression = grnxx::Expression::parse(
+      test.table, "Int < Int2 && Float >= Float2");
+
+  auto records = create_input_records();
+
+  grnxx::Array<grnxx::Bool> bool_results;
+  expression->evaluate(records, &bool_results);
+  assert(bool_results.size() == test.table->num_rows());
+  for (size_t i = 0; i < bool_results.size(); ++i) {
+    size_t row_id = records[i].row_id.raw();
+    assert(bool_results[i].match(
+        (test.int_values[row_id] < test.int2_values[row_id]) &
+        (test.float_values[row_id] >= test.float2_values[row_id])));
+  }
+} catch (const char *msg) {
+  std::cerr << "msg: " << msg << std::endl;
+}
+
 void test_sequential_filter() {
   // Create an object for building expressions.
   auto builder = grnxx::ExpressionBuilder::create(test.table);
@@ -2956,6 +2976,9 @@ int main() {
   // Subexpression.
   test_subexpression();
 
+  // Parser.
+  test_parser();
+
   // Test sequential operations.
   test_sequential_filter();
   test_sequential_adjust();
-------------- next part --------------
HTML����������������������������...
다운로드 



More information about the Groonga-commit mailing list
Back to archive index