groonga/groonga at 6d994a6 [master] TokenRegexp: don't search overlapped tokens (Groonga-commit) - Groonga - fulltext search engine.

Kouhei Sutou	2015-04-07 23:47:15 +0900 (Tue, 07 Apr 2015)

  New Revision: 6d994a6f3413bef6800d3f3e5b8a40aa326b473c
  https://github.com/groonga/groonga/commit/6d994a6f3413bef6800d3f3e5b8a40aa326b473c

  Message:
    TokenRegexp: don't search overlapped tokens
    
    They are needless.

  Added files:
    test/command/suite/select/filter/index/regexp/long.expected
    test/command/suite/select/filter/index/regexp/long.test
    test/command/suite/tokenizers/regexp/get/long.expected
    test/command/suite/tokenizers/regexp/get/long.test
  Modified files:
    lib/tokenizers.c
    test/command/suite/tokenizers/regexp/get/end/four.expected

  Modified: lib/tokenizers.c (+9 -0)
===================================================================

--- lib/tokenizers.c    2015-04-07 22:55:41 +0900 (ea85cc6)
+++ lib/tokenizers.c    2015-04-07 23:47:15 +0900 (8ed0b8c)
@@ -475,6 +475,7 @@ typedef struct {
   struct {
     grn_bool have_begin;
     grn_bool have_end;
+    int32_t n_skip_tokens;
   } get;
   grn_bool is_begin;
   grn_bool is_end;
@@ -513,6 +514,7 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 
   tokenizer->get.have_begin = GRN_FALSE;
   tokenizer->get.have_end   = GRN_FALSE;
+  tokenizer->get.n_skip_tokens = 0;
 
   tokenizer->is_begin = GRN_TRUE;
   tokenizer->is_end   = GRN_FALSE;
@@ -681,6 +683,13 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
           status |= GRN_TOKEN_FORCE_PREFIX;
         }
       }
+    } else {
+      if (tokenizer->get.n_skip_tokens > 0) {
+        tokenizer->get.n_skip_tokens--;
+        status |= GRN_TOKEN_SKIP;
+      } else {
+        tokenizer->get.n_skip_tokens = ngram_unit - 1;
+      }
     }
   } else {
     if (tokenizer->next == end) {

  Added: test/command/suite/select/filter/index/regexp/long.expected (+52 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/long.expected    2015-04-07 23:47:15 +0900 (e3a3a5e)
@@ -0,0 +1,52 @@
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+table_create RegexpTokens TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION   Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "Groonga"},
+{"content": "Mroonga"},
+{"content": "Rroonga and Ruby"}
+]
+[[0,0.0,0.0],3]
+select Memos --filter 'content @~ "roonga"'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        3
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "content",
+          "Text"
+        ]
+      ],
+      [
+        1,
+        "Groonga"
+      ],
+      [
+        2,
+        "Mroonga"
+      ],
+      [
+        3,
+        "Rroonga and Ruby"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/select/filter/index/regexp/long.test (+16 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/long.test    2015-04-07 23:47:15 +0900 (758139d)
@@ -0,0 +1,16 @@
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR Text
+
+table_create RegexpTokens TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
+  Memos content
+
+load --table Memos
+[
+{"content": "Groonga"},
+{"content": "Mroonga"},
+{"content": "Rroonga and Ruby"}
+]
+
+select Memos --filter 'content @~ "roonga"'

  Modified: test/command/suite/tokenizers/regexp/get/end/four.expected (+0 -4)
===================================================================
--- test/command/suite/tokenizers/regexp/get/end/four.expected    2015-04-07 22:55:41 +0900 (b621183)
+++ test/command/suite/tokenizers/regexp/get/end/four.expected    2015-04-07 23:47:15 +0900 (ad58a34)
@@ -47,10 +47,6 @@ table_tokenize Lexicon "abcd\\z" --mode GET
       "position": 0
     },
     {
-      "value": "bc",
-      "position": 1
-    },
-    {
       "value": "cd",
       "position": 2
     },

  Added: test/command/suite/tokenizers/regexp/get/long.expected (+98 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/long.expected    2015-04-07 23:47:15 +0900 (d9023b0)
@@ -0,0 +1,98 @@
+table_create Lexicon TABLE_PAT_KEY ShortText   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+table_tokenize Lexicon "abcdefghijk" --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "￯",
+      "position": 0
+    },
+    {
+      "value": "ab",
+      "position": 1
+    },
+    {
+      "value": "bc",
+      "position": 2
+    },
+    {
+      "value": "cd",
+      "position": 3
+    },
+    {
+      "value": "de",
+      "position": 4
+    },
+    {
+      "value": "ef",
+      "position": 5
+    },
+    {
+      "value": "fg",
+      "position": 6
+    },
+    {
+      "value": "gh",
+      "position": 7
+    },
+    {
+      "value": "hi",
+      "position": 8
+    },
+    {
+      "value": "ij",
+      "position": 9
+    },
+    {
+      "value": "jk",
+      "position": 10
+    },
+    {
+      "value": "k",
+      "position": 11
+    },
+    {
+      "value": "￰",
+      "position": 12
+    }
+  ]
+]
+table_tokenize Lexicon "abcdefghijk" --mode GET
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "ab",
+      "position": 0
+    },
+    {
+      "value": "cd",
+      "position": 2
+    },
+    {
+      "value": "ef",
+      "position": 4
+    },
+    {
+      "value": "gh",
+      "position": 6
+    },
+    {
+      "value": "ij",
+      "position": 8
+    },
+    {
+      "value": "jk",
+      "position": 9
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/regexp/get/long.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/regexp/get/long.test    2015-04-07 23:47:15 +0900 (b3b2f0d)
@@ -0,0 +1,5 @@
+table_create Lexicon TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenRegexp
+table_tokenize Lexicon "abcdefghijk" --mode ADD
+
+table_tokenize Lexicon "abcdefghijk" --mode GET
-------------- next part --------------
HTML����������������������������...
다운로드 


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at 6d994a6 [master] TokenRegexp: don't search overlapped tokens