[Groonga-commit] groonga/groonga at 2fde282 [master] Add functions/index_column plugin

Back to archive index

Kouhei Sutou null+****@clear*****
Mon May 22 22:24:41 JST 2017


Kouhei Sutou	2017-05-22 22:24:41 +0900 (Mon, 22 May 2017)

  New Revision: 2fde28248c238df1d983a9585cb77f53eaf58562
  https://github.com/groonga/groonga/commit/2fde28248c238df1d983a9585cb77f53eaf58562

  Message:
    Add functions/index_column plugin
    
    New selector:
    
      * index_column_df_ratio_between()
    
    New function:
    
      * index_column_df_ratio()

  Added files:
    plugins/functions/index_column.c
    plugins/functions/index_column_sources.am
    test/command/suite/select/function/index_column/index_column_df_ratio_between/and.expected
    test/command/suite/select/function/index_column/index_column_df_ratio_between/and.test
    test/command/suite/select/function/index_column/index_column_df_ratio_between/or.expected
    test/command/suite/select/function/index_column/index_column_df_ratio_between/or.test
  Modified files:
    plugins/functions/CMakeLists.txt
    plugins/functions/Makefile.am

  Modified: plugins/functions/CMakeLists.txt (+22 -1)
===================================================================
--- plugins/functions/CMakeLists.txt    2017-05-22 22:24:07 +0900 (d221576)
+++ plugins/functions/CMakeLists.txt    2017-05-22 22:24:41 +0900 (42314bd)
@@ -1,4 +1,4 @@
-# Copyright(C) 2015-2016 Brazil
+# Copyright(C) 2015-2017 Brazil
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -98,3 +98,24 @@ else()
   install(TARGETS time_functions DESTINATION "${GRN_FUNCTIONS_PLUGIN_DIR}")
 endif()
 target_link_libraries(time_functions libgroonga)
+
+read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/index_column_sources.am
+  INDEX_COLUMN_SOURCES)
+set_source_files_properties(${INDEX_COLUMN_SOURCES}
+  PROPERTIES
+  COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}")
+if(GRN_EMBED)
+  add_library(index_column_functions STATIC ${INDEX_COLUMN_SOURCES})
+  set_target_properties(
+    index_column_functions
+    PROPERTIES
+    POSITION_INDEPENDENT_CODE ON)
+else()
+  add_library(index_column_functions MODULE ${INDEX_COLUMN_SOURCES})
+  set_target_properties(index_column_functions PROPERTIES
+    PREFIX ""
+    OUTPUT_NAME "index_column")
+  install(TARGETS index_column_functions
+    DESTINATION "${GRN_FUNCTIONS_PLUGIN_DIR}")
+endif()
+target_link_libraries(index_column_functions libgroonga)

  Modified: plugins/functions/Makefile.am (+2 -0)
===================================================================
--- plugins/functions/Makefile.am    2017-05-22 22:24:07 +0900 (179f574)
+++ plugins/functions/Makefile.am    2017-05-22 22:24:41 +0900 (b24947b)
@@ -19,10 +19,12 @@ function_plugins_LTLIBRARIES += vector.la
 function_plugins_LTLIBRARIES += string.la
 function_plugins_LTLIBRARIES += number.la
 function_plugins_LTLIBRARIES += time.la
+function_plugins_LTLIBRARIES += index_column.la
 
 include vector_sources.am
 include string_sources.am
 include number_sources.am
 include time_sources.am
+include index_column_sources.am
 
 number_la_LIBADD = -lm

  Added: plugins/functions/index_column.c (+266 -0) 100644
===================================================================
--- /dev/null
+++ plugins/functions/index_column.c    2017-05-22 22:24:41 +0900 (acb7355)
@@ -0,0 +1,266 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2017 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#ifdef GRN_EMBEDDED
+#  define GRN_PLUGIN_FUNCTION_TAG functions_time
+#endif
+
+#include <groonga/plugin.h>
+
+static grn_rc
+selector_index_column_df_ratio_between(grn_ctx *ctx,
+                                       grn_obj *table,
+                                       grn_obj *index,
+                                       int n_args,
+                                       grn_obj **args,
+                                       grn_obj *res,
+                                       grn_operator op)
+{
+  grn_rc rc = GRN_SUCCESS;
+  grn_obj *index_column;
+  grn_ii *ii;
+  double min;
+  double max;
+  grn_obj *source_table;
+  unsigned int n_documents;
+  grn_posting posting;
+
+  if ((n_args - 1) != 3) {
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_INVALID_ARGUMENT,
+                     "index_column_df_ratio_between(): "
+                     "wrong number of arguments (%d for 3)", n_args - 1);
+    rc = ctx->rc;
+    goto exit;
+  }
+
+  index_column = args[1];
+  ii = (grn_ii *)index_column;
+  min = GRN_FLOAT_VALUE(args[2]);
+  max = GRN_FLOAT_VALUE(args[3]);
+
+  source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index_column));
+  n_documents = grn_table_size(ctx, source_table);
+  memset(&posting, 0, sizeof(grn_posting));
+  posting.sid = 1;
+
+  if (op == GRN_OP_AND) {
+    GRN_TABLE_EACH_BEGIN(ctx, res, cursor, record_id) {
+      void *key;
+      grn_id term_id;
+      uint32_t n_match_documents;
+      double df_ratio;
+
+      grn_table_cursor_get_key(ctx, cursor, &key);
+      term_id = *(grn_id *)key;
+      n_match_documents = grn_ii_estimate_size(ctx, ii, term_id);
+      if (n_match_documents > n_documents) {
+        n_match_documents = n_documents;
+      }
+      df_ratio = (double)n_match_documents / (double)n_documents;
+      if (min <= df_ratio && df_ratio <= max) {
+        posting.rid = term_id;
+        grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op);
+      }
+    } GRN_TABLE_EACH_END(ctx, cursor);
+    grn_ii_resolve_sel_and(ctx, (grn_hash *)res, op);
+  } else {
+    GRN_TABLE_EACH_BEGIN(ctx, table, cursor, term_id) {
+      uint32_t n_match_documents;
+      double df_ratio;
+
+      n_match_documents = grn_ii_estimate_size(ctx, ii, term_id);
+      if (n_match_documents > n_documents) {
+        n_match_documents = n_documents;
+      }
+      df_ratio = (double)n_match_documents / (double)n_documents;
+      {
+        void *key;
+        int key_size;
+        key_size = grn_table_cursor_get_key(ctx, cursor, &key);
+      }
+      if (min <= df_ratio && df_ratio <= max) {
+        posting.rid = term_id;
+        grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op);
+      }
+    } GRN_TABLE_EACH_END(ctx, cursor);
+  }
+
+exit :
+  return rc;
+}
+
+static grn_obj *
+func_index_column_df_ratio(grn_ctx *ctx,
+                           int n_args,
+                           grn_obj **args,
+                           grn_user_data *user_data)
+{
+  grn_obj *term_table;
+  grn_obj *index_column_name;
+  grn_obj *index_column;
+  grn_ii *ii;
+  grn_id term_id;
+
+  if (n_args != 1) {
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_INVALID_ARGUMENT,
+                     "index_column_df_ratio(): "
+                     "wrong number of arguments (%d for 1)", n_args - 1);
+    return NULL;
+  }
+
+  {
+    grn_obj *expr;
+    grn_obj *variable;
+
+    expr = grn_plugin_proc_get_caller(ctx, user_data);
+    if (!expr) {
+      GRN_PLUGIN_ERROR(ctx,
+                       GRN_INVALID_ARGUMENT,
+                       "index_column_df_ratio(): "
+                       "called directly");
+      return NULL;
+    }
+
+    variable = grn_expr_get_var_by_offset(ctx, expr, 0);
+    if (!variable) {
+      GRN_PLUGIN_ERROR(ctx,
+                       GRN_INVALID_ARGUMENT,
+                       "index_column_df_ratio(): "
+                       "caller expression must have target record information");
+      return NULL;
+    }
+
+    term_table = grn_ctx_at(ctx, variable->header.domain);
+    term_id = GRN_RECORD_VALUE(variable);
+    while (GRN_TRUE) {
+      grn_obj *key_type;
+
+      key_type = grn_ctx_at(ctx, term_table->header.domain);
+      if (!grn_obj_is_table(ctx, key_type)) {
+        break;
+      }
+
+      grn_table_get_key(ctx, term_table, term_id, &term_id, sizeof(grn_id));
+      term_table = key_type;
+    }
+  }
+
+  index_column_name = args[0];
+  if (!grn_obj_is_text_family_bulk(ctx, index_column_name)) {
+    grn_obj inspected;
+    GRN_TEXT_INIT(&inspected, 0);
+    grn_inspect(ctx, &inspected, index_column_name);
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_INVALID_ARGUMENT,
+                     "index_column_df_ratio(): "
+                     "the first argument must be index column name: %.*s",
+                     (int)GRN_TEXT_LEN(&inspected),
+                     GRN_TEXT_VALUE(&inspected));
+    GRN_OBJ_FIN(ctx, &inspected);
+    return NULL;
+  }
+
+  index_column = grn_obj_column(ctx,
+                                term_table,
+                                GRN_TEXT_VALUE(index_column_name),
+                                GRN_TEXT_LEN(index_column_name));
+  if (!index_column) {
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_INVALID_ARGUMENT,
+                     "index_column_df_ratio(): "
+                     "nonexistent object: <%.*s>",
+                     (int)GRN_TEXT_LEN(index_column_name),
+                     GRN_TEXT_VALUE(index_column_name));
+    return NULL;
+  }
+
+  if (!grn_obj_is_index_column(ctx, index_column)) {
+    grn_obj inspected;
+    GRN_TEXT_INIT(&inspected, 0);
+    grn_inspect(ctx, &inspected, index_column);
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_INVALID_ARGUMENT,
+                     "index_column_df_ratio(): "
+                     "the first argument must be index column: %.*s",
+                     (int)GRN_TEXT_LEN(&inspected),
+                     GRN_TEXT_VALUE(&inspected));
+    GRN_OBJ_FIN(ctx, &inspected);
+    if (grn_obj_is_accessor(ctx, index_column)) {
+      grn_obj_unlink(ctx, index_column);
+    }
+    return NULL;
+  }
+
+  ii = (grn_ii *)index_column;
+
+  {
+    grn_obj *source_table;
+    unsigned int n_documents;
+    uint32_t n_match_documents;
+    double df_ratio;
+    grn_obj *df_ratio_value;
+
+    source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index_column));
+    n_documents = grn_table_size(ctx, source_table);
+    n_match_documents = grn_ii_estimate_size(ctx, ii, term_id);
+    if (n_match_documents > n_documents) {
+      n_match_documents = n_documents;
+    }
+    df_ratio = (double)n_match_documents / (double)n_documents;
+
+    df_ratio_value = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_FLOAT, 0);
+    if (!df_ratio_value) {
+      return NULL;
+    }
+    GRN_FLOAT_SET(ctx, df_ratio_value, df_ratio);
+    return df_ratio_value;
+  }
+}
+
+grn_rc
+GRN_PLUGIN_INIT(grn_ctx *ctx)
+{
+  return ctx->rc;
+}
+
+grn_rc
+GRN_PLUGIN_REGISTER(grn_ctx *ctx)
+{
+  grn_obj *selector_proc;
+
+  selector_proc = grn_proc_create(ctx, "index_column_df_ratio_between", -1,
+                                  GRN_PROC_FUNCTION,
+                                  NULL, NULL, NULL, 0, NULL);
+  grn_proc_set_selector(ctx, selector_proc,
+                        selector_index_column_df_ratio_between);
+  grn_proc_set_selector_operator(ctx, selector_proc, GRN_OP_NOP);
+
+  grn_proc_create(ctx, "index_column_df_ratio", -1,
+                  GRN_PROC_FUNCTION,
+                  func_index_column_df_ratio, NULL, NULL, 0, NULL);
+
+  return ctx->rc;
+}
+
+grn_rc
+GRN_PLUGIN_FIN(grn_ctx *ctx)
+{
+  return GRN_SUCCESS;
+}

  Added: plugins/functions/index_column_sources.am (+2 -0) 100644
===================================================================
--- /dev/null
+++ plugins/functions/index_column_sources.am    2017-05-22 22:24:41 +0900 (261907b)
@@ -0,0 +1,2 @@
+index_column_la_SOURCES =			\
+	index_column.c

  Added: test/command/suite/select/function/index_column/index_column_df_ratio_between/and.expected (+96 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/index_column/index_column_df_ratio_between/and.expected    2017-05-22 22:24:41 +0900 (3738cfd)
@@ -0,0 +1,96 @@
+plugin_register functions/index_column
+[[0,0.0,0.0],true]
+table_create Memos TABLE_HASH_KEY ShortText
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+column_create Terms index COLUMN_INDEX|WITH_POSITION Memos _key
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"_key": "Groonga is a fast full text search engine."},
+{"_key": "Mroonga is a MySQL storage engine based on Groonga."},
+{"_key": "Mroonga provides fast full text search feature to MySQL."},
+{"_key": "Rroonga is a Ruby bindings for Groonga."},
+{"_key": "Ruby"}
+]
+[[0,0.0,0.0],5]
+select Terms   --filter 'true && index_column_df_ratio_between(index, 0.1, 0.9)'   --limit -1   --sort_keys _id   --columns[df_ratio].stage output   --columns[df_ratio].type Float   --columns[df_ratio].value 'index_column_df_ratio("index")'   --output_columns '_id, _key, df_ratio'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        10
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "_key",
+          "ShortText"
+        ],
+        [
+          "df_ratio",
+          "Float"
+        ]
+      ],
+      [
+        10,
+        "mroonga",
+        0.8
+      ],
+      [
+        12,
+        "storage",
+        0.2
+      ],
+      [
+        13,
+        "based",
+        0.2
+      ],
+      [
+        14,
+        "on",
+        0.2
+      ],
+      [
+        15,
+        "provides",
+        0.2
+      ],
+      [
+        16,
+        "feature",
+        0.2
+      ],
+      [
+        17,
+        "to",
+        0.2
+      ],
+      [
+        18,
+        "rroonga",
+        0.2
+      ],
+      [
+        20,
+        "bindings",
+        0.2
+      ],
+      [
+        21,
+        "for",
+        0.2
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/select/function/index_column/index_column_df_ratio_between/and.test (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/index_column/index_column_df_ratio_between/and.test    2017-05-22 22:24:41 +0900 (e799bbf)
@@ -0,0 +1,26 @@
+plugin_register functions/index_column
+
+table_create Memos TABLE_HASH_KEY ShortText
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram \
+  --normalizer NormalizerAuto
+column_create Terms index COLUMN_INDEX|WITH_POSITION Memos _key
+
+load --table Memos
+[
+{"_key": "Groonga is a fast full text search engine."},
+{"_key": "Mroonga is a MySQL storage engine based on Groonga."},
+{"_key": "Mroonga provides fast full text search feature to MySQL."},
+{"_key": "Rroonga is a Ruby bindings for Groonga."},
+{"_key": "Ruby"}
+]
+
+select Terms \
+  --filter 'true && index_column_df_ratio_between(index, 0.1, 0.9)' \
+  --limit -1 \
+  --sort_keys _id \
+  --columns[df_ratio].stage output \
+  --columns[df_ratio].type Float \
+  --columns[df_ratio].value 'index_column_df_ratio("index")' \
+  --output_columns '_id, _key, df_ratio'

  Added: test/command/suite/select/function/index_column/index_column_df_ratio_between/or.expected (+101 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/index_column/index_column_df_ratio_between/or.expected    2017-05-22 22:24:41 +0900 (839b7f5)
@@ -0,0 +1,101 @@
+plugin_register functions/index_column
+[[0,0.0,0.0],true]
+table_create Memos TABLE_HASH_KEY ShortText
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+column_create Terms index COLUMN_INDEX|WITH_POSITION Memos _key
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"_key": "Groonga is a fast full text search engine."},
+{"_key": "Mroonga is a MySQL storage engine based on Groonga."},
+{"_key": "Mroonga provides fast full text search feature to MySQL."},
+{"_key": "Rroonga is a Ruby bindings for Groonga."},
+{"_key": "Ruby"}
+]
+[[0,0.0,0.0],5]
+select Terms   --filter '_key == "Ruby" || index_column_df_ratio_between(index, 0.1, 0.9)'   --limit -1   --sort_keys _id   --columns[df_ratio].stage output   --columns[df_ratio].type Float   --columns[df_ratio].value 'index_column_df_ratio("index")'   --output_columns '_id, _key, df_ratio'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        11
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "_key",
+          "ShortText"
+        ],
+        [
+          "df_ratio",
+          "Float"
+        ]
+      ],
+      [
+        10,
+        "mroonga",
+        0.8
+      ],
+      [
+        12,
+        "storage",
+        0.2
+      ],
+      [
+        13,
+        "based",
+        0.2
+      ],
+      [
+        14,
+        "on",
+        0.2
+      ],
+      [
+        15,
+        "provides",
+        0.2
+      ],
+      [
+        16,
+        "feature",
+        0.2
+      ],
+      [
+        17,
+        "to",
+        0.2
+      ],
+      [
+        18,
+        "rroonga",
+        0.2
+      ],
+      [
+        19,
+        "ruby",
+        1.0
+      ],
+      [
+        20,
+        "bindings",
+        0.2
+      ],
+      [
+        21,
+        "for",
+        0.2
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/select/function/index_column/index_column_df_ratio_between/or.test (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/index_column/index_column_df_ratio_between/or.test    2017-05-22 22:24:41 +0900 (5f82cab)
@@ -0,0 +1,26 @@
+plugin_register functions/index_column
+
+table_create Memos TABLE_HASH_KEY ShortText
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram \
+  --normalizer NormalizerAuto
+column_create Terms index COLUMN_INDEX|WITH_POSITION Memos _key
+
+load --table Memos
+[
+{"_key": "Groonga is a fast full text search engine."},
+{"_key": "Mroonga is a MySQL storage engine based on Groonga."},
+{"_key": "Mroonga provides fast full text search feature to MySQL."},
+{"_key": "Rroonga is a Ruby bindings for Groonga."},
+{"_key": "Ruby"}
+]
+
+select Terms \
+  --filter '_key == "Ruby" || index_column_df_ratio_between(index, 0.1, 0.9)' \
+  --limit -1 \
+  --sort_keys _id \
+  --columns[df_ratio].stage output \
+  --columns[df_ratio].type Float \
+  --columns[df_ratio].value 'index_column_df_ratio("index")' \
+  --output_columns '_id, _key, df_ratio'
-------------- next part --------------
HTML����������������������������...
다운로드 



More information about the Groonga-commit mailing list
Back to archive index