Kouhei Sutou
null+****@clear*****
Mon May 22 22:24:41 JST 2017
Kouhei Sutou 2017-05-22 22:24:41 +0900 (Mon, 22 May 2017) New Revision: 2fde28248c238df1d983a9585cb77f53eaf58562 https://github.com/groonga/groonga/commit/2fde28248c238df1d983a9585cb77f53eaf58562 Message: Add functions/index_column plugin New selector: * index_column_df_ratio_between() New function: * index_column_df_ratio() Added files: plugins/functions/index_column.c plugins/functions/index_column_sources.am test/command/suite/select/function/index_column/index_column_df_ratio_between/and.expected test/command/suite/select/function/index_column/index_column_df_ratio_between/and.test test/command/suite/select/function/index_column/index_column_df_ratio_between/or.expected test/command/suite/select/function/index_column/index_column_df_ratio_between/or.test Modified files: plugins/functions/CMakeLists.txt plugins/functions/Makefile.am Modified: plugins/functions/CMakeLists.txt (+22 -1) =================================================================== --- plugins/functions/CMakeLists.txt 2017-05-22 22:24:07 +0900 (d221576) +++ plugins/functions/CMakeLists.txt 2017-05-22 22:24:41 +0900 (42314bd) @@ -1,4 +1,4 @@ -# Copyright(C) 2015-2016 Brazil +# Copyright(C) 2015-2017 Brazil # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -98,3 +98,24 @@ else() install(TARGETS time_functions DESTINATION "${GRN_FUNCTIONS_PLUGIN_DIR}") endif() target_link_libraries(time_functions libgroonga) + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/index_column_sources.am + INDEX_COLUMN_SOURCES) +set_source_files_properties(${INDEX_COLUMN_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +if(GRN_EMBED) + add_library(index_column_functions STATIC ${INDEX_COLUMN_SOURCES}) + set_target_properties( + index_column_functions + PROPERTIES + POSITION_INDEPENDENT_CODE ON) +else() + add_library(index_column_functions MODULE ${INDEX_COLUMN_SOURCES}) + set_target_properties(index_column_functions PROPERTIES + PREFIX "" + OUTPUT_NAME "index_column") + install(TARGETS index_column_functions + DESTINATION "${GRN_FUNCTIONS_PLUGIN_DIR}") +endif() +target_link_libraries(index_column_functions libgroonga) Modified: plugins/functions/Makefile.am (+2 -0) =================================================================== --- plugins/functions/Makefile.am 2017-05-22 22:24:07 +0900 (179f574) +++ plugins/functions/Makefile.am 2017-05-22 22:24:41 +0900 (b24947b) @@ -19,10 +19,12 @@ function_plugins_LTLIBRARIES += vector.la function_plugins_LTLIBRARIES += string.la function_plugins_LTLIBRARIES += number.la function_plugins_LTLIBRARIES += time.la +function_plugins_LTLIBRARIES += index_column.la include vector_sources.am include string_sources.am include number_sources.am include time_sources.am +include index_column_sources.am number_la_LIBADD = -lm Added: plugins/functions/index_column.c (+266 -0) 100644 =================================================================== --- /dev/null +++ plugins/functions/index_column.c 2017-05-22 22:24:41 +0900 (acb7355) @@ -0,0 +1,266 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2017 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG functions_time +#endif + +#include <groonga/plugin.h> + +static grn_rc +selector_index_column_df_ratio_between(grn_ctx *ctx, + grn_obj *table, + grn_obj *index, + int n_args, + grn_obj **args, + grn_obj *res, + grn_operator op) +{ + grn_rc rc = GRN_SUCCESS; + grn_obj *index_column; + grn_ii *ii; + double min; + double max; + grn_obj *source_table; + unsigned int n_documents; + grn_posting posting; + + if ((n_args - 1) != 3) { + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio_between(): " + "wrong number of arguments (%d for 3)", n_args - 1); + rc = ctx->rc; + goto exit; + } + + index_column = args[1]; + ii = (grn_ii *)index_column; + min = GRN_FLOAT_VALUE(args[2]); + max = GRN_FLOAT_VALUE(args[3]); + + source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index_column)); + n_documents = grn_table_size(ctx, source_table); + memset(&posting, 0, sizeof(grn_posting)); + posting.sid = 1; + + if (op == GRN_OP_AND) { + GRN_TABLE_EACH_BEGIN(ctx, res, cursor, record_id) { + void *key; + grn_id term_id; + uint32_t n_match_documents; + double df_ratio; + + grn_table_cursor_get_key(ctx, cursor, &key); + term_id = *(grn_id *)key; + n_match_documents = grn_ii_estimate_size(ctx, ii, term_id); + if (n_match_documents > n_documents) { + n_match_documents = n_documents; + } + df_ratio = (double)n_match_documents / (double)n_documents; + if (min <= df_ratio && df_ratio <= max) { + posting.rid = term_id; + grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op); + } + } GRN_TABLE_EACH_END(ctx, cursor); + grn_ii_resolve_sel_and(ctx, (grn_hash *)res, op); + } else { + GRN_TABLE_EACH_BEGIN(ctx, table, cursor, term_id) { + uint32_t n_match_documents; + double df_ratio; + + n_match_documents = grn_ii_estimate_size(ctx, ii, term_id); + if (n_match_documents > n_documents) { + n_match_documents = n_documents; + } + df_ratio = (double)n_match_documents / (double)n_documents; + { + void *key; + int key_size; + key_size = grn_table_cursor_get_key(ctx, cursor, &key); + } + if (min <= df_ratio && df_ratio <= max) { + posting.rid = term_id; + grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op); + } + } GRN_TABLE_EACH_END(ctx, cursor); + } + +exit : + return rc; +} + +static grn_obj * +func_index_column_df_ratio(grn_ctx *ctx, + int n_args, + grn_obj **args, + grn_user_data *user_data) +{ + grn_obj *term_table; + grn_obj *index_column_name; + grn_obj *index_column; + grn_ii *ii; + grn_id term_id; + + if (n_args != 1) { + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio(): " + "wrong number of arguments (%d for 1)", n_args - 1); + return NULL; + } + + { + grn_obj *expr; + grn_obj *variable; + + expr = grn_plugin_proc_get_caller(ctx, user_data); + if (!expr) { + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio(): " + "called directly"); + return NULL; + } + + variable = grn_expr_get_var_by_offset(ctx, expr, 0); + if (!variable) { + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio(): " + "caller expression must have target record information"); + return NULL; + } + + term_table = grn_ctx_at(ctx, variable->header.domain); + term_id = GRN_RECORD_VALUE(variable); + while (GRN_TRUE) { + grn_obj *key_type; + + key_type = grn_ctx_at(ctx, term_table->header.domain); + if (!grn_obj_is_table(ctx, key_type)) { + break; + } + + grn_table_get_key(ctx, term_table, term_id, &term_id, sizeof(grn_id)); + term_table = key_type; + } + } + + index_column_name = args[0]; + if (!grn_obj_is_text_family_bulk(ctx, index_column_name)) { + grn_obj inspected; + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, index_column_name); + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio(): " + "the first argument must be index column name: %.*s", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + + index_column = grn_obj_column(ctx, + term_table, + GRN_TEXT_VALUE(index_column_name), + GRN_TEXT_LEN(index_column_name)); + if (!index_column) { + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio(): " + "nonexistent object: <%.*s>", + (int)GRN_TEXT_LEN(index_column_name), + GRN_TEXT_VALUE(index_column_name)); + return NULL; + } + + if (!grn_obj_is_index_column(ctx, index_column)) { + grn_obj inspected; + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, index_column); + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio(): " + "the first argument must be index column: %.*s", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + if (grn_obj_is_accessor(ctx, index_column)) { + grn_obj_unlink(ctx, index_column); + } + return NULL; + } + + ii = (grn_ii *)index_column; + + { + grn_obj *source_table; + unsigned int n_documents; + uint32_t n_match_documents; + double df_ratio; + grn_obj *df_ratio_value; + + source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index_column)); + n_documents = grn_table_size(ctx, source_table); + n_match_documents = grn_ii_estimate_size(ctx, ii, term_id); + if (n_match_documents > n_documents) { + n_match_documents = n_documents; + } + df_ratio = (double)n_match_documents / (double)n_documents; + + df_ratio_value = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_FLOAT, 0); + if (!df_ratio_value) { + return NULL; + } + GRN_FLOAT_SET(ctx, df_ratio_value, df_ratio); + return df_ratio_value; + } +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_obj *selector_proc; + + selector_proc = grn_proc_create(ctx, "index_column_df_ratio_between", -1, + GRN_PROC_FUNCTION, + NULL, NULL, NULL, 0, NULL); + grn_proc_set_selector(ctx, selector_proc, + selector_index_column_df_ratio_between); + grn_proc_set_selector_operator(ctx, selector_proc, GRN_OP_NOP); + + grn_proc_create(ctx, "index_column_df_ratio", -1, + GRN_PROC_FUNCTION, + func_index_column_df_ratio, NULL, NULL, 0, NULL); + + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} Added: plugins/functions/index_column_sources.am (+2 -0) 100644 =================================================================== --- /dev/null +++ plugins/functions/index_column_sources.am 2017-05-22 22:24:41 +0900 (261907b) @@ -0,0 +1,2 @@ +index_column_la_SOURCES = \ + index_column.c Added: test/command/suite/select/function/index_column/index_column_df_ratio_between/and.expected (+96 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/index_column/index_column_df_ratio_between/and.expected 2017-05-22 22:24:41 +0900 (3738cfd) @@ -0,0 +1,96 @@ +plugin_register functions/index_column +[[0,0.0,0.0],true] +table_create Memos TABLE_HASH_KEY ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto +[[0,0.0,0.0],true] +column_create Terms index COLUMN_INDEX|WITH_POSITION Memos _key +[[0,0.0,0.0],true] +load --table Memos +[ +{"_key": "Groonga is a fast full text search engine."}, +{"_key": "Mroonga is a MySQL storage engine based on Groonga."}, +{"_key": "Mroonga provides fast full text search feature to MySQL."}, +{"_key": "Rroonga is a Ruby bindings for Groonga."}, +{"_key": "Ruby"} +] +[[0,0.0,0.0],5] +select Terms --filter 'true && index_column_df_ratio_between(index, 0.1, 0.9)' --limit -1 --sort_keys _id --columns[df_ratio].stage output --columns[df_ratio].type Float --columns[df_ratio].value 'index_column_df_ratio("index")' --output_columns '_id, _key, df_ratio' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 10 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "_key", + "ShortText" + ], + [ + "df_ratio", + "Float" + ] + ], + [ + 10, + "mroonga", + 0.8 + ], + [ + 12, + "storage", + 0.2 + ], + [ + 13, + "based", + 0.2 + ], + [ + 14, + "on", + 0.2 + ], + [ + 15, + "provides", + 0.2 + ], + [ + 16, + "feature", + 0.2 + ], + [ + 17, + "to", + 0.2 + ], + [ + 18, + "rroonga", + 0.2 + ], + [ + 20, + "bindings", + 0.2 + ], + [ + 21, + "for", + 0.2 + ] + ] + ] +] Added: test/command/suite/select/function/index_column/index_column_df_ratio_between/and.test (+26 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/index_column/index_column_df_ratio_between/and.test 2017-05-22 22:24:41 +0900 (e799bbf) @@ -0,0 +1,26 @@ +plugin_register functions/index_column + +table_create Memos TABLE_HASH_KEY ShortText + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto +column_create Terms index COLUMN_INDEX|WITH_POSITION Memos _key + +load --table Memos +[ +{"_key": "Groonga is a fast full text search engine."}, +{"_key": "Mroonga is a MySQL storage engine based on Groonga."}, +{"_key": "Mroonga provides fast full text search feature to MySQL."}, +{"_key": "Rroonga is a Ruby bindings for Groonga."}, +{"_key": "Ruby"} +] + +select Terms \ + --filter 'true && index_column_df_ratio_between(index, 0.1, 0.9)' \ + --limit -1 \ + --sort_keys _id \ + --columns[df_ratio].stage output \ + --columns[df_ratio].type Float \ + --columns[df_ratio].value 'index_column_df_ratio("index")' \ + --output_columns '_id, _key, df_ratio' Added: test/command/suite/select/function/index_column/index_column_df_ratio_between/or.expected (+101 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/index_column/index_column_df_ratio_between/or.expected 2017-05-22 22:24:41 +0900 (839b7f5) @@ -0,0 +1,101 @@ +plugin_register functions/index_column +[[0,0.0,0.0],true] +table_create Memos TABLE_HASH_KEY ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto +[[0,0.0,0.0],true] +column_create Terms index COLUMN_INDEX|WITH_POSITION Memos _key +[[0,0.0,0.0],true] +load --table Memos +[ +{"_key": "Groonga is a fast full text search engine."}, +{"_key": "Mroonga is a MySQL storage engine based on Groonga."}, +{"_key": "Mroonga provides fast full text search feature to MySQL."}, +{"_key": "Rroonga is a Ruby bindings for Groonga."}, +{"_key": "Ruby"} +] +[[0,0.0,0.0],5] +select Terms --filter '_key == "Ruby" || index_column_df_ratio_between(index, 0.1, 0.9)' --limit -1 --sort_keys _id --columns[df_ratio].stage output --columns[df_ratio].type Float --columns[df_ratio].value 'index_column_df_ratio("index")' --output_columns '_id, _key, df_ratio' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 11 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "_key", + "ShortText" + ], + [ + "df_ratio", + "Float" + ] + ], + [ + 10, + "mroonga", + 0.8 + ], + [ + 12, + "storage", + 0.2 + ], + [ + 13, + "based", + 0.2 + ], + [ + 14, + "on", + 0.2 + ], + [ + 15, + "provides", + 0.2 + ], + [ + 16, + "feature", + 0.2 + ], + [ + 17, + "to", + 0.2 + ], + [ + 18, + "rroonga", + 0.2 + ], + [ + 19, + "ruby", + 1.0 + ], + [ + 20, + "bindings", + 0.2 + ], + [ + 21, + "for", + 0.2 + ] + ] + ] +] Added: test/command/suite/select/function/index_column/index_column_df_ratio_between/or.test (+26 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/index_column/index_column_df_ratio_between/or.test 2017-05-22 22:24:41 +0900 (5f82cab) @@ -0,0 +1,26 @@ +plugin_register functions/index_column + +table_create Memos TABLE_HASH_KEY ShortText + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto +column_create Terms index COLUMN_INDEX|WITH_POSITION Memos _key + +load --table Memos +[ +{"_key": "Groonga is a fast full text search engine."}, +{"_key": "Mroonga is a MySQL storage engine based on Groonga."}, +{"_key": "Mroonga provides fast full text search feature to MySQL."}, +{"_key": "Rroonga is a Ruby bindings for Groonga."}, +{"_key": "Ruby"} +] + +select Terms \ + --filter '_key == "Ruby" || index_column_df_ratio_between(index, 0.1, 0.9)' \ + --limit -1 \ + --sort_keys _id \ + --columns[df_ratio].stage output \ + --columns[df_ratio].type Float \ + --columns[df_ratio].value 'index_column_df_ratio("index")' \ + --output_columns '_id, _key, df_ratio' -------------- next part -------------- HTML����������������������������...다운로드