[Groonga-commit] groonga/groonga at 308ff2f [master] ii: fix a bug that offline indexing generate small errors in indexes

Back to archive index

susumu.yata null+****@clear*****
Mon Aug 10 17:38:51 JST 2015


susumu.yata	2015-08-10 17:38:51 +0900 (Mon, 10 Aug 2015)

  New Revision: 308ff2fc9ea0e93aa5cf87a162b133203b7b715e
  https://github.com/groonga/groonga/commit/308ff2fc9ea0e93aa5cf87a162b133203b7b715e

  Message:
    ii: fix a bug that offline indexing generate small errors in indexes
    
    GitHub: fix #371

  Modified files:
    lib/ii.c

  Modified: lib/ii.c (+127 -61)
===================================================================
--- lib/ii.c    2015-08-09 22:33:49 +0900 (391d3c3)
+++ lib/ii.c    2015-08-10 17:38:51 +0900 (8faa373)
@@ -7113,6 +7113,13 @@ const size_t II_BUFFER_BLOCK_SIZE = 0x1000000;
 const uint32_t II_BUFFER_BLOCK_READ_UNIT_SIZE = 0x200000;
 
 typedef struct {
+  unsigned int sid;
+  unsigned int weight;
+  const char *p;
+  uint32_t len;
+} ii_buffer_value;
+
+typedef struct {
   uint32_t nrecs;
   uint32_t nposts;
   grn_id last_rid;
@@ -7478,14 +7485,7 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid,
     grn_obj *tmp_lexicon;
     uint32_t est_len = value_len * 2 + 2;
     if (ii_buffer->block_buf_size < ii_buffer->block_pos + est_len) {
-      grn_ii_buffer_flush(ctx, ii_buffer);
-    }
-    if (ii_buffer->block_buf_size < est_len) {
-      grn_id *block_buf = (grn_id *)GRN_REALLOC(ii_buffer->block_buf,
-                                                est_len * sizeof(grn_id));
-      if (!block_buf) { return; }
-      ii_buffer->block_buf = block_buf;
-      ii_buffer->block_buf_size = est_len;
+      return;
     }
     if ((tmp_lexicon = get_tmp_lexicon(ctx, ii_buffer))) {
       unsigned int token_flags = 0;
@@ -8048,70 +8048,136 @@ grn_ii_buffer_close(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
 }
 
 static void
+ii_buffer_values_append(grn_ctx *ctx, ii_buffer_value **values,
+                        unsigned *nvalues, unsigned *max_nvalues,
+                        unsigned int sid, unsigned weight,
+                        const char *p, uint32_t len) {
+  if (*nvalues == *max_nvalues) {
+    unsigned new_max_nvalues = *max_nvalues ? (*max_nvalues * 2) : 1;
+    unsigned new_size = new_max_nvalues * sizeof(ii_buffer_value);
+    ii_buffer_value *new_values;
+    new_values = (ii_buffer_value *)GRN_REALLOC(*values, new_size);
+    if (new_values) {
+      *values = new_values;
+      *max_nvalues = new_max_nvalues;
+    }
+  }
+  if (*values) {
+    (*values)[*nvalues].sid = sid;
+    (*values)[*nvalues].weight = weight;
+    (*values)[*nvalues].p = p;
+    (*values)[*nvalues].len = len;
+    ++*nvalues;
+  }
+}
+
+static void
 grn_ii_buffer_parse(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
                     grn_obj *target, int ncols, grn_obj **cols)
 {
   grn_table_cursor  *tc;
-  if ((tc = grn_table_cursor_open(ctx, target,
-                                  NULL, 0, NULL, 0, 0, -1,
-                                  GRN_CURSOR_BY_ID))) {
-    grn_id rid;
-    grn_obj rv;
-    GRN_TEXT_INIT(&rv, 0);
-    while ((rid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) {
-      int sid;
-      grn_obj **col;
-      for (sid = 1, col = cols; sid <= ncols; sid++, col++) {
-        grn_obj_reinit_for(ctx, &rv, *col);
-        if (GRN_OBJ_TABLEP(*col)) {
-          grn_table_get_key2(ctx, *col, rid, &rv);
-        } else {
-          grn_obj_get_value(ctx, *col, rid, &rv);
-        }
-        switch (rv.header.type) {
-        case GRN_BULK :
-          grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0,
-                                 GRN_TEXT_VALUE(&rv), GRN_TEXT_LEN(&rv));
-          break;
-        case GRN_UVECTOR :
-          {
-            unsigned int i, size;
-            unsigned int element_size;
-
-            size = grn_uvector_size(ctx, &rv);
-            element_size = grn_uvector_element_size(ctx, &rv);
-            for (i = 0; i < size; i++) {
-              grn_ii_buffer_tokenize(ctx, ii_buffer, rid, sid, 0,
-                                     GRN_BULK_HEAD(&rv) + (element_size * i),
-                                     element_size);
-            }
+  grn_obj *vobjs;
+  if ((vobjs = GRN_MALLOCN(grn_obj, ncols))) {
+    int i;
+    for (i = 0; i < ncols; i++) {
+      GRN_TEXT_INIT(&vobjs[i], 0);
+    }
+    if ((tc = grn_table_cursor_open(ctx, target,
+                                    NULL, 0, NULL, 0, 0, -1,
+                                    GRN_CURSOR_BY_ID))) {
+      ii_buffer_value *values = NULL;
+      unsigned int nvalues = 0;
+      unsigned int max_nvalues = 0;
+      uint32_t est_len;
+      grn_id rid;
+      while ((rid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) {
+        unsigned int j;
+        int sid;
+        grn_obj **col;
+        for (sid = 1, col = cols; sid <= ncols; sid++, col++) {
+          grn_obj *rv = &vobjs[sid - 1];
+          grn_obj_reinit_for(ctx, rv, *col);
+          if (GRN_OBJ_TABLEP(*col)) {
+            grn_table_get_key2(ctx, *col, rid, rv);
+          } else {
+            grn_obj_get_value(ctx, *col, rid, rv);
           }
-          break;
-        case GRN_VECTOR :
-          if (rv.u.v.body) {
-            int i;
-            int n_sections = rv.u.v.n_sections;
-            grn_section *sections = rv.u.v.sections;
-            const char *head = GRN_BULK_HEAD(rv.u.v.body);
-            for (i = 0; i < n_sections; i++) {
-              grn_section *section = sections + i;
-              if (section->length == 0) {
-                continue;
+          switch (rv->header.type) {
+          case GRN_BULK :
+            ii_buffer_values_append(ctx, &values, &nvalues, &max_nvalues, sid,
+                                    0, GRN_TEXT_VALUE(rv), GRN_TEXT_LEN(rv));
+            break;
+          case GRN_UVECTOR :
+            {
+              unsigned int size;
+              unsigned int elem_size;
+              size = grn_uvector_size(ctx, rv);
+              elem_size = grn_uvector_element_size(ctx, rv);
+              for (j = 0; j < size; j++) {
+                ii_buffer_values_append(ctx, &values, &nvalues, &max_nvalues,
+                                        sid, 0,
+                                        GRN_BULK_HEAD(rv) + (elem_size * j),
+                                        elem_size);
+              }
+            }
+            break;
+          case GRN_VECTOR :
+            if (rv->u.v.body) {
+              int j;
+              int n_sections = rv->u.v.n_sections;
+              grn_section *sections = rv->u.v.sections;
+              const char *head = GRN_BULK_HEAD(rv->u.v.body);
+              for (j = 0; j < n_sections; j++) {
+                grn_section *section = sections + j;
+                if (section->length == 0) {
+                  continue;
+                }
+                ii_buffer_values_append(ctx, &values, &nvalues, &max_nvalues,
+                                        sid, section->weight,
+                                        head + section->offset,
+                                        section->length);
               }
-              grn_ii_buffer_tokenize(ctx, ii_buffer, rid,
-                                     sid, section->weight,
-                                     head + section->offset, section->length);
             }
+            break;
+          default :
+            ERR(GRN_INVALID_ARGUMENT,
+                "[index] invalid object assigned as value");
+            break;
           }
-          break;
-        default :
-          ERR(GRN_INVALID_ARGUMENT, "[index] invalid object assigned as value");
-          break;
         }
+
+        est_len = 0;
+        for (j = 0; j < nvalues; j++) {
+          est_len += values[j].len * 2 + 2;
+        }
+        if (ii_buffer->block_buf_size < ii_buffer->block_pos + est_len) {
+          grn_ii_buffer_flush(ctx, ii_buffer);
+        }
+        if (ii_buffer->block_buf_size < est_len) {
+          grn_id *block_buf = (grn_id *)GRN_REALLOC(ii_buffer->block_buf,
+                                                    est_len * sizeof(grn_id));
+          if (block_buf) {
+            ii_buffer->block_buf = block_buf;
+            ii_buffer->block_buf_size = est_len;
+          }
+        }
+
+        for (j = 0; j < nvalues; j++) {
+          grn_ii_buffer_tokenize(ctx, ii_buffer, rid, values[j].sid,
+                                 values[j].weight, values[j].p,
+                                 values[j].len);
+        }
+        nvalues = 0;
+      }
+      if (values) {
+        GRN_FREE(values);
       }
+      grn_table_cursor_close(ctx, tc);
     }
-    GRN_OBJ_FIN(ctx, &rv);
-    grn_table_cursor_close(ctx, tc);
+    for (i = 0; i < ncols; i++) {
+      GRN_OBJ_FIN(ctx, &vobjs[i]);
+    }
+    GRN_FREE(vobjs);
   }
 }
 
-------------- next part --------------
HTML����������������������������...
다운로드 



More information about the Groonga-commit mailing list
Back to archive index