[Groonga-commit] groonga/groonga at fe9a32d [master] nfkc: add table based implementation

Back to archive index

Kouhei Sutou null+****@clear*****
Sun Dec 6 12:12:48 JST 2015


Kouhei Sutou	2015-12-06 12:12:48 +0900 (Sun, 06 Dec 2015)

  New Revision: fe9a32d6db7cf267e4373bf6bba48061cc7221f7
  https://github.com/groonga/groonga/commit/fe9a32d6db7cf267e4373bf6bba48061cc7221f7

  Message:
    nfkc: add table based implementation
    
    It's fast same as the current switch based implementation.
    
    It doesn't require many CPU and memory resources on build.

  Modified files:
    lib/nfkc.rb

  Modified: lib/nfkc.rb (+177 -1)
===================================================================
--- lib/nfkc.rb    2015-12-05 22:06:09 +0900 (41bad0a)
+++ lib/nfkc.rb    2015-12-06 12:12:48 +0900 (5c77359)
@@ -276,6 +276,177 @@ grn_nfkc#{@unicode_version}_map2(const unsigned char *prefix, const unsigned cha
   end
 end
 
+class TableGenerator < SwitchGenerator
+  private
+  def generate_map1(hash)
+    generate_decompose(hash)
+  end
+
+  def generate_decompose(char_map)
+    byte_size_groups = char_map.keys.group_by do |from|
+      bytes = from.bytes
+      first_byte = bytes[0]
+      if first_byte < 0x80
+        []
+      elsif first_byte < 0xe0
+        bytes[0, 1]
+      elsif first_byte < 0xf0
+        bytes[0, 2]
+      elsif first_byte < 0xf8
+        bytes[0, 3]
+      elsif first_byte < 0xfc
+        bytes[0, 4]
+      elsif first_byte < 0xfe
+        bytes[0, 5]
+      end
+    end
+
+    generate_decompose_tables(char_map, byte_size_groups)
+
+    @output.puts(<<-HEADER)
+
+const char *
+grn_nfkc#{@unicode_version}_map1(const unsigned char *str)
+{
+    HEADER
+
+    prev_common_bytes = []
+    prev_n_common_bytes = 0
+    byte_size_groups.keys.sort.each do |common_bytes|
+      froms = byte_size_groups[common_bytes]
+      froms_bytes = froms.collect(&:bytes).sort
+      min = froms_bytes.first.last
+      max = froms_bytes.last.last
+      n_common_bytes = 0
+      if common_bytes.empty?
+        @output.puts(<<-BODY)
+  if (str[0] < 0x80) {
+    if (str[0] >= #{"%#04x" % min} && str[0] <= #{"%#04x" % max}) {
+      return #{decompose_table_name(common_bytes)}[str[0] - #{"%#04x" % min}];
+    } else {
+      return NULL;
+    }
+  } else {
+        BODY
+      else
+        found_different_byte = false
+        common_bytes.each_with_index do |common_byte, i|
+          unless found_different_byte
+            if prev_common_bytes[i] == common_byte
+              n_common_bytes += 1
+              next
+            end
+            found_different_byte = true
+          end
+          indent = "  " * i
+          # p [i, prev_common_bytes.collect{|x| "%#04x" % x}, common_bytes.collect{|x| "%#04x" % x}, "%#04x" % common_byte, n_common_bytes, prev_n_common_bytes]
+          if prev_common_bytes[i].nil?
+            # p nil
+            @output.puts(<<-BODY)
+    #{indent}switch (str[#{i}]) {
+            BODY
+          elsif i < prev_n_common_bytes
+            # p :prev
+            @output.puts(<<-BODY)
+    #{indent}  default :
+    #{indent}    break;
+    #{indent}  }
+    #{indent}  break;
+            BODY
+          elsif n_common_bytes < prev_n_common_bytes
+            # p :common_prev
+            @output.puts(<<-BODY)
+    #{indent}switch (str[#{i}]) {
+            BODY
+          end
+          @output.puts(<<-BODY)
+    #{indent}case #{"%#04x" % common_byte} :
+          BODY
+        end
+
+        n = froms_bytes.first.size - 1
+        indent = "  " * common_bytes.size
+        @output.puts(<<-BODY)
+    #{indent}if (str[#{n}] >= #{"%#04x" % min} && str[#{n}] <= #{"%#04x" % max}) {
+    #{indent}  return #{decompose_table_name(common_bytes)}[str[#{n}] - #{"%#04x" % min}];
+    #{indent}}
+    #{indent}break;
+        BODY
+      end
+
+      prev_common_bytes = common_bytes
+      prev_n_common_bytes = n_common_bytes
+    end
+
+    # p [prev_common_bytes.collect{|x| "%#04x" % x}, prev_n_common_bytes]
+
+    (prev_common_bytes.size - 1).step(0, -1) do |i|
+      indent = "  " * i
+      @output.puts(<<-BODY)
+    #{indent}default :
+    #{indent}  break;
+    #{indent}}
+      BODY
+    end
+
+    @output.puts(<<-FOOTER)
+  }
+
+  return NULL;
+}
+    FOOTER
+  end
+
+  def to_bytes_map(char_map)
+    bytes_map = {}
+    char_map.each_key do |from|
+      parent = bytes_map
+      from.bytes[0..-2].each do |byte|
+        parent[byte] ||= {}
+        parent = parent[byte]
+      end
+      parent[from.bytes.last] = char_map[from]
+    end
+    bytes_map
+  end
+
+  def decompose_table_name(common_bytes)
+    suffix = common_bytes.collect {|byte| "%02x" % byte}.join("")
+    "grn_nfkc#{@unicode_version}_decompose_table_#{suffix}"
+  end
+
+  def generate_decompose_tables(char_map, byte_size_groups)
+    byte_size_groups.keys.sort.each do |common_bytes|
+      froms = byte_size_groups[common_bytes]
+      @output.puts(<<-TABLE_HEADER)
+
+static const char *#{decompose_table_name(common_bytes)}[] = {
+      TABLE_HEADER
+
+      lines = []
+      last_bytes = froms.collect {|from| from.bytes.last}
+      last_bytes.min.step(last_bytes.max).each_slice(8) do |slice|
+        values = slice.collect do |last_byte|
+          from = (common_bytes + [last_byte]).pack("c*").force_encoding("UTF-8")
+          to = char_map[from]
+          if to
+            escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("")
+            "\"#{escaped_value}\""
+          else
+            "NULL"
+          end
+        end
+        lines << ("  " + values.join(", "))
+      end
+      @output.puts(lines.join(",\n"))
+
+      @output.puts(<<-TABLE_FOOTER)
+};
+      TABLE_FOOTER
+    end
+  end
+end
+
 def ccpush(hash, src, dst)
   head = src.shift
   hash[head] = {} unless hash[head]
@@ -388,12 +559,17 @@ end
 
 ######## main #######
 
+generator_class = SwitchGenerator
 ARGV.each{|arg|
   case arg
   when /-*c/i
     $case_sensitive = true
   when /-*s/i
     $keep_space = true
+  when "--impl=switch"
+    generator_class = SwitchGenerator
+  when "--impl=table"
+    generator_class = TableGenerator
   end
 }
 
@@ -436,7 +612,7 @@ don't edit this file by hand. it generated automatically by nfkc.rb
 #ifdef GRN_WITH_NFKC
   HEADER
 
-  generator = SwitchGenerator.new(unicode_version, output)
+  generator = generator_class.new(unicode_version, output)
   generator.generate(map1, map2)
 
   output.puts(<<-FOOTER)
-------------- next part --------------
HTML����������������������������...
다운로드 



More information about the Groonga-commit mailing list
Back to archive index