Kouhei Sutou
null+****@clear*****
Sun Dec 6 12:12:48 JST 2015
Kouhei Sutou 2015-12-06 12:12:48 +0900 (Sun, 06 Dec 2015) New Revision: fe9a32d6db7cf267e4373bf6bba48061cc7221f7 https://github.com/groonga/groonga/commit/fe9a32d6db7cf267e4373bf6bba48061cc7221f7 Message: nfkc: add table based implementation It's fast same as the current switch based implementation. It doesn't require many CPU and memory resources on build. Modified files: lib/nfkc.rb Modified: lib/nfkc.rb (+177 -1) =================================================================== --- lib/nfkc.rb 2015-12-05 22:06:09 +0900 (41bad0a) +++ lib/nfkc.rb 2015-12-06 12:12:48 +0900 (5c77359) @@ -276,6 +276,177 @@ grn_nfkc#{@unicode_version}_map2(const unsigned char *prefix, const unsigned cha end end +class TableGenerator < SwitchGenerator + private + def generate_map1(hash) + generate_decompose(hash) + end + + def generate_decompose(char_map) + byte_size_groups = char_map.keys.group_by do |from| + bytes = from.bytes + first_byte = bytes[0] + if first_byte < 0x80 + [] + elsif first_byte < 0xe0 + bytes[0, 1] + elsif first_byte < 0xf0 + bytes[0, 2] + elsif first_byte < 0xf8 + bytes[0, 3] + elsif first_byte < 0xfc + bytes[0, 4] + elsif first_byte < 0xfe + bytes[0, 5] + end + end + + generate_decompose_tables(char_map, byte_size_groups) + + @output.puts(<<-HEADER) + +const char * +grn_nfkc#{@unicode_version}_map1(const unsigned char *str) +{ + HEADER + + prev_common_bytes = [] + prev_n_common_bytes = 0 + byte_size_groups.keys.sort.each do |common_bytes| + froms = byte_size_groups[common_bytes] + froms_bytes = froms.collect(&:bytes).sort + min = froms_bytes.first.last + max = froms_bytes.last.last + n_common_bytes = 0 + if common_bytes.empty? + @output.puts(<<-BODY) + if (str[0] < 0x80) { + if (str[0] >= #{"%#04x" % min} && str[0] <= #{"%#04x" % max}) { + return #{decompose_table_name(common_bytes)}[str[0] - #{"%#04x" % min}]; + } else { + return NULL; + } + } else { + BODY + else + found_different_byte = false + common_bytes.each_with_index do |common_byte, i| + unless found_different_byte + if prev_common_bytes[i] == common_byte + n_common_bytes += 1 + next + end + found_different_byte = true + end + indent = " " * i + # p [i, prev_common_bytes.collect{|x| "%#04x" % x}, common_bytes.collect{|x| "%#04x" % x}, "%#04x" % common_byte, n_common_bytes, prev_n_common_bytes] + if prev_common_bytes[i].nil? + # p nil + @output.puts(<<-BODY) + #{indent}switch (str[#{i}]) { + BODY + elsif i < prev_n_common_bytes + # p :prev + @output.puts(<<-BODY) + #{indent} default : + #{indent} break; + #{indent} } + #{indent} break; + BODY + elsif n_common_bytes < prev_n_common_bytes + # p :common_prev + @output.puts(<<-BODY) + #{indent}switch (str[#{i}]) { + BODY + end + @output.puts(<<-BODY) + #{indent}case #{"%#04x" % common_byte} : + BODY + end + + n = froms_bytes.first.size - 1 + indent = " " * common_bytes.size + @output.puts(<<-BODY) + #{indent}if (str[#{n}] >= #{"%#04x" % min} && str[#{n}] <= #{"%#04x" % max}) { + #{indent} return #{decompose_table_name(common_bytes)}[str[#{n}] - #{"%#04x" % min}]; + #{indent}} + #{indent}break; + BODY + end + + prev_common_bytes = common_bytes + prev_n_common_bytes = n_common_bytes + end + + # p [prev_common_bytes.collect{|x| "%#04x" % x}, prev_n_common_bytes] + + (prev_common_bytes.size - 1).step(0, -1) do |i| + indent = " " * i + @output.puts(<<-BODY) + #{indent}default : + #{indent} break; + #{indent}} + BODY + end + + @output.puts(<<-FOOTER) + } + + return NULL; +} + FOOTER + end + + def to_bytes_map(char_map) + bytes_map = {} + char_map.each_key do |from| + parent = bytes_map + from.bytes[0..-2].each do |byte| + parent[byte] ||= {} + parent = parent[byte] + end + parent[from.bytes.last] = char_map[from] + end + bytes_map + end + + def decompose_table_name(common_bytes) + suffix = common_bytes.collect {|byte| "%02x" % byte}.join("") + "grn_nfkc#{@unicode_version}_decompose_table_#{suffix}" + end + + def generate_decompose_tables(char_map, byte_size_groups) + byte_size_groups.keys.sort.each do |common_bytes| + froms = byte_size_groups[common_bytes] + @output.puts(<<-TABLE_HEADER) + +static const char *#{decompose_table_name(common_bytes)}[] = { + TABLE_HEADER + + lines = [] + last_bytes = froms.collect {|from| from.bytes.last} + last_bytes.min.step(last_bytes.max).each_slice(8) do |slice| + values = slice.collect do |last_byte| + from = (common_bytes + [last_byte]).pack("c*").force_encoding("UTF-8") + to = char_map[from] + if to + escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("") + "\"#{escaped_value}\"" + else + "NULL" + end + end + lines << (" " + values.join(", ")) + end + @output.puts(lines.join(",\n")) + + @output.puts(<<-TABLE_FOOTER) +}; + TABLE_FOOTER + end + end +end + def ccpush(hash, src, dst) head = src.shift hash[head] = {} unless hash[head] @@ -388,12 +559,17 @@ end ######## main ####### +generator_class = SwitchGenerator ARGV.each{|arg| case arg when /-*c/i $case_sensitive = true when /-*s/i $keep_space = true + when "--impl=switch" + generator_class = SwitchGenerator + when "--impl=table" + generator_class = TableGenerator end } @@ -436,7 +612,7 @@ don't edit this file by hand. it generated automatically by nfkc.rb #ifdef GRN_WITH_NFKC HEADER - generator = SwitchGenerator.new(unicode_version, output) + generator = generator_class.new(unicode_version, output) generator.generate(map1, map2) output.puts(<<-FOOTER) -------------- next part -------------- HTML����������������������������...다운로드