depgraph using wtab (instead of ptab) (Anthy-dev 3823) - Anthy

janitor/mkdepgraph-cleanup branch からさらに分岐した
feature/depgraph-wt branch で下記の depgraph の変更を入れました。

使う際には janitor/no-ptab-use と merge されることが前提です。
(ないと "数接尾辞" の ptab.h での定義が WF_NONE の影響がでて
例えば「100%」、「3番センター」の変換が変わってしまいます。)



wt based depgraph implementation.  (No ptab.)

2010-07-05  NIIBE Yutaka  <gniib****@fsij*****>

	* anthy/dic.h (anthy_get_seq_ent_wtype_freq0): New.

	* depgraph/indepword-wt.txt: New file.

	* depgraph/Makefile.am: Use indepword-wt.txt.

	* depgraph/mkdepgraph.c (INDEPWORD_INPUT_FILENAME): Use
	indepword-wt.txt.
	(parse_indep): Support #XX type description.
	(init_indep_word_seq_tab): # is not comment any more.

	* src-splitter/compose.c (enum_candidates): Use anthy_wtype_equal.

	* src-splitter/wordlist.c (make_word_list): Use
          anthy_get_seq_ent_wtype_freq0.

	* src-worddic/ext_ent.c (anthy_get_ext_seq_ent_wtype): Bug fix.
	Swap arg1 and arg2.

	* src-worddic/word_dic.c (anthy_get_seq_ent_wtype_freq0): New.
	(anthy_get_seq_ent_wtype_compound_freq): Use anthy_wtype_equal.

diff --git a/anthy/dic.h b/anthy/dic.h
index 6105999..6c5efaf 100644
--- a/anthy/dic.h
+++ b/anthy/dic.h
@@ -39,6 +39,7 @@ int anthy_get_nth_dic_ent_wtype(seq_ent_t, xstr *, int nth, wtype_t *w);
 int anthy_get_seq_ent_pos(seq_ent_t, int pos);
 int anthy_get_seq_ent_ct(seq_ent_t, int pos, int ct);
 int anthy_get_seq_ent_wtype_freq(seq_ent_t, wtype_t);
+int anthy_get_seq_ent_wtype_freq0(seq_ent_t, wtype_t);
 int anthy_get_seq_ent_indep(seq_ent_t se);
 /* 複合語 */
 compound_ent_t anthy_get_nth_compound_ent(seq_ent_t se, int nth);
diff --git a/depgraph/Makefile.am b/depgraph/Makefile.am
index 713cfcd..a12cb1e 100644
--- a/depgraph/Makefile.am
+++ b/depgraph/Makefile.am
@@ -4,14 +4,14 @@ DEPWORDS = conjugate.table conjugate.depword fix.depword noun.depword \
 	   a.depword  ajv.depword master.depword
 INCLUDES = -I$(top_srcdir)/ -DSRCDIR=\"$(srcdir)\"
 CLEANFILES = anthy.dep all.depword
-EXTRA_DIST = indepword.txt $(DEPWORDS)
+EXTRA_DIST = indepword-wt.txt $(DEPWORDS)

 # Generate the dictionary
 noinst_PROGRAMS = mkdepgraph
 mkdepgraph_SOURCES = mkdepgraph.c
 mkdepgraph_LDADD =  ../src-main/libanthy.la ../src-worddic/libanthydic.la

-anthy.dep : mkdepgraph all.depword indepword.txt
+anthy.dep : mkdepgraph all.depword indepword-wt.txt
 	./mkdepgraph

 all.depword: $(DEPWORDS)
diff --git a/depgraph/indepword-wt.txt b/depgraph/indepword-wt.txt
new file mode 100644
index 0000000..c64b2ef
--- /dev/null
+++ b/depgraph/indepword-wt.txt
@@ -0,0 +1,161 @@
+#kxi @カ変活用動詞連用形
+#kxo @カ変活用動詞未然形
+#kxoi @カ変活用動詞命令形
+#kxure @カ変活用動詞仮定形
+#kxuru @カ変活用動詞終止形
+#kxuru2
+#kxya
+#sxi @する未然形「し」
+#sxe @する未然形「せ」
+#sxi2 @する連用形「し」
+#sxuru
+#sxure
+#sxiro @する命令形「しろ」
+#sxeyo @する命令形「せよ」
+#sxya
+#T @名詞*のあと @名詞35のあと
+#B5 @バ行5段活用動詞語幹
+#B5r @バ行5段活用動詞語幹 @バ行5段活用動詞名詞化語幹
+#C5r @カ行C5段活用動詞語幹 @カ行C5段活用動詞名詞化語幹
+#CJ @接続詞
+#CN @名詞*のあと @地名のあと
+#CNPRE
+#CNS @名詞*のあと @地名のあと
+#CNSUC1
+#CNSUC2
+#D2KY @形容詞語幹
+#yasui @形容詞語幹
+#D2T16 @名詞*のあと @名詞35のあと
+#D2T35 @名詞*のあと @名詞15のあと @名詞35のあと
+#F00 @副詞 @副詞0のあと
+#F01 @副詞 @副詞1のあと
+#F02 @副詞 @副詞2のあと
+#F03 @副詞 @副詞3のあと
+#F04 @副詞 @副詞4のあと
+#F05 @副詞 @副詞5のあと
+#F06 @副詞 @副詞6のあと
+#F07 @副詞 @副詞7のあと
+#F08 @副詞 @副詞8のあと
+#F09 @副詞 @副詞9のあと
+#F10 @副詞 @副詞10のあと
+#F11 @副詞 @副詞11のあと
+#F12 @副詞 @副詞12のあと
+#F13 @副詞 @副詞13のあと
+#F14 @副詞 @副詞14のあと
+#F15 @副詞
+#G5 @ガ行5段活用動詞語幹
+#G5r @ガ行5段活用動詞語幹 @ガ行5段活用動詞名詞化語幹
+#JCN @名詞*のあと
+#JN @名詞*のあと @人名のあと
+#JNM @名詞*のあと @人名のあと
+#JNMUC @名詞*のあと
+#JNS @名詞*のあと @人名のあと
+#JNSSUC
+#JNSUC
+#JS
+#JSSUC
+#K2T15 @名詞*のあと
+#K2T16 @名詞*のあと
+#K2T35 @名詞*のあと
+#K5 @カ行5段活用動詞語幹
+#K5r @カ行5段活用動詞語幹 @カ行5段活用動詞名詞化語幹
+#KJ @単漢字
+#KK @名詞*のあと @団体名のあと
+#KN @名詞*のあと
+#KS @上下一段活用動詞語幹
+#KSr @上下一段活用動詞語幹 @上下一段活用動詞名詞化語幹
+#KY @形容詞語幹
+#KYs @形容詞語幹
+#KYE @形容詞語幹
+#KYI @形容詞語幹
+#KYU @形容詞語幹
+#KYii @形容詞語幹
+#KYn @形容詞語幹
+#KYy @形容詞語幹
+#KYme @形容詞語幹
+#KYmi @形容詞語幹
+#KYmime @形容詞語幹
+#KYna @形容詞語幹
+#KYT @形容詞語幹
+#L5 @ラ行L5段活用動詞語幹
+#M5 @マ行5段活用動詞語幹
+#M5r @マ行5段活用動詞語幹 @マ行5段活用動詞名詞化語幹
+#N00 @数詞のあと
+#N01 @数詞のあと
+#N02 @数詞のあと
+#N03 @数詞のあと
+#N2KYT @名詞*のあと
+#N2T10 @名詞*のあと @名詞35のあと
+#N2T16
+#N2T17
+#N2T30
+#N2T35 @名詞*のあと @名詞35のあと
+#N5 @ナ行5段活用動詞語幹
+#N5r @ナ行5段活用動詞語幹 @ナ行5段活用動詞名詞化語幹
+#ND2KY
+#NN @数詞のあと
+#NNPRE
+#OKX @動詞丁寧表現語幹
+#PRE
+#R5 @ラ行5段活用動詞語幹
+#R5r @ラ行5段活用動詞語幹 @ラ行5段活用動詞名詞化語幹
+#RT @連体詞
+#S5 @サ行5段活用動詞語幹
+#S5r @サ行5段活用動詞語幹 @サ行5段活用動詞名詞化語幹
+#SUC
+#SX @サ変活用動詞語幹 @動詞丁寧表現語幹
+#T00 @名詞*のあと @名詞0のあと
+#T01 @名詞*のあと @名詞1のあと
+#T02 @名詞*のあと @名詞2のあと
+#T03 @名詞*のあと @名詞3のあと
+#T04 @名詞*のあと @名詞4のあと
+#T05 @名詞*のあと @名詞5のあと
+#T06 @名詞*のあと @名詞6のあと
+#T07 @名詞*のあと @名詞7のあと
+#T08 @名詞*のあと @名詞8のあと
+#T09 @名詞*のあと @名詞9のあと
+#T10 @名詞*のあと @名詞10のあと
+#T11 @名詞*のあと @名詞11のあと
+#T12 @名詞*のあと @名詞12のあと
+#T13 @名詞*のあと @名詞13のあと
+#T14 @名詞*のあと @名詞14のあと
+#T15 @名詞*のあと @名詞15のあと
+#T16 @名詞*のあと @名詞16のあと
+#T17 @名詞*のあと @名詞17のあと
+#T18 @名詞*のあと @名詞18のあと
+#T19 @名詞*のあと @名詞19のあと
+#T20 @名詞*のあと @名詞20のあと
+#T21 @名詞*のあと @名詞21のあと
+#T22 @名詞*のあと @名詞22のあと
+#T23 @名詞*のあと @名詞23のあと
+#T24 @名詞*のあと @名詞24のあと
+#T25 @名詞*のあと @名詞25のあと
+#T26 @名詞*のあと @名詞26のあと
+#T27 @名詞*のあと @名詞27のあと
+#T28 @名詞*のあと @名詞28のあと
+#T29 @名詞*のあと @名詞29のあと
+#T30 @名詞*のあと @名詞30のあと
+#T31 @名詞*のあと @名詞31のあと
+#T32 @名詞*のあと @名詞32のあと
+#T33 @名詞*のあと @名詞33のあと
+#T34 @名詞*のあと @名詞34のあと
+#T35 @名詞*のあと @名詞35のあと
+#T36 @名詞*のあと @名詞36のあと
+#T37 @名詞*のあと @名詞37のあと
+#T38 @名詞*のあと @名詞38のあと
+#T39 @名詞*のあと @名詞39のあと
+#T5 @タ行5段活用動詞語幹
+#T5r @タ行5段活用動詞語幹 @タ行5段活用動詞名詞化語幹
+#U5 @ワ行U5段活用動詞語幹
+#U5r @ワ行U5段活用動詞語幹 @ワ行U5段活用動詞名詞化語幹
+#W5 @ワ行5段活用動詞語幹
+#W5r @ワ行5段活用動詞語幹 @ワ行5段活用動詞名詞化語幹
+#ZX @ザ変活用動詞語幹
+#aru @ラ変活用動詞語幹
+#arazu
+#NONE
+#N04 @数詞のあと
+#N05 @数詞のあと
+#SVSUC
+#OPEN @開き括弧
+#CLOSE @閉じ括弧
diff --git a/depgraph/mkdepgraph.c b/depgraph/mkdepgraph.c
index 4cc4778..fd5e7f6 100644
--- a/depgraph/mkdepgraph.c
+++ b/depgraph/mkdepgraph.c
@@ -256,7 +256,7 @@ get_tokens (char *buf, char **tokens, int n)
 #define MAX_TOKEN 256
 #define BUFSIZE 1024
 #define DEPWORD_INPUT_FILENAME "all.depword"
-#define INDEPWORD_INPUT_FILENAME "indepword.txt"
+#define INDEPWORD_INPUT_FILENAME "indepword-wt.txt"

 static void
 init_depword_tab(void)
@@ -305,25 +305,64 @@ init_depword_tab(void)
 static void
 parse_indep (char **tokens, int nr, int lineno)
 {
-  if (nr < 2) {
-    fprintf(stderr, "%d: Syntex error in indepword defs.\n", lineno);
-    return;
-  }
+  int node;
+  wtype_t wt;
+  int i;
+
+  if (anthy_type_to_wtype (tokens[0], &wt) == NULL)
+    {
+      fprintf (stderr, "%d: no such WT\n", lineno);
+      return;
+    }
+
+  for (i = 0; i < nrRules; i++)
+    if (anthy_wtype_equal (gRules[i].wt, wt))
+      return;

-  gRules = (struct wordseq_rule*)realloc (gRules, sizeof(struct wordseq_rule)*(nrRules+1));
+  gRules = (struct wordseq_rule *)realloc (gRules, sizeof (struct wordseq_rule)*(nrRules+1));
   if (gRules == NULL)
     {
       fprintf (stderr, "%d: malloc failed.\n", lineno);
       exit (1);
     }

-  /* 行の先頭には品詞の名前が入っている */
-  gRules[nrRules].wt = anthy_init_wtype_by_name(tokens[0]);
+  if (nr == 2)
+    node = get_node_id_by_name (tokens[1]);
+  else if (nr >= 3)
+    {
+      int i;
+      struct dep_branch *db;
+      struct dep_node *dn;
+      xstr *strs[1];
+
+      node = get_node_id_by_name (tokens[0]); /* New node */
+      dn = &gNodes[node];
+      strs[0] = anthy_cstr_to_xstr ("", ANTHY_EUC_JP_ENCODING);
+      db = find_branch (dn, strs, 1);

-  /* その次にはノード名が入っている */
-  gRules[nrRules].node_id = get_node_id_by_name(tokens[1]);
+      db->transition = (struct dep_transition *)realloc (db->transition,
+							 sizeof (struct dep_transition)*
+							 (db->nr_transitions + nr - 1));
+
+      for (i = 1; i < nr; i++)
+	{
+	  struct dep_transition *tr;
+
+	  tr = &db->transition[db->nr_transitions];
+	  parse_transition (tokens[i], tr);
+	  db->nr_transitions++;
+	}
+    }
+  else
+    {
+      if (nr != 1)
+	fprintf (stderr, "%d: syntax error (ignored).\n", lineno);
+      return;
+    }

-  nrRules ++;
+  gRules[nrRules].wt = wt;
+  gRules[nrRules].node_id = node;
+  nrRules++;
 }

 /** 自立語からの遷移表 */
@@ -350,8 +389,6 @@ init_indep_word_seq_tab (void)
 	goto error;

       *p = '\0';
-      if (buf[0] == '#')
-	continue;

       lineno++;
       nr = get_tokens (buf, tokens, MAX_TOKEN);
diff --git a/src-splitter/compose.c b/src-splitter/compose.c
index 774df7c..db3f21c 100644
--- a/src-splitter/compose.c
+++ b/src-splitter/compose.c
@@ -153,8 +153,7 @@ enum_candidates(struct seg_ent *seg,
     }
     anthy_get_nth_dic_ent_wtype(ce->elm[n].se, &ce->elm[n].str, i, &wt);

-    ce->elm[n].wt = anthy_get_wtype_with_ct(ce->elm[n].wt, CT_NONE);
-    if (anthy_wtype_include(ce->elm[n].wt, wt)) {
+    if (anthy_wtype_equal (ce->elm[n].wt, wt)) {
       xstr word, yomi;

       yomi.len = ce->elm[n].str.len;
diff --git a/src-splitter/wordlist.c b/src-splitter/wordlist.c
index f677a3c..8f27735 100644
--- a/src-splitter/wordlist.c
+++ b/src-splitter/wordlist.c
@@ -388,7 +388,7 @@ make_word_list(struct splitter_context *sc,
     int freq;
     anthy_get_nth_dep_rule(i, &rule);
     if (!is_compound) {
-      freq = anthy_get_seq_ent_wtype_freq(se, rule.wt);
+      freq = anthy_get_seq_ent_wtype_freq0 (se, rule.wt);
     } else {
       freq = anthy_get_seq_ent_wtype_compound_freq(se, rule.wt);
     }
diff --git a/src-worddic/ext_ent.c b/src-worddic/ext_ent.c
index e3de315..509e8bd 100644
--- a/src-worddic/ext_ent.c
+++ b/src-worddic/ext_ent.c
@@ -543,7 +543,7 @@ int
 anthy_get_ext_seq_ent_wtype(struct seq_ent *se, wtype_t w)
 {
   if (se == &num_ent) {
-    if (anthy_wtype_include(w, wt_num)) {
+    if (anthy_wtype_include(wt_num, w)) {
       /* 数字の場合 */
       return 10;
     }
diff --git a/src-worddic/word_dic.c b/src-worddic/word_dic.c
index d14bedd..47e1434 100644
--- a/src-worddic/word_dic.c
+++ b/src-worddic/word_dic.c
@@ -516,6 +516,32 @@ anthy_get_seq_ent_wtype_freq(seq_ent_t seq, wtype_t wt)
   return f;
 }

+int
+anthy_get_seq_ent_wtype_freq0(seq_ent_t seq, wtype_t wt)
+{
+  int i, f;
+
+  if (!seq) {
+    return 0;
+  }
+  /**/
+  if (seq->nr_dic_ents == 0) {
+    return anthy_get_ext_seq_ent_wtype(seq, wt);
+  }
+
+  f = 0;
+  /* 単語 */
+  for (i = 0; i < seq->nr_dic_ents; i++) {
+    if (seq->dic_ents[i]->order == 0 &&
+	anthy_wtype_equal (wt, seq->dic_ents[i]->type)) {
+      if (f < seq->dic_ents[i]->freq) {
+	f = seq->dic_ents[i]->freq;
+      }
+    }
+  }
+  return f;
+}
+
 /*
  * wtの品詞を持つ複合語の中で最大の頻度を持つものを返す
  */
@@ -533,7 +559,7 @@ anthy_get_seq_ent_wtype_compound_freq(seq_ent_t se, wtype_t wt)
     if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
       continue;
     }
-    if (anthy_wtype_include(wt, s->dic_ents[i]->type)) {
+    if (anthy_wtype_equal (wt, s->dic_ents[i]->type)) {
       if (f < s->dic_ents[i]->freq) {
 	f = s->dic_ents[i]->freq;
       }
-- 



Anthy

[Anthy-dev 3823] depgraph using wtab (instead of ptab)