sumom****@users*****
sumom****@users*****
2011年 6月 24日 (金) 14:07:56 JST
Index: julius4/libsent/src/ngram/init_ngram.c diff -u julius4/libsent/src/ngram/init_ngram.c:1.7 julius4/libsent/src/ngram/init_ngram.c:1.8 --- julius4/libsent/src/ngram/init_ngram.c:1.7 Fri Apr 29 14:09:17 2011 +++ julius4/libsent/src/ngram/init_ngram.c Fri Jun 24 14:07:56 2011 @@ -12,7 +12,7 @@ * @author Akinobu LEE * @date Wed Feb 16 07:40:53 2005 * - * $Revision: 1.7 $ + * $Revision: 1.8 $ * */ /* @@ -50,6 +50,10 @@ jlog("Error: init_ngram: failed to close \"%s\"\n", bin_ngram_file); return FALSE; } + + /* set default unknown (=OOV) word id */ + set_default_unknown_id(ndata); + jlog("Stat: init_ngram: finished reading n-gram\n"); return TRUE; } @@ -83,8 +87,11 @@ jlog("Error: init_ngram: failed to close \"%s\"\n", ngram_file); return FALSE; } - jlog("Stat: init_ngram: finished reading n-gram\n"); + /* set default unknown (=OOV) word id */ + set_default_unknown_id(ndata); + + jlog("Stat: init_ngram: finished reading n-gram\n"); return TRUE; } @@ -159,32 +166,47 @@ } /** - * @brief Set unknown word ID to the N-gram data. - * + * @brief Set default unknown word ID to the N-gram data. + * If default "<unk>" is not found, also try "<UNK>". * * @param ndata [out] N-gram data to set unknown word ID. - * @param str [in] word name string of unknown word */ void -set_unknown_id(NGRAM_INFO *ndata, char *str) +set_default_unknown_id(NGRAM_INFO *ndata) { - ndata->unk_id = ngram_lookup_word(ndata, str); - if (ndata->unk_id == WORD_INVALID) { - if (strmatch(str, UNK_WORD_DEFAULT)) { - /* if default "<unk>" is not found, also try "<UNK>" */ - ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2); - if (ndata->unk_id == WORD_INVALID) { - jlog("Stat: init_ngram: either \"%s\" and \"%s\" not found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2); - ndata->isopen = FALSE; - return; - } + ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT); + if (ndata->unk_id != WORD_INVALID) { + jlog("Stat: init_ngram: found unknown word entry \"%s\"\n", UNK_WORD_DEFAULT); + ndata->isopen = TRUE; + } else { + ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2); + if (ndata->unk_id != WORD_INVALID) { + jlog("Stat: init_ngram: found unknown word entry \"%s\"\n", UNK_WORD_DEFAULT2); + ndata->isopen = TRUE; + } else{ + jlog("Stat: init_ngram: neither \"%s\" nor \"%s\" was found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2); + ndata->isopen = FALSE; } } - if (ndata->unk_id == WORD_INVALID) { - jlog("Stat: init_ngram: \"%s\" not found, assuming close vocabulary LM\n", str); - ndata->isopen = FALSE; + ndata->unk_num = 0; +} + +/** + * @brief Set user-specified word ID to the N-gram data. + * + * @param ndata [out] N-gram data to set unknown word ID. + * @param str [in] word name string of unknown word + */ +void +set_unknown_id(NGRAM_INFO *ndata, char *str) +{ + WORD_ID w; + w = ngram_lookup_word(ndata, str); + if (w == WORD_INVALID) { + jlog("Stat: init_ngram: \"%s\" not found", str); } else { - jlog("Stat: init_ngram: unknown words will be mapped to \"%s\"\n", str); + jlog("Stat: init_ngram: unknown word entry was set to \"%s\"\n", str); + ndata->unk_id = w; ndata->isopen = TRUE; } } Index: julius4/libsent/src/ngram/ngram_util.c diff -u julius4/libsent/src/ngram/ngram_util.c:1.6 julius4/libsent/src/ngram/ngram_util.c:1.7 --- julius4/libsent/src/ngram/ngram_util.c:1.6 Fri Apr 29 14:09:17 2011 +++ julius4/libsent/src/ngram/ngram_util.c Fri Jun 24 14:07:56 2011 @@ -12,7 +12,7 @@ * @author Akinobu LEE * @date Wed Feb 16 17:18:55 2005 * - * $Revision: 1.6 $ + * $Revision: 1.7 $ * */ /* @@ -90,7 +90,9 @@ } if (ndata->isopen) { fprintf(fp, "\t OOV word = %s(id=%d)\n", ndata->wname[ndata->unk_id],ndata->unk_id); - fprintf(fp, "\t OOV size = %d words in dict\n", ndata->unk_num); + if (ndata->unk_num != 0) { + fprintf(fp, "\t OOV size = %d words in dict\n", ndata->unk_num); + } } else { fprintf(fp, "\t OOV word = none (assume close vocabulary)\n"); }