61 #include "ngram_model_internal.h"
68 ext = strrchr(file_name,
'.');
73 while (--ext >= file_name) {
74 if (*ext ==
'.')
break;
76 if (ext < file_name) {
81 while (--ext >= file_name) {
82 if (*ext ==
'.')
break;
84 if (ext < file_name) {
122 const char *file_name,
130 if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
132 if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
137 model = ngram_model_arpa_read(config, file_name, lmath);
140 model = ngram_model_dmp_read(config, file_name, lmath);
143 E_ERROR(
"language model file type not supported\n");
154 lw = cmd_ln_float32_r(config,
"-lw");
156 wip = cmd_ln_float32_r(config,
"-wip");
158 uw = cmd_ln_float32_r(config,
"-uw");
179 return ngram_model_arpa_write(model, file_name);
181 return ngram_model_dmp_write(model, file_name);
183 E_ERROR(
"language model file type not supported\n");
186 E_ERROR(
"language model file type not supported\n");
194 int32 n, int32 n_unigram)
203 if (base->
lmath != lmath) {
218 for (i = 0; i < base->
n_words; ++i) {
266 for (i = 0; i < model->
n_words; ++i) {
277 for (j = 0; j < lmclass->
n_words; ++j) {
280 for (j = 0; j < lmclass->
n_hash; ++j) {
281 if (lmclass->nword_hash[j].
wid != -1) {
288 ngram_class_free(model->
classes[i]);
312 for (i = 0; i < model->
n_words; ++i) {
321 if (outstr[0] ==
'<' || outstr[0] ==
'[') {
340 E_WARN(
"Duplicate word in dictionary after conversion: %s\n",
346 model->
wid = new_wid;
362 if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
374 for (i = 0; i < model->
n_words; ++i) {
375 if (strlen(model->
word_str[i]) > maxlen)
376 maxlen = strlen(model->
word_str[i]);
383 maxlen = maxlen *
sizeof(int) + 15;
388 for (i = 0; i < model->
n_words; ++i) {
389 ICONV_CONST
char *in;
391 size_t inleft, outleft, result;
394 in = (ICONV_CONST
char *)model->
word_str[i];
400 while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
401 if (errno != E2BIG) {
410 iconv(ic, NULL, NULL, NULL, NULL);
415 in = (ICONV_CONST
char *)model->
word_str[i];
420 if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
421 if (errno != E2BIG) {
430 iconv(ic, NULL, NULL, NULL, NULL);
435 goto start_conversion;
438 result = maxlen - outleft;
450 memcpy(model->
word_str[i], outbuf, result);
456 E_WARN(
"Duplicate word in dictionary after conversion: %s\n",
464 model->
wid = new_wid;
478 float32 lw, float32 wip, float32 uw)
487 if (out_log_wip) *out_log_wip = model->
log_wip;
488 if (out_log_uw) *out_log_uw = model->
log_uw;
495 int32 n_hist, int32 *n_used)
497 int32 score, class_weight = 0;
505 if (NGRAM_IS_CLASSWID(wid)) {
508 class_weight = ngram_class_prob(lmclass, wid);
509 if (class_weight == 1)
513 for (i = 0; i < n_hist; ++i) {
517 score = (*model->
funcs->
score)(model, wid, history, n_hist, n_used);
520 return score + class_weight;
533 va_start(history, word);
535 while ((hword = va_arg(history,
const char *)) != NULL)
540 va_start(history, word);
542 while ((hword = va_arg(history,
const char *)) != NULL) {
543 histid[n_hist] =
ngram_wid(model, hword);
549 histid, n_hist, &n_used);
571 int32 n_hist, int32 *n_used)
573 int32 prob, class_weight = 0;
581 if (NGRAM_IS_CLASSWID(wid)) {
584 class_weight = ngram_class_prob(lmclass, wid);
585 if (class_weight == 1)
589 for (i = 0; i < n_hist; ++i) {
596 return prob + class_weight;
609 va_start(history, word);
611 while ((hword = va_arg(history,
const char *)) != NULL)
616 va_start(history, word);
618 while ((hword = va_arg(history,
const char *)) != NULL) {
619 histid[n_hist] =
ngram_wid(model, hword);
625 histid, n_hist, &n_used);
638 prob = (int32)(prob / base->
lw);
680 int m,
int successor)
710 va_start(history, word);
712 while ((hword = va_arg(history,
const char *)) != NULL)
717 va_start(history, word);
719 while ((hword = va_arg(history,
const char *)) != NULL) {
720 histid[n_hist] =
ngram_wid(model, hword);
733 if (n_hist >= model->
n)
737 return (*model->
funcs->
iter)(model, wid, history, n_hist);
744 if (itor->
m == itor->model->
n - 1)
754 return (*itor->model->
funcs->
iter_get)(itor, out_score, out_bowt);
785 wid = NGRAM_BASEWID(wid);
805 wid = NGRAM_CLASSWID(wid, classid);
809 E_ERROR(
"Duplicate definition of word %s\n", word);
823 E_ERROR(
"Hash insertion failed for word %s => %p (should not happen)\n",
833 const char *word, float32 weight)
837 wid = ngram_add_word_internal(model, word, -1);
866 lmclass->nword_hash = NULL;
869 for (gn = classwords; gn; gn = gnode_next(gn)) {
870 tprob += gnode_float32(gn);
872 if (tprob > 1.1 || tprob < 0.9) {
873 E_WARN(
"Total class probability is %f, will normalize\n", tprob);
874 for (gn = classwords; gn; gn = gnode_next(gn)) {
875 gn->data.fl /= tprob;
878 for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
886 ngram_class_add_word(
ngram_class_t *lmclass, int32 wid, int32 lweight)
890 if (lmclass->nword_hash == NULL) {
892 lmclass->nword_hash =
ckd_malloc(NGRAM_HASH_SIZE *
sizeof(*lmclass->nword_hash));
893 memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE *
sizeof(*lmclass->nword_hash));
894 lmclass->
n_hash = NGRAM_HASH_SIZE;
900 hash = wid & (lmclass->
n_hash - 1);
901 if (lmclass->nword_hash[hash].
wid == -1) {
903 lmclass->nword_hash[hash].
wid = wid;
904 lmclass->nword_hash[hash].
prob1 = lweight;
911 while (lmclass->nword_hash[hash].
next != -1)
912 hash = lmclass->nword_hash[hash].
next;
917 lmclass->nword_hash =
ckd_realloc(lmclass->nword_hash,
918 lmclass->
n_hash * 2 *
sizeof(*lmclass->nword_hash));
919 memset(lmclass->nword_hash + lmclass->
n_hash,
920 0xff, lmclass->
n_hash *
sizeof(*lmclass->nword_hash));
927 for (next = 0; next < lmclass->
n_hash; ++next)
928 if (lmclass->nword_hash[next].
wid == -1)
931 assert(next != lmclass->
n_hash);
933 lmclass->nword_hash[next].
wid = wid;
934 lmclass->nword_hash[next].
prob1 = lweight;
935 lmclass->nword_hash[hash].
next = next;
951 const char *classname,
956 int32 classid, tag_wid, wid, i, scale;
964 E_ERROR(
"No such word or class tag: %s\n", classname);
967 for (classid = 0; classid < model->
n_classes; ++classid) {
973 E_ERROR(
"Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
976 lmclass = model->
classes[classid];
979 wid = ngram_add_word_internal(model, word, classid);
989 for (i = 0; i < lmclass->
n_words; ++i)
990 lmclass->
prob1[i] += scale;
991 for (i = 0; i < lmclass->
n_hash; ++i)
992 if (lmclass->nword_hash[i].
wid != -1)
993 lmclass->nword_hash[i].
prob1 += scale;
996 return ngram_class_add_word(lmclass, wid,
logmath_log(model->
lmath, fprob));
1001 const char *classname,
1002 float32 classweight,
1004 const float32 *weights,
1009 int32 i, start_wid = -1;
1010 int32 classid, tag_wid;
1020 E_ERROR(
"Number of classes cannot exceed 128 (sorry)\n");
1024 for (i = 0; i < n_words; ++i) {
1027 wid = ngram_add_word_internal(model, words[i], classid);
1030 if (start_wid == -1)
1031 start_wid = NGRAM_BASEWID(wid);
1035 lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
1037 if (lmclass == NULL)
1046 model->
classes[classid] = lmclass;
1053 int32 base_wid = NGRAM_BASEWID(wid);
1055 if (base_wid < lmclass->start_wid
1060 hash = wid & (lmclass->
n_hash - 1);
1061 while (hash != -1 && lmclass->nword_hash[hash].
wid != wid)
1062 hash = lmclass->nword_hash[hash].
next;
1065 return lmclass->nword_hash[hash].
prob1;
1073 read_classdef_file(
hash_table_t *classes,
const char *file_name)
1082 char *classname = NULL;
1084 if ((fp =
fopen_comp(file_name,
"r", &is_pipe)) == NULL) {
1085 E_ERROR(
"File %s not found\n", file_name);
1095 if (fgets(line,
sizeof(line), fp) == NULL)
1104 if (n_words == 2 && 0 == strcmp(wptr[0],
"END")) {
1109 if (classname == NULL || 0 != strcmp(wptr[1], classname))
1118 classdef->words =
ckd_calloc(classdef->n_words,
1119 sizeof(*classdef->words));
1120 classdef->weights =
ckd_calloc(classdef->n_words,
1121 sizeof(*classdef->weights));
1123 weight = classprobs;
1124 for (i = 0; i < classdef->n_words; ++i) {
1126 classdef->weights[i] = gnode_float32(weight);
1127 word = gnode_next(word);
1128 weight = gnode_next(weight);
1133 classdef_free(classdef);
1148 fprob = (float32)
atof_c(wptr[1]);
1158 if (n_words == 2 && 0 == strcmp(wptr[0],
"LMCLASS")) {
1172 for (gn = classwords; gn; gn = gnode_next(gn))
1185 for (i = 0; i < classdef->n_words; ++i)
1195 const char *file_name)
1203 if (read_classdef_file(classes, file_name) < 0) {
1210 for (gn = hl; gn; gn = gnode_next(gn)) {
1217 classdef->n_words) < 0)
1223 for (gn = hl; gn; gn = gnode_next(gn)) {
1226 classdef_free(he->
val);