52 #include "sphinxbase/byteorder.h"
55 #include "ngram_model_dmp.h"
57 static const char darpa_hdr[] =
"Darpa Trigram LM";
60 #define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
61 #define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
62 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
65 new_unigram_table(int32 n_ug)
71 for (i = 0; i < n_ug; i++) {
72 table[i].
prob1.f = -99.0;
79 ngram_model_dmp_read(
cmd_ln_t *config,
80 const char *file_name,
88 int32 i, j, k, vn, n, ts;
97 char *map_base = NULL;
98 size_t offset = 0, filesize;
105 if ((fp =
fopen_comp(file_name,
"rb", &is_pipe)) == NULL) {
106 E_ERROR(
"Dump file %s not found\n", file_name);
110 if (is_pipe && do_mmap) {
111 E_WARN(
"Dump file is compressed, will not use memory-mapped I/O\n");
116 if (fread(&k,
sizeof(k), 1, fp) != 1)
118 if (k != strlen(darpa_hdr)+1) {
120 if (k != strlen(darpa_hdr)+1) {
121 E_ERROR(
"Wrong magic header size number %x: %s is not a dump file\n", k, file_name);
126 if (fread(str, 1, k, fp) != (
size_t) k) {
127 E_ERROR(
"Cannot read header\n");
130 if (strncmp(str, darpa_hdr, k) != 0) {
131 E_ERROR(
"Wrong header %s: %s is not a dump file\n", darpa_hdr);
138 (
"Byteswapping required, will not use memory-mapped I/O for LM file\n");
142 E_INFO(
"Will use memory-mapped I/O for LM file\n");
143 #ifdef __ADSPBLACKFIN__
144 E_FATAL(
"memory mapping is not supported at the moment.");
150 if (fread(&k,
sizeof(k), 1, fp) != 1)
152 if (do_swap) SWAP_INT32(&k);
153 if (fread(str, 1, k, fp) != (
size_t) k) {
154 E_ERROR(
"Cannot read LM filename in header\n");
159 if (fread(&vn,
sizeof(vn), 1, fp) != 1)
161 if (do_swap) SWAP_INT32(&vn);
164 if (fread(&ts,
sizeof(ts), 1, fp) != 1)
166 if (do_swap) SWAP_INT32(&ts);
170 if (fread(&k,
sizeof(k), 1, fp) != 1)
172 if (do_swap) SWAP_INT32(&k);
175 if (fread(str, 1, k, fp) != (
size_t) k) {
176 E_ERROR(
"fread(word) failed\n");
181 if (fread(&n_unigram,
sizeof(n_unigram), 1, fp) != 1)
183 if (do_swap) SWAP_INT32(&n_unigram);
190 if (fread(&n_bigram,
sizeof(n_bigram), 1, fp) != 1)
192 if (do_swap) SWAP_INT32(&n_bigram);
193 if (fread(&n_trigram,
sizeof(n_trigram), 1, fp) != 1)
195 if (do_swap) SWAP_INT32(&n_trigram);
196 E_INFO(
"ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
203 else if (n_bigram > 0)
207 ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram);
214 model->
lm3g.unigrams = new_unigram_table(n_unigram + 1);
215 ugptr = model->
lm3g.unigrams;
216 for (i = 0; i <= n_unigram; ++i) {
218 if (fread(ugptr,
sizeof(int32), 1, fp) != 1) {
219 E_ERROR(
"fread(mapid[%d]) failed\n", i);
223 if (fread(ugptr,
sizeof(
unigram_t), 1, fp) != 1) {
224 E_ERROR(
"fread(unigrams) failed\n");
231 SWAP_INT32(&ugptr->
prob1.l);
232 SWAP_INT32(&ugptr->
bo_wt1.l);
238 E_DEBUG(2, (
"ug %d: prob %d bo %d bigrams %d\n",
242 E_INFO(
"%8d = LM.unigrams(+trailer) read\n", n_unigram);
247 fseek(fp, 0, SEEK_END);
248 filesize = ftell(fp);
249 fseek(fp, offset, SEEK_SET);
253 E_WARN(
"-mmap specified, but tseg_base is not word-aligned. Will not memory-map.\n");
271 offset += (n_bigram + 1) *
sizeof(
bigram_t);
274 model->
lm3g.bigrams =
276 if (fread(model->
lm3g.bigrams,
sizeof(
bigram_t), n_bigram + 1, fp)
277 != (
size_t) n_bigram + 1) {
278 E_ERROR(
"fread(bigrams) failed\n");
282 for (i = 0, bgptr = model->
lm3g.bigrams; i <= n_bigram;
284 SWAP_INT16(&bgptr->
wid);
285 SWAP_INT16(&bgptr->
prob2);
286 SWAP_INT16(&bgptr->
bo_wt2);
291 E_INFO(
"%8d = LM.bigrams(+trailer) read\n", n_bigram);
301 model->
lm3g.trigrams =
305 != (
size_t) n_trigram) {
306 E_ERROR(
"fread(trigrams) failed\n");
310 for (i = 0, tgptr = model->
lm3g.trigrams; i < n_trigram;
312 SWAP_INT16(&tgptr->
wid);
313 SWAP_INT16(&tgptr->
prob3);
317 E_INFO(
"%8d = LM.trigrams read\n", n_trigram);
326 fseek(fp, offset, SEEK_SET);
327 if (fread(&k,
sizeof(k), 1, fp) != 1)
329 if (do_swap) SWAP_INT32(&k);
333 E_ERROR(
"fread(prob2) failed\n");
336 for (i = 0; i < k; i++) {
342 E_INFO(
"%8d = LM.prob2 entries read\n", k);
347 if (fread(&k,
sizeof(k), 1, fp) != 1)
349 if (do_swap) SWAP_INT32(&k);
353 E_ERROR(
"fread(bo_wt2) failed\n");
356 for (i = 0; i < k; i++) {
362 E_INFO(
"%8d = LM.bo_wt2 entries read\n", k);
367 if (fread(&k,
sizeof(k), 1, fp) != 1)
369 if (do_swap) SWAP_INT32(&k);
373 E_ERROR(
"fread(prob3) failed\n");
376 for (i = 0; i < k; i++) {
382 E_INFO(
"%8d = LM.prob3 entries read\n", k);
390 memcpy(&k, map_base + offset,
sizeof(k));
391 offset +=
sizeof(int32);
393 offset += k *
sizeof(int32);
396 k = (n_bigram + 1) / BG_SEG_SZ + 1;
397 if (fread(&k,
sizeof(k), 1, fp) != 1)
399 if (do_swap) SWAP_INT32(&k);
403 E_ERROR(
"fread(tseg_base) failed\n");
407 for (i = 0; i < k; i++)
410 E_INFO(
"%8d = LM.tseg_base entries read\n", k);
415 memcpy(&k, map_base + offset,
sizeof(k));
416 offset +=
sizeof(int32);
417 tmp_word_str = (
char *) (map_base + offset);
422 if (fread(&k,
sizeof(k), 1, fp) != 1)
424 if (do_swap) SWAP_INT32(&k);
426 if (fread(tmp_word_str, 1, k, fp) != (
size_t) k) {
427 E_ERROR(
"fread(word-string) failed\n");
433 for (i = 0, j = 0; i < k; i++)
434 if (tmp_word_str[i] ==
'\0')
436 if (j != n_unigram) {
437 E_ERROR(
"Error reading word strings (%d doesn't match n_unigrams %d)\n",
445 for (i = 0; i < n_unigram; i++) {
446 base->
word_str[i] = tmp_word_str + j;
448 (
void *)(
long)i) != (
void *)(
long)i) {
456 for (i = 0; i < n_unigram; i++) {
459 (
void *)(
long)i) != (
void *)(
long)i) {
466 E_INFO(
"%8d = ascii word strings read\n", i);
489 int i, bgcount, tgcount, seg;
491 if (base->
funcs == &ngram_model_dmp_funcs) {
492 E_INFO(
"Using existing DMP model.\n");
497 E_INFO(
"Building DMP model...\n");
499 newbase = &model->
base;
500 ngram_model_init(newbase, &ngram_model_dmp_funcs,
509 model->
lm3g.unigrams = new_unigram_table(newbase->
n_counts[0] + 1);
518 model->
lm3g.unigrams[wids[0]].
prob1.l = prob1;
519 model->
lm3g.unigrams[wids[0]].
bo_wt1.l = bo_wt1;
522 newbase->
word_str[wids[0]], wids[0]))
524 E_WARN(
"Duplicate word in dictionary: %s\n", newbase->
word_str[wids[0]]);
535 init_sorted_list(&sorted_prob2);
536 if (newbase->
n > 2) {
537 init_sorted_list(&sorted_bo_wt2);
538 init_sorted_list(&sorted_prob3);
542 if (newbase->
n > 2) {
552 for (i = 0; i < newbase->
n_counts[0]; ++i) {
554 bgcount = bgptr - model->
lm3g.bigrams;
557 E_DEBUG(2, (
"unigram %d: %s => bigram %d\n", i, newbase->
word_str[i], bgcount));
568 assert (bgptr - model->
lm3g.bigrams < newbase->
n_counts[1]);
570 bgptr->
wid = wids[1];
571 bgptr->
prob2 = sorted_id(&sorted_prob2, &prob2);
572 if (newbase->
n > 2) {
573 tgcount = (tgptr - model->
lm3g.trigrams);
574 bgcount = (bgptr - model->
lm3g.bigrams);
577 bgptr->
bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2);
581 seg = bgcount >> LOG_BG_SEG_SZ;
585 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
589 E_DEBUG(2, (
"bigram %d %s %s => trigram %d:%d\n",
600 assert(tgptr - model->
lm3g.trigrams < newbase->
n_counts[2]);
602 tgptr->
wid = wids[2];
603 tgptr->
prob3 = sorted_id(&sorted_prob3, &prob3);
604 E_DEBUG(2, (
"trigram %d %s %s %s => prob %d\n",
616 bgcount = bgptr - model->
lm3g.bigrams;
617 tgcount = tgptr - model->
lm3g.trigrams;
618 seg = bgcount >> LOG_BG_SEG_SZ;
619 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
627 model->
lm3g.
prob2 = vals_in_sorted_list(&sorted_prob2);
630 free_sorted_list(&sorted_prob2);
631 if (newbase->
n > 2) {
634 model->
lm3g.
bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2);
635 free_sorted_list(&sorted_bo_wt2);
639 model->
lm3g.
prob3 = vals_in_sorted_list(&sorted_prob3);
642 free_sorted_list(&sorted_prob3);
652 fwrite_int32(FILE *fh, int32 val)
654 fwrite(&val, 4, 1, fh);
664 fwrite(&bogus, 4, 1, fh);
667 fwrite(&log10val, 4, 1, fh);
669 fwrite(&log10val, 4, 1, fh);
676 fwrite(bg,
sizeof(*bg), 1, fh);
682 fwrite(tg,
sizeof(*tg), 1, fh);
687 static char const *fmtdesc[] = {
688 "BEGIN FILE FORMAT DESCRIPTION",
689 "Header string length (int32) and string (including trailing 0)",
690 "Original LM filename string-length (int32) and filename (including trailing 0)",
691 "(int32) version number (present iff value <= 0)",
692 "(int32) original LM file modification timestamp (iff version# present)",
693 "(int32) string-length and string (including trailing 0) (iff version# present)",
694 "... previous entry continued any number of times (iff version# present)",
695 "(int32) 0 (terminating sequence of strings) (iff version# present)",
696 "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)",
697 "(int32) lm_t.ucount (must be > 0)",
698 "(int32) lm_t.bcount",
699 "(int32) lm_t.tcount",
700 "lm_t.ucount+1 unigrams (including sentinel)",
701 "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3",
702 "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)",
703 "(int32) lm_t.n_prob2",
704 "(int32) lm_t.prob2[]",
705 "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)",
706 "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)",
707 "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)",
708 "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)",
709 "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)",
710 "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)",
711 "(int32) Sum(all word string-lengths, including trailing 0 for each)",
712 "All word strings (including trailing 0 for each)",
713 "END FILE FORMAT DESCRIPTION",
718 ngram_model_dmp_write_header(FILE * fh)
721 k = strlen(darpa_hdr) + 1;
723 fwrite(darpa_hdr, 1, k, fh);
727 ngram_model_dmp_write_lm_filename(FILE * fh,
const char *lmfile)
731 k = strlen(lmfile) + 1;
733 fwrite(lmfile, 1, k, fh);
736 #define LMDMP_VERSION_TG_16BIT -1
741 ngram_model_dmp_write_version(FILE * fh, int32 mtime)
743 fwrite_int32(fh, LMDMP_VERSION_TG_16BIT);
744 fwrite_int32(fh, mtime);
748 ngram_model_dmp_write_ngram_counts(FILE * fh,
ngram_model_t *model)
750 fwrite_int32(fh, model->
n_counts[0]);
751 fwrite_int32(fh, model->
n_counts[1]);
752 fwrite_int32(fh, model->
n_counts[2]);
756 ngram_model_dmp_write_fmtdesc(FILE * fh)
762 for (i = 0; fmtdesc[i] != NULL; i++) {
763 k = strlen(fmtdesc[i]) + 1;
765 fwrite(fmtdesc[i], 1, k, fh);
771 fwrite_int32(fh, 4-k);
772 fwrite(
"!!!!", 1, 4-k, fh);
778 ngram_model_dmp_write_unigram(FILE *fh,
ngram_model_t *model)
783 for (i = 0; i <= model->
n_counts[0]; i++) {
784 fwrite_ug(fh, &(lm->
lm3g.unigrams[i]), model->
lmath);
795 for (i = 0; i <= model->
n_counts[1]; i++) {
796 fwrite_bg(fh, &(lm->
lm3g.bigrams[i]));
802 ngram_model_dmp_write_trigram(FILE *fh,
ngram_model_t *model)
807 for (i = 0; i < model->
n_counts[2]; i++) {
808 fwrite_tg(fh, &(lm->
lm3g.trigrams[i]));
821 fwrite(&log10val, 4, 1, fh);
834 fwrite(&log10val, 4, 1, fh);
847 fwrite(&log10val, 4, 1, fh);
852 ngram_model_dmp_write_tg_segbase(FILE *fh,
ngram_model_t *model)
857 k = (model->
n_counts[1] + 1) / BG_SEG_SZ + 1;
859 for (i = 0; i < k; i++)
864 ngram_model_dmp_write_wordstr(FILE *fh,
ngram_model_t *model)
869 for (i = 0; i < model->
n_counts[0]; i++)
870 k += strlen(model->
word_str[i]) + 1;
872 for (i = 0; i < model->
n_counts[0]; i++)
874 strlen(model->
word_str[i]) + 1, fh);
879 const char *file_name)
886 model = ngram_model_dmp_build(base);
887 newbase = &model->
base;
891 if ((fh = fopen(file_name,
"wb")) == NULL) {
892 E_ERROR(
"Cannot create file %s\n", file_name);
895 ngram_model_dmp_write_header(fh);
896 ngram_model_dmp_write_lm_filename(fh, file_name);
897 ngram_model_dmp_write_version(fh, 0);
898 ngram_model_dmp_write_fmtdesc(fh);
899 ngram_model_dmp_write_ngram_counts(fh, newbase);
900 ngram_model_dmp_write_unigram(fh, newbase);
901 if (newbase->
n > 1) {
902 ngram_model_dmp_write_bigram(fh, newbase);
903 if (newbase->
n > 2) {
904 ngram_model_dmp_write_trigram(fh, newbase);
906 ngram_model_dmp_write_bgprob(fh, newbase);
907 if (newbase->
n > 2) {
908 ngram_model_dmp_write_tgbowt(fh, newbase);
909 ngram_model_dmp_write_tgprob(fh, newbase);
910 ngram_model_dmp_write_tg_segbase(fh, newbase);
913 ngram_model_dmp_write_wordstr(fh, newbase);
920 ngram_model_dmp_apply_weights(
ngram_model_t *base, float32 lw,
921 float32 wip, float32 uw)
924 lm3g_apply_weights(base, &model->
lm3g, lw, wip, uw);
931 #define NGRAM_MODEL_TYPE ngram_model_dmp_t
932 #include "lm3g_templates.c"
956 lm3g_tginfo_free(base, &model->
lm3g);
960 ngram_model_dmp_free,
961 ngram_model_dmp_apply_weights,
963 lm3g_template_raw_score,
964 lm3g_template_add_ug,
967 lm3g_template_mgrams,
968 lm3g_template_successors,
969 lm3g_template_iter_get,
970 lm3g_template_iter_next,
971 lm3g_template_iter_free