00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00041 #include <logmath.h>
00042 #include <ngram_model.h>
00043 #include <cmd_ln.h>
00044 #include <ckd_alloc.h>
00045 #include <err.h>
00046 #include <pio.h>
00047 #include <strfuncs.h>
00048
00049 #include <stdio.h>
00050 #include <string.h>
00051 #include <math.h>
00052
00053 static const arg_t defn[] = {
00054 { "-help",
00055 ARG_BOOLEAN,
00056 "no",
00057 "Shows the usage of the tool"},
00058
00059 { "-logbase",
00060 ARG_FLOAT64,
00061 "1.0001",
00062 "Base in which all log-likelihoods calculated" },
00063
00064 { "-lm",
00065 ARG_STRING,
00066 NULL,
00067 "Language model file"},
00068
00069 { "-probdef",
00070 ARG_STRING,
00071 NULL,
00072 "Probability definition file for classes in LM"},
00073
00074 { "-lmctlfn",
00075 ARG_STRING,
00076 NULL,
00077 "Control file listing a set of language models"},
00078
00079 { "-lmname",
00080 ARG_STRING,
00081 NULL,
00082 "Name of language model in -lmctlfn to use for all utterances" },
00083
00084 { "-lsn",
00085 ARG_STRING,
00086 NULL,
00087 "Transcription file to evaluate"},
00088
00089 { "-text",
00090 ARG_STRING,
00091 "Text string to evaluate"},
00092
00093 { "-mmap",
00094 ARG_BOOLEAN,
00095 "no",
00096 "Use memory-mapped I/O for reading binary LM files"},
00097
00098 { "-lw",
00099 ARG_FLOAT32,
00100 "1.0",
00101 "Language model weight" },
00102
00103 { "-wip",
00104 ARG_FLOAT32,
00105 "1.0",
00106 "Word insertion probability" },
00107
00108 { "-uw",
00109 ARG_FLOAT32,
00110 "1.0",
00111 "Unigram probability weight (interpolated with uniform distribution)"},
00112
00113
00114 { NULL, 0, NULL, NULL }
00115 };
00116
00117 static int
00118 calc_entropy(ngram_model_t *lm, char **words, int32 n,
00119 int32 *out_n_ccs, int32 *out_n_oovs)
00120 {
00121 int32 *wids;
00122 int32 startwid;
00123 int32 i, ch, nccs, noovs, unk;
00124
00125 if (n == 0)
00126 return 0;
00127
00128 unk = ngram_unknown_wid(lm);
00129
00130
00131 wids = ckd_calloc(n, sizeof(*wids));
00132 for (i = 0; i < n; ++i)
00133 wids[n-i-1] = ngram_wid(lm, words[i]);
00134
00135 startwid = ngram_wid(lm, "<s>");
00136
00137
00138
00139 ch = noovs = nccs = 0;
00140 for (i = 0; i < n; ++i) {
00141 int32 n_used;
00142 int32 prob;
00143
00144
00145 if (wids[i] == startwid) {
00146 ++nccs;
00147 continue;
00148 }
00149
00150 if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) {
00151 ++noovs;
00152 continue;
00153 }
00154
00155 prob = ngram_ng_score(lm,
00156 wids[i], wids + i + 1,
00157 n - i - 1, &n_used);
00158 ch -= prob;
00159 }
00160
00161 if (out_n_ccs) *out_n_ccs = nccs;
00162 if (out_n_oovs) *out_n_oovs = noovs;
00163
00164
00165 n -= (nccs + noovs);
00166 if (n <= 0)
00167 return 0;
00168 return ch / n;
00169 }
00170
00171 static void
00172 evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
00173 {
00174 FILE *fh;
00175 lineiter_t *litor;
00176 int32 nccs, noovs, nwords;
00177 float64 ch, log_to_log2;;
00178
00179 if ((fh = fopen(lsnfn, "r")) == NULL)
00180 E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);
00181
00182
00183
00184 log_to_log2 = log(logmath_get_base(lmath)) / log(2);
00185 nccs = noovs = nwords = 0;
00186 ch = 0.0;
00187 for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) {
00188 char **words;
00189 int32 n, tmp_ch, tmp_noovs, tmp_nccs;
00190
00191 n = str2words(litor->buf, NULL, 0);
00192 if (n < 0)
00193 E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
00194 if (n == 0)
00195 continue;
00196 words = ckd_calloc(n, sizeof(*words));
00197 str2words(litor->buf, words, n);
00198
00199
00200 if (words[n-1][0] == '('
00201 && words[n-1][strlen(words[n-1])-1] == ')')
00202 n = n - 1;
00203
00204 tmp_ch = calc_entropy(lm, words, n, &tmp_nccs, &tmp_noovs);
00205
00206 ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
00207 nccs += tmp_nccs;
00208 noovs += tmp_noovs;
00209 nwords += n;
00210
00211 ckd_free(words);
00212 }
00213
00214 ch /= (nwords - nccs - noovs);
00215 printf("cross-entropy: %f bits\n", ch);
00216
00217
00218 printf("perplexity: %f\n", pow(2.0, ch));
00219
00220
00221 printf("%d words evaluated\n", nwords);
00222 printf("%d OOVs (%.2f%%), %d context cues removed\n",
00223 noovs, (double)noovs / nwords * 100, nccs);
00224 }
00225
00226 static void
00227 evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text)
00228 {
00229 char *textfoo;
00230 char **words;
00231 int32 n, ch, noovs, nccs;
00232
00233
00234 textfoo = ckd_salloc(text);
00235 n = str2words(textfoo, NULL, 0);
00236 if (n < 0)
00237 E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
00238 if (n == 0)
00239 return;
00240 words = ckd_calloc(n, sizeof(*words));
00241 str2words(textfoo, words, n);
00242
00243 ch = calc_entropy(lm, words, n, &nccs, &noovs);
00244
00245 printf("input: %s\n", text);
00246 printf("cross-entropy: %f bits\n",
00247 ch * log(logmath_get_base(lmath)) / log(2));
00248
00249
00250 printf("perplexity: %f\n", logmath_exp(lmath, ch));
00251
00252
00253 printf("%d words evaluated\n", n);
00254 printf("%d OOVs, %d context cues removed\n",
00255 noovs, nccs);
00256
00257 ckd_free(textfoo);
00258 ckd_free(words);
00259 }
00260
00261 int
00262 main(int argc, char *argv[])
00263 {
00264 cmd_ln_t *config;
00265 ngram_model_t *lm = NULL;
00266 logmath_t *lmath;
00267 const char *lmfn, *probdefn, *lsnfn, *text;
00268
00269 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
00270 return 1;
00271
00272
00273 if ((lmath = logmath_init
00274 (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
00275 E_FATAL("Failed to initialize log math\n");
00276 }
00277
00278
00279 lmfn = cmd_ln_str_r(config, "-lm");
00280 if (lmfn == NULL
00281 || (lm = ngram_model_read(config, lmfn,
00282 NGRAM_AUTO, lmath)) == NULL) {
00283 E_FATAL("Failed to load language model from %s\n",
00284 cmd_ln_str_r(config, "-lm"));
00285 }
00286 if ((probdefn = cmd_ln_str_r(config, "-probdef")) != NULL)
00287 ngram_model_read_classdef(lm, probdefn);
00288 ngram_model_apply_weights(lm,
00289 cmd_ln_float32_r(config, "-lw"),
00290 cmd_ln_float32_r(config, "-wip"),
00291 cmd_ln_float32_r(config, "-uw"));
00292
00293
00294 lsnfn = cmd_ln_str_r(config, "-lsn");
00295 text = cmd_ln_str_r(config, "-text");
00296 if (lsnfn) {
00297 evaluate_file(lm, lmath, lsnfn);
00298 }
00299 else if (text) {
00300 evaluate_string(lm, lmath, text);
00301 }
00302
00303 return 0;
00304 }