• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/sphinx_adtools/cont_fileseg.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2001 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * cont_fileseg.c -- Read input file, filter silence regions, and segment into utterances.
00039  * 
00040  * HISTORY
00041  * 
00042  * $Log: cont_fileseg.c,v $
00043  * Revision 1.1.1.1  2006/05/23 18:45:02  dhuggins
00044  * re-importation
00045  *
00046  * Revision 1.13  2005/06/30 00:28:46  rkm
00047  * Kept within-utterance silences in rawmode
00048  *
00049  * 
00050  * 28-Jun-2005  M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00051  *              Modified to use new state variables in cont_ad_t.
00052  * 
00053  * Revision 1.12  2005/05/31 15:54:38  rkm
00054  * *** empty log message ***
00055  *
00056  * Revision 1.11  2005/05/24 20:56:58  rkm
00057  * Added min/max-noise parameters to cont_fileseg
00058  *
00059  * Revision 1.10  2005/05/13 23:28:43  egouvea
00060  * Changed null device to system dependent one: NUL for windows, /dev/null for everything else
00061  * 
00062  * $Log: cont_fileseg.c,v $
00063  * Revision 1.1.1.1  2006/05/23 18:45:02  dhuggins
00064  * re-importation
00065  *
00066  * Revision 1.13  2005/06/30 00:28:46  rkm
00067  * Kept within-utterance silences in rawmode
00068  *
00069  * Revision 1.12  2005/05/31 15:54:38  rkm
00070  * *** empty log message ***
00071  *
00072  * Revision 1.11  2005/05/24 20:56:58  rkm
00073  * Added min/max-noise parameters to cont_fileseg
00074  *
00075  * Revision 1.9  2005/02/13 01:29:48  rkm
00076  * Fixed cont_ad_read to never cross sil/speech boundary, and rawmode
00077  *
00078  * Revision 1.8  2005/02/01 22:21:13  rkm
00079  * Added raw data logging, and raw data pass-through mode to cont_ad
00080  *
00081  * Revision 1.7  2004/07/16 00:57:11  egouvea
00082  * Added Ravi's implementation of FSG support.
00083  *
00084  * Revision 1.3  2004/06/25 14:58:05  rkm
00085  * *** empty log message ***
00086  *
00087  * Revision 1.2  2004/06/23 20:32:08  rkm
00088  * Exposed several cont_ad config parameters
00089  *
00090  * 
00091  * 27-Jun-96    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00092  *              Created.
00093  */
00094 
00095 #include <stdio.h>
00096 #include <stdlib.h>
00097 #include <string.h>
00098 #include <assert.h>
00099 #include <math.h>
00100 
00101 #include <prim_type.h>
00102 #include <ad.h>
00103 #include <cont_ad.h>
00104 #include <err.h>
00105 
00106 static FILE *infp;              /* File being segmented */
00107 static int32 swap;
00108 
00109 /* Max size read by file_ad_read function on each invocation, for debugging */
00110 static int32 max_ad_read_size;
00111 
00112 #if defined(WIN32) && !defined(GNUWINCE)
00113 #define NULL_DEVICE "NUL"
00114 #else
00115 #define NULL_DEVICE "/dev/null"
00116 #endif
00117 
00118 
00119 /*
00120  * Need to provide cont_ad_init with a read function to read the input file.
00121  * This is it.  The ad_rec_t *r argument is ignored since there is no A/D
00122  * device involved.
00123  */
00124 static int32
00125 file_ad_read(ad_rec_t * r, int16 * buf, int32 max)
00126 {
00127     int32 i, k;
00128 
00129     if (max > max_ad_read_size)
00130         max = max_ad_read_size;
00131 
00132     k = fread(buf, sizeof(int16), max, infp);
00133     if (swap) {
00134         for (i = 0; i < k; i++) {
00135             buf[i] = ((buf[i] >> 8) & 0x00ff) | ((buf[i] << 8) & 0xff00);
00136         }
00137     }
00138 
00139     return ((k > 0) ? k : -1);
00140 }
00141 
00142 
00143 static void
00144 usagemsg(char *pgm)
00145 {
00146     E_INFO("Usage: %s \\\n", pgm);
00147     E_INFOCONT("\t[-? | -h] \\\n");
00148     E_INFOCONT("\t[-d | -debug] \\\n");
00149     E_INFOCONT("\t[-sps <sampling-rate> (16000)] \\\n");
00150     E_INFOCONT("\t[-b | -byteswap] \\\n");
00151     E_INFOCONT
00152         ("\t[{-s | -silsep} <length-silence-separator(sec) (0.5)]> \\\n");
00153     E_INFOCONT("\t[-w | -writeseg] \\\n");
00154     E_INFOCONT("\t[-min-noise <min-noise>] \\\n");
00155     E_INFOCONT("\t[-max-noise <max-noise>] \\\n");
00156     E_INFOCONT("\t[-delta-sil <delta-sil>] \\\n");
00157     E_INFOCONT("\t[-delta-speech <delta-speech>] \\\n");
00158     E_INFOCONT("\t[-sil-onset <sil-onset>] \\\n");
00159     E_INFOCONT("\t[-speech-onset <speech-onset>] \\\n");
00160     E_INFOCONT("\t[-adapt-rate <adapt-rate>] \\\n");
00161     E_INFOCONT("\t[-max-adreadsize <ad_read_blksize>] \\\n");
00162     E_INFOCONT("\t[-c <copy-input-file>] \\\n");
00163     E_INFOCONT("\t[-r | -rawmode] \\\n");
00164     E_INFOCONT("\t-i <input-file>\n");
00165 
00166     exit(0);
00167 }
00168 
00169 /*
00170  * Read specified input file, segment it into utterances wherever a silence segment of
00171  * a given minimum duration is encountered.  Filter out long silences.
00172  * Utterances are written to files named 00000000.raw, 00000001.raw, 00000002.raw, etc.
00173  */
00174 int
00175 main(int32 argc, char **argv)
00176 {
00177     cont_ad_t *cont;
00178     int32 uttid, uttlen, starttime, siltime, sps, debug, writeseg, rawmode;
00179     int16 buf[4096];
00180     char *infile, *copyfile, segfile[1024];
00181     FILE *fp;
00182     float endsil;
00183     ad_rec_t ad;
00184     int32 i, k;
00185     int32 winsize, leader, trailer;
00186     int32 orig_min_noise, orig_max_noise;
00187     int32 orig_delta_sil, orig_delta_speech;
00188     int32 orig_speech_onset, orig_sil_onset;
00189     int32 min_noise, max_noise;
00190     int32 delta_sil, delta_speech;
00191     int32 sil_onset, speech_onset;
00192     float32 orig_adapt_rate;
00193     float32 adapt_rate;
00194     int32 total_speech_samples;
00195     float32 total_speech_sec;
00196     FILE *rawfp;
00197 
00198     /* Set argument defaults */
00199     cont = NULL;
00200     sps = 16000;
00201     swap = 0;
00202     endsil = 0.5;
00203     writeseg = 0;
00204     min_noise = max_noise = -1;
00205     delta_sil = delta_speech = -1;
00206     sil_onset = speech_onset = -1;
00207     adapt_rate = -1.0;
00208     max_ad_read_size = (int32) 0x7ffffff0;
00209     debug = 0;
00210     infile = NULL;
00211     copyfile = NULL;
00212     rawfp = NULL;
00213     rawmode = 0;
00214 
00215     /* Parse arguments */
00216     for (i = 1; i < argc; i++) {
00217         if ((strcmp(argv[i], "-help") == 0)
00218             || (strcmp(argv[i], "-h") == 0)
00219             || (strcmp(argv[i], "-?") == 0)) {
00220             usagemsg(argv[0]);
00221         }
00222         else if ((strcmp(argv[i], "-debug") == 0)
00223                  || (strcmp(argv[i], "-d") == 0)) {
00224             debug = 1;
00225         }
00226         else if (strcmp(argv[i], "-sps") == 0) {
00227             i++;
00228             if ((i == argc)
00229                 || (sscanf(argv[i], "%d", &sps) != 1)
00230                 || (sps <= 0)) {
00231                 E_ERROR("Invalid -sps argument\n");
00232                 usagemsg(argv[0]);
00233             }
00234         }
00235         else if ((strcmp(argv[i], "-byteswap") == 0)
00236                  || (strcmp(argv[i], "-b") == 0)) {
00237             swap = 1;
00238         }
00239         else if ((strcmp(argv[i], "-silsep") == 0)
00240                  || (strcmp(argv[i], "-s") == 0)) {
00241             i++;
00242             if ((i == argc)
00243                 || (sscanf(argv[i], "%f", &endsil) != 1)
00244                 || (endsil <= 0.0)) {
00245                 E_ERROR("Invalid -silsep argument\n");
00246                 usagemsg(argv[0]);
00247             }
00248         }
00249         else if ((strcmp(argv[i], "-writeseg") == 0)
00250                  || (strcmp(argv[i], "-w") == 0)) {
00251             writeseg = 1;
00252         }
00253         else if (strcmp(argv[i], "-min-noise") == 0) {
00254             i++;
00255             if ((i == argc) ||
00256                 (sscanf(argv[i], "%d", &min_noise) != 1) ||
00257                 (min_noise < 0)) {
00258                 E_ERROR("Invalid -min-noise argument\n");
00259                 usagemsg(argv[0]);
00260             }
00261         }
00262         else if (strcmp(argv[i], "-max-noise") == 0) {
00263             i++;
00264             if ((i == argc) ||
00265                 (sscanf(argv[i], "%d", &max_noise) != 1) ||
00266                 (max_noise < 0)) {
00267                 E_ERROR("Invalid -max-noise argument\n");
00268                 usagemsg(argv[0]);
00269             }
00270         }
00271         else if (strcmp(argv[i], "-delta-sil") == 0) {
00272             i++;
00273             if ((i == argc) ||
00274                 (sscanf(argv[i], "%d", &delta_sil) != 1) ||
00275                 (delta_sil < 0)) {
00276                 E_ERROR("Invalid -delta-sil argument\n");
00277                 usagemsg(argv[0]);
00278             }
00279         }
00280         else if (strcmp(argv[i], "-delta-speech") == 0) {
00281             i++;
00282             if ((i == argc) ||
00283                 (sscanf(argv[i], "%d", &delta_speech) != 1) ||
00284                 (delta_speech < 0)) {
00285                 E_ERROR("Invalid -delta-speech argument\n");
00286                 usagemsg(argv[0]);
00287             }
00288         }
00289         else if (strcmp(argv[i], "-sil-onset") == 0) {
00290             i++;
00291             if ((i == argc) ||
00292                 (sscanf(argv[i], "%d", &sil_onset) != 1) ||
00293                 (sil_onset < 1)) {
00294                 E_ERROR("Invalid -sil-onset argument\n");
00295                 usagemsg(argv[0]);
00296             }
00297         }
00298         else if (strcmp(argv[i], "-speech-onset") == 0) {
00299             i++;
00300             if ((i == argc) ||
00301                 (sscanf(argv[i], "%d", &speech_onset) != 1) ||
00302                 (speech_onset < 1)) {
00303                 E_ERROR("Invalid -speech-onset argument\n");
00304                 usagemsg(argv[0]);
00305             }
00306         }
00307         else if (strcmp(argv[i], "-adapt-rate") == 0) {
00308             i++;
00309             if ((i == argc) ||
00310                 (sscanf(argv[i], "%f", &adapt_rate) != 1) ||
00311                 (adapt_rate < 0.0) || (adapt_rate > 1.0)) {
00312                 E_ERROR("Invalid -adapt-rate argument\n");
00313                 usagemsg(argv[0]);
00314             }
00315         }
00316         else if (strcmp(argv[i], "-max-adreadsize") == 0) {
00317             i++;
00318             if ((i == argc) ||
00319                 (sscanf(argv[i], "%d", &max_ad_read_size) != 1) ||
00320                 (max_ad_read_size < 1)) {
00321                 E_ERROR("Invalid -max-adreadsize argument\n");
00322                 usagemsg(argv[0]);
00323             }
00324         }
00325         else if (strcmp(argv[i], "-c") == 0) {
00326             i++;
00327             if (i == argc) {
00328                 E_ERROR("Invalid -c argument\n");
00329                 usagemsg(argv[0]);
00330             }
00331             copyfile = argv[i];
00332         }
00333         else if ((strcmp(argv[i], "-rawmode") == 0)
00334                  || (strcmp(argv[i], "-r") == 0)) {
00335             rawmode = 1;
00336         }
00337         else if (strcmp(argv[i], "-i") == 0) {
00338             i++;
00339             if (i == argc) {
00340                 E_ERROR("Invalid -i argument\n");
00341                 usagemsg(argv[0]);
00342             }
00343             infile = argv[i];
00344         }
00345         else {
00346             usagemsg(argv[0]);
00347         }
00348     }
00349 
00350     if (infile == NULL) {
00351         E_ERROR("No input file specified\n");
00352         usagemsg(argv[0]);
00353     }
00354 
00355     if ((infp = fopen(infile, "rb")) == NULL)
00356         E_FATAL("fopen(%s,rb) failed\n", infile);
00357 
00358     /*
00359      * Associate continuous listening module with opened input file and read function.
00360      * No A/D device is involved, but need to fill in ad->sps.
00361      * Calibrate input data using first few seconds of file, but then rewind it!!
00362      */
00363     ad.sps = sps;
00364     ad.bps = sizeof(int16);
00365     if (!rawmode)
00366         cont = cont_ad_init(&ad, file_ad_read);
00367     else
00368         cont = cont_ad_init_rawmode(&ad, file_ad_read);
00369 
00370     printf("Calibrating ...");
00371     fflush(stdout);
00372     if (cont_ad_calib(cont) < 0)
00373         printf(" failed; file too short?\n");
00374     else
00375         printf(" done\n");
00376     rewind(infp);
00377 
00378     /* Convert desired min. inter-utterance silence duration to #samples */
00379     siltime = (int32) (endsil * sps);
00380 
00381     /* Enable writing raw input to output by the cont module if specified */
00382     if (copyfile) {
00383         if ((rawfp = fopen(copyfile, "wb")) == NULL)
00384             E_ERROR("fopen(%s,wb) failed; not dumping raw file\n",
00385                     copyfile);
00386         else
00387             cont_ad_set_rawfp(cont, rawfp);
00388     }
00389 
00390     cont_ad_get_params(cont,
00391                        &orig_delta_sil, &orig_delta_speech,
00392                        &orig_min_noise, &orig_max_noise,
00393                        &winsize,
00394                        &orig_speech_onset, &orig_sil_onset,
00395                        &leader, &trailer, &orig_adapt_rate);
00396 
00397     E_INFO("Default parameters:\n");
00398     E_INFOCONT("\tmin-noise = %d, max-noise = %d\n",
00399                orig_min_noise, orig_max_noise);
00400     E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n",
00401                orig_delta_sil, orig_delta_speech);
00402     E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n",
00403                orig_sil_onset, orig_speech_onset);
00404     E_INFOCONT("\tadapt_rate = %.3f\n", orig_adapt_rate);
00405 
00406     if (min_noise < 0)
00407         min_noise = orig_min_noise;
00408     if (max_noise < 0)
00409         max_noise = orig_max_noise;
00410     if (delta_sil < 0)
00411         delta_sil = orig_delta_sil;
00412     if (delta_speech < 0)
00413         delta_speech = orig_delta_speech;
00414     if (sil_onset < 0)
00415         sil_onset = orig_sil_onset;
00416     if (speech_onset < 0)
00417         speech_onset = orig_speech_onset;
00418     if (adapt_rate < 0.0)
00419         adapt_rate = orig_adapt_rate;
00420 
00421     cont_ad_set_params(cont,
00422                        delta_sil, delta_speech,
00423                        min_noise, max_noise,
00424                        winsize,
00425                        speech_onset, sil_onset,
00426                        leader, trailer, adapt_rate);
00427 
00428     E_INFO("Current parameters:\n");
00429     E_INFOCONT("\tmin-noise = %d, max-noise = %d\n", min_noise, max_noise);
00430     E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n", delta_sil,
00431                delta_speech);
00432     E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n", sil_onset,
00433                speech_onset);
00434     E_INFOCONT("\tadapt_rate = %.3f\n", adapt_rate);
00435 
00436     E_INFO("Sampling rate: %d", sps);
00437     E_INFOCONT("; Byteswap: %s", swap ? "Yes" : "No");
00438     E_INFOCONT("; Max ad-read size: %d\n", max_ad_read_size);
00439 
00440     if (debug)
00441         cont_ad_set_logfp(cont, stdout);
00442 
00443     total_speech_samples = 0;
00444     total_speech_sec = 0.0;
00445 
00446     uttid = 0;
00447     uttlen = 0;
00448     starttime = 0;
00449     fp = NULL;
00450 
00451     /* Process data */
00452     for (;;) {
00453         /* Get audio data from continuous listening module */
00454         k = cont_ad_read(cont, buf, 4096);
00455 
00456         if (k < 0) {            /* End of input audio file; close any open output file and exit */
00457             if (fp != NULL) {
00458                 fclose(fp);
00459                 fp = NULL;
00460 
00461                 printf
00462                     ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
00463                      uttid, (double) starttime / (double) sps,
00464                      (double) (starttime + uttlen) / (double) sps,
00465                      (double) uttlen / (double) sps, uttlen);
00466                 fflush(stdout);
00467 
00468                 total_speech_samples += uttlen;
00469                 total_speech_sec += (double) uttlen / (double) sps;
00470 
00471                 uttid++;
00472             }
00473 
00474             break;
00475         }
00476 
00477         if (cont->state == CONT_AD_STATE_SIL) { /* Silence data got */
00478             if (fp != NULL) {   /* Currently in an utterance */
00479                 if (cont->seglen > siltime) {   /* Long enough silence detected; end the utterance */
00480                     fclose(fp);
00481                     fp = NULL;
00482 
00483                     printf
00484                         ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
00485                          uttid, (double) starttime / (double) sps,
00486                          (double) (starttime + uttlen) / (double) sps,
00487                          (double) uttlen / (double) sps, uttlen);
00488                     fflush(stdout);
00489 
00490                     total_speech_samples += uttlen;
00491                     total_speech_sec += (double) uttlen / (double) sps;
00492 
00493                     uttid++;
00494                 }
00495                 else {
00496                     /*
00497                      * Short silence within utt; write it to output.  (Some extra trailing silence
00498                      * is included in the utterance, as a result.  Not to worry about it.)
00499                      */
00500                     if (k > 0) {
00501                         fwrite(buf, sizeof(int16), k, fp);
00502                         uttlen += k;
00503                     }
00504                 }
00505             }
00506         }
00507         else {
00508             assert(cont->state == CONT_AD_STATE_SPEECH);
00509 
00510             if (fp == NULL) {   /* Not in an utt; open a new output file */
00511                 if (writeseg)
00512                     sprintf(segfile, "%08d.raw", uttid);
00513                 else
00514                     strcpy(segfile, NULL_DEVICE);
00515                 if ((fp = fopen(segfile, "wb")) == NULL)
00516                     E_FATAL("fopen(%s,wb) failed\n", segfile);
00517 
00518                 starttime = cont->read_ts - k;
00519                 uttlen = 0;
00520             }
00521 
00522             /* Write data obtained to output file */
00523             if (k > 0) {
00524                 fwrite(buf, sizeof(int16), k, fp);
00525                 uttlen += k;
00526             }
00527         }
00528     }
00529 
00530     if (rawfp)
00531         fclose(rawfp);
00532 
00533     E_INFO("Total raw input speech = %d frames, %d samples, %.2f sec\n",
00534            cont->tot_frm, cont->tot_frm * cont->spf,
00535            (cont->tot_frm * cont->spf) / (float32) cont->sps);
00536     E_INFO("Total speech detected = %d samples, %.2f sec\n",
00537            total_speech_samples, total_speech_sec);
00538 
00539     cont_ad_close(cont);
00540 
00541     return 0;
00542 }

Generated on Thu Jan 6 2011 for SphinxBase by  doxygen 1.7.1