PocketSphinx 5prealpha
pocketsphinx.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2008 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
38/* System headers. */
39#include <stdio.h>
40#include <assert.h>
41
42#ifdef HAVE_UNISTD_H
43#include <unistd.h>
44#endif
45
46/* SphinxBase headers. */
47#include <sphinxbase/err.h>
48#include <sphinxbase/strfuncs.h>
49#include <sphinxbase/filename.h>
50#include <sphinxbase/pio.h>
51#include <sphinxbase/jsgf.h>
52#include <sphinxbase/hash_table.h>
53
54/* Local headers. */
55#include "cmdln_macro.h"
56#include "pocketsphinx.h"
58#include "ps_lattice_internal.h"
59#include "phone_loop_search.h"
60#include "kws_search.h"
61#include "fsg_search_internal.h"
62#include "ngram_search.h"
65#include "allphone_search.h"
66
67static const arg_t ps_args_def[] = {
68 POCKETSPHINX_OPTIONS,
69 CMDLN_EMPTY_OPTION
70};
71
72/* I'm not sure what the portable way to do this is. */
73static int
74file_exists(const char *path)
75{
76 FILE *tmp;
77
78 tmp = fopen(path, "rb");
79 if (tmp) fclose(tmp);
80 return (tmp != NULL);
81}
82
83#ifdef MODELDIR
84static int
85hmmdir_exists(const char *path)
86{
87 FILE *tmp;
88 char *mdef = string_join(path, "/mdef", NULL);
89
90 tmp = fopen(mdef, "rb");
91 if (tmp) fclose(tmp);
92 ckd_free(mdef);
93 return (tmp != NULL);
94}
95#endif
96
97static void
98ps_expand_file_config(ps_decoder_t *ps, const char *arg, const char *extra_arg,
99 const char *hmmdir, const char *file)
100{
101 const char *val;
102 if ((val = cmd_ln_str_r(ps->config, arg)) != NULL) {
103 cmd_ln_set_str_extra_r(ps->config, extra_arg, val);
104 } else if (hmmdir == NULL) {
105 cmd_ln_set_str_extra_r(ps->config, extra_arg, NULL);
106 } else {
107 char *tmp = string_join(hmmdir, "/", file, NULL);
108 if (file_exists(tmp))
109 cmd_ln_set_str_extra_r(ps->config, extra_arg, tmp);
110 else
111 cmd_ln_set_str_extra_r(ps->config, extra_arg, NULL);
112 ckd_free(tmp);
113 }
114}
115
116/* Feature and front-end parameters that may be in feat.params */
117static const arg_t feat_defn[] = {
118 waveform_to_cepstral_command_line_macro(),
119 cepstral_to_feature_command_line_macro(),
120 CMDLN_EMPTY_OPTION
121};
122
123static void
124ps_expand_model_config(ps_decoder_t *ps)
125{
126 char const *hmmdir, *featparams;
127
128 /* Disable memory mapping on Blackfin (FIXME: should be uClinux in general). */
129#ifdef __ADSPBLACKFIN__
130 E_INFO("Will not use mmap() on uClinux/Blackfin.");
131 cmd_ln_set_boolean_r(ps->config, "-mmap", FALSE);
132#endif
133
134 /* Get acoustic model filenames and add them to the command-line */
135 hmmdir = cmd_ln_str_r(ps->config, "-hmm");
136 ps_expand_file_config(ps, "-mdef", "_mdef", hmmdir, "mdef");
137 ps_expand_file_config(ps, "-mean", "_mean", hmmdir, "means");
138 ps_expand_file_config(ps, "-var", "_var", hmmdir, "variances");
139 ps_expand_file_config(ps, "-tmat", "_tmat", hmmdir, "transition_matrices");
140 ps_expand_file_config(ps, "-mixw", "_mixw", hmmdir, "mixture_weights");
141 ps_expand_file_config(ps, "-sendump", "_sendump", hmmdir, "sendump");
142 ps_expand_file_config(ps, "-fdict", "_fdict", hmmdir, "noisedict");
143 ps_expand_file_config(ps, "-lda", "_lda", hmmdir, "feature_transform");
144 ps_expand_file_config(ps, "-featparams", "_featparams", hmmdir, "feat.params");
145 ps_expand_file_config(ps, "-senmgau", "_senmgau", hmmdir, "senmgau");
146
147 /* Look for feat.params in acoustic model dir. */
148 if ((featparams = cmd_ln_str_r(ps->config, "_featparams"))) {
149 if (NULL !=
150 cmd_ln_parse_file_r(ps->config, feat_defn, featparams, FALSE))
151 E_INFO("Parsed model-specific feature parameters from %s\n",
152 featparams);
153 }
154
155 /* Print here because acmod_init might load feat.params file */
156 if (err_get_logfp() != NULL) {
157 cmd_ln_print_values_r(ps->config, err_get_logfp(), ps_args());
158 }
159}
160
161static void
162ps_free_searches(ps_decoder_t *ps)
163{
164 if (ps->searches) {
165 hash_iter_t *search_it;
166 for (search_it = hash_table_iter(ps->searches); search_it;
167 search_it = hash_table_iter_next(search_it)) {
168 ps_search_free(hash_entry_val(search_it->ent));
169 }
170 hash_table_free(ps->searches);
171 }
172
173 ps->searches = NULL;
174 ps->search = NULL;
175}
176
177static ps_search_t *
178ps_find_search(ps_decoder_t *ps, char const *name)
179{
180 void *search = NULL;
181 hash_table_lookup(ps->searches, name, &search);
182
183 return (ps_search_t *) search;
184}
185
186/* Set default acoustic and language models if they are not defined in configuration. */
187void
188ps_default_search_args(cmd_ln_t *config)
189{
190#ifdef MODELDIR
191 const char *hmmdir = cmd_ln_str_r(config, "-hmm");
192 const char *lmfile = cmd_ln_str_r(config, "-lm");
193 const char *dictfile = cmd_ln_str_r(config, "-dict");
194
195 if (hmmdir == NULL && hmmdir_exists(MODELDIR "/en-us/en-us")) {
196 hmmdir = MODELDIR "/en-us/en-us";
197 cmd_ln_set_str_r(config, "-hmm", hmmdir);
198 }
199
200 if (lmfile == NULL && !cmd_ln_str_r(config, "-fsg")
201 && !cmd_ln_str_r(config, "-jsgf")
202 && !cmd_ln_str_r(config, "-lmctl")
203 && !cmd_ln_str_r(config, "-kws")
204 && !cmd_ln_str_r(config, "-keyphrase")
205 && file_exists(MODELDIR "/en-us/en-us.lm.bin")) {
206 lmfile = MODELDIR "/en-us/en-us.lm.bin";
207 cmd_ln_set_str_r(config, "-lm", lmfile);
208 }
209
210 if (dictfile == NULL && file_exists(MODELDIR "/en-us/cmudict-en-us.dict")) {
211 dictfile = MODELDIR "/en-us/cmudict-en-us.dict";
212 cmd_ln_set_str_r(config, "-dict", dictfile);
213 }
214#endif
215}
216
217int
218ps_reinit(ps_decoder_t *ps, cmd_ln_t *config)
219{
220 const char *path;
221 const char *keyphrase;
222 int32 lw;
223
224 if (config && config != ps->config) {
225 cmd_ln_free_r(ps->config);
226 ps->config = cmd_ln_retain(config);
227 }
228
229 err_set_debug_level(cmd_ln_int32_r(ps->config, "-debug"));
230 /* Set up logging. We need to do this earlier because we want to dump
231 * the information to the configured log, not to the stderr. */
232 if (config && cmd_ln_str_r(ps->config, "-logfn")) {
233 if (err_set_logfile(cmd_ln_str_r(ps->config, "-logfn")) < 0) {
234 E_ERROR("Cannot redirect log output\n");
235 return -1;
236 }
237 }
238
239 ps->mfclogdir = cmd_ln_str_r(ps->config, "-mfclogdir");
240 ps->rawlogdir = cmd_ln_str_r(ps->config, "-rawlogdir");
241 ps->senlogdir = cmd_ln_str_r(ps->config, "-senlogdir");
242
243 /* Fill in some default arguments. */
244 ps_expand_model_config(ps);
245
246 /* Free old searches (do this before other reinit) */
247 ps_free_searches(ps);
248 ps->searches = hash_table_new(3, HASH_CASE_YES);
249
250 /* Free old acmod. */
251 acmod_free(ps->acmod);
252 ps->acmod = NULL;
253
254 /* Free old dictionary (must be done after the two things above) */
255 dict_free(ps->dict);
256 ps->dict = NULL;
257
258 /* Free d2p */
259 dict2pid_free(ps->d2p);
260 ps->d2p = NULL;
261
262 /* Logmath computation (used in acmod and search) */
263 if (ps->lmath == NULL
264 || (logmath_get_base(ps->lmath) !=
265 (float64)cmd_ln_float32_r(ps->config, "-logbase"))) {
266 if (ps->lmath)
267 logmath_free(ps->lmath);
268 ps->lmath = logmath_init
269 ((float64)cmd_ln_float32_r(ps->config, "-logbase"), 0,
270 cmd_ln_boolean_r(ps->config, "-bestpath"));
271 }
272
273 /* Acoustic model (this is basically everything that
274 * uttproc.c, senscr.c, and others used to do) */
275 if ((ps->acmod = acmod_init(ps->config, ps->lmath, NULL, NULL)) == NULL)
276 return -1;
277
278
279
280 if (cmd_ln_int32_r(ps->config, "-pl_window") > 0) {
281 /* Initialize an auxiliary phone loop search, which will run in
282 * "parallel" with FSG or N-Gram search. */
283 if ((ps->phone_loop =
284 phone_loop_search_init(ps->config, ps->acmod, ps->dict)) == NULL)
285 return -1;
286 hash_table_enter(ps->searches,
287 ps_search_name(ps->phone_loop),
288 ps->phone_loop);
289 }
290
291 /* Dictionary and triphone mappings (depends on acmod). */
292 /* FIXME: pass config, change arguments, implement LTS, etc. */
293 if ((ps->dict = dict_init(ps->config, ps->acmod->mdef)) == NULL)
294 return -1;
295 if ((ps->d2p = dict2pid_build(ps->acmod->mdef, ps->dict)) == NULL)
296 return -1;
297
298 lw = cmd_ln_float32_r(ps->config, "-lw");
299
300 /* Determine whether we are starting out in FSG or N-Gram search mode.
301 * If neither is used skip search initialization. */
302
303 /* Load KWS if one was specified in config */
304 if ((keyphrase = cmd_ln_str_r(ps->config, "-keyphrase"))) {
305 if (ps_set_keyphrase(ps, PS_DEFAULT_SEARCH, keyphrase))
306 return -1;
307 ps_set_search(ps, PS_DEFAULT_SEARCH);
308 }
309
310 if ((path = cmd_ln_str_r(ps->config, "-kws"))) {
311 if (ps_set_kws(ps, PS_DEFAULT_SEARCH, path))
312 return -1;
313 ps_set_search(ps, PS_DEFAULT_SEARCH);
314 }
315
316 /* Load an FSG if one was specified in config */
317 if ((path = cmd_ln_str_r(ps->config, "-fsg"))) {
318 fsg_model_t *fsg = fsg_model_readfile(path, ps->lmath, lw);
319 if (!fsg)
320 return -1;
321 if (ps_set_fsg(ps, PS_DEFAULT_SEARCH, fsg)) {
322 fsg_model_free(fsg);
323 return -1;
324 }
325 fsg_model_free(fsg);
326 ps_set_search(ps, PS_DEFAULT_SEARCH);
327 }
328
329 /* Or load a JSGF grammar */
330 if ((path = cmd_ln_str_r(ps->config, "-jsgf"))) {
331 if (ps_set_jsgf_file(ps, PS_DEFAULT_SEARCH, path)
332 || ps_set_search(ps, PS_DEFAULT_SEARCH))
333 return -1;
334 }
335
336 if ((path = cmd_ln_str_r(ps->config, "-allphone"))) {
337 if (ps_set_allphone_file(ps, PS_DEFAULT_SEARCH, path)
338 || ps_set_search(ps, PS_DEFAULT_SEARCH))
339 return -1;
340 }
341
342 if ((path = cmd_ln_str_r(ps->config, "-lm")) &&
343 !cmd_ln_boolean_r(ps->config, "-allphone")) {
344 if (ps_set_lm_file(ps, PS_DEFAULT_SEARCH, path)
345 || ps_set_search(ps, PS_DEFAULT_SEARCH))
346 return -1;
347 }
348
349 if ((path = cmd_ln_str_r(ps->config, "-lmctl"))) {
350 const char *name;
351 ngram_model_t *lmset;
352 ngram_model_set_iter_t *lmset_it;
353
354 if (!(lmset = ngram_model_set_read(ps->config, path, ps->lmath))) {
355 E_ERROR("Failed to read language model control file: %s\n", path);
356 return -1;
357 }
358
359 for(lmset_it = ngram_model_set_iter(lmset);
360 lmset_it; lmset_it = ngram_model_set_iter_next(lmset_it)) {
361 ngram_model_t *lm = ngram_model_set_iter_model(lmset_it, &name);
362 E_INFO("adding search %s\n", name);
363 if (ps_set_lm(ps, name, lm)) {
364 ngram_model_set_iter_free(lmset_it);
365 ngram_model_free(lmset);
366 return -1;
367 }
368 }
369 ngram_model_free(lmset);
370
371 name = cmd_ln_str_r(ps->config, "-lmname");
372 if (name)
373 ps_set_search(ps, name);
374 else {
375 E_ERROR("No default LM name (-lmname) for `-lmctl'\n");
376 return -1;
377 }
378 }
379
380 /* Initialize performance timer. */
381 ps->perf.name = "decode";
382 ptmr_init(&ps->perf);
383
384 return 0;
385}
386
388ps_init(cmd_ln_t *config)
389{
390 ps_decoder_t *ps;
391
392 if (!config) {
393 E_ERROR("No configuration specified");
394 return NULL;
395 }
396
397 ps = ckd_calloc(1, sizeof(*ps));
398 ps->refcount = 1;
399 if (ps_reinit(ps, config) < 0) {
400 ps_free(ps);
401 return NULL;
402 }
403 return ps;
404}
405
406arg_t const *
408{
409 return ps_args_def;
410}
411
414{
415 ++ps->refcount;
416 return ps;
417}
418
419int
421{
422 if (ps == NULL)
423 return 0;
424 if (--ps->refcount > 0)
425 return ps->refcount;
426 ps_free_searches(ps);
427 dict_free(ps->dict);
428 dict2pid_free(ps->d2p);
429 acmod_free(ps->acmod);
430 logmath_free(ps->lmath);
431 cmd_ln_free_r(ps->config);
432 ckd_free(ps);
433 return 0;
434}
435
436cmd_ln_t *
438{
439 return ps->config;
440}
441
442logmath_t *
444{
445 return ps->lmath;
446}
447
448fe_t *
450{
451 return ps->acmod->fe;
452}
453
454feat_t *
456{
457 return ps->acmod->fcb;
458}
459
460ps_mllr_t *
462{
463 return acmod_update_mllr(ps->acmod, mllr);
464}
465
466int
467ps_set_search(ps_decoder_t *ps, const char *name)
468{
469 ps_search_t *search;
470
471 if (ps->acmod->state != ACMOD_ENDED && ps->acmod->state != ACMOD_IDLE) {
472 E_ERROR("Cannot change search while decoding, end utterance first\n");
473 return -1;
474 }
475
476 if (!(search = ps_find_search(ps, name))) {
477 return -1;
478 }
479
480 ps->search = search;
481 /* Set pl window depending on the search */
482 if (!strcmp(PS_SEARCH_TYPE_NGRAM, ps_search_type(search))) {
483 ps->pl_window = cmd_ln_int32_r(ps->config, "-pl_window");
484 } else {
485 ps->pl_window = 0;
486 }
487
488 return 0;
489}
490
491const char*
493{
494 hash_iter_t *search_it;
495 const char* name = NULL;
496 for (search_it = hash_table_iter(ps->searches); search_it;
497 search_it = hash_table_iter_next(search_it)) {
498 if (hash_entry_val(search_it->ent) == ps->search) {
499 name = hash_entry_key(search_it->ent);
500 break;
501 }
502 }
503 return name;
504}
505
506int
507ps_unset_search(ps_decoder_t *ps, const char *name)
508{
509 ps_search_t *search = hash_table_delete(ps->searches, name);
510 if (!search)
511 return -1;
512 if (ps->search == search)
513 ps->search = NULL;
514 ps_search_free(search);
515 return 0;
516}
517
520{
521 return (ps_search_iter_t *)hash_table_iter(ps->searches);
522}
523
526{
527 return (ps_search_iter_t *)hash_table_iter_next((hash_iter_t *)itor);
528}
529
530const char*
532{
533 return (const char*)(((hash_iter_t *)itor)->ent->key);
534}
535
536void
538{
539 hash_table_iter_free((hash_iter_t *)itor);
540}
541
542ngram_model_t *
543ps_get_lm(ps_decoder_t *ps, const char *name)
544{
545 ps_search_t *search = ps_find_search(ps, name);
546 if (search && strcmp(PS_SEARCH_TYPE_NGRAM, ps_search_type(search)))
547 return NULL;
548 return search ? ((ngram_search_t *) search)->lmset : NULL;
549}
550
551fsg_model_t *
552ps_get_fsg(ps_decoder_t *ps, const char *name)
553{
554 ps_search_t *search = ps_find_search(ps, name);
555 if (search && strcmp(PS_SEARCH_TYPE_FSG, ps_search_type(search)))
556 return NULL;
557 return search ? ((fsg_search_t *) search)->fsg : NULL;
558}
559
560const char*
561ps_get_kws(ps_decoder_t *ps, const char* name)
562{
563 ps_search_t *search = ps_find_search(ps, name);
564 if (search && strcmp(PS_SEARCH_TYPE_KWS, ps_search_type(search)))
565 return NULL;
566 return search ? kws_search_get_keyphrases(search) : NULL;
567}
568
569static int
570set_search_internal(ps_decoder_t *ps, ps_search_t *search)
571{
572 ps_search_t *old_search;
573
574 if (!search)
575 return -1;
576
577 search->pls = ps->phone_loop;
578 old_search = (ps_search_t *) hash_table_replace(ps->searches, ps_search_name(search), search);
579 if (old_search != search)
580 ps_search_free(old_search);
581
582 return 0;
583}
584
585int
586ps_set_lm(ps_decoder_t *ps, const char *name, ngram_model_t *lm)
587{
588 ps_search_t *search;
589 search = ngram_search_init(name, lm, ps->config, ps->acmod, ps->dict, ps->d2p);
590 return set_search_internal(ps, search);
591}
592
593int
594ps_set_lm_file(ps_decoder_t *ps, const char *name, const char *path)
595{
596 ngram_model_t *lm;
597 int result;
598
599 lm = ngram_model_read(ps->config, path, NGRAM_AUTO, ps->lmath);
600 if (!lm)
601 return -1;
602
603 result = ps_set_lm(ps, name, lm);
604 ngram_model_free(lm);
605 return result;
606}
607
608int
609ps_set_allphone(ps_decoder_t *ps, const char *name, ngram_model_t *lm)
610{
611 ps_search_t *search;
612 search = allphone_search_init(name, lm, ps->config, ps->acmod, ps->dict, ps->d2p);
613 return set_search_internal(ps, search);
614}
615
616int
617ps_set_allphone_file(ps_decoder_t *ps, const char *name, const char *path)
618{
619 ngram_model_t *lm;
620 int result;
621
622 lm = NULL;
623 if (path)
624 lm = ngram_model_read(ps->config, path, NGRAM_AUTO, ps->lmath);
625 result = ps_set_allphone(ps, name, lm);
626 if (lm)
627 ngram_model_free(lm);
628 return result;
629}
630
631int
632ps_set_kws(ps_decoder_t *ps, const char *name, const char *keyfile)
633{
634 ps_search_t *search;
635 search = kws_search_init(name, NULL, keyfile, ps->config, ps->acmod, ps->dict, ps->d2p);
636 return set_search_internal(ps, search);
637}
638
639int
640ps_set_keyphrase(ps_decoder_t *ps, const char *name, const char *keyphrase)
641{
642 ps_search_t *search;
643 search = kws_search_init(name, keyphrase, NULL, ps->config, ps->acmod, ps->dict, ps->d2p);
644 return set_search_internal(ps, search);
645}
646
647int
648ps_set_fsg(ps_decoder_t *ps, const char *name, fsg_model_t *fsg)
649{
650 ps_search_t *search;
651 search = fsg_search_init(name, fsg, ps->config, ps->acmod, ps->dict, ps->d2p);
652 return set_search_internal(ps, search);
653}
654
655int
656ps_set_jsgf_file(ps_decoder_t *ps, const char *name, const char *path)
657{
658 fsg_model_t *fsg;
659 jsgf_rule_t *rule;
660 char const *toprule;
661 jsgf_t *jsgf = jsgf_parse_file(path, NULL);
662 float lw;
663 int result;
664
665 if (!jsgf)
666 return -1;
667
668 rule = NULL;
669 /* Take the -toprule if specified. */
670 if ((toprule = cmd_ln_str_r(ps->config, "-toprule"))) {
671 rule = jsgf_get_rule(jsgf, toprule);
672 if (rule == NULL) {
673 E_ERROR("Start rule %s not found\n", toprule);
674 jsgf_grammar_free(jsgf);
675 return -1;
676 }
677 } else {
678 rule = jsgf_get_public_rule(jsgf);
679 if (rule == NULL) {
680 E_ERROR("No public rules found in %s\n", path);
681 jsgf_grammar_free(jsgf);
682 return -1;
683 }
684 }
685
686 lw = cmd_ln_float32_r(ps->config, "-lw");
687 fsg = jsgf_build_fsg(jsgf, rule, ps->lmath, lw);
688 result = ps_set_fsg(ps, name, fsg);
689 fsg_model_free(fsg);
690 jsgf_grammar_free(jsgf);
691 return result;
692}
693
694int
695ps_set_jsgf_string(ps_decoder_t *ps, const char *name, const char *jsgf_string)
696{
697 fsg_model_t *fsg;
698 jsgf_rule_t *rule;
699 char const *toprule;
700 jsgf_t *jsgf = jsgf_parse_string(jsgf_string, NULL);
701 float lw;
702 int result;
703
704 if (!jsgf)
705 return -1;
706
707 rule = NULL;
708 /* Take the -toprule if specified. */
709 if ((toprule = cmd_ln_str_r(ps->config, "-toprule"))) {
710 rule = jsgf_get_rule(jsgf, toprule);
711 if (rule == NULL) {
712 E_ERROR("Start rule %s not found\n", toprule);
713 return -1;
714 }
715 } else {
716 rule = jsgf_get_public_rule(jsgf);
717 if (rule == NULL) {
718 E_ERROR("No public rules found in input string\n");
719 return -1;
720 }
721 }
722
723 lw = cmd_ln_float32_r(ps->config, "-lw");
724 fsg = jsgf_build_fsg(jsgf, rule, ps->lmath, lw);
725 result = ps_set_fsg(ps, name, fsg);
726 fsg_model_free(fsg);
727 return result;
728}
729
730
731int
732ps_load_dict(ps_decoder_t *ps, char const *dictfile,
733 char const *fdictfile, char const *format)
734{
735 dict2pid_t *d2p;
736 dict_t *dict;
737 hash_iter_t *search_it;
738 cmd_ln_t *newconfig;
739
740 /* Create a new scratch config to load this dict (so existing one
741 * won't be affected if it fails) */
742 newconfig = cmd_ln_init(NULL, ps_args(), TRUE, NULL);
743 cmd_ln_set_boolean_r(newconfig, "-dictcase",
744 cmd_ln_boolean_r(ps->config, "-dictcase"));
745 cmd_ln_set_str_r(newconfig, "-dict", dictfile);
746 if (fdictfile)
747 cmd_ln_set_str_extra_r(newconfig, "_fdict", fdictfile);
748 else
749 cmd_ln_set_str_extra_r(newconfig, "_fdict",
750 cmd_ln_str_r(ps->config, "_fdict"));
751
752 /* Try to load it. */
753 if ((dict = dict_init(newconfig, ps->acmod->mdef)) == NULL) {
754 cmd_ln_free_r(newconfig);
755 return -1;
756 }
757
758 /* Reinit the dict2pid. */
759 if ((d2p = dict2pid_build(ps->acmod->mdef, dict)) == NULL) {
760 cmd_ln_free_r(newconfig);
761 return -1;
762 }
763
764 /* Success! Update the existing config to reflect new dicts and
765 * drop everything into place. */
766 cmd_ln_free_r(newconfig);
767 dict_free(ps->dict);
768 ps->dict = dict;
769 dict2pid_free(ps->d2p);
770 ps->d2p = d2p;
771
772 /* And tell all searches to reconfigure themselves. */
773 for (search_it = hash_table_iter(ps->searches); search_it;
774 search_it = hash_table_iter_next(search_it)) {
775 if (ps_search_reinit(hash_entry_val(search_it->ent), dict, d2p) < 0) {
776 hash_table_iter_free(search_it);
777 return -1;
778 }
779 }
780
781 return 0;
782}
783
784int
785ps_save_dict(ps_decoder_t *ps, char const *dictfile,
786 char const *format)
787{
788 return dict_write(ps->dict, dictfile, format);
789}
790
791int
793 char const *word,
794 char const *phones,
795 int update)
796{
797 int32 wid;
798 s3cipid_t *pron;
799 hash_iter_t *search_it;
800 char **phonestr, *tmp;
801 int np, i, rv;
802
803 /* Parse phones into an array of phone IDs. */
804 tmp = ckd_salloc(phones);
805 np = str2words(tmp, NULL, 0);
806 phonestr = ckd_calloc(np, sizeof(*phonestr));
807 str2words(tmp, phonestr, np);
808 pron = ckd_calloc(np, sizeof(*pron));
809 for (i = 0; i < np; ++i) {
810 pron[i] = bin_mdef_ciphone_id(ps->acmod->mdef, phonestr[i]);
811 if (pron[i] == -1) {
812 E_ERROR("Unknown phone %s in phone string %s\n",
813 phonestr[i], tmp);
814 ckd_free(phonestr);
815 ckd_free(tmp);
816 ckd_free(pron);
817 return -1;
818 }
819 }
820 /* No longer needed. */
821 ckd_free(phonestr);
822 ckd_free(tmp);
823
824 /* Add it to the dictionary. */
825 if ((wid = dict_add_word(ps->dict, word, pron, np)) == -1) {
826 ckd_free(pron);
827 return -1;
828 }
829 /* No longer needed. */
830 ckd_free(pron);
831
832 /* Now we also have to add it to dict2pid. */
833 dict2pid_add_word(ps->d2p, wid);
834
835 /* TODO: we definitely need to refactor this */
836 for (search_it = hash_table_iter(ps->searches); search_it;
837 search_it = hash_table_iter_next(search_it)) {
838 ps_search_t *search = hash_entry_val(search_it->ent);
839 if (!strcmp(PS_SEARCH_TYPE_NGRAM, ps_search_type(search))) {
840 ngram_model_t *lmset = ((ngram_search_t *) search)->lmset;
841 if (ngram_model_add_word(lmset, word, 1.0) == NGRAM_INVALID_WID) {
842 hash_table_iter_free(search_it);
843 return -1;
844 }
845 }
846
847 if (update) {
848 if ((rv = ps_search_reinit(search, ps->dict, ps->d2p) < 0)) {
849 hash_table_iter_free(search_it);
850 return rv;
851 }
852 }
853 }
854
855 /* Rebuild the widmap and search tree if requested. */
856 return wid;
857}
858
859char *
860ps_lookup_word(ps_decoder_t *ps, const char *word)
861{
862 s3wid_t wid;
863 int32 phlen, j;
864 char *phones;
865 dict_t *dict = ps->dict;
866
867 wid = dict_wordid(dict, word);
868 if (wid == BAD_S3WID)
869 return NULL;
870
871 for (phlen = j = 0; j < dict_pronlen(dict, wid); ++j)
872 phlen += strlen(dict_ciphone_str(dict, wid, j)) + 1;
873 phones = ckd_calloc(1, phlen);
874 for (j = 0; j < dict_pronlen(dict, wid); ++j) {
875 strcat(phones, dict_ciphone_str(dict, wid, j));
876 if (j != dict_pronlen(dict, wid) - 1)
877 strcat(phones, " ");
878 }
879 return phones;
880}
881
882long
884 long maxsamps)
885{
886 int16 *data;
887 long total, pos, endpos;
888
889 ps_start_stream(ps);
890 ps_start_utt(ps);
891
892 /* If this file is seekable or maxsamps is specified, then decode
893 * the whole thing at once. */
894 if (maxsamps != -1) {
895 data = ckd_calloc(maxsamps, sizeof(*data));
896 total = fread(data, sizeof(*data), maxsamps, rawfh);
897 ps_process_raw(ps, data, total, FALSE, TRUE);
898 ckd_free(data);
899 } else if ((pos = ftell(rawfh)) >= 0) {
900 fseek(rawfh, 0, SEEK_END);
901 endpos = ftell(rawfh);
902 fseek(rawfh, pos, SEEK_SET);
903 maxsamps = endpos - pos;
904
905 data = ckd_calloc(maxsamps, sizeof(*data));
906 total = fread(data, sizeof(*data), maxsamps, rawfh);
907 ps_process_raw(ps, data, total, FALSE, TRUE);
908 ckd_free(data);
909 } else {
910 /* Otherwise decode it in a stream. */
911 total = 0;
912 while (!feof(rawfh)) {
913 int16 data[256];
914 size_t nread;
915
916 nread = fread(data, sizeof(*data), sizeof(data)/sizeof(*data), rawfh);
917 ps_process_raw(ps, data, nread, FALSE, FALSE);
918 total += nread;
919 }
920 }
921 ps_end_utt(ps);
922 return total;
923}
924
925int
927{
929 return 0;
930}
931
932int
934{
935 int rv;
936 char uttid[16];
937
938 if (ps->acmod->state == ACMOD_STARTED || ps->acmod->state == ACMOD_PROCESSING) {
939 E_ERROR("Utterance already started\n");
940 return -1;
941 }
942
943 if (ps->search == NULL) {
944 E_ERROR("No search module is selected, did you forget to "
945 "specify a language model or grammar?\n");
946 return -1;
947 }
948
949 ptmr_reset(&ps->perf);
950 ptmr_start(&ps->perf);
951
952 sprintf(uttid, "%09u", ps->uttno);
953 ++ps->uttno;
954
955 /* Remove any residual word lattice and hypothesis. */
957 ps->search->dag = NULL;
958 ps->search->last_link = NULL;
959 ps->search->post = 0;
960 ckd_free(ps->search->hyp_str);
961 ps->search->hyp_str = NULL;
962 if ((rv = acmod_start_utt(ps->acmod)) < 0)
963 return rv;
964
965 /* Start logging features and audio if requested. */
966 if (ps->mfclogdir) {
967 char *logfn = string_join(ps->mfclogdir, "/",
968 uttid, ".mfc", NULL);
969 FILE *mfcfh;
970 E_INFO("Writing MFCC file: %s\n", logfn);
971 if ((mfcfh = fopen(logfn, "wb")) == NULL) {
972 E_ERROR_SYSTEM("Failed to open MFCC file %s", logfn);
973 ckd_free(logfn);
974 return -1;
975 }
976 ckd_free(logfn);
977 acmod_set_mfcfh(ps->acmod, mfcfh);
978 }
979 if (ps->rawlogdir) {
980 char *logfn = string_join(ps->rawlogdir, "/",
981 uttid, ".raw", NULL);
982 FILE *rawfh;
983 E_INFO("Writing raw audio file: %s\n", logfn);
984 if ((rawfh = fopen(logfn, "wb")) == NULL) {
985 E_ERROR_SYSTEM("Failed to open raw audio file %s", logfn);
986 ckd_free(logfn);
987 return -1;
988 }
989 ckd_free(logfn);
990 acmod_set_rawfh(ps->acmod, rawfh);
991 }
992 if (ps->senlogdir) {
993 char *logfn = string_join(ps->senlogdir, "/",
994 uttid, ".sen", NULL);
995 FILE *senfh;
996 E_INFO("Writing senone score file: %s\n", logfn);
997 if ((senfh = fopen(logfn, "wb")) == NULL) {
998 E_ERROR_SYSTEM("Failed to open senone score file %s", logfn);
999 ckd_free(logfn);
1000 return -1;
1001 }
1002 ckd_free(logfn);
1003 acmod_set_senfh(ps->acmod, senfh);
1004 }
1005
1006 /* Start auxiliary phone loop search. */
1007 if (ps->phone_loop)
1008 ps_search_start(ps->phone_loop);
1009
1010 return ps_search_start(ps->search);
1011}
1012
1013static int
1014ps_search_forward(ps_decoder_t *ps)
1015{
1016 int nfr;
1017
1018 nfr = 0;
1019 while (ps->acmod->n_feat_frame > 0) {
1020 int k;
1021 if (ps->pl_window > 0)
1022 if ((k = ps_search_step(ps->phone_loop, ps->acmod->output_frame)) < 0)
1023 return k;
1024 if (ps->acmod->output_frame >= ps->pl_window)
1025 if ((k = ps_search_step(ps->search,
1026 ps->acmod->output_frame - ps->pl_window)) < 0)
1027 return k;
1028 acmod_advance(ps->acmod);
1029 ++ps->n_frame;
1030 ++nfr;
1031 }
1032 return nfr;
1033}
1034
1035int
1037{
1038 int nfr, n_searchfr;
1039
1040 ps_start_utt(ps);
1041 n_searchfr = 0;
1042 acmod_set_insenfh(ps->acmod, senfh);
1043 while ((nfr = acmod_read_scores(ps->acmod)) > 0) {
1044 if ((nfr = ps_search_forward(ps)) < 0) {
1045 ps_end_utt(ps);
1046 return nfr;
1047 }
1048 n_searchfr += nfr;
1049 }
1050 ps_end_utt(ps);
1051 acmod_set_insenfh(ps->acmod, NULL);
1052
1053 return n_searchfr;
1054}
1055
1056int
1058 int16 const *data,
1059 size_t n_samples,
1060 int no_search,
1061 int full_utt)
1062{
1063 int n_searchfr = 0;
1064
1065 if (ps->acmod->state == ACMOD_IDLE) {
1066 E_ERROR("Failed to process data, utterance is not started. Use start_utt to start it\n");
1067 return 0;
1068 }
1069
1070 if (no_search)
1071 acmod_set_grow(ps->acmod, TRUE);
1072
1073 while (n_samples) {
1074 int nfr;
1075
1076 /* Process some data into features. */
1077 if ((nfr = acmod_process_raw(ps->acmod, &data,
1078 &n_samples, full_utt)) < 0)
1079 return nfr;
1080
1081 /* Score and search as much data as possible */
1082 if (no_search)
1083 continue;
1084 if ((nfr = ps_search_forward(ps)) < 0)
1085 return nfr;
1086 n_searchfr += nfr;
1087 }
1088
1089 return n_searchfr;
1090}
1091
1092int
1094 mfcc_t **data,
1095 int32 n_frames,
1096 int no_search,
1097 int full_utt)
1098{
1099 int n_searchfr = 0;
1100
1101 if (no_search)
1102 acmod_set_grow(ps->acmod, TRUE);
1103
1104 while (n_frames) {
1105 int nfr;
1106
1107 /* Process some data into features. */
1108 if ((nfr = acmod_process_cep(ps->acmod, &data,
1109 &n_frames, full_utt)) < 0)
1110 return nfr;
1111
1112 /* Score and search as much data as possible */
1113 if (no_search)
1114 continue;
1115 if ((nfr = ps_search_forward(ps)) < 0)
1116 return nfr;
1117 n_searchfr += nfr;
1118 }
1119
1120 return n_searchfr;
1121}
1122
1123int
1125{
1126 int rv, i;
1127
1128 if (ps->acmod->state == ACMOD_ENDED || ps->acmod->state == ACMOD_IDLE) {
1129 E_ERROR("Utterance is not started\n");
1130 return -1;
1131 }
1132 acmod_end_utt(ps->acmod);
1133
1134 /* Search any remaining frames. */
1135 if ((rv = ps_search_forward(ps)) < 0) {
1136 ptmr_stop(&ps->perf);
1137 return rv;
1138 }
1139 /* Finish phone loop search. */
1140 if (ps->phone_loop) {
1141 if ((rv = ps_search_finish(ps->phone_loop)) < 0) {
1142 ptmr_stop(&ps->perf);
1143 return rv;
1144 }
1145 }
1146 /* Search any frames remaining in the lookahead window. */
1147 if (ps->acmod->output_frame >= ps->pl_window) {
1148 for (i = ps->acmod->output_frame - ps->pl_window;
1149 i < ps->acmod->output_frame; ++i)
1150 ps_search_step(ps->search, i);
1151 }
1152 /* Finish main search. */
1153 if ((rv = ps_search_finish(ps->search)) < 0) {
1154 ptmr_stop(&ps->perf);
1155 return rv;
1156 }
1157 ptmr_stop(&ps->perf);
1158
1159 /* Log a backtrace if requested. */
1160 if (cmd_ln_boolean_r(ps->config, "-backtrace")) {
1161 const char* hyp;
1162 ps_seg_t *seg;
1163 int32 score;
1164
1165 hyp = ps_get_hyp(ps, &score);
1166
1167 if (hyp != NULL) {
1168 E_INFO("%s (%d)\n", hyp, score);
1169 E_INFO_NOFN("%-20s %-5s %-5s %-5s %-10s %-10s %-3s\n",
1170 "word", "start", "end", "pprob", "ascr", "lscr", "lback");
1171 for (seg = ps_seg_iter(ps); seg;
1172 seg = ps_seg_next(seg)) {
1173 char const *word;
1174 int sf, ef;
1175 int32 post, lscr, ascr, lback;
1176
1177 word = ps_seg_word(seg);
1178 ps_seg_frames(seg, &sf, &ef);
1179 post = ps_seg_prob(seg, &ascr, &lscr, &lback);
1180 E_INFO_NOFN("%-20s %-5d %-5d %-1.3f %-10d %-10d %-3d\n",
1181 word, sf, ef, logmath_exp(ps_get_logmath(ps), post),
1182 ascr, lscr, lback);
1183 }
1184 }
1185 }
1186 return rv;
1187}
1188
1189char const *
1190ps_get_hyp(ps_decoder_t *ps, int32 *out_best_score)
1191{
1192 char const *hyp;
1193
1194 ptmr_start(&ps->perf);
1195 hyp = ps_search_hyp(ps->search, out_best_score);
1196 ptmr_stop(&ps->perf);
1197 return hyp;
1198}
1199
1200int32
1202{
1203 int32 prob;
1204
1205 ptmr_start(&ps->perf);
1206 prob = ps_search_prob(ps->search);
1207 ptmr_stop(&ps->perf);
1208 return prob;
1209}
1210
1211ps_seg_t *
1213{
1214 ps_seg_t *itor;
1215
1216 ptmr_start(&ps->perf);
1217 itor = ps_search_seg_iter(ps->search);
1218 ptmr_stop(&ps->perf);
1219 return itor;
1220}
1221
1222ps_seg_t *
1224{
1225 return ps_search_seg_next(seg);
1226}
1227
1228char const *
1230{
1231 return seg->word;
1232}
1233
1234void
1235ps_seg_frames(ps_seg_t *seg, int *out_sf, int *out_ef)
1236{
1237 int uf;
1238 uf = acmod_stream_offset(seg->search->acmod);
1239 if (out_sf) *out_sf = seg->sf + uf;
1240 if (out_ef) *out_ef = seg->ef + uf;
1241}
1242
1243int32
1244ps_seg_prob(ps_seg_t *seg, int32 *out_ascr, int32 *out_lscr, int32 *out_lback)
1245{
1246 if (out_ascr) *out_ascr = seg->ascr;
1247 if (out_lscr) *out_lscr = seg->lscr;
1248 if (out_lback) *out_lback = seg->lback;
1249 return seg->prob;
1250}
1251
1252void
1254{
1255 ps_search_seg_free(seg);
1256}
1257
1260{
1261 return ps_search_lattice(ps->search);
1262}
1263
1264ps_nbest_t *
1266{
1267 ps_lattice_t *dag;
1268 ngram_model_t *lmset;
1269 ps_astar_t *nbest;
1270 float32 lwf;
1271
1272 if (ps->search == NULL)
1273 return NULL;
1274 if ((dag = ps_get_lattice(ps)) == NULL)
1275 return NULL;
1276
1277 /* FIXME: This is all quite specific to N-Gram search. Either we
1278 * should make N-best a method for each search module or it needs
1279 * to be abstracted to work for N-Gram and FSG. */
1280 if (0 != strcmp(ps_search_type(ps->search), PS_SEARCH_TYPE_NGRAM)) {
1281 lmset = NULL;
1282 lwf = 1.0f;
1283 } else {
1284 lmset = ((ngram_search_t *)ps->search)->lmset;
1285 lwf = ((ngram_search_t *)ps->search)->bestpath_fwdtree_lw_ratio;
1286 }
1287
1288 nbest = ps_astar_start(dag, lmset, lwf, 0, -1, -1, -1);
1289
1290 nbest = ps_nbest_next(nbest);
1291
1292 return (ps_nbest_t *)nbest;
1293}
1294
1295void
1297{
1298 ps_astar_finish(nbest);
1299}
1300
1301ps_nbest_t *
1303{
1304 ps_latpath_t *next;
1305
1306 next = ps_astar_next(nbest);
1307 if (next == NULL) {
1308 ps_nbest_free(nbest);
1309 return NULL;
1310 }
1311 return nbest;
1312}
1313
1314char const *
1315ps_nbest_hyp(ps_nbest_t *nbest, int32 *out_score)
1316{
1317 assert(nbest != NULL);
1318
1319 if (nbest->top == NULL)
1320 return NULL;
1321 if (out_score) *out_score = nbest->top->score;
1322 return ps_astar_hyp(nbest, nbest->top);
1323}
1324
1325ps_seg_t *
1327{
1328 if (nbest->top == NULL)
1329 return NULL;
1330
1331 return ps_astar_seg_iter(nbest, nbest->top, 1.0);
1332}
1333
1334int
1336{
1337 return ps->acmod->output_frame + 1;
1338}
1339
1340void
1341ps_get_utt_time(ps_decoder_t *ps, double *out_nspeech,
1342 double *out_ncpu, double *out_nwall)
1343{
1344 int32 frate;
1345
1346 frate = cmd_ln_int32_r(ps->config, "-frate");
1347 *out_nspeech = (double)ps->acmod->output_frame / frate;
1348 *out_ncpu = ps->perf.t_cpu;
1349 *out_nwall = ps->perf.t_elapsed;
1350}
1351
1352void
1353ps_get_all_time(ps_decoder_t *ps, double *out_nspeech,
1354 double *out_ncpu, double *out_nwall)
1355{
1356 int32 frate;
1357
1358 frate = cmd_ln_int32_r(ps->config, "-frate");
1359 *out_nspeech = (double)ps->n_frame / frate;
1360 *out_ncpu = ps->perf.t_tot_cpu;
1361 *out_nwall = ps->perf.t_tot_elapsed;
1362}
1363
1364uint8
1366{
1367 return fe_get_vad_state(ps->acmod->fe);
1368}
1369
1370void
1371ps_search_init(ps_search_t *search, ps_searchfuncs_t *vt,
1372 const char *type,
1373 const char *name,
1374 cmd_ln_t *config, acmod_t *acmod, dict_t *dict,
1375 dict2pid_t *d2p)
1376{
1377 search->vt = vt;
1378 search->name = ckd_salloc(name);
1379 search->type = ckd_salloc(type);
1380
1381 search->config = config;
1382 search->acmod = acmod;
1383 if (d2p)
1384 search->d2p = dict2pid_retain(d2p);
1385 else
1386 search->d2p = NULL;
1387 if (dict) {
1388 search->dict = dict_retain(dict);
1389 search->start_wid = dict_startwid(dict);
1390 search->finish_wid = dict_finishwid(dict);
1391 search->silence_wid = dict_silwid(dict);
1392 search->n_words = dict_size(dict);
1393 }
1394 else {
1395 search->dict = NULL;
1396 search->start_wid = search->finish_wid = search->silence_wid = -1;
1397 search->n_words = 0;
1398 }
1399}
1400
1401void
1402ps_search_base_free(ps_search_t *search)
1403{
1404 /* FIXME: We will have refcounting on acmod, config, etc, at which
1405 * point we will free them here too. */
1406 ckd_free(search->name);
1407 ckd_free(search->type);
1408 dict_free(search->dict);
1409 dict2pid_free(search->d2p);
1410 ckd_free(search->hyp_str);
1411 ps_lattice_free(search->dag);
1412}
1413
1414void
1415ps_search_base_reinit(ps_search_t *search, dict_t *dict,
1416 dict2pid_t *d2p)
1417{
1418 dict_free(search->dict);
1419 dict2pid_free(search->d2p);
1420 /* FIXME: _retain() should just return NULL if passed NULL. */
1421 if (dict) {
1422 search->dict = dict_retain(dict);
1423 search->start_wid = dict_startwid(dict);
1424 search->finish_wid = dict_finishwid(dict);
1425 search->silence_wid = dict_silwid(dict);
1426 search->n_words = dict_size(dict);
1427 }
1428 else {
1429 search->dict = NULL;
1430 search->start_wid = search->finish_wid = search->silence_wid = -1;
1431 search->n_words = 0;
1432 }
1433 if (d2p)
1434 search->d2p = dict2pid_retain(d2p);
1435 else
1436 search->d2p = NULL;
1437}
1438
1439void
1441{
1442 acmod_set_rawdata_size(ps->acmod, size);
1443}
1444
1445void
1446ps_get_rawdata(ps_decoder_t *ps, int16 **buffer, int32 *size)
1447{
1448 acmod_get_rawdata(ps->acmod, buffer, size);
1449}
void acmod_get_rawdata(acmod_t *acmod, int16 **buffer, int32 *size)
Retrieves the raw data collected during utterance decoding.
Definition: acmod.c:1332
int acmod_set_grow(acmod_t *acmod, int grow_feat)
Set memory allocation policy for utterance processing.
Definition: acmod.c:410
void acmod_free(acmod_t *acmod)
Finalize an acoustic model.
Definition: acmod.c:299
int acmod_process_raw(acmod_t *acmod, int16 const **inout_raw, size_t *inout_n_samps, int full_utt)
TODO: Set queue length for utterance processing.
Definition: acmod.c:607
int acmod_end_utt(acmod_t *acmod)
Mark the end of an utterance.
Definition: acmod.c:441
int acmod_advance(acmod_t *acmod)
Advance the frame index.
Definition: acmod.c:899
int acmod_set_mfcfh(acmod_t *acmod, FILE *logfh)
Start logging MFCCs to a filehandle.
Definition: acmod.c:375
int acmod_set_insenfh(acmod_t *acmod, FILE *senfh)
Set up a senone score dump file for input.
Definition: acmod.c:864
int acmod_read_scores(acmod_t *acmod)
Read one frame of scores from senone score dump file.
Definition: acmod.c:1012
int acmod_set_rawfh(acmod_t *acmod, FILE *logfh)
Start logging raw audio to a filehandle.
Definition: acmod.c:387
int32 acmod_stream_offset(acmod_t *acmod)
Get the offset of the utterance start of the current stream, helpful for stream-wide timing.
Definition: acmod.c:1308
void acmod_set_rawdata_size(acmod_t *acmod, int32 size)
Sets the limit of the raw audio data to store.
Definition: acmod.c:1321
int acmod_process_cep(acmod_t *acmod, mfcc_t ***inout_cep, int *inout_n_frames, int full_utt)
Feed acoustic feature data into the acoustic model for scoring.
Definition: acmod.c:699
int acmod_start_utt(acmod_t *acmod)
Mark the start of an utterance.
Definition: acmod.c:423
int acmod_set_senfh(acmod_t *acmod, FILE *logfh)
Start logging senone scores to a filehandle.
Definition: acmod.c:364
ps_mllr_t * acmod_update_mllr(acmod_t *acmod, ps_mllr_t *mllr)
Adapt acoustic model using a linear transform.
Definition: acmod.c:339
void acmod_start_stream(acmod_t *acmod)
Reset the current stream.
Definition: acmod.c:1314
acmod_t * acmod_init(cmd_ln_t *config, logmath_t *lmath, fe_t *fe, feat_t *fcb)
Initialize an acoustic model.
Definition: acmod.c:228
@ ACMOD_IDLE
Not in an utterance.
Definition: acmod.h:68
@ ACMOD_PROCESSING
Utterance in progress.
Definition: acmod.h:70
@ ACMOD_ENDED
Utterance ended, still buffering.
Definition: acmod.h:71
@ ACMOD_STARTED
Utterance started, no data yet.
Definition: acmod.h:69
dict2pid_t * dict2pid_retain(dict2pid_t *d2p)
Retain a pointer to dict2pid.
Definition: dict2pid.c:500
dict2pid_t * dict2pid_build(bin_mdef_t *mdef, dict_t *dict)
Build the dict2pid structure for the given model/dictionary.
Definition: dict2pid.c:388
int dict2pid_free(dict2pid_t *d2p)
Free the memory dict2pid structure.
Definition: dict2pid.c:507
int dict2pid_add_word(dict2pid_t *d2p, int32 wid)
Add a word to the dict2pid structure (after adding it to dict).
Definition: dict2pid.c:298
#define dict_size(d)
Packaged macro access to dictionary members.
Definition: dict.h:151
ps_search_t * ngram_search_init(const char *name, ngram_model_t *lm, cmd_ln_t *config, acmod_t *acmod, dict_t *dict, dict2pid_t *d2p)
Initialize the N-Gram search module.
Definition: ngram_search.c:140
N-Gram based multi-pass search ("FBS")
Flat lexicon based Viterbi search.
Lexicon tree based Viterbi search.
Fast and rough context-independent phoneme loop search.
Main header file for the PocketSphinx decoder.
POCKETSPHINX_EXPORT void ps_seg_frames(ps_seg_t *seg, int *out_sf, int *out_ef)
Get inclusive start and end frames from a segmentation iterator.
POCKETSPHINX_EXPORT ps_seg_t * ps_seg_next(ps_seg_t *seg)
Get the next segment in a word segmentation.
POCKETSPHINX_EXPORT void ps_get_rawdata(ps_decoder_t *ps, int16 **buffer, int32 *size)
Retrieves the raw data collected during utterance decoding.
POCKETSPHINX_EXPORT cmd_ln_t * ps_get_config(ps_decoder_t *ps)
Get the configuration object for this decoder.
Definition: pocketsphinx.c:437
POCKETSPHINX_EXPORT uint8 ps_get_in_speech(ps_decoder_t *ps)
Checks if the last feed audio buffer contained speech.
POCKETSPHINX_EXPORT char const * ps_seg_word(ps_seg_t *seg)
Get word string from a segmentation iterator.
POCKETSPHINX_EXPORT logmath_t * ps_get_logmath(ps_decoder_t *ps)
Get the log-math computation object for this decoder.
Definition: pocketsphinx.c:443
POCKETSPHINX_EXPORT char const * ps_get_hyp(ps_decoder_t *ps, int32 *out_best_score)
Get hypothesis string and path score.
POCKETSPHINX_EXPORT feat_t * ps_get_feat(ps_decoder_t *ps)
Get the dynamic feature computation object for this decoder.
Definition: pocketsphinx.c:455
POCKETSPHINX_EXPORT int ps_reinit(ps_decoder_t *ps, cmd_ln_t *config)
Reinitialize the decoder with updated configuration.
Definition: pocketsphinx.c:218
POCKETSPHINX_EXPORT long ps_decode_raw(ps_decoder_t *ps, FILE *rawfh, long maxsamps)
Decode a raw audio stream.
Definition: pocketsphinx.c:883
POCKETSPHINX_EXPORT int ps_decode_senscr(ps_decoder_t *ps, FILE *senfh)
Decode a senone score dump file.
POCKETSPHINX_EXPORT void ps_set_rawdata_size(ps_decoder_t *ps, int32 size)
Sets the limit of the raw audio data to store in decoder to retrieve it later on ps_get_rawdata.
POCKETSPHINX_EXPORT int ps_process_raw(ps_decoder_t *ps, int16 const *data, size_t n_samples, int no_search, int full_utt)
Decode raw audio data.
POCKETSPHINX_EXPORT int ps_save_dict(ps_decoder_t *ps, char const *dictfile, char const *format)
Dump the current pronunciation dictionary to a file.
Definition: pocketsphinx.c:785
POCKETSPHINX_EXPORT int ps_add_word(ps_decoder_t *ps, char const *word, char const *phones, int update)
Add a word to the pronunciation dictionary.
Definition: pocketsphinx.c:792
POCKETSPHINX_EXPORT void ps_get_all_time(ps_decoder_t *ps, double *out_nspeech, double *out_ncpu, double *out_nwall)
Get overall performance information.
POCKETSPHINX_EXPORT ps_seg_t * ps_nbest_seg(ps_nbest_t *nbest)
Get the word segmentation from an N-best list iterator.
POCKETSPHINX_EXPORT arg_t const * ps_args(void)
Returns the argument definitions used in ps_init().
Definition: pocketsphinx.c:407
POCKETSPHINX_EXPORT int ps_load_dict(ps_decoder_t *ps, char const *dictfile, char const *fdictfile, char const *format)
Reload the pronunciation dictionary from a file.
Definition: pocketsphinx.c:732
POCKETSPHINX_EXPORT void ps_nbest_free(ps_nbest_t *nbest)
Finish N-best search early, releasing resources.
POCKETSPHINX_EXPORT void ps_default_search_args(cmd_ln_t *)
Sets default grammar and language model if they are not set explicitly and are present in the default...
Definition: pocketsphinx.c:188
POCKETSPHINX_EXPORT int ps_start_stream(ps_decoder_t *ps)
Start processing of the stream of speech.
Definition: pocketsphinx.c:926
POCKETSPHINX_EXPORT fe_t * ps_get_fe(ps_decoder_t *ps)
Get the feature extraction object for this decoder.
Definition: pocketsphinx.c:449
POCKETSPHINX_EXPORT void ps_get_utt_time(ps_decoder_t *ps, double *out_nspeech, double *out_ncpu, double *out_nwall)
Get performance information for the current utterance.
POCKETSPHINX_EXPORT ps_seg_t * ps_seg_iter(ps_decoder_t *ps)
Get an iterator over the word segmentation for the best hypothesis.
POCKETSPHINX_EXPORT char * ps_lookup_word(ps_decoder_t *ps, const char *word)
Lookup for the word in the dictionary and return phone transcription for it.
Definition: pocketsphinx.c:860
POCKETSPHINX_EXPORT ps_decoder_t * ps_retain(ps_decoder_t *ps)
Retain a pointer to the decoder.
Definition: pocketsphinx.c:413
POCKETSPHINX_EXPORT int ps_end_utt(ps_decoder_t *ps)
End utterance processing.
POCKETSPHINX_EXPORT ps_nbest_t * ps_nbest(ps_decoder_t *ps)
Get an iterator over the best hypotheses.
POCKETSPHINX_EXPORT int ps_get_n_frames(ps_decoder_t *ps)
Get the number of frames of data searched.
POCKETSPHINX_EXPORT ps_mllr_t * ps_update_mllr(ps_decoder_t *ps, ps_mllr_t *mllr)
Adapt current acoustic model using a linear transform.
Definition: pocketsphinx.c:461
POCKETSPHINX_EXPORT void ps_seg_free(ps_seg_t *seg)
Finish iterating over a word segmentation early, freeing resources.
POCKETSPHINX_EXPORT int32 ps_get_prob(ps_decoder_t *ps)
Get posterior probability.
POCKETSPHINX_EXPORT int ps_free(ps_decoder_t *ps)
Finalize the decoder.
Definition: pocketsphinx.c:420
POCKETSPHINX_EXPORT int ps_process_cep(ps_decoder_t *ps, mfcc_t **data, int n_frames, int no_search, int full_utt)
Decode acoustic feature data.
POCKETSPHINX_EXPORT int ps_start_utt(ps_decoder_t *ps)
Start utterance processing.
Definition: pocketsphinx.c:933
POCKETSPHINX_EXPORT int32 ps_seg_prob(ps_seg_t *seg, int32 *out_ascr, int32 *out_lscr, int32 *out_lback)
Get language, acoustic, and posterior probabilities from a segmentation iterator.
POCKETSPHINX_EXPORT ps_nbest_t * ps_nbest_next(ps_nbest_t *nbest)
Move an N-best list iterator forward.
POCKETSPHINX_EXPORT char const * ps_nbest_hyp(ps_nbest_t *nbest, int32 *out_score)
Get the hypothesis string from an N-best list iterator.
POCKETSPHINX_EXPORT ps_decoder_t * ps_init(cmd_ln_t *config)
Initialize the decoder from a configuration object.
Definition: pocketsphinx.c:388
POCKETSPHINX_EXPORT ps_lattice_t * ps_get_lattice(ps_decoder_t *ps)
Get word lattice.
Internal implementation of PocketSphinx decoder.
char const * ps_astar_hyp(ps_astar_t *nbest, ps_latpath_t *path)
Get hypothesis string from A* search.
Definition: ps_lattice.c:1804
ps_astar_t * ps_astar_start(ps_lattice_t *dag, ngram_model_t *lmset, float32 lwf, int sf, int ef, int w1, int w2)
Begin N-Gram based A* search on a word graph.
Definition: ps_lattice.c:1712
void ps_astar_finish(ps_astar_t *nbest)
Finish N-best search, releasing resources associated with it.
Definition: ps_lattice.c:1925
ps_seg_t * ps_astar_seg_iter(ps_astar_t *astar, ps_latpath_t *path, float32 lwf)
Get hypothesis segmentation from A* search.
Definition: ps_lattice.c:1898
ps_latpath_t * ps_astar_next(ps_astar_t *nbest)
Find next best hypothesis of A* on a word graph.
Definition: ps_lattice.c:1771
POCKETSPHINX_EXPORT int ps_lattice_free(ps_lattice_t *dag)
Free a lattice.
Definition: ps_lattice.c:665
Word graph search implementation.
POCKETSPHINX_EXPORT int ps_set_kws(ps_decoder_t *ps, const char *name, const char *keyfile)
Adds keyphrases from a file to spotting.
Definition: pocketsphinx.c:632
POCKETSPHINX_EXPORT int ps_unset_search(ps_decoder_t *ps, const char *name)
Unsets the search and releases related resources.
Definition: pocketsphinx.c:507
POCKETSPHINX_EXPORT int ps_set_jsgf_string(ps_decoder_t *ps, const char *name, const char *jsgf_string)
Adds new search using JSGF model.
Definition: pocketsphinx.c:695
POCKETSPHINX_EXPORT ps_search_iter_t * ps_search_iter(ps_decoder_t *ps)
Returns iterator over current searches.
Definition: pocketsphinx.c:519
POCKETSPHINX_EXPORT int ps_set_search(ps_decoder_t *ps, const char *name)
Actives search with the provided name.
Definition: pocketsphinx.c:467
POCKETSPHINX_EXPORT void ps_search_iter_free(ps_search_iter_t *itor)
Delete an unfinished search iterator.
Definition: pocketsphinx.c:537
POCKETSPHINX_EXPORT int ps_set_allphone_file(ps_decoder_t *ps, const char *name, const char *path)
Adds new search based on phone N-gram language model.
Definition: pocketsphinx.c:617
POCKETSPHINX_EXPORT const char * ps_get_search(ps_decoder_t *ps)
Returns name of curent search in decoder.
Definition: pocketsphinx.c:492
POCKETSPHINX_EXPORT const char * ps_get_kws(ps_decoder_t *ps, const char *name)
Get the current Key phrase to spot.
Definition: pocketsphinx.c:561
POCKETSPHINX_EXPORT fsg_model_t * ps_get_fsg(ps_decoder_t *ps, const char *name)
Get the finite-state grammar set object for this decoder.
Definition: pocketsphinx.c:552
POCKETSPHINX_EXPORT int ps_set_lm_file(ps_decoder_t *ps, const char *name, const char *path)
Adds new search based on N-gram language model.
Definition: pocketsphinx.c:594
POCKETSPHINX_EXPORT int ps_set_keyphrase(ps_decoder_t *ps, const char *name, const char *keyphrase)
Adds new keyphrase to spot.
Definition: pocketsphinx.c:640
POCKETSPHINX_EXPORT const char * ps_search_iter_val(ps_search_iter_t *itor)
Retrieves the name of the search the iterator points to.
Definition: pocketsphinx.c:531
POCKETSPHINX_EXPORT int ps_set_allphone(ps_decoder_t *ps, const char *name, ngram_model_t *lm)
Adds new search based on phone N-gram language model.
Definition: pocketsphinx.c:609
POCKETSPHINX_EXPORT int ps_set_lm(ps_decoder_t *ps, const char *name, ngram_model_t *lm)
Adds new search based on N-gram language model.
Definition: pocketsphinx.c:586
POCKETSPHINX_EXPORT ngram_model_t * ps_get_lm(ps_decoder_t *ps, const char *name)
Get the language model set object for this decoder.
Definition: pocketsphinx.c:543
POCKETSPHINX_EXPORT ps_search_iter_t * ps_search_iter_next(ps_search_iter_t *itor)
Updates search iterator to point to the next position.
Definition: pocketsphinx.c:525
POCKETSPHINX_EXPORT int ps_set_jsgf_file(ps_decoder_t *ps, const char *name, const char *path)
Adds new search using JSGF model.
Definition: pocketsphinx.c:656
POCKETSPHINX_EXPORT int ps_set_fsg(ps_decoder_t *ps, const char *name, fsg_model_t *fsg)
Adds new search based on finite state grammar.
Definition: pocketsphinx.c:648
#define BAD_S3WID
Dictionary word id.
Definition: s3types.h:90
int16 s3cipid_t
Size definitions for more semantially meaningful units.
Definition: s3types.h:63
Acoustic model structure.
Definition: acmod.h:148
uint8 state
State of utterance processing.
Definition: acmod.h:187
bin_mdef_t * mdef
Model definition.
Definition: acmod.h:159
feat_t * fcb
Dynamic feature computation.
Definition: acmod.h:156
fe_t * fe
Acoustic feature computation.
Definition: acmod.h:155
frame_idx_t n_feat_frame
Number of frames active in feat_buf.
Definition: acmod.h:199
frame_idx_t output_frame
Index of next frame of dynamic features.
Definition: acmod.h:194
Building composite triphone (as well as word internal triphones) with the dictionary.
Definition: dict2pid.h:84
a structure for a dictionary.
Definition: dict.h:76
Implementation of FSG search (and "FSG set") structure.
N-Gram search module structure.
Definition: ngram_search.h:197
A* search structure.
Decoder object.
cmd_ln_t * config
Configuration.
ps_search_t * phone_loop
Phone loop search for lookahead.
char const * senlogdir
Log directory for senone score files.
int pl_window
Window size for phoneme lookahead.
uint32 uttno
Utterance counter.
uint32 n_frame
Total number of frames processed.
hash_table_t * searches
Set of search modules.
char const * mfclogdir
Log directory for MFCC files.
char const * rawlogdir
Log directory for audio files.
int refcount
Reference count.
ptmr_t perf
Performance counter for all of decoding.
logmath_t * lmath
Log math computation.
ps_search_t * search
Currently active search module.
dict2pid_t * d2p
Dictionary to senone mapping.
dict_t * dict
Pronunciation dictionary.
acmod_t * acmod
Acoustic model.
Partial path structure used in N-best (A*) search.
int32 score
Exact score from start node up to node->sf.
Word graph structure used in bestpath/nbest search.
Feature space linear transform structure.
Definition: acmod.h:82
Base structure for search module.
int32 finish_wid
Finish word ID.
acmod_t * acmod
Acoustic model.
ps_search_t * pls
Phoneme loop for lookahead.
int32 post
Utterance posterior probability.
dict2pid_t * d2p
Dictionary to senone mappings.
ps_lattice_t * dag
Current hypothesis word graph.
dict_t * dict
Pronunciation dictionary.
ps_latlink_t * last_link
Final link in best path.
char * hyp_str
Current hypothesis string.
ps_searchfuncs_t * vt
V-table of search methods.
cmd_ln_t * config
Configuration.
int32 silence_wid
Silence word ID.
int32 n_words
Number of words known to search (may be less than in the dictionary)
int32 start_wid
Start word ID.
V-table for search algorithm.
Base structure for hypothesis segmentation iterator.
ps_search_t * search
Search object from whence this came.
int32 lback
Language model backoff.
int32 lscr
Language model score.
int32 ascr
Acoustic score.
frame_idx_t sf
Start frame.
char const * word
Word string (pointer into dictionary hash)
frame_idx_t ef
End frame.
int32 prob
Log posterior probability.