Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
ngram_build_main.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1995,1996 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Authors: Simon King */
34/* Date : July 1995 */
35/*-----------------------------------------------------------------------*/
36/* EST_Ngrammar build program */
37/* */
38/*=======================================================================*/
39
40#include <cstdlib>
41#include "EST.h"
42#include "EST_Ngrammar.h"
43#include "EST_Pathname.h"
44
45
46
47/** @name <command>ngram_build</command> <emphasis>Train n-gram language model</emphasis>
48 @id ngram_build_manual
49 * @toc
50 */
51
52//@{
53
54
55/**@name Synopsis
56 */
57//@{
58
59//@synopsis
60
61/**
62ngram_build offers basic ngram language model estimation.
63
64
65<formalpara>
66<para><title>Input data format</title></para>
67
68<para> Two input formats are supported. In sentence_per_line format,
69the program will deal with start and end of sentence (if required) by
70using special vocabulary items specified by -prev_tag, -prev_prev_tag
71and -last_tag. For example, the input sentence: </para>
72
73<screen>
74the cat sat on the mat
75</screen>
76
77would be treated as
78
79<screen>
80... prev_prev_tag prev_prev_tag prev_tag the cat sat on the mat last_tag
81</screen>
82
83where prev_prev_tag is the argument to -prev_prev_tag, and so on. A
84default set of tag names is also available. This input format is only
85useful for sliding-window type applications (e.g. language modelling
86for speech recognition).
87
88The second input format is ngram_per_line which is useful for either
89non-sliding-window applications, or where the user requires an
90alternative treatment of start/end of sentence to that provided
91above. Now the input file simply contains a complete ngram per
92line. For the same example as above (to build a trigram model) this
93would be:
94
95<para>
96<screen>
97prev_prev_tag prev_tag the
98prev_tag the cat
99the cat sat
100cat sat on
101sat on the
102on the mat
103the mat last_tag
104</screen>
105</para>
106
107</formalpara>
108
109
110<formalpara>
111<para><title>Representation</title></para>
112
113\[V^N\]
114
115<para> The internal representation of the model becomes important for
116higher values of N where, if V is the vocabulary size, \‍(V^N\‍) becomes
117very large. In such cases, we cannot explicitly hold probabilities for
118all possible ngrams, and a sparse representation must be used
119(i.e. only non-zero probabilities are stored).</para>
120</formalpara>
121
122<formalpara>
123<para><title>Getting more robust probability estimates</title></para>
124The common techniques for getting better estimates of the low/zero
125frequency ngrams are provided: namely smoothing and backing-off</para>
126</formalpara>
127
128<formalpara>
129<para><title>Testing an ngram model</title></para>
130Use the <link linkend=ngram-test-manual>ngram_test</link> program.
131</formalpara>
132
133*/
134
135//@}
136
137/**@name OPTIONS
138 */
139//@{
140
141//@options
142
143//@}
144
145
146int main(int argc, char **argv)
147{
148 int order;
150 EST_Option al, op;
154 EST_Ngrammar::representation_t representation =
155 EST_Ngrammar::dense;
156
159 bool trace=false;
160 double floor=0.0;
161
162 parse_command_line
163 (argc, argv,
164 EST_String("[input file0] [input file1] ... -o [output file]\n")+
165 "-w <ifile> filename containing word list (required)\n"+
166 "-p <ifile> filename containing predictee word list\n"+
167 " (default is to use wordlist given by -w)\n"+
168 "-order <int> order, 1=unigram, 2=bigram etc. (default 2)\n"+
169 "-smooth <int> Good-Turing smooth the grammar up to the\n"+
170 " given frequency\n"+
171 "-o <ofile> Output file for constructed ngram\n"+
172 "\n"
173 "-input_format <string>\n"+
174 " format of input data (default sentence_per_line)\n"+
175 " may be sentence_per_file, ngram_per_line.\n"+
176 "-otype <string> format of output file, one of cstr_ascii\n"+
177 " cstr_bin or htk_ascii\n"+
178 "-sparse build ngram in sparse representation\n"+
179 "-dense build ngram in dense representation (default)\n"+
180 "-backoff <int>\n"+
181 " build backoff ngram (requires -smooth)\n"+
182 "-floor <double>\n"+
183 " frequency floor value used with some ngrams\n"+
184 "-freqsmooth <int>\n"+
185 " build frequency backed off smoothed ngram, this\n"+
186 " requires -smooth option\n"+
187 "-trace give verbose outout about build process\n"+
188 "-save_compressed save ngram in gzipped format\n"+
189 "-oov_mode <string>\n"+
190 " what to do about out-of-vocabulary words,\n"+
191 " one of skip_ngram, skip_sentence (default),\n"+
192 " skip_file, or use_oov_marker\n"+
193 "-oov_marker <string>\n"+
194 " special word for oov words (default "+OOV_MARKER+")\n"+
195 " (use in conjunction with '-oov_mode use_oov_marker'\n"+
196 "\n"+
197 "Pseudo-words :\n"+
198 "-prev_tag <string>\n"+
199 " tag before sentence start\n"+
200 "-prev_prev_tag <string>\n"+
201 " all words before 'prev_tag'\n"+
202 "-last_tag <string>\n"+
203 " after sentence end\n"+
204 "-default_tags use default tags of "+SENTENCE_START_MARKER+
205 ","+SENTENCE_END_MARKER+" and "+SENTENCE_END_MARKER+"\n"+
206 " respectively\n",
207 files, al);
208
209 if (al.present("-input_format"))
210 input_format = al.val("-input_format");
211 else
212 input_format = "sentence_per_line";
213
214 if (al.present("-oov_mode"))
215 oov_mode = al.val("-oov_mode");
216 else
217 oov_mode = "skip_sentence";
218
219
220 if(al.present("-oov_marker"))
221 {
222 if(oov_mode != "use_oov_marker")
223 {
224 cerr << "Error : can only use -oov_marker with '-oov_mode use_oov_marker'" << endl;
225 exit(1);
226 }
227 else
228 oov_marker = al.val("-oov_marker");
229
230 // should check oov marker is/isn't (?) in vocab
231 // ......
232 }
233
234 if( (oov_mode != "skip_ngram") &&
235 (oov_mode != "skip_sentence") &&
236 (oov_mode != "skip_file") &&
237 (oov_mode != "use_oov_marker") )
238 {
239 cerr << oov_mode << " is not a valid oov_mode !" << endl;
240 exit(1);
241 }
242
243 if (al.present("-w"))
244 wordlist_file = al.val("-w");
245 else{
246 cerr << "build_ngram: Must specify a wordlist with -w" << endl;
247 exit(1);
248 }
249
250 if (load_StrList(wordlist_file,wordlist) != format_ok)
251 {
252 cerr << "build_ngram: Could not read wordlist from file "
253 << wordlist_file << endl;
254 exit(1);
255 }
256
257
258 if (al.present("-p"))
259 {
260
261 if(input_format != "ngram_per_line")
262 {
263 cerr << "Can't have differering predictor/predictee lists unless data is in ngram_per_line format !" << endl;
264 exit(1);
265 }
266
267 wordlist_file2 = al.val("-p");
268 if (load_StrList(wordlist_file2,wordlist2) != format_ok)
269 {
270 cerr << "build_ngram: Could not read predictee list from file "
271 << wordlist_file2 << endl;
272 exit(1);
273 }
274 }
275
276 if (al.present("-trace"))
277 trace=true;
278
279 if (al.present("-o"))
280 out_file = al.val("-o");
281 else
282 out_file = "-";
283
284 if (al.present("-default_tags"))
285 {
286 prev_tag = SENTENCE_START_MARKER;
287 prev_prev_tag = SENTENCE_END_MARKER;
288 last_tag = SENTENCE_END_MARKER;
289
290 wordlist.append(SENTENCE_START_MARKER);
291 wordlist.append(SENTENCE_END_MARKER);
292
293 if (al.present("-p"))
294 {
295 wordlist2.append(SENTENCE_START_MARKER);
296 wordlist2.append(SENTENCE_END_MARKER);
297 }
298 }
299
300 if (al.present("-prev_tag"))
301 {
302 if (al.present("-default_tags"))
303 cerr << "build_ngram: WARNING : -prev_tag overrides -default_tags"
304 << endl;
305 prev_tag = al.val("-prev_tag");
306 }
307
308 if (al.present("-prev_prev_tag"))
309 {
310 if (al.present("-default_tags"))
311 cerr << "build_ngram: WARNING : -prev_prev_tag overrides -default_tags"
312 << endl;
313 prev_prev_tag = al.val("-prev_prev_tag");
314 }
315
316 if (al.present("-last_tag"))
317 {
318 if (al.present("-default_tags"))
319 cerr << "build_ngram: WARNING : -last_tag overrides -default_tags"
320 << endl;
321 last_tag = al.val("-last_tag");
322 }
323
324 if ( ( (prev_tag=="") || (prev_prev_tag=="") || (last_tag=="") )
325 && ( (prev_tag!="") || (prev_prev_tag!="") || (last_tag!="") ) )
326 {
327 cerr << "build_ngram: ERROR : if any tags are given, ALL must be given"
328 << endl;
329 exit(1);
330 }
331
332 if (al.present("-order"))
333 order = al.ival("-order");
334 else
335 {
336 cerr << "build_ngram: WARNING : No order specified with -order : defaulting to bigram"
337 << endl;
338 order = 2;
339 }
340
341 if (al.present("-otype"))
342 format = al.val("-otype");
343 else
344 format = "";
345
346 if (al.present("-floor"))
347 floor = al.dval("-floor");
348 else
349 floor = 0.0;
350
351 if (al.present("-backoff"))
352 if (!al.present("-smooth"))
353 {
354 cerr << "build_ngram: backoff requires smooth value" << endl;
355 exit(-1);
356 }
357 if (al.present("-freqsmooth"))
358 if (!al.present("-smooth"))
359 {
360 cerr << "build_ngram: frequency smooth requires smooth value"
361 << endl;
362 exit(-1);
363 }
364
365 if (al.present("-dense"))
366 representation = EST_Ngrammar::dense;
367 else if (al.present("-sparse"))
368 {
369 cerr << "build_ngram: Sorry, sparse representation is not yet available " << endl;
370 exit(1);
371 representation = EST_Ngrammar::sparse;
372 }
373 else if (al.present("-backoff"))
374 representation = EST_Ngrammar::backoff;
375 else
376 cerr << "build_ngram: Defaulting to dense representation" << endl;
377
378 if (al.present("-p"))
379 {
380 if (!ngrammar.init(order,representation,wordlist,wordlist2))
381 {
382 cerr << "build_ngram: Failed to initialise " << order << "-gram" << endl;
383 exit(1);
384 }
385 }
386 else
387 {
388 if (!ngrammar.init(order,representation,wordlist))
389 {
390 cerr << "build_ngram: Failed to initialise " << order << "-gram" << endl;
391 exit(1);
392 }
393 }
394
395
396 if ( al.present("-backoff") )
397 {
400 al.ival("-backoff"),al.ival("-smooth")))
401 {
402 cerr << "build_ngram: Failed to build backoff " << order
403 << "-gram" << endl;
404 exit(1);
405 }
406 else if (trace)
407 cerr << "build_ngram: Built backoff " << order <<
408 "-gram" << endl;
409 }
410 else
411 {
414 {
415 cerr << "build_ngram: Failed to build " << order << "-gram" << endl;
416 exit(1);
417 }
418 else
419 if(trace)
420 cerr << "build_ngram: Built " << order << "-gram" << endl;
421 }
422
423
424 // Posit processing functions
425 if (al.present("-freqsmooth"))
426 {
427 Ngram_freqsmooth(ngrammar,al.ival("-smooth"),al.ival("-freqsmooth"));
428 }
429 else if (al.present("-smooth") && !al.present("-backoff"))
430 {
431 int smoothcount = atoi(al.val("-smooth"));
432 if(!Good_Turing_smooth(ngrammar,smoothcount,0))
433 {
434 cerr << "build_ngram: Failed to smooth " << order << "-gram" << endl;
435 exit(1);
436 }
437 else
438 if(trace)
439 cerr << "build_ngram: Good Turing smoothed " << order << "-gram" << endl;
440
441 }
442
443 // save
444 if (al.present("-save_compressed"))
445 {
446 EST_String tmp_file = make_tmp_filename();
447 if (ngrammar.save(tmp_file,format,trace,floor) == write_ok)
448 {
450 EST_Pathname tmp(out_file);
451 if (tmp.extension() == GZIP_FILENAME_EXTENSION)
452 prog_name = "gzip --stdout";
453 else if (tmp.extension() == COMPRESS_FILENAME_EXTENSION)
454 prog_name = "compress -c";
455 else // default
456 {
457 prog_name = "gzip --stdout";
458 if(out_file != "-")
459 out_file = out_file + "." + GZIP_FILENAME_EXTENSION;
460 }
461
462 if (trace)
463 cerr << "build_ngram: Compressing with '" << prog_name << "'" << endl;
464
465 // now compress
466 if(compress_file(tmp_file,out_file,prog_name) != 0)
467 {
468 cerr << "build_ngram: Failed to compress to file "
469 << out_file << endl;
470 (void)delete_file(tmp_file);
471 exit(1);
472 }
473
474 (void)delete_file(tmp_file);
475
476 if(trace)
477 cerr << "build_ngram: Saved in compressed " << format
478 << " format to " << out_file << endl;
479 }
480 else
481 {
482 cerr << "build_ngram: Failed to write temporary file "
483 << tmp_file << endl;
484 exit(1);
485 }
486
487
488 }
489 else
490 {
491 if (ngrammar.save(out_file,format,trace,floor) == write_ok)
492 {
493 if(trace)
494 cerr << "build_ngram: Saved in " << format
495 << " format to " << out_file << endl;
496 }
497 else
498 {
499 cerr << "build_ngram: Failed to save " << format << " format data to "
500 << out_file << endl;
501 exit(1);
502 }
503 }
504
505
506 // everything went okay
507 return 0;
508}