3 * Generate ngrams data from text files.
4 * Run: php felis.php INPUTDIR OUTPUTDIR
5 * INPUTDIR should contain text files e.g. english.txt
6 * OUTPUTDIR would contain ngrams files e.g. english.lm
9 // Language model generation failing?
10 // up your memory limit or set $minFreq >0 in TextCat.php
11 // ini_set('memory_limit', '2000000000');
13 require_once __DIR__.'/TextCat.php';
14 // TODO: add option to control model ngram count
18 die( "Use $argv[0] INPUTDIR OUTPUTDIR\n" );
20 if ( !file_exists( $argv[2] ) ) {
21 mkdir( $argv[2], 0755, true );
23 $cat = new TextCat( $argv[2] );
25 foreach ( new DirectoryIterator( $argv[1] ) as $file ) {
26 if ( !$file->isFile() ) {
29 $ngrams = $cat->createLM( file_get_contents( $file->getPathname() ), $maxNgrams );
30 $cat->writeLanguageFile( $ngrams, $argv[2] . "/" . $file->getBasename( ".txt" ) . ".lm" );