X-Git-Url: https://scripts.mit.edu/gitweb/autoinstallsdev/mediawiki.git/blobdiff_plain/19e297c21b10b1b8a3acad5e73fc71dcb35db44a..6932310fd58ebef145fa01eb76edf7150284d8ea:/vendor/wikimedia/textcat/felis.php diff --git a/vendor/wikimedia/textcat/felis.php b/vendor/wikimedia/textcat/felis.php new file mode 100644 index 00000000..78a9c560 --- /dev/null +++ b/vendor/wikimedia/textcat/felis.php @@ -0,0 +1,32 @@ +0 in TextCat.php +// ini_set('memory_limit', '2000000000'); + +require_once __DIR__.'/TextCat.php'; +// TODO: add option to control model ngram count +$maxNgrams = 4000; + +if ( $argc != 3 ) { + die( "Use $argv[0] INPUTDIR OUTPUTDIR\n" ); +} +if ( !file_exists( $argv[2] ) ) { + mkdir( $argv[2], 0755, true ); +} +$cat = new TextCat( $argv[2] ); + +foreach ( new DirectoryIterator( $argv[1] ) as $file ) { + if ( !$file->isFile() ) { + continue; + } + $ngrams = $cat->createLM( file_get_contents( $file->getPathname() ), $maxNgrams ); + $cat->writeLanguageFile( $ngrams, $argv[2] . "/" . $file->getBasename( ".txt" ) . ".lm" ); +} +exit( 0 );