]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blobdiff - vendor/wikimedia/textcat/TextCat.php
MediaWiki 1.30.2
[autoinstallsdev/mediawiki.git] / vendor / wikimedia / textcat / TextCat.php
diff --git a/vendor/wikimedia/textcat/TextCat.php b/vendor/wikimedia/textcat/TextCat.php
new file mode 100644 (file)
index 0000000..2d4ae02
--- /dev/null
@@ -0,0 +1,331 @@
+<?php
+
+/**
+ * TextCat language classifier
+ * See http://odur.let.rug.nl/~vannoord/TextCat/
+ */
+class TextCat {
+
+       const STATUSTOOSHORT = 'Input is too short.';
+       const STATUSNOMATCH = 'No match found.';
+       const STATUSAMBIGUOUS = 'Cannot determine language.';
+
+       /**
+        * Minimum input length to be considered for
+        * classification
+        * @var string
+        */
+       private $resultStatus = '';
+
+       /**
+        * Number of ngrams to be used.
+        * @var int
+        */
+       private $maxNgrams = 3000;
+
+       /**
+        * Minimum frequency of ngram to be counted.
+        * ( For language model generation set this
+        *   >0 to decrease CPU and memory reqs. )
+        * @var int
+        */
+       private $minFreq = 0;
+
+       /**
+        * Regexp used as word separator
+        * @var string
+        */
+       private $wordSeparator = '0-9\s\(\)';
+
+       /**
+        * List of language files
+        * @var string[]
+        */
+       private $langFiles = array();
+
+       /**
+        * Minimum input length to be considered for
+        * classification
+        * @var int
+        */
+       private $minInputLength = 0;
+
+       /**
+        * Maximum ratio of the score between a given
+        * candidate and the best candidate for the
+        * given candidate to be considered an alternative.
+        * @var float
+        */
+       private $resultsRatio = 1.05;
+
+       /**
+        * Maximum number of languages to return, within
+        * the resultsRatio. If there are more, the result
+        * is too ambiguous.
+        * @var int
+        */
+       private $maxReturnedLanguages = 10;
+
+       /**
+        * Maximum proportion of maximum score allowed.
+        * Compare score to worst possible score, and if
+        * it is too close, consider it not worth reporting.
+        * @var float
+        */
+       private $maxProportion = 1.00;
+
+       /**
+        * Amount to boost scores for languages
+        * specified by $boostedLangs; typical
+        * values are 0 to 0.15;
+        * @var float
+        */
+       private $langBoostScore = 0.00;
+
+       /**
+        * List of languages to boost by $langBoostScore
+        * @var string[]
+        */
+       private $boostedLangs = array();
+
+       /**
+        * @param
+        */
+       public function getResultStatus() {
+               return $this->resultStatus;
+       }
+
+       /**
+        * @param int $maxNgrams
+        */
+       public function setMaxNgrams( $maxNgrams ) {
+               $this->maxNgrams = $maxNgrams;
+       }
+
+       /**
+        * @param int $minFreq
+        */
+       public function setMinFreq( $minFreq ) {
+               $this->minFreq = $minFreq;
+       }
+
+       /**
+        * @param int $minInputLength
+        */
+       public function setMinInputLength( $minInputLength ) {
+               $this->minInputLength = $minInputLength;
+       }
+
+       /**
+        * @param float $resultsRatio
+        */
+       public function setResultsRatio( $resultsRatio ) {
+               $this->resultsRatio = $resultsRatio;
+       }
+
+       /**
+        * @param int $maxReturnedLanguages
+        */
+       public function setMaxReturnedLanguages( $maxReturnedLanguages ) {
+               $this->maxReturnedLanguages = $maxReturnedLanguages;
+       }
+
+       /**
+        * @param float $maxProportion
+        */
+       public function setMaxProportion( $maxProportion ) {
+               $this->maxProportion = $maxProportion;
+       }
+
+       /**
+        * @param float $langBoostScore
+        */
+       public function setLangBoostScore( $langBoostScore ) {
+               $this->langBoostScore = $langBoostScore;
+       }
+
+       /**
+        * @param float $langBoostScore
+        */
+       public function setBoostedLangs( $boostedLangs = array() ) {
+               // flip for more efficient lookups
+               $this->boostedLangs = array_flip( $boostedLangs );
+       }
+
+       /**
+        * @param string $wordSeparator
+        */
+       public function setWordSeparator( $wordSeparator ) {
+               $this->wordSeparator = $wordSeparator;
+       }
+
+       /**
+        * @param string|array $dirs
+        */
+       public function __construct( $dirs = array() ) {
+               if ( empty( $dirs ) ) {
+                       $dirs = array( __DIR__."/LM" );
+               }
+               if ( !is_array( $dirs ) ) {
+                       $dirs = array( $dirs );
+               }
+               foreach ( $dirs as $dir ) {
+                       foreach ( new DirectoryIterator( $dir ) as $file ) {
+                               if ( !$file->isFile() ) {
+                                       continue;
+                               }
+                               if ( $file->getExtension() == "lm" &&
+                                    !isset( $this->langFiles[$file->getBasename( ".lm" )] ) ) {
+                                       $this->langFiles[$file->getBasename( ".lm" )] = $file->getPathname();
+                               }
+                       }
+               }
+       }
+
+       /**
+        * Create ngrams list for text.
+        * @param string $text
+        * @param int $maxNgrams How many ngrams to use.
+        * @return int[]
+        */
+       public function createLM( $text, $maxNgrams ) {
+               $ngram = array();
+               foreach ( preg_split( "/[{$this->wordSeparator}]+/u", $text ) as $word ) {
+                       if ( empty( $word ) ) {
+                               continue;
+                       }
+                       $word = "_".$word."_";
+                       $len = mb_strlen( $word, "UTF-8" );
+                       for ( $i=0;$i<$len;$i++ ) {
+                               $rlen = $len - $i;
+                               if ( $rlen > 4 ) {
+                                       @$ngram[mb_substr( $word, $i, 5, "UTF-8" )]++;
+                               }
+                               if ( $rlen > 3 ) {
+                                       @$ngram[mb_substr( $word, $i, 4, "UTF-8" )]++;
+                               }
+                               if ( $rlen > 2 ) {
+                                       @$ngram[mb_substr( $word, $i, 3, "UTF-8" )]++;
+                               }
+                               if ( $rlen > 1 ) {
+                                       @$ngram[mb_substr( $word, $i, 2, "UTF-8" )]++;
+                               }
+                               @$ngram[mb_substr( $word, $i, 1, "UTF-8" )]++;
+                       }
+               }
+               if ( $this->minFreq ) {
+                       $min = $this->minFreq;
+                       $ngram = array_filter( $ngram, function ( $v ) use( $min ) { return $v > $min;
+
+         } );
+               }
+               uksort( $ngram, function( $k1, $k2 ) use( $ngram ) {
+                               if ( $ngram[$k1] == $ngram[$k2] ) {
+                                       return strcmp( $k1, $k2 );
+                               }
+                               return $ngram[$k2] - $ngram[$k1];
+               } );
+               if ( count( $ngram ) > $maxNgrams ) {
+                       array_splice( $ngram, $maxNgrams );
+               }
+               return $ngram;
+       }
+
+       /**
+        * Load data from language file.
+        * @param string $langFile
+        * @return int[] Language file data
+        */
+       public function loadLanguageFile( $langFile ) {
+               include $langFile;
+               array_splice( $ranks, $this->maxNgrams );
+               return $ranks;
+       }
+
+       /**
+        * Write ngrams to file in PHP format
+        * @param int[] $ngrams
+        * @param string $outfile Output filename
+        */
+       public function writeLanguageFile( $ngrams, $outfile ) {
+               $out = fopen( $outfile, "w" );
+               // write original array as "$ngrams"
+               fwrite( $out, '<?php $ngrams = ' . var_export( $ngrams, true ) . ";\n" );
+               // write reduced array as "$ranks"
+               $rank = 1;
+               $ranks = array_map( function ( $x ) use( &$rank ) { return $rank++;
+
+        }, $ngrams );
+               fwrite( $out, '$ranks = ' . var_export( $ranks, true ) . ";\n" );
+               fclose( $out );
+       }
+
+       /**
+        * Classify text.
+        * @param string $text
+        * @param string[] $candidates List of candidate languages.
+        * @return int[] Array with keys of language names and values of score.
+        *                               Sorted by ascending score, with first result being the best.
+        */
+       public function classify( $text, $candidates = null ) {
+               $results = array();
+               $this->resultStatus = '';
+
+               // strip non-word characters before checking for min length, don't assess empty strings
+               $wordLength = mb_strlen( preg_replace( "/[{$this->wordSeparator}]+/", "", $text ) );
+               if ( $wordLength < $this->minInputLength || $wordLength == 0 ) {
+                       $this->resultStatus = self::STATUSTOOSHORT;
+                       return $results;
+               }
+
+               $inputgrams = array_keys( $this->createLM( $text, $this->maxNgrams ) );
+               if ( $candidates ) {
+                       // flip for more efficient lookups
+                       $candidates = array_flip( $candidates );
+               }
+               foreach ( $this->langFiles as $language => $langFile ) {
+                       if ( $candidates && !isset( $candidates[$language] ) ) {
+                               continue;
+                       }
+                       $ngrams = $this->loadLanguageFile( $langFile );
+                       $p = 0;
+                       foreach ( $inputgrams as $i => $ingram ) {
+                               if ( !empty( $ngrams[$ingram] ) ) {
+                                       $p += abs( $ngrams[$ingram] - $i );
+                               } else {
+                                       $p += $this->maxNgrams;
+                               }
+                       }
+                       if ( isset( $this->boostedLangs[$language] ) ) {
+                               $p = round( $p * ( 1 - $this->langBoostScore ) );
+                       }
+                       $results[$language] = $p;
+               }
+
+               asort( $results );
+
+               // ignore any item that scores higher than best * resultsRatio
+               $max = reset( $results ) * $this->resultsRatio;
+               $results = array_filter( $results, function ( $res ) use ( $max ) { return $res <= $max;
+               } );
+
+               // if more than maxReturnedLanguages remain, the result is too ambiguous, so bail
+               if ( count( $results ) > $this->maxReturnedLanguages ) {
+                       $this->resultStatus = self::STATUSAMBIGUOUS;
+                       return array();
+               }
+
+               // filter max proportion of max score after ambiguity check; reuse $max variable
+               $max = count( $inputgrams ) * $this->maxNgrams * $this->maxProportion;
+               $results = array_filter( $results, function ( $res ) use ( $max ) { return $res <= $max;
+               } );
+
+               if ( count( $results ) == 0 ) {
+                       $this->resultStatus = self::STATUSNOMATCH;
+                       return $results;
+               }
+
+               return $results;
+       }
+}
+