X-Git-Url: https://scripts.mit.edu/gitweb/autoinstallsdev/mediawiki.git/blobdiff_plain/19e297c21b10b1b8a3acad5e73fc71dcb35db44a..6932310fd58ebef145fa01eb76edf7150284d8ea:/includes/Collation.php diff --git a/includes/Collation.php b/includes/Collation.php deleted file mode 100644 index f00b568f..00000000 --- a/includes/Collation.php +++ /dev/null @@ -1,307 +0,0 @@ -lang = Language::factory( 'en' ); - } - - function getSortKey( $string ) { - return $this->lang->uc( $string ); - } - - function getFirstLetter( $string ) { - if ( $string[0] == "\0" ) { - $string = substr( $string, 1 ); - } - return $this->lang->ucfirst( $this->lang->firstChar( $string ) ); - } -} - -class IcuCollation extends Collation { - var $primaryCollator, $mainCollator, $locale; - var $firstLetterData; - - /** - * Unified CJK blocks. - * - * The same definition of a CJK block must be used for both Collation and - * generateCollationData.php. These blocks are omitted from the first - * letter data, as an optimisation measure and because the default UCA table - * is pretty useless for sorting Chinese text anyway. Japanese and Korean - * blocks are not included here, because they are smaller and more useful. - */ - static $cjkBlocks = array( - array( 0x2E80, 0x2EFF ), // CJK Radicals Supplement - array( 0x2F00, 0x2FDF ), // Kangxi Radicals - array( 0x2FF0, 0x2FFF ), // Ideographic Description Characters - array( 0x3000, 0x303F ), // CJK Symbols and Punctuation - array( 0x31C0, 0x31EF ), // CJK Strokes - array( 0x3200, 0x32FF ), // Enclosed CJK Letters and Months - array( 0x3300, 0x33FF ), // CJK Compatibility - array( 0x3400, 0x4DBF ), // CJK Unified Ideographs Extension A - array( 0x4E00, 0x9FFF ), // CJK Unified Ideographs - array( 0xF900, 0xFAFF ), // CJK Compatibility Ideographs - array( 0xFE30, 0xFE4F ), // CJK Compatibility Forms - array( 0x20000, 0x2A6DF ), // CJK Unified Ideographs Extension B - array( 0x2A700, 0x2B73F ), // CJK Unified Ideographs Extension C - array( 0x2B740, 0x2B81F ), // CJK Unified Ideographs Extension D - array( 0x2F800, 0x2FA1F ), // CJK Compatibility Ideographs Supplement - ); - - const RECORD_LENGTH = 14; - - function __construct( $locale ) { - if ( !extension_loaded( 'intl' ) ) { - throw new MWException( 'An ICU collation was requested, ' . - 'but the intl extension is not available.' ); - } - $this->locale = $locale; - $this->mainCollator = Collator::create( $locale ); - if ( !$this->mainCollator ) { - throw new MWException( "Invalid ICU locale specified for collation: $locale" ); - } - - $this->primaryCollator = Collator::create( $locale ); - $this->primaryCollator->setStrength( Collator::PRIMARY ); - } - - function getSortKey( $string ) { - // intl extension produces non null-terminated - // strings. Appending '' fixes it so that it doesn't generate - // a warning on each access in debug php. - wfSuppressWarnings(); - $key = $this->mainCollator->getSortKey( $string ) . ''; - wfRestoreWarnings(); - return $key; - } - - function getPrimarySortKey( $string ) { - wfSuppressWarnings(); - $key = $this->primaryCollator->getSortKey( $string ) . ''; - wfRestoreWarnings(); - return $key; - } - - function getFirstLetter( $string ) { - $string = strval( $string ); - if ( $string === '' ) { - return ''; - } - - // Check for CJK - $firstChar = mb_substr( $string, 0, 1, 'UTF-8' ); - if ( ord( $firstChar ) > 0x7f - && self::isCjk( utf8ToCodepoint( $firstChar ) ) ) - { - return $firstChar; - } - - $sortKey = $this->getPrimarySortKey( $string ); - - // Do a binary search to find the correct letter to sort under - $min = $this->findLowerBound( - array( $this, 'getSortKeyByLetterIndex' ), - $this->getFirstLetterCount(), - 'strcmp', - $sortKey ); - - if ( $min === false ) { - // Before the first letter - return ''; - } - return $this->getLetterByIndex( $min ); - } - - function getFirstLetterData() { - if ( $this->firstLetterData !== null ) { - return $this->firstLetterData; - } - - $cache = wfGetCache( CACHE_ANYTHING ); - $cacheKey = wfMemcKey( 'first-letters', $this->locale ); - $cacheEntry = $cache->get( $cacheKey ); - - if ( $cacheEntry ) { - $this->firstLetterData = $cacheEntry; - return $this->firstLetterData; - } - - // Generate data from serialized data file - - $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" ); - if ( $letters === false ) { - throw new MWException( "MediaWiki does not support ICU locale " . - "\"{$this->locale}\"" ); - } - - // Sort the letters. - // - // It's impossible to have the precompiled data file properly sorted, - // because the sort order changes depending on ICU version. If the - // array is not properly sorted, the binary search will return random - // results. - // - // We also take this opportunity to remove primary collisions. - $letterMap = array(); - foreach ( $letters as $letter ) { - $key = $this->getPrimarySortKey( $letter ); - if ( isset( $letterMap[$key] ) ) { - // Primary collision - // Keep whichever one sorts first in the main collator - if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) { - $letterMap[$key] = $letter; - } - } else { - $letterMap[$key] = $letter; - } - } - ksort( $letterMap, SORT_STRING ); - $data = array( - 'chars' => array_values( $letterMap ), - 'keys' => array_keys( $letterMap ) - ); - - // Reduce memory usage before caching - unset( $letterMap ); - - // Save to cache - $this->firstLetterData = $data; - $cache->set( $cacheKey, $data, 86400 * 7 /* 1 week */ ); - return $data; - } - - function getLetterByIndex( $index ) { - if ( $this->firstLetterData === null ) { - $this->getFirstLetterData(); - } - return $this->firstLetterData['chars'][$index]; - } - - function getSortKeyByLetterIndex( $index ) { - if ( $this->firstLetterData === null ) { - $this->getFirstLetterData(); - } - return $this->firstLetterData['keys'][$index]; - } - - function getFirstLetterCount() { - if ( $this->firstLetterData === null ) { - $this->getFirstLetterData(); - } - return count( $this->firstLetterData['chars'] ); - } - - /** - * Do a binary search, and return the index of the largest item that sorts - * less than or equal to the target value. - * - * @param $valueCallback A function to call to get the value with - * a given array index. - * @param $valueCount The number of items accessible via $valueCallback, - * indexed from 0 to $valueCount - 1 - * @param $comparisonCallback A callback to compare two values, returning - * -1, 0 or 1 in the style of strcmp(). - * @param $target The target value to find. - * - * @return The item index of the lower bound, or false if the target value - * sorts before all items. - */ - function findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target ) { - $min = 0; - $max = $valueCount - 1; - do { - $mid = $min + ( ( $max - $min ) >> 1 ); - $item = call_user_func( $valueCallback, $mid ); - $comparison = call_user_func( $comparisonCallback, $target, $item ); - if ( $comparison > 0 ) { - $min = $mid; - } elseif ( $comparison == 0 ) { - $min = $mid; - break; - } else { - $max = $mid; - } - } while ( $min < $max - 1 ); - - if ( $min == 0 && $max == 0 && $comparison > 0 ) { - // Before the first item - return false; - } else { - return $min; - } - } - - static function isCjk( $codepoint ) { - foreach ( self::$cjkBlocks as $block ) { - if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) { - return true; - } - } - return false; - } -} -