<?php
# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
# http://www.mediawiki.org/
-#
+#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
+# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
-#
+#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
-#
+#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
-# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# http://www.gnu.org/copyleft/gpl.html
/**
- * Unicode normalization routines for working with UTF-8 strings.
- * Currently assumes that input strings are valid UTF-8!
- *
- * Not as fast as I'd like, but should be usable for most purposes.
- * UtfNormal::toNFC() will bail early if given ASCII text or text
- * it can quickly deterimine is already normalized.
- *
- * All functions can be called static.
- *
- * See description of forms at http://www.unicode.org/reports/tr15/
- *
- * @package UtfNormal
+ * @defgroup UtfNormal UtfNormal
*/
/** */
-require_once 'UtfNormalUtil.php';
+require_once dirname(__FILE__).'/UtfNormalUtil.php';
global $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp;
-$utfCombiningClass = NULL;
-$utfCanonicalComp = NULL;
-$utfCanonicalDecomp = NULL;
+$utfCombiningClass = null;
+$utfCanonicalComp = null;
+$utfCanonicalDecomp = null;
# Load compatibility decompositions on demand if they are needed.
global $utfCompatibilityDecomp;
-$utfCompatibilityDecomp = NULL;
-
-define( 'UNICODE_HANGUL_FIRST', 0xac00 );
-define( 'UNICODE_HANGUL_LAST', 0xd7a3 );
-
-define( 'UNICODE_HANGUL_LBASE', 0x1100 );
-define( 'UNICODE_HANGUL_VBASE', 0x1161 );
-define( 'UNICODE_HANGUL_TBASE', 0x11a7 );
-
-define( 'UNICODE_HANGUL_LCOUNT', 19 );
-define( 'UNICODE_HANGUL_VCOUNT', 21 );
-define( 'UNICODE_HANGUL_TCOUNT', 28 );
-define( 'UNICODE_HANGUL_NCOUNT', UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT );
-
-define( 'UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1 );
-define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
-define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
-
-define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
-define( 'UNICODE_SURROGATE_LAST', 0xdfff );
-define( 'UNICODE_MAX', 0x10ffff );
-define( 'UNICODE_REPLACEMENT', 0xfffd );
-
-
-define( 'UTF8_HANGUL_FIRST', codepointToUtf8( UNICODE_HANGUL_FIRST ) );
-define( 'UTF8_HANGUL_LAST', codepointToUtf8( UNICODE_HANGUL_LAST ) );
-
-define( 'UTF8_HANGUL_LBASE', codepointToUtf8( UNICODE_HANGUL_LBASE ) );
-define( 'UTF8_HANGUL_VBASE', codepointToUtf8( UNICODE_HANGUL_VBASE ) );
-define( 'UTF8_HANGUL_TBASE', codepointToUtf8( UNICODE_HANGUL_TBASE ) );
-
-define( 'UTF8_HANGUL_LEND', codepointToUtf8( UNICODE_HANGUL_LEND ) );
-define( 'UTF8_HANGUL_VEND', codepointToUtf8( UNICODE_HANGUL_VEND ) );
-define( 'UTF8_HANGUL_TEND', codepointToUtf8( UNICODE_HANGUL_TEND ) );
-
-define( 'UTF8_SURROGATE_FIRST', codepointToUtf8( UNICODE_SURROGATE_FIRST ) );
-define( 'UTF8_SURROGATE_LAST', codepointToUtf8( UNICODE_SURROGATE_LAST ) );
-define( 'UTF8_MAX', codepointToUtf8( UNICODE_MAX ) );
-define( 'UTF8_REPLACEMENT', codepointToUtf8( UNICODE_REPLACEMENT ) );
-#define( 'UTF8_REPLACEMENT', '!' );
-
-define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
-define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
-define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
-
-# These two ranges are illegal
-define( 'UTF8_FDD0', codepointToUtf8( 0xfdd0 ) );
-define( 'UTF8_FDEF', codepointToUtf8( 0xfdef ) );
-define( 'UTF8_FFFE', codepointToUtf8( 0xfffe ) );
-define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
-
-define( 'UTF8_HEAD', false );
-define( 'UTF8_TAIL', true );
-
+$utfCompatibilityDecomp = null;
/**
* For using the ICU wrapper
define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
/**
+ * Unicode normalization routines for working with UTF-8 strings.
+ * Currently assumes that input strings are valid UTF-8!
*
- * @package MediaWiki
+ * Not as fast as I'd like, but should be usable for most purposes.
+ * UtfNormal::toNFC() will bail early if given ASCII text or text
+ * it can quickly deterimine is already normalized.
+ *
+ * All functions can be called static.
+ *
+ * See description of forms at http://www.unicode.org/reports/tr15/
+ *
+ * @ingroup UtfNormal
*/
class UtfNormal {
/**
* Fast return for pure ASCII strings; some lesser optimizations for
* strings containing only known-good characters. Not as fast as toNFC().
*
- * @param string $string a UTF-8 string
+ * @param $string String: a UTF-8 string
* @return string a clean, shiny, normalized UTF-8 string
*/
- function cleanUp( $string ) {
+ static function cleanUp( $string ) {
if( NORMALIZE_ICU ) {
# We exclude a few chars that ICU would not.
$string = preg_replace(
$string );
$string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
$string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
-
+
# UnicodeString constructor fails if the string ends with a
# head byte. Add a junk char at the end, we'll strip it off.
return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
* Fast return for pure ASCII strings; some lesser optimizations for
* strings containing only known-good characters.
*
- * @param string $string a valid UTF-8 string. Input is not validated.
+ * @param $string String: a valid UTF-8 string. Input is not validated.
* @return string a UTF-8 string in normal form C
*/
- function toNFC( $string ) {
+ static function toNFC( $string ) {
if( NORMALIZE_ICU )
return utf8_normalize( $string, UNORM_NFC );
elseif( UtfNormal::quickIsNFC( $string ) )
else
return UtfNormal::NFC( $string );
}
-
+
/**
* Convert a UTF-8 string to normal form D, canonical decomposition.
* Fast return for pure ASCII strings.
*
- * @param string $string a valid UTF-8 string. Input is not validated.
+ * @param $string String: a valid UTF-8 string. Input is not validated.
* @return string a UTF-8 string in normal form D
*/
- function toNFD( $string ) {
+ static function toNFD( $string ) {
if( NORMALIZE_ICU )
return utf8_normalize( $string, UNORM_NFD );
elseif( preg_match( '/[\x80-\xff]/', $string ) )
else
return $string;
}
-
+
/**
* Convert a UTF-8 string to normal form KC, compatibility composition.
* This may cause irreversible information loss, use judiciously.
* Fast return for pure ASCII strings.
*
- * @param string $string a valid UTF-8 string. Input is not validated.
+ * @param $string String: a valid UTF-8 string. Input is not validated.
* @return string a UTF-8 string in normal form KC
*/
- function toNFKC( $string ) {
+ static function toNFKC( $string ) {
if( NORMALIZE_ICU )
return utf8_normalize( $string, UNORM_NFKC );
elseif( preg_match( '/[\x80-\xff]/', $string ) )
else
return $string;
}
-
+
/**
* Convert a UTF-8 string to normal form KD, compatibility decomposition.
* This may cause irreversible information loss, use judiciously.
* Fast return for pure ASCII strings.
*
- * @param string $string a valid UTF-8 string. Input is not validated.
+ * @param $string String: a valid UTF-8 string. Input is not validated.
* @return string a UTF-8 string in normal form KD
*/
- function toNFKD( $string ) {
+ static function toNFKD( $string ) {
if( NORMALIZE_ICU )
return utf8_normalize( $string, UNORM_NFKD );
elseif( preg_match( '/[\x80-\xff]/', $string ) )
else
return $string;
}
-
+
/**
* Load the basic composition data if necessary
- * @access private
+ * @private
*/
- function loadData() {
- global $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp;
+ static function loadData() {
+ global $utfCombiningClass;
if( !isset( $utfCombiningClass ) ) {
- require_once( 'UtfNormalData.inc' );
+ require_once( dirname(__FILE__) . '/UtfNormalData.inc' );
}
}
-
+
/**
* Returns true if the string is _definitely_ in NFC.
* Returns false if not or uncertain.
- * @param string $string a valid UTF-8 string. Input is not validated.
+ * @param $string String: a valid UTF-8 string. Input is not validated.
* @return bool
*/
- function quickIsNFC( $string ) {
+ static function quickIsNFC( $string ) {
# ASCII is always valid NFC!
# If it's pure ASCII, let it through.
if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
-
+
UtfNormal::loadData();
global $utfCheckNFC, $utfCombiningClass;
$len = strlen( $string );
/**
* Returns true if the string is _definitely_ in NFC.
* Returns false if not or uncertain.
- * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
+ * @param $string String: a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
*/
- function quickIsNFCVerify( &$string ) {
+ static function quickIsNFCVerify( &$string ) {
# Screen out some characters that eg won't be allowed in XML
$string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
-
+
# ASCII is always valid NFC!
# If we're only ever given plain ASCII, we can avoid the overhead
# of initializing the decomposition tables by skipping out early.
if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
-
+
static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
if( !isset( $checkit ) ) {
# Load/build some scary lookup tables...
UtfNormal::loadData();
global $utfCheckNFC, $utfCombiningClass;
-
+
$utfCheckOrCombining = array_merge( $utfCheckNFC, $utfCombiningClass );
# Head bytes for sequences which we should do further validity checks
array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
-
+
# Each UTF-8 head byte is followed by a certain
# number of tail bytes.
$tailBytes = array();
$tailBytes[chr($n)] = $remaining;
}
}
-
+
# Chop the text into pure-ASCII and non-ASCII areas;
# large ASCII parts can be handled much more quickly.
# Don't chop up Unicode areas for punctuation, though,
# that wastes energy.
+ $matches = array();
preg_match_all(
'/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
$string, $matches );
-
+
$looksNormal = true;
$base = 0;
$replace = array();
foreach( $matches[1] as $str ) {
$chunk = strlen( $str );
-
+
if( $str{0} < "\x80" ) {
# ASCII chunk: guaranteed to be valid UTF-8
# and in normal form C, so skip over it.
$base += $chunk;
continue;
}
-
+
# We'll have to examine the chunk byte by byte to ensure
# that it consists of valid UTF-8 sequences, and to see
# if any of them might not be normalized.
#
# Since PHP is not the fastest language on earth, some of
# this code is a little ugly with inner loop optimizations.
-
+
$head = '';
$len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
-
+
for( $i = -1; --$len; ) {
if( $remaining = $tailBytes[$c = $str{++$i}] ) {
# UTF-8 head byte!
# 0xed is relatively frequent in Korean, which
# abuts the surrogate area, so we're doing
# this check separately to speed things up.
-
+
if( $sequence >= UTF8_SURROGATE_FIRST ) {
# Surrogates are legal only in UTF-16 code.
# They are totally forbidden here in UTF-8
($n < 0xc2 && $sequence <= UTF8_OVERLONG_A)
|| ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
|| ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
-
+
# U+FFFE and U+FFFF are explicitly forbidden in Unicode.
- || ($n == 0xef &&
+ || ($n == 0xef &&
($sequence == UTF8_FFFE)
|| ($sequence == UTF8_FFFF) )
-
+
# Unicode has been limited to 21 bits; longer
# sequences are not allowed.
|| ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
-
+
$replace[] = array( UTF8_REPLACEMENT,
- $base + $i + 1 - strlen( $sequence ),
+ $base + $i + 1 - strlen( $sequence ),
strlen( $sequence ) );
$head = '';
continue;
}
}
}
-
+
if( isset( $utfCheckOrCombining[$sequence] ) ) {
# If it's NO or MAYBE, we'll have to rip
# the string apart and put it back together.
# That's going to be mighty slow.
$looksNormal = false;
}
-
+
# The sequence is legal!
$head = '';
} elseif( $c < "\x80" ) {
}
return $looksNormal;
}
-
+
# These take a string and run the normalization on them, without
# checking for validity or any optimization etc. Input must be
# VALID UTF-8!
/**
- * @param string $string
+ * @param $string string
* @return string
- * @access private
+ * @private
*/
- function NFC( $string ) {
+ static function NFC( $string ) {
return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
}
-
+
/**
- * @param string $string
+ * @param $string string
* @return string
- * @access private
+ * @private
*/
- function NFD( $string ) {
+ static function NFD( $string ) {
UtfNormal::loadData();
global $utfCanonicalDecomp;
return UtfNormal::fastCombiningSort(
UtfNormal::fastDecompose( $string, $utfCanonicalDecomp ) );
}
-
+
/**
- * @param string $string
+ * @param $string string
* @return string
- * @access private
+ * @private
*/
- function NFKC( $string ) {
+ static function NFKC( $string ) {
return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
}
-
+
/**
- * @param string $string
+ * @param $string string
* @return string
- * @access private
+ * @private
*/
- function NFKD( $string ) {
+ static function NFKD( $string ) {
global $utfCompatibilityDecomp;
if( !isset( $utfCompatibilityDecomp ) ) {
require_once( 'UtfNormalDataK.inc' );
return UtfNormal::fastCombiningSort(
UtfNormal::fastDecompose( $string, $utfCompatibilityDecomp ) );
}
-
-
+
+
/**
* Perform decomposition of a UTF-8 string into either D or KD form
* (depending on which decomposition map is passed to us).
* Input is assumed to be *valid* UTF-8. Invalid code will break.
- * @access private
- * @param string $string Valid UTF-8 string
- * @param array $map hash of expanded decomposition map
+ * @private
+ * @param $string String: valid UTF-8 string
+ * @param $map Array: hash of expanded decomposition map
* @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
*/
- function fastDecompose( $string, &$map ) {
+ static function fastDecompose( $string, $map ) {
UtfNormal::loadData();
$len = strlen( $string );
$out = '';
| (ord( $c{1} ) & 0x3f) << 6
| (ord( $c{2} ) & 0x3f) )
- UNICODE_HANGUL_FIRST;
- $l = IntVal( $index / UNICODE_HANGUL_NCOUNT );
- $v = IntVal( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
+ $l = intval( $index / UNICODE_HANGUL_NCOUNT );
+ $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
$t = $index % UNICODE_HANGUL_TCOUNT;
$out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
if( $t >= 25 ) {
/**
* Sorts combining characters into canonical order. This is the
* final step in creating decomposed normal forms D and KD.
- * @access private
- * @param string $string a valid, decomposed UTF-8 string. Input is not validated.
+ * @private
+ * @param $string String: a valid, decomposed UTF-8 string. Input is not validated.
* @return string a UTF-8 string with combining characters sorted in canonical order
*/
- function fastCombiningSort( $string ) {
+ static function fastCombiningSort( $string ) {
UtfNormal::loadData();
global $utfCombiningClass;
$len = strlen( $string );
}
if( isset( $utfCombiningClass[$c] ) ) {
$lastClass = $utfCombiningClass[$c];
- @$combiners[$lastClass] .= $c;
+ if( isset( $combiners[$lastClass] ) ) {
+ $combiners[$lastClass] .= $c;
+ } else {
+ $combiners[$lastClass] = $c;
+ }
continue;
}
}
/**
* Produces canonically composed sequences, i.e. normal form C or KC.
*
- * @access private
- * @param string $string a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
+ * @private
+ * @param $string String: a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
* @return string a UTF-8 string with canonical precomposed characters used where possible
*/
- function fastCompose( $string ) {
+ static function fastCompose( $string ) {
UtfNormal::loadData();
global $utfCanonicalComp, $utfCombiningClass;
$len = strlen( $string );
$hangulPoint = UNICODE_HANGUL_FIRST +
UNICODE_HANGUL_TCOUNT *
(UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
-
+
# Hardcode the limited-range UTF-8 conversion:
$startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
# $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
$tIndex = ord( $c{2} ) - 0xa7;
if( $tIndex < 0 ) $tIndex = ord( $c{2} ) - 0x80 + (0x11c0 - 0x11a7);
-
+
# Increment the code point by $tIndex, without
# the function overhead of decoding and recoding UTF-8
#
$startChar{1} = chr( $mid );
}
$startChar{2} = chr( $tail );
-
+
# If there's another jamo char after this, *don't* try to merge it.
$lastHangul = 1;
continue;
$out .= $startChar . $combining;
return $out;
}
-
+
/**
* This is just used for the benchmark, comparing how long it takes to
* interate through a string without really doing anything of substance.
- * @param string $string
+ * @param $string string
* @return string
*/
- function placebo( $string ) {
+ static function placebo( $string ) {
$len = strlen( $string );
$out = '';
for( $i = 0; $i < $len; $i++ ) {
return $out;
}
}
-
-?>