--- /dev/null
+<?php
+/**
+ * @file
+ * @license https://opensource.org/licenses/Apache-2.0 Apache-2.0
+ */
+
+namespace Wikimedia\CSS\Parser;
+
+/**
+ * Character set conversion for CSS
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#input-byte-stream
+ */
+class Encoder {
+
+ /**
+ * @var array Mapping from CSS encoding tags to mbstring/iconv encodings
+ * @see https://encoding.spec.whatwg.org/#concept-encoding-get
+ */
+ protected static $encodings = [
+ 'unicode-1-1-utf-8' => 'UTF-8',
+ 'utf-8' => 'UTF-8',
+ 'utf8' => 'UTF-8',
+ '866' => 'CP866',
+ 'cp866' => 'CP866',
+ 'csibm866' => 'CP866',
+ 'ibm866' => 'CP866',
+ 'csisolatin2' => 'ISO-8859-2',
+ 'iso-8859-2' => 'ISO-8859-2',
+ 'iso-ir-101' => 'ISO-8859-2',
+ 'iso8859-2' => 'ISO-8859-2',
+ 'iso88592' => 'ISO-8859-2',
+ 'iso_8859-2' => 'ISO-8859-2',
+ 'iso_8859-2:1987' => 'ISO-8859-2',
+ 'l2' => 'ISO-8859-2',
+ 'latin2' => 'ISO-8859-2',
+ 'csisolatin3' => 'ISO-8859-3',
+ 'iso-8859-3' => 'ISO-8859-3',
+ 'iso-ir-109' => 'ISO-8859-3',
+ 'iso8859-3' => 'ISO-8859-3',
+ 'iso88593' => 'ISO-8859-3',
+ 'iso_8859-3' => 'ISO-8859-3',
+ 'iso_8859-3:1988' => 'ISO-8859-3',
+ 'l3' => 'ISO-8859-3',
+ 'latin3' => 'ISO-8859-3',
+ 'csisolatin4' => 'ISO-8859-4',
+ 'iso-8859-4' => 'ISO-8859-4',
+ 'iso-ir-110' => 'ISO-8859-4',
+ 'iso8859-4' => 'ISO-8859-4',
+ 'iso88594' => 'ISO-8859-4',
+ 'iso_8859-4' => 'ISO-8859-4',
+ 'iso_8859-4:1988' => 'ISO-8859-4',
+ 'l4' => 'ISO-8859-4',
+ 'latin4' => 'ISO-8859-4',
+ 'csisolatincyrillic' => 'ISO-8859-5',
+ 'cyrillic' => 'ISO-8859-5',
+ 'iso-8859-5' => 'ISO-8859-5',
+ 'iso-ir-144' => 'ISO-8859-5',
+ 'iso8859-5' => 'ISO-8859-5',
+ 'iso88595' => 'ISO-8859-5',
+ 'iso_8859-5' => 'ISO-8859-5',
+ 'iso_8859-5:1988' => 'ISO-8859-5',
+ 'arabic' => 'ISO-8859-6',
+ 'asmo-708' => 'ISO-8859-6',
+ 'csiso88596e' => 'ISO-8859-6',
+ 'csiso88596i' => 'ISO-8859-6',
+ 'csisolatinarabic' => 'ISO-8859-6',
+ 'ecma-114' => 'ISO-8859-6',
+ 'iso-8859-6' => 'ISO-8859-6',
+ 'iso-8859-6-e' => 'ISO-8859-6',
+ 'iso-8859-6-i' => 'ISO-8859-6',
+ 'iso-ir-127' => 'ISO-8859-6',
+ 'iso8859-6' => 'ISO-8859-6',
+ 'iso88596' => 'ISO-8859-6',
+ 'iso_8859-6' => 'ISO-8859-6',
+ 'iso_8859-6:1987' => 'ISO-8859-6',
+ 'csisolatingreek' => 'ISO-8859-7',
+ 'ecma-118' => 'ISO-8859-7',
+ 'elot_928' => 'ISO-8859-7',
+ 'greek' => 'ISO-8859-7',
+ 'greek8' => 'ISO-8859-7',
+ 'iso-8859-7' => 'ISO-8859-7',
+ 'iso-ir-126' => 'ISO-8859-7',
+ 'iso8859-7' => 'ISO-8859-7',
+ 'iso88597' => 'ISO-8859-7',
+ 'iso_8859-7' => 'ISO-8859-7',
+ 'iso_8859-7:1987' => 'ISO-8859-7',
+ 'sun_eu_greek' => 'ISO-8859-7',
+ 'csiso88598e' => 'ISO-8859-8',
+ 'csisolatinhebrew' => 'ISO-8859-8',
+ 'hebrew' => 'ISO-8859-8',
+ 'iso-8859-8' => 'ISO-8859-8',
+ 'iso-8859-8-e' => 'ISO-8859-8',
+ 'iso-ir-138' => 'ISO-8859-8',
+ 'iso8859-8' => 'ISO-8859-8',
+ 'iso88598' => 'ISO-8859-8',
+ 'iso_8859-8' => 'ISO-8859-8',
+ 'iso_8859-8:1988' => 'ISO-8859-8',
+ 'visual' => 'ISO-8859-8',
+ 'csiso88598i' => 'ISO-8859-8', // ISO-8859-8-I?
+ 'iso-8859-8-i' => 'ISO-8859-8', // ISO-8859-8-I?
+ 'logical' => 'ISO-8859-8', // ISO-8859-8-I?
+ 'csisolatin6' => 'ISO-8859-10',
+ 'iso-8859-10' => 'ISO-8859-10',
+ 'iso-ir-157' => 'ISO-8859-10',
+ 'iso8859-10' => 'ISO-8859-10',
+ 'iso885910' => 'ISO-8859-10',
+ 'l6' => 'ISO-8859-10',
+ 'latin6' => 'ISO-8859-10',
+ 'iso-8859-13' => 'ISO-8859-13',
+ 'iso8859-13' => 'ISO-8859-13',
+ 'iso885913' => 'ISO-8859-13',
+ 'iso-8859-14' => 'ISO-8859-14',
+ 'iso8859-14' => 'ISO-8859-14',
+ 'iso885914' => 'ISO-8859-14',
+ 'csisolatin9' => 'ISO-8859-15',
+ 'iso-8859-15' => 'ISO-8859-15',
+ 'iso8859-15' => 'ISO-8859-15',
+ 'iso885915' => 'ISO-8859-15',
+ 'iso_8859-15' => 'ISO-8859-15',
+ 'l9' => 'ISO-8859-15',
+ 'iso-8859-16' => 'ISO-8859-16',
+ 'cskoi8r' => 'KOI8-R',
+ 'koi' => 'KOI8-R',
+ 'koi8' => 'KOI8-R',
+ 'koi8-r' => 'KOI8-R',
+ 'koi8_r' => 'KOI8-R',
+ 'koi8-ru' => 'KOI8-U',
+ 'koi8-u' => 'KOI8-U',
+ 'csmacintosh' => 'macintosh',
+ 'mac' => 'macintosh',
+ 'macintosh' => 'macintosh',
+ 'x-mac-roman' => 'macintosh',
+ 'dos-874' => 'Windows-874',
+ 'iso-8859-11' => 'Windows-874',
+ 'iso8859-11' => 'Windows-874',
+ 'iso885911' => 'Windows-874',
+ 'tis-620' => 'Windows-874',
+ 'windows-874' => 'Windows-874',
+ 'cp1250' => 'Windows-1250',
+ 'windows-1250' => 'Windows-1250',
+ 'x-cp1250' => 'Windows-1250',
+ 'cp1251' => 'Windows-1251',
+ 'windows-1251' => 'Windows-1251',
+ 'x-cp1251' => 'Windows-1251',
+ 'ansi_x3.4-1968' => 'Windows-1252',
+ 'ascii' => 'Windows-1252',
+ 'cp1252' => 'Windows-1252',
+ 'cp819' => 'Windows-1252',
+ 'csisolatin1' => 'Windows-1252',
+ 'ibm819' => 'Windows-1252',
+ 'iso-8859-1' => 'Windows-1252',
+ 'iso-ir-100' => 'Windows-1252',
+ 'iso8859-1' => 'Windows-1252',
+ 'iso88591' => 'Windows-1252',
+ 'iso_8859-1' => 'Windows-1252',
+ 'iso_8859-1:1987' => 'Windows-1252',
+ 'l1' => 'Windows-1252',
+ 'latin1' => 'Windows-1252',
+ 'us-ascii' => 'Windows-1252',
+ 'windows-1252' => 'Windows-1252',
+ 'x-cp1252' => 'Windows-1252',
+ 'cp1253' => 'Windows-1253',
+ 'windows-1253' => 'Windows-1253',
+ 'x-cp1253' => 'Windows-1253',
+ 'cp1254' => 'Windows-1254',
+ 'csisolatin5' => 'Windows-1254',
+ 'iso-8859-9' => 'Windows-1254',
+ 'iso-ir-148' => 'Windows-1254',
+ 'iso8859-9' => 'Windows-1254',
+ 'iso88599' => 'Windows-1254',
+ 'iso_8859-9' => 'Windows-1254',
+ 'iso_8859-9:1989' => 'Windows-1254',
+ 'l5' => 'Windows-1254',
+ 'latin5' => 'Windows-1254',
+ 'windows-1254' => 'Windows-1254',
+ 'x-cp1254' => 'Windows-1254',
+ 'cp1255' => 'Windows-1255',
+ 'windows-1255' => 'Windows-1255',
+ 'x-cp1255' => 'Windows-1255',
+ 'cp1256' => 'Windows-1256',
+ 'windows-1256' => 'Windows-1256',
+ 'x-cp1256' => 'Windows-1256',
+ 'cp1257' => 'Windows-1257',
+ 'windows-1257' => 'Windows-1257',
+ 'x-cp1257' => 'Windows-1257',
+ 'cp1258' => 'Windows-1258',
+ 'windows-1258' => 'Windows-1258',
+ 'x-cp1258' => 'Windows-1258',
+ 'x-mac-cyrillic' => 'mac-cyrillic',
+ 'x-mac-ukrainian' => 'mac-cyrillic',
+ 'chinese' => 'GB18030', // GBK
+ 'csgb2312' => 'GB18030', // GBK
+ 'csiso58gb231280' => 'GB18030', // GBK
+ 'gb2312' => 'GB18030', // GBK
+ 'gb_2312' => 'GB18030', // GBK
+ 'gb_2312-80' => 'GB18030', // GBK
+ 'gbk' => 'GB18030', // GBK
+ 'iso-ir-58' => 'GB18030', // GBK
+ 'x-gbk' => 'GB18030', // GBK
+ 'gb18030' => 'GB18030',
+ 'big5' => 'BIG-5',
+ 'big5-hkscs' => 'BIG-5',
+ 'cn-big5' => 'BIG-5',
+ 'csbig5' => 'BIG-5',
+ 'x-x-big5' => 'BIG-5',
+ 'cseucpkdfmtjapanese' => 'EUC-JP',
+ 'euc-jp' => 'EUC-JP',
+ 'x-euc-jp' => 'EUC-JP',
+ 'csiso2022jp' => 'ISO-2022-JP',
+ 'iso-2022-jp' => 'ISO-2022-JP',
+ 'csshiftjis' => 'SJIS',
+ 'ms932' => 'SJIS',
+ 'ms_kanji' => 'SJIS',
+ 'shift-jis' => 'SJIS',
+ 'shift_jis' => 'SJIS',
+ 'sjis' => 'SJIS',
+ 'windows-31j' => 'SJIS',
+ 'x-sjis' => 'SJIS',
+ 'cseuckr' => 'EUC-KR',
+ 'csksc56011987' => 'EUC-KR',
+ 'euc-kr' => 'EUC-KR',
+ 'iso-ir-149' => 'EUC-KR',
+ 'korean' => 'EUC-KR',
+ 'ks_c_5601-1987' => 'EUC-KR',
+ 'ks_c_5601-1989' => 'EUC-KR',
+ 'ksc5601' => 'EUC-KR',
+ 'ksc_5601' => 'EUC-KR',
+ 'windows-949' => 'EUC-KR',
+ 'csiso2022kr' => 'replacement',
+ 'hz-gb-2312' => 'replacement',
+ 'iso-2022-cn' => 'replacement',
+ 'iso-2022-cn-ext' => 'replacement',
+ 'iso-2022-kr' => 'replacement',
+ 'utf-16be' => 'UTF-16BE',
+ 'utf-16' => 'UTF-16LE',
+ 'utf-16le' => 'UTF-16LE',
+ 'x-user-defined' => 'x-user-defined',
+ ];
+
+ /**
+ * Convert CSS text to UTF-8
+ * @param string $text Text being detected
+ * @param string[] $encodings Encodings to use at various points in the algorithm:
+ * - transport: Encoding from HTTP or the like
+ * - environment: Encoding from HTML `<link>` or the like
+ * @return string
+ */
+ public static function convert( $text, $encodings = [] ) {
+ // First, check for a BOM and honor that if it's present.
+ if ( substr( $text, 0, 3 ) === "\xef\xbb\xbf" ) {
+ // UTF-8 with BOM (convert it anyway in case the BOM is a lie)
+ return self::doConvert( 'UTF-8', substr( $text, 3 ) );
+ }
+ $start = substr( $text, 0, 2 );
+ if ( $start === "\xfe\xff" ) {
+ return self::doConvert( 'UTF-16BE', substr( $text, 2 ) );
+ }
+ if ( $start === "\xff\xfe" ) {
+ return self::doConvert( 'UTF-16LE', substr( $text, 2 ) );
+ }
+
+ // 1. Transport encoding
+ $encoding = isset( $encodings['transport'] )
+ ? trim( strtolower( $encodings['transport'] ), "\t\n\f\r " )
+ : null;
+ if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) {
+ return self::doConvert( self::$encodings[$encoding], $text );
+ }
+
+ // 2. @charset rule
+ if ( preg_match( '/^@charset "([\x00-\x21\x23-\x7f]{0,1012})";/', $text, $m ) ) {
+ $encoding = trim( strtolower( $m[1] ), "\t\n\f\r " );
+ if ( $encoding === 'utf-16be' || $encoding === 'utf-16le' ) {
+ // It's obviously lying.
+ $encoding = 'utf-8';
+ }
+ if ( isset( self::$encodings[$encoding] ) ) {
+ return self::doConvert( self::$encodings[$encoding], $text );
+ }
+ }
+
+ // 3. Environment encoding
+ $encoding = isset( $encodings['environment'] )
+ ? trim( strtolower( $encodings['environment'] ), "\t\n\f\r " )
+ : null;
+ if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) {
+ return self::doConvert( self::$encodings[$encoding], $text );
+ }
+
+ // 4. Just use UTF-8
+ return self::doConvert( 'UTF-8', $text );
+ }
+
+ /**
+ * Actually perform the conversion
+ * @param string $encoding
+ * @param string $text
+ * @return string
+ */
+ protected static function doConvert( $encoding, $text ) {
+ // Pseudo-encoding that just outputs one replacement character
+ if ( $encoding === 'replacement' ) {
+ return \UtfNormal\Constants::UTF8_REPLACEMENT;
+ }
+
+ // Pseudo-encoding that shifts non-ASCII bytes to the BMP private use area
+ if ( $encoding === 'x-user-defined' ) {
+ return preg_replace_callback( '/[\x80-\xff]/', function ( $m ) {
+ return \UtfNormal\Utils::codepointToUtf8( 0xf700 + ord( $m[0] ) );
+ }, $text );
+ }
+
+ // We prefer mbstring because it has sane handling of invalid input,
+ // where iconv just chokes and returns false. But we need iconv for
+ // some encodings mbstring doesn't support.
+ if ( in_array( $encoding, mb_list_encodings(), true ) ) {
+ $old = mb_substitute_character();
+ mb_substitute_character( \UtfNormal\Constants::UNICODE_REPLACEMENT );
+ $text = mb_convert_encoding( $text, 'UTF-8', $encoding );
+ mb_substitute_character( $old );
+ return $text;
+ }
+
+ $ret = \MediaWiki\quietCall( 'iconv', $encoding, 'UTF-8', $text );
+ if ( $ret === false ) {
+ throw new \RuntimeException( "Cannot convert '$text' from $encoding" );
+ }
+ return $ret;
+ }
+}