X-Git-Url: https://scripts.mit.edu/gitweb/autoinstallsdev/mediawiki.git/blobdiff_plain/19e297c21b10b1b8a3acad5e73fc71dcb35db44a..6932310fd58ebef145fa01eb76edf7150284d8ea:/vendor/wikimedia/css-sanitizer/src/Parser/Encoder.php diff --git a/vendor/wikimedia/css-sanitizer/src/Parser/Encoder.php b/vendor/wikimedia/css-sanitizer/src/Parser/Encoder.php new file mode 100644 index 00000000..5691d987 --- /dev/null +++ b/vendor/wikimedia/css-sanitizer/src/Parser/Encoder.php @@ -0,0 +1,330 @@ + 'UTF-8', + 'utf-8' => 'UTF-8', + 'utf8' => 'UTF-8', + '866' => 'CP866', + 'cp866' => 'CP866', + 'csibm866' => 'CP866', + 'ibm866' => 'CP866', + 'csisolatin2' => 'ISO-8859-2', + 'iso-8859-2' => 'ISO-8859-2', + 'iso-ir-101' => 'ISO-8859-2', + 'iso8859-2' => 'ISO-8859-2', + 'iso88592' => 'ISO-8859-2', + 'iso_8859-2' => 'ISO-8859-2', + 'iso_8859-2:1987' => 'ISO-8859-2', + 'l2' => 'ISO-8859-2', + 'latin2' => 'ISO-8859-2', + 'csisolatin3' => 'ISO-8859-3', + 'iso-8859-3' => 'ISO-8859-3', + 'iso-ir-109' => 'ISO-8859-3', + 'iso8859-3' => 'ISO-8859-3', + 'iso88593' => 'ISO-8859-3', + 'iso_8859-3' => 'ISO-8859-3', + 'iso_8859-3:1988' => 'ISO-8859-3', + 'l3' => 'ISO-8859-3', + 'latin3' => 'ISO-8859-3', + 'csisolatin4' => 'ISO-8859-4', + 'iso-8859-4' => 'ISO-8859-4', + 'iso-ir-110' => 'ISO-8859-4', + 'iso8859-4' => 'ISO-8859-4', + 'iso88594' => 'ISO-8859-4', + 'iso_8859-4' => 'ISO-8859-4', + 'iso_8859-4:1988' => 'ISO-8859-4', + 'l4' => 'ISO-8859-4', + 'latin4' => 'ISO-8859-4', + 'csisolatincyrillic' => 'ISO-8859-5', + 'cyrillic' => 'ISO-8859-5', + 'iso-8859-5' => 'ISO-8859-5', + 'iso-ir-144' => 'ISO-8859-5', + 'iso8859-5' => 'ISO-8859-5', + 'iso88595' => 'ISO-8859-5', + 'iso_8859-5' => 'ISO-8859-5', + 'iso_8859-5:1988' => 'ISO-8859-5', + 'arabic' => 'ISO-8859-6', + 'asmo-708' => 'ISO-8859-6', + 'csiso88596e' => 'ISO-8859-6', + 'csiso88596i' => 'ISO-8859-6', + 'csisolatinarabic' => 'ISO-8859-6', + 'ecma-114' => 'ISO-8859-6', + 'iso-8859-6' => 'ISO-8859-6', + 'iso-8859-6-e' => 'ISO-8859-6', + 'iso-8859-6-i' => 'ISO-8859-6', + 'iso-ir-127' => 'ISO-8859-6', + 'iso8859-6' => 'ISO-8859-6', + 'iso88596' => 'ISO-8859-6', + 'iso_8859-6' => 'ISO-8859-6', + 'iso_8859-6:1987' => 'ISO-8859-6', + 'csisolatingreek' => 'ISO-8859-7', + 'ecma-118' => 'ISO-8859-7', + 'elot_928' => 'ISO-8859-7', + 'greek' => 'ISO-8859-7', + 'greek8' => 'ISO-8859-7', + 'iso-8859-7' => 'ISO-8859-7', + 'iso-ir-126' => 'ISO-8859-7', + 'iso8859-7' => 'ISO-8859-7', + 'iso88597' => 'ISO-8859-7', + 'iso_8859-7' => 'ISO-8859-7', + 'iso_8859-7:1987' => 'ISO-8859-7', + 'sun_eu_greek' => 'ISO-8859-7', + 'csiso88598e' => 'ISO-8859-8', + 'csisolatinhebrew' => 'ISO-8859-8', + 'hebrew' => 'ISO-8859-8', + 'iso-8859-8' => 'ISO-8859-8', + 'iso-8859-8-e' => 'ISO-8859-8', + 'iso-ir-138' => 'ISO-8859-8', + 'iso8859-8' => 'ISO-8859-8', + 'iso88598' => 'ISO-8859-8', + 'iso_8859-8' => 'ISO-8859-8', + 'iso_8859-8:1988' => 'ISO-8859-8', + 'visual' => 'ISO-8859-8', + 'csiso88598i' => 'ISO-8859-8', // ISO-8859-8-I? + 'iso-8859-8-i' => 'ISO-8859-8', // ISO-8859-8-I? + 'logical' => 'ISO-8859-8', // ISO-8859-8-I? + 'csisolatin6' => 'ISO-8859-10', + 'iso-8859-10' => 'ISO-8859-10', + 'iso-ir-157' => 'ISO-8859-10', + 'iso8859-10' => 'ISO-8859-10', + 'iso885910' => 'ISO-8859-10', + 'l6' => 'ISO-8859-10', + 'latin6' => 'ISO-8859-10', + 'iso-8859-13' => 'ISO-8859-13', + 'iso8859-13' => 'ISO-8859-13', + 'iso885913' => 'ISO-8859-13', + 'iso-8859-14' => 'ISO-8859-14', + 'iso8859-14' => 'ISO-8859-14', + 'iso885914' => 'ISO-8859-14', + 'csisolatin9' => 'ISO-8859-15', + 'iso-8859-15' => 'ISO-8859-15', + 'iso8859-15' => 'ISO-8859-15', + 'iso885915' => 'ISO-8859-15', + 'iso_8859-15' => 'ISO-8859-15', + 'l9' => 'ISO-8859-15', + 'iso-8859-16' => 'ISO-8859-16', + 'cskoi8r' => 'KOI8-R', + 'koi' => 'KOI8-R', + 'koi8' => 'KOI8-R', + 'koi8-r' => 'KOI8-R', + 'koi8_r' => 'KOI8-R', + 'koi8-ru' => 'KOI8-U', + 'koi8-u' => 'KOI8-U', + 'csmacintosh' => 'macintosh', + 'mac' => 'macintosh', + 'macintosh' => 'macintosh', + 'x-mac-roman' => 'macintosh', + 'dos-874' => 'Windows-874', + 'iso-8859-11' => 'Windows-874', + 'iso8859-11' => 'Windows-874', + 'iso885911' => 'Windows-874', + 'tis-620' => 'Windows-874', + 'windows-874' => 'Windows-874', + 'cp1250' => 'Windows-1250', + 'windows-1250' => 'Windows-1250', + 'x-cp1250' => 'Windows-1250', + 'cp1251' => 'Windows-1251', + 'windows-1251' => 'Windows-1251', + 'x-cp1251' => 'Windows-1251', + 'ansi_x3.4-1968' => 'Windows-1252', + 'ascii' => 'Windows-1252', + 'cp1252' => 'Windows-1252', + 'cp819' => 'Windows-1252', + 'csisolatin1' => 'Windows-1252', + 'ibm819' => 'Windows-1252', + 'iso-8859-1' => 'Windows-1252', + 'iso-ir-100' => 'Windows-1252', + 'iso8859-1' => 'Windows-1252', + 'iso88591' => 'Windows-1252', + 'iso_8859-1' => 'Windows-1252', + 'iso_8859-1:1987' => 'Windows-1252', + 'l1' => 'Windows-1252', + 'latin1' => 'Windows-1252', + 'us-ascii' => 'Windows-1252', + 'windows-1252' => 'Windows-1252', + 'x-cp1252' => 'Windows-1252', + 'cp1253' => 'Windows-1253', + 'windows-1253' => 'Windows-1253', + 'x-cp1253' => 'Windows-1253', + 'cp1254' => 'Windows-1254', + 'csisolatin5' => 'Windows-1254', + 'iso-8859-9' => 'Windows-1254', + 'iso-ir-148' => 'Windows-1254', + 'iso8859-9' => 'Windows-1254', + 'iso88599' => 'Windows-1254', + 'iso_8859-9' => 'Windows-1254', + 'iso_8859-9:1989' => 'Windows-1254', + 'l5' => 'Windows-1254', + 'latin5' => 'Windows-1254', + 'windows-1254' => 'Windows-1254', + 'x-cp1254' => 'Windows-1254', + 'cp1255' => 'Windows-1255', + 'windows-1255' => 'Windows-1255', + 'x-cp1255' => 'Windows-1255', + 'cp1256' => 'Windows-1256', + 'windows-1256' => 'Windows-1256', + 'x-cp1256' => 'Windows-1256', + 'cp1257' => 'Windows-1257', + 'windows-1257' => 'Windows-1257', + 'x-cp1257' => 'Windows-1257', + 'cp1258' => 'Windows-1258', + 'windows-1258' => 'Windows-1258', + 'x-cp1258' => 'Windows-1258', + 'x-mac-cyrillic' => 'mac-cyrillic', + 'x-mac-ukrainian' => 'mac-cyrillic', + 'chinese' => 'GB18030', // GBK + 'csgb2312' => 'GB18030', // GBK + 'csiso58gb231280' => 'GB18030', // GBK + 'gb2312' => 'GB18030', // GBK + 'gb_2312' => 'GB18030', // GBK + 'gb_2312-80' => 'GB18030', // GBK + 'gbk' => 'GB18030', // GBK + 'iso-ir-58' => 'GB18030', // GBK + 'x-gbk' => 'GB18030', // GBK + 'gb18030' => 'GB18030', + 'big5' => 'BIG-5', + 'big5-hkscs' => 'BIG-5', + 'cn-big5' => 'BIG-5', + 'csbig5' => 'BIG-5', + 'x-x-big5' => 'BIG-5', + 'cseucpkdfmtjapanese' => 'EUC-JP', + 'euc-jp' => 'EUC-JP', + 'x-euc-jp' => 'EUC-JP', + 'csiso2022jp' => 'ISO-2022-JP', + 'iso-2022-jp' => 'ISO-2022-JP', + 'csshiftjis' => 'SJIS', + 'ms932' => 'SJIS', + 'ms_kanji' => 'SJIS', + 'shift-jis' => 'SJIS', + 'shift_jis' => 'SJIS', + 'sjis' => 'SJIS', + 'windows-31j' => 'SJIS', + 'x-sjis' => 'SJIS', + 'cseuckr' => 'EUC-KR', + 'csksc56011987' => 'EUC-KR', + 'euc-kr' => 'EUC-KR', + 'iso-ir-149' => 'EUC-KR', + 'korean' => 'EUC-KR', + 'ks_c_5601-1987' => 'EUC-KR', + 'ks_c_5601-1989' => 'EUC-KR', + 'ksc5601' => 'EUC-KR', + 'ksc_5601' => 'EUC-KR', + 'windows-949' => 'EUC-KR', + 'csiso2022kr' => 'replacement', + 'hz-gb-2312' => 'replacement', + 'iso-2022-cn' => 'replacement', + 'iso-2022-cn-ext' => 'replacement', + 'iso-2022-kr' => 'replacement', + 'utf-16be' => 'UTF-16BE', + 'utf-16' => 'UTF-16LE', + 'utf-16le' => 'UTF-16LE', + 'x-user-defined' => 'x-user-defined', + ]; + + /** + * Convert CSS text to UTF-8 + * @param string $text Text being detected + * @param string[] $encodings Encodings to use at various points in the algorithm: + * - transport: Encoding from HTTP or the like + * - environment: Encoding from HTML `` or the like + * @return string + */ + public static function convert( $text, $encodings = [] ) { + // First, check for a BOM and honor that if it's present. + if ( substr( $text, 0, 3 ) === "\xef\xbb\xbf" ) { + // UTF-8 with BOM (convert it anyway in case the BOM is a lie) + return self::doConvert( 'UTF-8', substr( $text, 3 ) ); + } + $start = substr( $text, 0, 2 ); + if ( $start === "\xfe\xff" ) { + return self::doConvert( 'UTF-16BE', substr( $text, 2 ) ); + } + if ( $start === "\xff\xfe" ) { + return self::doConvert( 'UTF-16LE', substr( $text, 2 ) ); + } + + // 1. Transport encoding + $encoding = isset( $encodings['transport'] ) + ? trim( strtolower( $encodings['transport'] ), "\t\n\f\r " ) + : null; + if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) { + return self::doConvert( self::$encodings[$encoding], $text ); + } + + // 2. @charset rule + if ( preg_match( '/^@charset "([\x00-\x21\x23-\x7f]{0,1012})";/', $text, $m ) ) { + $encoding = trim( strtolower( $m[1] ), "\t\n\f\r " ); + if ( $encoding === 'utf-16be' || $encoding === 'utf-16le' ) { + // It's obviously lying. + $encoding = 'utf-8'; + } + if ( isset( self::$encodings[$encoding] ) ) { + return self::doConvert( self::$encodings[$encoding], $text ); + } + } + + // 3. Environment encoding + $encoding = isset( $encodings['environment'] ) + ? trim( strtolower( $encodings['environment'] ), "\t\n\f\r " ) + : null; + if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) { + return self::doConvert( self::$encodings[$encoding], $text ); + } + + // 4. Just use UTF-8 + return self::doConvert( 'UTF-8', $text ); + } + + /** + * Actually perform the conversion + * @param string $encoding + * @param string $text + * @return string + */ + protected static function doConvert( $encoding, $text ) { + // Pseudo-encoding that just outputs one replacement character + if ( $encoding === 'replacement' ) { + return \UtfNormal\Constants::UTF8_REPLACEMENT; + } + + // Pseudo-encoding that shifts non-ASCII bytes to the BMP private use area + if ( $encoding === 'x-user-defined' ) { + return preg_replace_callback( '/[\x80-\xff]/', function ( $m ) { + return \UtfNormal\Utils::codepointToUtf8( 0xf700 + ord( $m[0] ) ); + }, $text ); + } + + // We prefer mbstring because it has sane handling of invalid input, + // where iconv just chokes and returns false. But we need iconv for + // some encodings mbstring doesn't support. + if ( in_array( $encoding, mb_list_encodings(), true ) ) { + $old = mb_substitute_character(); + mb_substitute_character( \UtfNormal\Constants::UNICODE_REPLACEMENT ); + $text = mb_convert_encoding( $text, 'UTF-8', $encoding ); + mb_substitute_character( $old ); + return $text; + } + + $ret = \MediaWiki\quietCall( 'iconv', $encoding, 'UTF-8', $text ); + if ( $ret === false ) { + throw new \RuntimeException( "Cannot convert '$text' from $encoding" ); + } + return $ret; + } +}