]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blobdiff - vendor/wikimedia/css-sanitizer/src/Parser/Encoder.php
MediaWiki 1.30.2
[autoinstallsdev/mediawiki.git] / vendor / wikimedia / css-sanitizer / src / Parser / Encoder.php
diff --git a/vendor/wikimedia/css-sanitizer/src/Parser/Encoder.php b/vendor/wikimedia/css-sanitizer/src/Parser/Encoder.php
new file mode 100644 (file)
index 0000000..5691d98
--- /dev/null
@@ -0,0 +1,330 @@
+<?php
+/**
+ * @file
+ * @license https://opensource.org/licenses/Apache-2.0 Apache-2.0
+ */
+
+namespace Wikimedia\CSS\Parser;
+
+/**
+ * Character set conversion for CSS
+ * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#input-byte-stream
+ */
+class Encoder {
+
+       /**
+        * @var array Mapping from CSS encoding tags to mbstring/iconv encodings
+        * @see https://encoding.spec.whatwg.org/#concept-encoding-get
+        */
+       protected static $encodings = [
+               'unicode-1-1-utf-8'     => 'UTF-8',
+               'utf-8'                 => 'UTF-8',
+               'utf8'                  => 'UTF-8',
+               '866'                   => 'CP866',
+               'cp866'                 => 'CP866',
+               'csibm866'              => 'CP866',
+               'ibm866'                => 'CP866',
+               'csisolatin2'           => 'ISO-8859-2',
+               'iso-8859-2'            => 'ISO-8859-2',
+               'iso-ir-101'            => 'ISO-8859-2',
+               'iso8859-2'             => 'ISO-8859-2',
+               'iso88592'              => 'ISO-8859-2',
+               'iso_8859-2'            => 'ISO-8859-2',
+               'iso_8859-2:1987'       => 'ISO-8859-2',
+               'l2'                    => 'ISO-8859-2',
+               'latin2'                => 'ISO-8859-2',
+               'csisolatin3'           => 'ISO-8859-3',
+               'iso-8859-3'            => 'ISO-8859-3',
+               'iso-ir-109'            => 'ISO-8859-3',
+               'iso8859-3'             => 'ISO-8859-3',
+               'iso88593'              => 'ISO-8859-3',
+               'iso_8859-3'            => 'ISO-8859-3',
+               'iso_8859-3:1988'       => 'ISO-8859-3',
+               'l3'                    => 'ISO-8859-3',
+               'latin3'                => 'ISO-8859-3',
+               'csisolatin4'           => 'ISO-8859-4',
+               'iso-8859-4'            => 'ISO-8859-4',
+               'iso-ir-110'            => 'ISO-8859-4',
+               'iso8859-4'             => 'ISO-8859-4',
+               'iso88594'              => 'ISO-8859-4',
+               'iso_8859-4'            => 'ISO-8859-4',
+               'iso_8859-4:1988'       => 'ISO-8859-4',
+               'l4'                    => 'ISO-8859-4',
+               'latin4'                => 'ISO-8859-4',
+               'csisolatincyrillic'    => 'ISO-8859-5',
+               'cyrillic'              => 'ISO-8859-5',
+               'iso-8859-5'            => 'ISO-8859-5',
+               'iso-ir-144'            => 'ISO-8859-5',
+               'iso8859-5'             => 'ISO-8859-5',
+               'iso88595'              => 'ISO-8859-5',
+               'iso_8859-5'            => 'ISO-8859-5',
+               'iso_8859-5:1988'       => 'ISO-8859-5',
+               'arabic'                => 'ISO-8859-6',
+               'asmo-708'              => 'ISO-8859-6',
+               'csiso88596e'           => 'ISO-8859-6',
+               'csiso88596i'           => 'ISO-8859-6',
+               'csisolatinarabic'      => 'ISO-8859-6',
+               'ecma-114'              => 'ISO-8859-6',
+               'iso-8859-6'            => 'ISO-8859-6',
+               'iso-8859-6-e'          => 'ISO-8859-6',
+               'iso-8859-6-i'          => 'ISO-8859-6',
+               'iso-ir-127'            => 'ISO-8859-6',
+               'iso8859-6'             => 'ISO-8859-6',
+               'iso88596'              => 'ISO-8859-6',
+               'iso_8859-6'            => 'ISO-8859-6',
+               'iso_8859-6:1987'       => 'ISO-8859-6',
+               'csisolatingreek'       => 'ISO-8859-7',
+               'ecma-118'              => 'ISO-8859-7',
+               'elot_928'              => 'ISO-8859-7',
+               'greek'                 => 'ISO-8859-7',
+               'greek8'                => 'ISO-8859-7',
+               'iso-8859-7'            => 'ISO-8859-7',
+               'iso-ir-126'            => 'ISO-8859-7',
+               'iso8859-7'             => 'ISO-8859-7',
+               'iso88597'              => 'ISO-8859-7',
+               'iso_8859-7'            => 'ISO-8859-7',
+               'iso_8859-7:1987'       => 'ISO-8859-7',
+               'sun_eu_greek'          => 'ISO-8859-7',
+               'csiso88598e'           => 'ISO-8859-8',
+               'csisolatinhebrew'      => 'ISO-8859-8',
+               'hebrew'                => 'ISO-8859-8',
+               'iso-8859-8'            => 'ISO-8859-8',
+               'iso-8859-8-e'          => 'ISO-8859-8',
+               'iso-ir-138'            => 'ISO-8859-8',
+               'iso8859-8'             => 'ISO-8859-8',
+               'iso88598'              => 'ISO-8859-8',
+               'iso_8859-8'            => 'ISO-8859-8',
+               'iso_8859-8:1988'       => 'ISO-8859-8',
+               'visual'                => 'ISO-8859-8',
+               'csiso88598i'           => 'ISO-8859-8', // ISO-8859-8-I?
+               'iso-8859-8-i'          => 'ISO-8859-8', // ISO-8859-8-I?
+               'logical'               => 'ISO-8859-8', // ISO-8859-8-I?
+               'csisolatin6'           => 'ISO-8859-10',
+               'iso-8859-10'           => 'ISO-8859-10',
+               'iso-ir-157'            => 'ISO-8859-10',
+               'iso8859-10'            => 'ISO-8859-10',
+               'iso885910'             => 'ISO-8859-10',
+               'l6'                    => 'ISO-8859-10',
+               'latin6'                => 'ISO-8859-10',
+               'iso-8859-13'           => 'ISO-8859-13',
+               'iso8859-13'            => 'ISO-8859-13',
+               'iso885913'             => 'ISO-8859-13',
+               'iso-8859-14'           => 'ISO-8859-14',
+               'iso8859-14'            => 'ISO-8859-14',
+               'iso885914'             => 'ISO-8859-14',
+               'csisolatin9'           => 'ISO-8859-15',
+               'iso-8859-15'           => 'ISO-8859-15',
+               'iso8859-15'            => 'ISO-8859-15',
+               'iso885915'             => 'ISO-8859-15',
+               'iso_8859-15'           => 'ISO-8859-15',
+               'l9'                    => 'ISO-8859-15',
+               'iso-8859-16'           => 'ISO-8859-16',
+               'cskoi8r'               => 'KOI8-R',
+               'koi'                   => 'KOI8-R',
+               'koi8'                  => 'KOI8-R',
+               'koi8-r'                => 'KOI8-R',
+               'koi8_r'                => 'KOI8-R',
+               'koi8-ru'               => 'KOI8-U',
+               'koi8-u'                => 'KOI8-U',
+               'csmacintosh'           => 'macintosh',
+               'mac'                   => 'macintosh',
+               'macintosh'             => 'macintosh',
+               'x-mac-roman'           => 'macintosh',
+               'dos-874'               => 'Windows-874',
+               'iso-8859-11'           => 'Windows-874',
+               'iso8859-11'            => 'Windows-874',
+               'iso885911'             => 'Windows-874',
+               'tis-620'               => 'Windows-874',
+               'windows-874'           => 'Windows-874',
+               'cp1250'                => 'Windows-1250',
+               'windows-1250'          => 'Windows-1250',
+               'x-cp1250'              => 'Windows-1250',
+               'cp1251'                => 'Windows-1251',
+               'windows-1251'          => 'Windows-1251',
+               'x-cp1251'              => 'Windows-1251',
+               'ansi_x3.4-1968'        => 'Windows-1252',
+               'ascii'                 => 'Windows-1252',
+               'cp1252'                => 'Windows-1252',
+               'cp819'                 => 'Windows-1252',
+               'csisolatin1'           => 'Windows-1252',
+               'ibm819'                => 'Windows-1252',
+               'iso-8859-1'            => 'Windows-1252',
+               'iso-ir-100'            => 'Windows-1252',
+               'iso8859-1'             => 'Windows-1252',
+               'iso88591'              => 'Windows-1252',
+               'iso_8859-1'            => 'Windows-1252',
+               'iso_8859-1:1987'       => 'Windows-1252',
+               'l1'                    => 'Windows-1252',
+               'latin1'                => 'Windows-1252',
+               'us-ascii'              => 'Windows-1252',
+               'windows-1252'          => 'Windows-1252',
+               'x-cp1252'              => 'Windows-1252',
+               'cp1253'                => 'Windows-1253',
+               'windows-1253'          => 'Windows-1253',
+               'x-cp1253'              => 'Windows-1253',
+               'cp1254'                => 'Windows-1254',
+               'csisolatin5'           => 'Windows-1254',
+               'iso-8859-9'            => 'Windows-1254',
+               'iso-ir-148'            => 'Windows-1254',
+               'iso8859-9'             => 'Windows-1254',
+               'iso88599'              => 'Windows-1254',
+               'iso_8859-9'            => 'Windows-1254',
+               'iso_8859-9:1989'       => 'Windows-1254',
+               'l5'                    => 'Windows-1254',
+               'latin5'                => 'Windows-1254',
+               'windows-1254'          => 'Windows-1254',
+               'x-cp1254'              => 'Windows-1254',
+               'cp1255'                => 'Windows-1255',
+               'windows-1255'          => 'Windows-1255',
+               'x-cp1255'              => 'Windows-1255',
+               'cp1256'                => 'Windows-1256',
+               'windows-1256'          => 'Windows-1256',
+               'x-cp1256'              => 'Windows-1256',
+               'cp1257'                => 'Windows-1257',
+               'windows-1257'          => 'Windows-1257',
+               'x-cp1257'              => 'Windows-1257',
+               'cp1258'                => 'Windows-1258',
+               'windows-1258'          => 'Windows-1258',
+               'x-cp1258'              => 'Windows-1258',
+               'x-mac-cyrillic'        => 'mac-cyrillic',
+               'x-mac-ukrainian'       => 'mac-cyrillic',
+               'chinese'               => 'GB18030', // GBK
+               'csgb2312'              => 'GB18030', // GBK
+               'csiso58gb231280'       => 'GB18030', // GBK
+               'gb2312'                => 'GB18030', // GBK
+               'gb_2312'               => 'GB18030', // GBK
+               'gb_2312-80'            => 'GB18030', // GBK
+               'gbk'                   => 'GB18030', // GBK
+               'iso-ir-58'             => 'GB18030', // GBK
+               'x-gbk'                 => 'GB18030', // GBK
+               'gb18030'               => 'GB18030',
+               'big5'                  => 'BIG-5',
+               'big5-hkscs'            => 'BIG-5',
+               'cn-big5'               => 'BIG-5',
+               'csbig5'                => 'BIG-5',
+               'x-x-big5'              => 'BIG-5',
+               'cseucpkdfmtjapanese'   => 'EUC-JP',
+               'euc-jp'                => 'EUC-JP',
+               'x-euc-jp'              => 'EUC-JP',
+               'csiso2022jp'           => 'ISO-2022-JP',
+               'iso-2022-jp'           => 'ISO-2022-JP',
+               'csshiftjis'            => 'SJIS',
+               'ms932'                 => 'SJIS',
+               'ms_kanji'              => 'SJIS',
+               'shift-jis'             => 'SJIS',
+               'shift_jis'             => 'SJIS',
+               'sjis'                  => 'SJIS',
+               'windows-31j'           => 'SJIS',
+               'x-sjis'                => 'SJIS',
+               'cseuckr'               => 'EUC-KR',
+               'csksc56011987'         => 'EUC-KR',
+               'euc-kr'                => 'EUC-KR',
+               'iso-ir-149'            => 'EUC-KR',
+               'korean'                => 'EUC-KR',
+               'ks_c_5601-1987'        => 'EUC-KR',
+               'ks_c_5601-1989'        => 'EUC-KR',
+               'ksc5601'               => 'EUC-KR',
+               'ksc_5601'              => 'EUC-KR',
+               'windows-949'           => 'EUC-KR',
+               'csiso2022kr'           => 'replacement',
+               'hz-gb-2312'            => 'replacement',
+               'iso-2022-cn'           => 'replacement',
+               'iso-2022-cn-ext'       => 'replacement',
+               'iso-2022-kr'           => 'replacement',
+               'utf-16be'              => 'UTF-16BE',
+               'utf-16'                => 'UTF-16LE',
+               'utf-16le'              => 'UTF-16LE',
+               'x-user-defined'        => 'x-user-defined',
+       ];
+
+       /**
+        * Convert CSS text to UTF-8
+        * @param string $text Text being detected
+        * @param string[] $encodings Encodings to use at various points in the algorithm:
+        *  - transport: Encoding from HTTP or the like
+        *  - environment: Encoding from HTML `<link>` or the like
+        * @return string
+        */
+       public static function convert( $text, $encodings = [] ) {
+               // First, check for a BOM and honor that if it's present.
+               if ( substr( $text, 0, 3 ) === "\xef\xbb\xbf" ) {
+                       // UTF-8 with BOM (convert it anyway in case the BOM is a lie)
+                       return self::doConvert( 'UTF-8', substr( $text, 3 ) );
+               }
+               $start = substr( $text, 0, 2 );
+               if ( $start === "\xfe\xff" ) {
+                       return self::doConvert( 'UTF-16BE', substr( $text, 2 ) );
+               }
+               if ( $start === "\xff\xfe" ) {
+                       return self::doConvert( 'UTF-16LE', substr( $text, 2 ) );
+               }
+
+               // 1. Transport encoding
+               $encoding = isset( $encodings['transport'] )
+                       ? trim( strtolower( $encodings['transport'] ), "\t\n\f\r " )
+                       : null;
+               if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) {
+                       return self::doConvert( self::$encodings[$encoding], $text );
+               }
+
+               // 2. @charset rule
+               if ( preg_match( '/^@charset "([\x00-\x21\x23-\x7f]{0,1012})";/', $text, $m ) ) {
+                       $encoding = trim( strtolower( $m[1] ), "\t\n\f\r " );
+                       if ( $encoding === 'utf-16be' || $encoding === 'utf-16le' ) {
+                               // It's obviously lying.
+                               $encoding = 'utf-8';
+                       }
+                       if ( isset( self::$encodings[$encoding] ) ) {
+                               return self::doConvert( self::$encodings[$encoding], $text );
+                       }
+               }
+
+               // 3. Environment encoding
+               $encoding = isset( $encodings['environment'] )
+                       ? trim( strtolower( $encodings['environment'] ), "\t\n\f\r " )
+                       : null;
+               if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) {
+                       return self::doConvert( self::$encodings[$encoding], $text );
+               }
+
+               // 4. Just use UTF-8
+               return self::doConvert( 'UTF-8', $text );
+       }
+
+       /**
+        * Actually perform the conversion
+        * @param string $encoding
+        * @param string $text
+        * @return string
+        */
+       protected static function doConvert( $encoding, $text ) {
+               // Pseudo-encoding that just outputs one replacement character
+               if ( $encoding === 'replacement' ) {
+                       return \UtfNormal\Constants::UTF8_REPLACEMENT;
+               }
+
+               // Pseudo-encoding that shifts non-ASCII bytes to the BMP private use area
+               if ( $encoding === 'x-user-defined' ) {
+                       return preg_replace_callback( '/[\x80-\xff]/', function ( $m ) {
+                               return \UtfNormal\Utils::codepointToUtf8( 0xf700 + ord( $m[0] ) );
+                       }, $text );
+               }
+
+               // We prefer mbstring because it has sane handling of invalid input,
+               // where iconv just chokes and returns false. But we need iconv for
+               // some encodings mbstring doesn't support.
+               if ( in_array( $encoding, mb_list_encodings(), true ) ) {
+                       $old = mb_substitute_character();
+                       mb_substitute_character( \UtfNormal\Constants::UNICODE_REPLACEMENT );
+                       $text = mb_convert_encoding( $text, 'UTF-8', $encoding );
+                       mb_substitute_character( $old );
+                       return $text;
+               }
+
+               $ret = \MediaWiki\quietCall( 'iconv', $encoding, 'UTF-8', $text );
+               if ( $ret === false ) {
+                       throw new \RuntimeException( "Cannot convert '$text' from $encoding" );
+               }
+               return $ret;
+       }
+}