]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - vendor/wikimedia/css-sanitizer/src/Parser/Encoder.php
MediaWiki 1.30.2
[autoinstalls/mediawiki.git] / vendor / wikimedia / css-sanitizer / src / Parser / Encoder.php
1 <?php
2 /**
3  * @file
4  * @license https://opensource.org/licenses/Apache-2.0 Apache-2.0
5  */
6
7 namespace Wikimedia\CSS\Parser;
8
9 /**
10  * Character set conversion for CSS
11  * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#input-byte-stream
12  */
13 class Encoder {
14
15         /**
16          * @var array Mapping from CSS encoding tags to mbstring/iconv encodings
17          * @see https://encoding.spec.whatwg.org/#concept-encoding-get
18          */
19         protected static $encodings = [
20                 'unicode-1-1-utf-8'     => 'UTF-8',
21                 'utf-8'                 => 'UTF-8',
22                 'utf8'                  => 'UTF-8',
23                 '866'                   => 'CP866',
24                 'cp866'                 => 'CP866',
25                 'csibm866'              => 'CP866',
26                 'ibm866'                => 'CP866',
27                 'csisolatin2'           => 'ISO-8859-2',
28                 'iso-8859-2'            => 'ISO-8859-2',
29                 'iso-ir-101'            => 'ISO-8859-2',
30                 'iso8859-2'             => 'ISO-8859-2',
31                 'iso88592'              => 'ISO-8859-2',
32                 'iso_8859-2'            => 'ISO-8859-2',
33                 'iso_8859-2:1987'       => 'ISO-8859-2',
34                 'l2'                    => 'ISO-8859-2',
35                 'latin2'                => 'ISO-8859-2',
36                 'csisolatin3'           => 'ISO-8859-3',
37                 'iso-8859-3'            => 'ISO-8859-3',
38                 'iso-ir-109'            => 'ISO-8859-3',
39                 'iso8859-3'             => 'ISO-8859-3',
40                 'iso88593'              => 'ISO-8859-3',
41                 'iso_8859-3'            => 'ISO-8859-3',
42                 'iso_8859-3:1988'       => 'ISO-8859-3',
43                 'l3'                    => 'ISO-8859-3',
44                 'latin3'                => 'ISO-8859-3',
45                 'csisolatin4'           => 'ISO-8859-4',
46                 'iso-8859-4'            => 'ISO-8859-4',
47                 'iso-ir-110'            => 'ISO-8859-4',
48                 'iso8859-4'             => 'ISO-8859-4',
49                 'iso88594'              => 'ISO-8859-4',
50                 'iso_8859-4'            => 'ISO-8859-4',
51                 'iso_8859-4:1988'       => 'ISO-8859-4',
52                 'l4'                    => 'ISO-8859-4',
53                 'latin4'                => 'ISO-8859-4',
54                 'csisolatincyrillic'    => 'ISO-8859-5',
55                 'cyrillic'              => 'ISO-8859-5',
56                 'iso-8859-5'            => 'ISO-8859-5',
57                 'iso-ir-144'            => 'ISO-8859-5',
58                 'iso8859-5'             => 'ISO-8859-5',
59                 'iso88595'              => 'ISO-8859-5',
60                 'iso_8859-5'            => 'ISO-8859-5',
61                 'iso_8859-5:1988'       => 'ISO-8859-5',
62                 'arabic'                => 'ISO-8859-6',
63                 'asmo-708'              => 'ISO-8859-6',
64                 'csiso88596e'           => 'ISO-8859-6',
65                 'csiso88596i'           => 'ISO-8859-6',
66                 'csisolatinarabic'      => 'ISO-8859-6',
67                 'ecma-114'              => 'ISO-8859-6',
68                 'iso-8859-6'            => 'ISO-8859-6',
69                 'iso-8859-6-e'          => 'ISO-8859-6',
70                 'iso-8859-6-i'          => 'ISO-8859-6',
71                 'iso-ir-127'            => 'ISO-8859-6',
72                 'iso8859-6'             => 'ISO-8859-6',
73                 'iso88596'              => 'ISO-8859-6',
74                 'iso_8859-6'            => 'ISO-8859-6',
75                 'iso_8859-6:1987'       => 'ISO-8859-6',
76                 'csisolatingreek'       => 'ISO-8859-7',
77                 'ecma-118'              => 'ISO-8859-7',
78                 'elot_928'              => 'ISO-8859-7',
79                 'greek'                 => 'ISO-8859-7',
80                 'greek8'                => 'ISO-8859-7',
81                 'iso-8859-7'            => 'ISO-8859-7',
82                 'iso-ir-126'            => 'ISO-8859-7',
83                 'iso8859-7'             => 'ISO-8859-7',
84                 'iso88597'              => 'ISO-8859-7',
85                 'iso_8859-7'            => 'ISO-8859-7',
86                 'iso_8859-7:1987'       => 'ISO-8859-7',
87                 'sun_eu_greek'          => 'ISO-8859-7',
88                 'csiso88598e'           => 'ISO-8859-8',
89                 'csisolatinhebrew'      => 'ISO-8859-8',
90                 'hebrew'                => 'ISO-8859-8',
91                 'iso-8859-8'            => 'ISO-8859-8',
92                 'iso-8859-8-e'          => 'ISO-8859-8',
93                 'iso-ir-138'            => 'ISO-8859-8',
94                 'iso8859-8'             => 'ISO-8859-8',
95                 'iso88598'              => 'ISO-8859-8',
96                 'iso_8859-8'            => 'ISO-8859-8',
97                 'iso_8859-8:1988'       => 'ISO-8859-8',
98                 'visual'                => 'ISO-8859-8',
99                 'csiso88598i'           => 'ISO-8859-8', // ISO-8859-8-I?
100                 'iso-8859-8-i'          => 'ISO-8859-8', // ISO-8859-8-I?
101                 'logical'               => 'ISO-8859-8', // ISO-8859-8-I?
102                 'csisolatin6'           => 'ISO-8859-10',
103                 'iso-8859-10'           => 'ISO-8859-10',
104                 'iso-ir-157'            => 'ISO-8859-10',
105                 'iso8859-10'            => 'ISO-8859-10',
106                 'iso885910'             => 'ISO-8859-10',
107                 'l6'                    => 'ISO-8859-10',
108                 'latin6'                => 'ISO-8859-10',
109                 'iso-8859-13'           => 'ISO-8859-13',
110                 'iso8859-13'            => 'ISO-8859-13',
111                 'iso885913'             => 'ISO-8859-13',
112                 'iso-8859-14'           => 'ISO-8859-14',
113                 'iso8859-14'            => 'ISO-8859-14',
114                 'iso885914'             => 'ISO-8859-14',
115                 'csisolatin9'           => 'ISO-8859-15',
116                 'iso-8859-15'           => 'ISO-8859-15',
117                 'iso8859-15'            => 'ISO-8859-15',
118                 'iso885915'             => 'ISO-8859-15',
119                 'iso_8859-15'           => 'ISO-8859-15',
120                 'l9'                    => 'ISO-8859-15',
121                 'iso-8859-16'           => 'ISO-8859-16',
122                 'cskoi8r'               => 'KOI8-R',
123                 'koi'                   => 'KOI8-R',
124                 'koi8'                  => 'KOI8-R',
125                 'koi8-r'                => 'KOI8-R',
126                 'koi8_r'                => 'KOI8-R',
127                 'koi8-ru'               => 'KOI8-U',
128                 'koi8-u'                => 'KOI8-U',
129                 'csmacintosh'           => 'macintosh',
130                 'mac'                   => 'macintosh',
131                 'macintosh'             => 'macintosh',
132                 'x-mac-roman'           => 'macintosh',
133                 'dos-874'               => 'Windows-874',
134                 'iso-8859-11'           => 'Windows-874',
135                 'iso8859-11'            => 'Windows-874',
136                 'iso885911'             => 'Windows-874',
137                 'tis-620'               => 'Windows-874',
138                 'windows-874'           => 'Windows-874',
139                 'cp1250'                => 'Windows-1250',
140                 'windows-1250'          => 'Windows-1250',
141                 'x-cp1250'              => 'Windows-1250',
142                 'cp1251'                => 'Windows-1251',
143                 'windows-1251'          => 'Windows-1251',
144                 'x-cp1251'              => 'Windows-1251',
145                 'ansi_x3.4-1968'        => 'Windows-1252',
146                 'ascii'                 => 'Windows-1252',
147                 'cp1252'                => 'Windows-1252',
148                 'cp819'                 => 'Windows-1252',
149                 'csisolatin1'           => 'Windows-1252',
150                 'ibm819'                => 'Windows-1252',
151                 'iso-8859-1'            => 'Windows-1252',
152                 'iso-ir-100'            => 'Windows-1252',
153                 'iso8859-1'             => 'Windows-1252',
154                 'iso88591'              => 'Windows-1252',
155                 'iso_8859-1'            => 'Windows-1252',
156                 'iso_8859-1:1987'       => 'Windows-1252',
157                 'l1'                    => 'Windows-1252',
158                 'latin1'                => 'Windows-1252',
159                 'us-ascii'              => 'Windows-1252',
160                 'windows-1252'          => 'Windows-1252',
161                 'x-cp1252'              => 'Windows-1252',
162                 'cp1253'                => 'Windows-1253',
163                 'windows-1253'          => 'Windows-1253',
164                 'x-cp1253'              => 'Windows-1253',
165                 'cp1254'                => 'Windows-1254',
166                 'csisolatin5'           => 'Windows-1254',
167                 'iso-8859-9'            => 'Windows-1254',
168                 'iso-ir-148'            => 'Windows-1254',
169                 'iso8859-9'             => 'Windows-1254',
170                 'iso88599'              => 'Windows-1254',
171                 'iso_8859-9'            => 'Windows-1254',
172                 'iso_8859-9:1989'       => 'Windows-1254',
173                 'l5'                    => 'Windows-1254',
174                 'latin5'                => 'Windows-1254',
175                 'windows-1254'          => 'Windows-1254',
176                 'x-cp1254'              => 'Windows-1254',
177                 'cp1255'                => 'Windows-1255',
178                 'windows-1255'          => 'Windows-1255',
179                 'x-cp1255'              => 'Windows-1255',
180                 'cp1256'                => 'Windows-1256',
181                 'windows-1256'          => 'Windows-1256',
182                 'x-cp1256'              => 'Windows-1256',
183                 'cp1257'                => 'Windows-1257',
184                 'windows-1257'          => 'Windows-1257',
185                 'x-cp1257'              => 'Windows-1257',
186                 'cp1258'                => 'Windows-1258',
187                 'windows-1258'          => 'Windows-1258',
188                 'x-cp1258'              => 'Windows-1258',
189                 'x-mac-cyrillic'        => 'mac-cyrillic',
190                 'x-mac-ukrainian'       => 'mac-cyrillic',
191                 'chinese'               => 'GB18030', // GBK
192                 'csgb2312'              => 'GB18030', // GBK
193                 'csiso58gb231280'       => 'GB18030', // GBK
194                 'gb2312'                => 'GB18030', // GBK
195                 'gb_2312'               => 'GB18030', // GBK
196                 'gb_2312-80'            => 'GB18030', // GBK
197                 'gbk'                   => 'GB18030', // GBK
198                 'iso-ir-58'             => 'GB18030', // GBK
199                 'x-gbk'                 => 'GB18030', // GBK
200                 'gb18030'               => 'GB18030',
201                 'big5'                  => 'BIG-5',
202                 'big5-hkscs'            => 'BIG-5',
203                 'cn-big5'               => 'BIG-5',
204                 'csbig5'                => 'BIG-5',
205                 'x-x-big5'              => 'BIG-5',
206                 'cseucpkdfmtjapanese'   => 'EUC-JP',
207                 'euc-jp'                => 'EUC-JP',
208                 'x-euc-jp'              => 'EUC-JP',
209                 'csiso2022jp'           => 'ISO-2022-JP',
210                 'iso-2022-jp'           => 'ISO-2022-JP',
211                 'csshiftjis'            => 'SJIS',
212                 'ms932'                 => 'SJIS',
213                 'ms_kanji'              => 'SJIS',
214                 'shift-jis'             => 'SJIS',
215                 'shift_jis'             => 'SJIS',
216                 'sjis'                  => 'SJIS',
217                 'windows-31j'           => 'SJIS',
218                 'x-sjis'                => 'SJIS',
219                 'cseuckr'               => 'EUC-KR',
220                 'csksc56011987'         => 'EUC-KR',
221                 'euc-kr'                => 'EUC-KR',
222                 'iso-ir-149'            => 'EUC-KR',
223                 'korean'                => 'EUC-KR',
224                 'ks_c_5601-1987'        => 'EUC-KR',
225                 'ks_c_5601-1989'        => 'EUC-KR',
226                 'ksc5601'               => 'EUC-KR',
227                 'ksc_5601'              => 'EUC-KR',
228                 'windows-949'           => 'EUC-KR',
229                 'csiso2022kr'           => 'replacement',
230                 'hz-gb-2312'            => 'replacement',
231                 'iso-2022-cn'           => 'replacement',
232                 'iso-2022-cn-ext'       => 'replacement',
233                 'iso-2022-kr'           => 'replacement',
234                 'utf-16be'              => 'UTF-16BE',
235                 'utf-16'                => 'UTF-16LE',
236                 'utf-16le'              => 'UTF-16LE',
237                 'x-user-defined'        => 'x-user-defined',
238         ];
239
240         /**
241          * Convert CSS text to UTF-8
242          * @param string $text Text being detected
243          * @param string[] $encodings Encodings to use at various points in the algorithm:
244          *  - transport: Encoding from HTTP or the like
245          *  - environment: Encoding from HTML `<link>` or the like
246          * @return string
247          */
248         public static function convert( $text, $encodings = [] ) {
249                 // First, check for a BOM and honor that if it's present.
250                 if ( substr( $text, 0, 3 ) === "\xef\xbb\xbf" ) {
251                         // UTF-8 with BOM (convert it anyway in case the BOM is a lie)
252                         return self::doConvert( 'UTF-8', substr( $text, 3 ) );
253                 }
254                 $start = substr( $text, 0, 2 );
255                 if ( $start === "\xfe\xff" ) {
256                         return self::doConvert( 'UTF-16BE', substr( $text, 2 ) );
257                 }
258                 if ( $start === "\xff\xfe" ) {
259                         return self::doConvert( 'UTF-16LE', substr( $text, 2 ) );
260                 }
261
262                 // 1. Transport encoding
263                 $encoding = isset( $encodings['transport'] )
264                         ? trim( strtolower( $encodings['transport'] ), "\t\n\f\r " )
265                         : null;
266                 if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) {
267                         return self::doConvert( self::$encodings[$encoding], $text );
268                 }
269
270                 // 2. @charset rule
271                 if ( preg_match( '/^@charset "([\x00-\x21\x23-\x7f]{0,1012})";/', $text, $m ) ) {
272                         $encoding = trim( strtolower( $m[1] ), "\t\n\f\r " );
273                         if ( $encoding === 'utf-16be' || $encoding === 'utf-16le' ) {
274                                 // It's obviously lying.
275                                 $encoding = 'utf-8';
276                         }
277                         if ( isset( self::$encodings[$encoding] ) ) {
278                                 return self::doConvert( self::$encodings[$encoding], $text );
279                         }
280                 }
281
282                 // 3. Environment encoding
283                 $encoding = isset( $encodings['environment'] )
284                         ? trim( strtolower( $encodings['environment'] ), "\t\n\f\r " )
285                         : null;
286                 if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) {
287                         return self::doConvert( self::$encodings[$encoding], $text );
288                 }
289
290                 // 4. Just use UTF-8
291                 return self::doConvert( 'UTF-8', $text );
292         }
293
294         /**
295          * Actually perform the conversion
296          * @param string $encoding
297          * @param string $text
298          * @return string
299          */
300         protected static function doConvert( $encoding, $text ) {
301                 // Pseudo-encoding that just outputs one replacement character
302                 if ( $encoding === 'replacement' ) {
303                         return \UtfNormal\Constants::UTF8_REPLACEMENT;
304                 }
305
306                 // Pseudo-encoding that shifts non-ASCII bytes to the BMP private use area
307                 if ( $encoding === 'x-user-defined' ) {
308                         return preg_replace_callback( '/[\x80-\xff]/', function ( $m ) {
309                                 return \UtfNormal\Utils::codepointToUtf8( 0xf700 + ord( $m[0] ) );
310                         }, $text );
311                 }
312
313                 // We prefer mbstring because it has sane handling of invalid input,
314                 // where iconv just chokes and returns false. But we need iconv for
315                 // some encodings mbstring doesn't support.
316                 if ( in_array( $encoding, mb_list_encodings(), true ) ) {
317                         $old = mb_substitute_character();
318                         mb_substitute_character( \UtfNormal\Constants::UNICODE_REPLACEMENT );
319                         $text = mb_convert_encoding( $text, 'UTF-8', $encoding );
320                         mb_substitute_character( $old );
321                         return $text;
322                 }
323
324                 $ret = \MediaWiki\quietCall( 'iconv', $encoding, 'UTF-8', $text );
325                 if ( $ret === false ) {
326                         throw new \RuntimeException( "Cannot convert '$text' from $encoding" );
327                 }
328                 return $ret;
329         }
330 }