4 * @license https://opensource.org/licenses/Apache-2.0 Apache-2.0
7 namespace Wikimedia\CSS\Parser;
10 * Character set conversion for CSS
11 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#input-byte-stream
16 * @var array Mapping from CSS encoding tags to mbstring/iconv encodings
17 * @see https://encoding.spec.whatwg.org/#concept-encoding-get
19 protected static $encodings = [
20 'unicode-1-1-utf-8' => 'UTF-8',
25 'csibm866' => 'CP866',
27 'csisolatin2' => 'ISO-8859-2',
28 'iso-8859-2' => 'ISO-8859-2',
29 'iso-ir-101' => 'ISO-8859-2',
30 'iso8859-2' => 'ISO-8859-2',
31 'iso88592' => 'ISO-8859-2',
32 'iso_8859-2' => 'ISO-8859-2',
33 'iso_8859-2:1987' => 'ISO-8859-2',
35 'latin2' => 'ISO-8859-2',
36 'csisolatin3' => 'ISO-8859-3',
37 'iso-8859-3' => 'ISO-8859-3',
38 'iso-ir-109' => 'ISO-8859-3',
39 'iso8859-3' => 'ISO-8859-3',
40 'iso88593' => 'ISO-8859-3',
41 'iso_8859-3' => 'ISO-8859-3',
42 'iso_8859-3:1988' => 'ISO-8859-3',
44 'latin3' => 'ISO-8859-3',
45 'csisolatin4' => 'ISO-8859-4',
46 'iso-8859-4' => 'ISO-8859-4',
47 'iso-ir-110' => 'ISO-8859-4',
48 'iso8859-4' => 'ISO-8859-4',
49 'iso88594' => 'ISO-8859-4',
50 'iso_8859-4' => 'ISO-8859-4',
51 'iso_8859-4:1988' => 'ISO-8859-4',
53 'latin4' => 'ISO-8859-4',
54 'csisolatincyrillic' => 'ISO-8859-5',
55 'cyrillic' => 'ISO-8859-5',
56 'iso-8859-5' => 'ISO-8859-5',
57 'iso-ir-144' => 'ISO-8859-5',
58 'iso8859-5' => 'ISO-8859-5',
59 'iso88595' => 'ISO-8859-5',
60 'iso_8859-5' => 'ISO-8859-5',
61 'iso_8859-5:1988' => 'ISO-8859-5',
62 'arabic' => 'ISO-8859-6',
63 'asmo-708' => 'ISO-8859-6',
64 'csiso88596e' => 'ISO-8859-6',
65 'csiso88596i' => 'ISO-8859-6',
66 'csisolatinarabic' => 'ISO-8859-6',
67 'ecma-114' => 'ISO-8859-6',
68 'iso-8859-6' => 'ISO-8859-6',
69 'iso-8859-6-e' => 'ISO-8859-6',
70 'iso-8859-6-i' => 'ISO-8859-6',
71 'iso-ir-127' => 'ISO-8859-6',
72 'iso8859-6' => 'ISO-8859-6',
73 'iso88596' => 'ISO-8859-6',
74 'iso_8859-6' => 'ISO-8859-6',
75 'iso_8859-6:1987' => 'ISO-8859-6',
76 'csisolatingreek' => 'ISO-8859-7',
77 'ecma-118' => 'ISO-8859-7',
78 'elot_928' => 'ISO-8859-7',
79 'greek' => 'ISO-8859-7',
80 'greek8' => 'ISO-8859-7',
81 'iso-8859-7' => 'ISO-8859-7',
82 'iso-ir-126' => 'ISO-8859-7',
83 'iso8859-7' => 'ISO-8859-7',
84 'iso88597' => 'ISO-8859-7',
85 'iso_8859-7' => 'ISO-8859-7',
86 'iso_8859-7:1987' => 'ISO-8859-7',
87 'sun_eu_greek' => 'ISO-8859-7',
88 'csiso88598e' => 'ISO-8859-8',
89 'csisolatinhebrew' => 'ISO-8859-8',
90 'hebrew' => 'ISO-8859-8',
91 'iso-8859-8' => 'ISO-8859-8',
92 'iso-8859-8-e' => 'ISO-8859-8',
93 'iso-ir-138' => 'ISO-8859-8',
94 'iso8859-8' => 'ISO-8859-8',
95 'iso88598' => 'ISO-8859-8',
96 'iso_8859-8' => 'ISO-8859-8',
97 'iso_8859-8:1988' => 'ISO-8859-8',
98 'visual' => 'ISO-8859-8',
99 'csiso88598i' => 'ISO-8859-8', // ISO-8859-8-I?
100 'iso-8859-8-i' => 'ISO-8859-8', // ISO-8859-8-I?
101 'logical' => 'ISO-8859-8', // ISO-8859-8-I?
102 'csisolatin6' => 'ISO-8859-10',
103 'iso-8859-10' => 'ISO-8859-10',
104 'iso-ir-157' => 'ISO-8859-10',
105 'iso8859-10' => 'ISO-8859-10',
106 'iso885910' => 'ISO-8859-10',
107 'l6' => 'ISO-8859-10',
108 'latin6' => 'ISO-8859-10',
109 'iso-8859-13' => 'ISO-8859-13',
110 'iso8859-13' => 'ISO-8859-13',
111 'iso885913' => 'ISO-8859-13',
112 'iso-8859-14' => 'ISO-8859-14',
113 'iso8859-14' => 'ISO-8859-14',
114 'iso885914' => 'ISO-8859-14',
115 'csisolatin9' => 'ISO-8859-15',
116 'iso-8859-15' => 'ISO-8859-15',
117 'iso8859-15' => 'ISO-8859-15',
118 'iso885915' => 'ISO-8859-15',
119 'iso_8859-15' => 'ISO-8859-15',
120 'l9' => 'ISO-8859-15',
121 'iso-8859-16' => 'ISO-8859-16',
122 'cskoi8r' => 'KOI8-R',
125 'koi8-r' => 'KOI8-R',
126 'koi8_r' => 'KOI8-R',
127 'koi8-ru' => 'KOI8-U',
128 'koi8-u' => 'KOI8-U',
129 'csmacintosh' => 'macintosh',
130 'mac' => 'macintosh',
131 'macintosh' => 'macintosh',
132 'x-mac-roman' => 'macintosh',
133 'dos-874' => 'Windows-874',
134 'iso-8859-11' => 'Windows-874',
135 'iso8859-11' => 'Windows-874',
136 'iso885911' => 'Windows-874',
137 'tis-620' => 'Windows-874',
138 'windows-874' => 'Windows-874',
139 'cp1250' => 'Windows-1250',
140 'windows-1250' => 'Windows-1250',
141 'x-cp1250' => 'Windows-1250',
142 'cp1251' => 'Windows-1251',
143 'windows-1251' => 'Windows-1251',
144 'x-cp1251' => 'Windows-1251',
145 'ansi_x3.4-1968' => 'Windows-1252',
146 'ascii' => 'Windows-1252',
147 'cp1252' => 'Windows-1252',
148 'cp819' => 'Windows-1252',
149 'csisolatin1' => 'Windows-1252',
150 'ibm819' => 'Windows-1252',
151 'iso-8859-1' => 'Windows-1252',
152 'iso-ir-100' => 'Windows-1252',
153 'iso8859-1' => 'Windows-1252',
154 'iso88591' => 'Windows-1252',
155 'iso_8859-1' => 'Windows-1252',
156 'iso_8859-1:1987' => 'Windows-1252',
157 'l1' => 'Windows-1252',
158 'latin1' => 'Windows-1252',
159 'us-ascii' => 'Windows-1252',
160 'windows-1252' => 'Windows-1252',
161 'x-cp1252' => 'Windows-1252',
162 'cp1253' => 'Windows-1253',
163 'windows-1253' => 'Windows-1253',
164 'x-cp1253' => 'Windows-1253',
165 'cp1254' => 'Windows-1254',
166 'csisolatin5' => 'Windows-1254',
167 'iso-8859-9' => 'Windows-1254',
168 'iso-ir-148' => 'Windows-1254',
169 'iso8859-9' => 'Windows-1254',
170 'iso88599' => 'Windows-1254',
171 'iso_8859-9' => 'Windows-1254',
172 'iso_8859-9:1989' => 'Windows-1254',
173 'l5' => 'Windows-1254',
174 'latin5' => 'Windows-1254',
175 'windows-1254' => 'Windows-1254',
176 'x-cp1254' => 'Windows-1254',
177 'cp1255' => 'Windows-1255',
178 'windows-1255' => 'Windows-1255',
179 'x-cp1255' => 'Windows-1255',
180 'cp1256' => 'Windows-1256',
181 'windows-1256' => 'Windows-1256',
182 'x-cp1256' => 'Windows-1256',
183 'cp1257' => 'Windows-1257',
184 'windows-1257' => 'Windows-1257',
185 'x-cp1257' => 'Windows-1257',
186 'cp1258' => 'Windows-1258',
187 'windows-1258' => 'Windows-1258',
188 'x-cp1258' => 'Windows-1258',
189 'x-mac-cyrillic' => 'mac-cyrillic',
190 'x-mac-ukrainian' => 'mac-cyrillic',
191 'chinese' => 'GB18030', // GBK
192 'csgb2312' => 'GB18030', // GBK
193 'csiso58gb231280' => 'GB18030', // GBK
194 'gb2312' => 'GB18030', // GBK
195 'gb_2312' => 'GB18030', // GBK
196 'gb_2312-80' => 'GB18030', // GBK
197 'gbk' => 'GB18030', // GBK
198 'iso-ir-58' => 'GB18030', // GBK
199 'x-gbk' => 'GB18030', // GBK
200 'gb18030' => 'GB18030',
202 'big5-hkscs' => 'BIG-5',
203 'cn-big5' => 'BIG-5',
205 'x-x-big5' => 'BIG-5',
206 'cseucpkdfmtjapanese' => 'EUC-JP',
207 'euc-jp' => 'EUC-JP',
208 'x-euc-jp' => 'EUC-JP',
209 'csiso2022jp' => 'ISO-2022-JP',
210 'iso-2022-jp' => 'ISO-2022-JP',
211 'csshiftjis' => 'SJIS',
213 'ms_kanji' => 'SJIS',
214 'shift-jis' => 'SJIS',
215 'shift_jis' => 'SJIS',
217 'windows-31j' => 'SJIS',
219 'cseuckr' => 'EUC-KR',
220 'csksc56011987' => 'EUC-KR',
221 'euc-kr' => 'EUC-KR',
222 'iso-ir-149' => 'EUC-KR',
223 'korean' => 'EUC-KR',
224 'ks_c_5601-1987' => 'EUC-KR',
225 'ks_c_5601-1989' => 'EUC-KR',
226 'ksc5601' => 'EUC-KR',
227 'ksc_5601' => 'EUC-KR',
228 'windows-949' => 'EUC-KR',
229 'csiso2022kr' => 'replacement',
230 'hz-gb-2312' => 'replacement',
231 'iso-2022-cn' => 'replacement',
232 'iso-2022-cn-ext' => 'replacement',
233 'iso-2022-kr' => 'replacement',
234 'utf-16be' => 'UTF-16BE',
235 'utf-16' => 'UTF-16LE',
236 'utf-16le' => 'UTF-16LE',
237 'x-user-defined' => 'x-user-defined',
241 * Convert CSS text to UTF-8
242 * @param string $text Text being detected
243 * @param string[] $encodings Encodings to use at various points in the algorithm:
244 * - transport: Encoding from HTTP or the like
245 * - environment: Encoding from HTML `<link>` or the like
248 public static function convert( $text, $encodings = [] ) {
249 // First, check for a BOM and honor that if it's present.
250 if ( substr( $text, 0, 3 ) === "\xef\xbb\xbf" ) {
251 // UTF-8 with BOM (convert it anyway in case the BOM is a lie)
252 return self::doConvert( 'UTF-8', substr( $text, 3 ) );
254 $start = substr( $text, 0, 2 );
255 if ( $start === "\xfe\xff" ) {
256 return self::doConvert( 'UTF-16BE', substr( $text, 2 ) );
258 if ( $start === "\xff\xfe" ) {
259 return self::doConvert( 'UTF-16LE', substr( $text, 2 ) );
262 // 1. Transport encoding
263 $encoding = isset( $encodings['transport'] )
264 ? trim( strtolower( $encodings['transport'] ), "\t\n\f\r " )
266 if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) {
267 return self::doConvert( self::$encodings[$encoding], $text );
271 if ( preg_match( '/^@charset "([\x00-\x21\x23-\x7f]{0,1012})";/', $text, $m ) ) {
272 $encoding = trim( strtolower( $m[1] ), "\t\n\f\r " );
273 if ( $encoding === 'utf-16be' || $encoding === 'utf-16le' ) {
274 // It's obviously lying.
277 if ( isset( self::$encodings[$encoding] ) ) {
278 return self::doConvert( self::$encodings[$encoding], $text );
282 // 3. Environment encoding
283 $encoding = isset( $encodings['environment'] )
284 ? trim( strtolower( $encodings['environment'] ), "\t\n\f\r " )
286 if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) {
287 return self::doConvert( self::$encodings[$encoding], $text );
291 return self::doConvert( 'UTF-8', $text );
295 * Actually perform the conversion
296 * @param string $encoding
297 * @param string $text
300 protected static function doConvert( $encoding, $text ) {
301 // Pseudo-encoding that just outputs one replacement character
302 if ( $encoding === 'replacement' ) {
303 return \UtfNormal\Constants::UTF8_REPLACEMENT;
306 // Pseudo-encoding that shifts non-ASCII bytes to the BMP private use area
307 if ( $encoding === 'x-user-defined' ) {
308 return preg_replace_callback( '/[\x80-\xff]/', function ( $m ) {
309 return \UtfNormal\Utils::codepointToUtf8( 0xf700 + ord( $m[0] ) );
313 // We prefer mbstring because it has sane handling of invalid input,
314 // where iconv just chokes and returns false. But we need iconv for
315 // some encodings mbstring doesn't support.
316 if ( in_array( $encoding, mb_list_encodings(), true ) ) {
317 $old = mb_substitute_character();
318 mb_substitute_character( \UtfNormal\Constants::UNICODE_REPLACEMENT );
319 $text = mb_convert_encoding( $text, 'UTF-8', $encoding );
320 mb_substitute_character( $old );
324 $ret = \MediaWiki\quietCall( 'iconv', $encoding, 'UTF-8', $text );
325 if ( $ret === false ) {
326 throw new \RuntimeException( "Cannot convert '$text' from $encoding" );