]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - includes/libs/mime/IEContentAnalyzer.php
MediaWiki 1.30.2-scripts2
[autoinstalls/mediawiki.git] / includes / libs / mime / IEContentAnalyzer.php
1 <?php
2 /**
3  * Simulation of Microsoft Internet Explorer's MIME type detection algorithm.
4  *
5  * @file
6  * @todo Define the exact license of this file.
7  */
8
9 /**
10  * This class simulates Microsoft Internet Explorer's terribly broken and
11  * insecure MIME type detection algorithm. It can be used to check web uploads
12  * with an apparently safe type, to see if IE will reinterpret them to produce
13  * something dangerous.
14  *
15  * It is full of bugs and strange design choices should not under any
16  * circumstances be used to determine a MIME type to present to a user or
17  * client. (Apple Safari developers, this means you too.)
18  *
19  * This class is based on a disassembly of IE 5.0, 6.0 and 7.0. Although I have
20  * attempted to ensure that this code works in exactly the same way as Internet
21  * Explorer, it does not share any source code, or creative choices such as
22  * variable names, thus I (Tim Starling) claim copyright on it.
23  *
24  * It may be redistributed without restriction. To aid reuse, this class does
25  * not depend on any MediaWiki module.
26  */
27 class IEContentAnalyzer {
28         /**
29          * Relevant data taken from the type table in IE 5
30          */
31         protected $baseTypeTable = [
32                 'ambiguous' /*1*/ => [
33                         'text/plain',
34                         'application/octet-stream',
35                         'application/x-netcdf', // [sic]
36                 ],
37                 'text' /*3*/ => [
38                         'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64',
39                         'application/macbinhex40', 'application/x-cdf', 'text/scriptlet'
40                 ],
41                 'binary' /*4*/ => [
42                         'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif',
43                         'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp',
44                         'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi',
45                         'video/x-msvideo', 'video/mpeg', 'application/x-compressed',
46                         'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java',
47                         'application/x-msdownload'
48                 ],
49                 'html' /*5*/ => [ 'text/html' ],
50         ];
51
52         /**
53          * Changes to the type table in later versions of IE
54          */
55         protected $addedTypes = [
56                 'ie07' => [
57                         'text' => [ 'text/xml', 'application/xml' ]
58                 ],
59         ];
60
61         /**
62          * An approximation of the "Content Type" values in HKEY_CLASSES_ROOT in a
63          * typical Windows installation.
64          *
65          * Used for extension to MIME type mapping if detection fails.
66          */
67         protected $registry = [
68                 '.323' => 'text/h323',
69                 '.3g2' => 'video/3gpp2',
70                 '.3gp' => 'video/3gpp',
71                 '.3gp2' => 'video/3gpp2',
72                 '.3gpp' => 'video/3gpp',
73                 '.aac' => 'audio/aac',
74                 '.ac3' => 'audio/ac3',
75                 '.accda' => 'application/msaccess',
76                 '.accdb' => 'application/msaccess',
77                 '.accdc' => 'application/msaccess',
78                 '.accde' => 'application/msaccess',
79                 '.accdr' => 'application/msaccess',
80                 '.accdt' => 'application/msaccess',
81                 '.ade' => 'application/msaccess',
82                 '.adp' => 'application/msaccess',
83                 '.adts' => 'audio/aac',
84                 '.ai' => 'application/postscript',
85                 '.aif' => 'audio/aiff',
86                 '.aifc' => 'audio/aiff',
87                 '.aiff' => 'audio/aiff',
88                 '.amc' => 'application/x-mpeg',
89                 '.application' => 'application/x-ms-application',
90                 '.asf' => 'video/x-ms-asf',
91                 '.asx' => 'video/x-ms-asf',
92                 '.au' => 'audio/basic',
93                 '.avi' => 'video/avi',
94                 '.bmp' => 'image/bmp',
95                 '.caf' => 'audio/x-caf',
96                 '.cat' => 'application/vnd.ms-pki.seccat',
97                 '.cbo' => 'application/sha',
98                 '.cdda' => 'audio/aiff',
99                 '.cer' => 'application/x-x509-ca-cert',
100                 '.conf' => 'text/plain',
101                 '.crl' => 'application/pkix-crl',
102                 '.crt' => 'application/x-x509-ca-cert',
103                 '.css' => 'text/css',
104                 '.csv' => 'application/vnd.ms-excel',
105                 '.der' => 'application/x-x509-ca-cert',
106                 '.dib' => 'image/bmp',
107                 '.dif' => 'video/x-dv',
108                 '.dll' => 'application/x-msdownload',
109                 '.doc' => 'application/msword',
110                 '.docm' => 'application/vnd.ms-word.document.macroEnabled.12',
111                 '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
112                 '.dot' => 'application/msword',
113                 '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12',
114                 '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
115                 '.dv' => 'video/x-dv',
116                 '.dwfx' => 'model/vnd.dwfx+xps',
117                 '.edn' => 'application/vnd.adobe.edn',
118                 '.eml' => 'message/rfc822',
119                 '.eps' => 'application/postscript',
120                 '.etd' => 'application/x-ebx',
121                 '.exe' => 'application/x-msdownload',
122                 '.fdf' => 'application/vnd.fdf',
123                 '.fif' => 'application/fractals',
124                 '.gif' => 'image/gif',
125                 '.gsm' => 'audio/x-gsm',
126                 '.hqx' => 'application/mac-binhex40',
127                 '.hta' => 'application/hta',
128                 '.htc' => 'text/x-component',
129                 '.htm' => 'text/html',
130                 '.html' => 'text/html',
131                 '.htt' => 'text/webviewhtml',
132                 '.hxa' => 'application/xml',
133                 '.hxc' => 'application/xml',
134                 '.hxd' => 'application/octet-stream',
135                 '.hxe' => 'application/xml',
136                 '.hxf' => 'application/xml',
137                 '.hxh' => 'application/octet-stream',
138                 '.hxi' => 'application/octet-stream',
139                 '.hxk' => 'application/xml',
140                 '.hxq' => 'application/octet-stream',
141                 '.hxr' => 'application/octet-stream',
142                 '.hxs' => 'application/octet-stream',
143                 '.hxt' => 'application/xml',
144                 '.hxv' => 'application/xml',
145                 '.hxw' => 'application/octet-stream',
146                 '.ico' => 'image/x-icon',
147                 '.iii' => 'application/x-iphone',
148                 '.ins' => 'application/x-internet-signup',
149                 '.iqy' => 'text/x-ms-iqy',
150                 '.isp' => 'application/x-internet-signup',
151                 '.jfif' => 'image/jpeg',
152                 '.jnlp' => 'application/x-java-jnlp-file',
153                 '.jpe' => 'image/jpeg',
154                 '.jpeg' => 'image/jpeg',
155                 '.jpg' => 'image/jpeg',
156                 '.jtx' => 'application/x-jtx+xps',
157                 '.latex' => 'application/x-latex',
158                 '.log' => 'text/plain',
159                 '.m1v' => 'video/mpeg',
160                 '.m2v' => 'video/mpeg',
161                 '.m3u' => 'audio/x-mpegurl',
162                 '.mac' => 'image/x-macpaint',
163                 '.man' => 'application/x-troff-man',
164                 '.mda' => 'application/msaccess',
165                 '.mdb' => 'application/msaccess',
166                 '.mde' => 'application/msaccess',
167                 '.mfp' => 'application/x-shockwave-flash',
168                 '.mht' => 'message/rfc822',
169                 '.mhtml' => 'message/rfc822',
170                 '.mid' => 'audio/mid',
171                 '.midi' => 'audio/mid',
172                 '.mod' => 'video/mpeg',
173                 '.mov' => 'video/quicktime',
174                 '.mp2' => 'video/mpeg',
175                 '.mp2v' => 'video/mpeg',
176                 '.mp3' => 'audio/mpeg',
177                 '.mp4' => 'video/mp4',
178                 '.mpa' => 'video/mpeg',
179                 '.mpe' => 'video/mpeg',
180                 '.mpeg' => 'video/mpeg',
181                 '.mpf' => 'application/vnd.ms-mediapackage',
182                 '.mpg' => 'video/mpeg',
183                 '.mpv2' => 'video/mpeg',
184                 '.mqv' => 'video/quicktime',
185                 '.NMW' => 'application/nmwb',
186                 '.nws' => 'message/rfc822',
187                 '.odc' => 'text/x-ms-odc',
188                 '.ols' => 'application/vnd.ms-publisher',
189                 '.p10' => 'application/pkcs10',
190                 '.p12' => 'application/x-pkcs12',
191                 '.p7b' => 'application/x-pkcs7-certificates',
192                 '.p7c' => 'application/pkcs7-mime',
193                 '.p7m' => 'application/pkcs7-mime',
194                 '.p7r' => 'application/x-pkcs7-certreqresp',
195                 '.p7s' => 'application/pkcs7-signature',
196                 '.pct' => 'image/pict',
197                 '.pdf' => 'application/pdf',
198                 '.pdx' => 'application/vnd.adobe.pdx',
199                 '.pfx' => 'application/x-pkcs12',
200                 '.pic' => 'image/pict',
201                 '.pict' => 'image/pict',
202                 '.pinstall' => 'application/x-picasa-detect',
203                 '.pko' => 'application/vnd.ms-pki.pko',
204                 '.png' => 'image/png',
205                 '.pnt' => 'image/x-macpaint',
206                 '.pntg' => 'image/x-macpaint',
207                 '.pot' => 'application/vnd.ms-powerpoint',
208                 '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12',
209                 '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template',
210                 '.ppa' => 'application/vnd.ms-powerpoint',
211                 '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12',
212                 '.pps' => 'application/vnd.ms-powerpoint',
213                 '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12',
214                 '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
215                 '.ppt' => 'application/vnd.ms-powerpoint',
216                 '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12',
217                 '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
218                 '.prf' => 'application/pics-rules',
219                 '.ps' => 'application/postscript',
220                 '.pub' => 'application/vnd.ms-publisher',
221                 '.pwz' => 'application/vnd.ms-powerpoint',
222                 '.py' => 'text/plain',
223                 '.pyw' => 'text/plain',
224                 '.qht' => 'text/x-html-insertion',
225                 '.qhtm' => 'text/x-html-insertion',
226                 '.qt' => 'video/quicktime',
227                 '.qti' => 'image/x-quicktime',
228                 '.qtif' => 'image/x-quicktime',
229                 '.qtl' => 'application/x-quicktimeplayer',
230                 '.rat' => 'application/rat-file',
231                 '.rmf' => 'application/vnd.adobe.rmf',
232                 '.rmi' => 'audio/mid',
233                 '.rqy' => 'text/x-ms-rqy',
234                 '.rtf' => 'application/msword',
235                 '.sct' => 'text/scriptlet',
236                 '.sd2' => 'audio/x-sd2',
237                 '.sdp' => 'application/sdp',
238                 '.shtml' => 'text/html',
239                 '.sit' => 'application/x-stuffit',
240                 '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12',
241                 '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide',
242                 '.slk' => 'application/vnd.ms-excel',
243                 '.snd' => 'audio/basic',
244                 '.so' => 'application/x-apachemodule',
245                 '.sol' => 'text/plain',
246                 '.sor' => 'text/plain',
247                 '.spc' => 'application/x-pkcs7-certificates',
248                 '.spl' => 'application/futuresplash',
249                 '.sst' => 'application/vnd.ms-pki.certstore',
250                 '.stl' => 'application/vnd.ms-pki.stl',
251                 '.swf' => 'application/x-shockwave-flash',
252                 '.thmx' => 'application/vnd.ms-officetheme',
253                 '.tif' => 'image/tiff',
254                 '.tiff' => 'image/tiff',
255                 '.txt' => 'text/plain',
256                 '.uls' => 'text/iuls',
257                 '.vcf' => 'text/x-vcard',
258                 '.vdx' => 'application/vnd.ms-visio.viewer',
259                 '.vsd' => 'application/vnd.ms-visio.viewer',
260                 '.vss' => 'application/vnd.ms-visio.viewer',
261                 '.vst' => 'application/vnd.ms-visio.viewer',
262                 '.vsx' => 'application/vnd.ms-visio.viewer',
263                 '.vtx' => 'application/vnd.ms-visio.viewer',
264                 '.wav' => 'audio/wav',
265                 '.wax' => 'audio/x-ms-wax',
266                 '.wbk' => 'application/msword',
267                 '.wdp' => 'image/vnd.ms-photo',
268                 '.wiz' => 'application/msword',
269                 '.wm' => 'video/x-ms-wm',
270                 '.wma' => 'audio/x-ms-wma',
271                 '.wmd' => 'application/x-ms-wmd',
272                 '.wmv' => 'video/x-ms-wmv',
273                 '.wmx' => 'video/x-ms-wmx',
274                 '.wmz' => 'application/x-ms-wmz',
275                 '.wpl' => 'application/vnd.ms-wpl',
276                 '.wsc' => 'text/scriptlet',
277                 '.wvx' => 'video/x-ms-wvx',
278                 '.xaml' => 'application/xaml+xml',
279                 '.xbap' => 'application/x-ms-xbap',
280                 '.xdp' => 'application/vnd.adobe.xdp+xml',
281                 '.xfdf' => 'application/vnd.adobe.xfdf',
282                 '.xht' => 'application/xhtml+xml',
283                 '.xhtml' => 'application/xhtml+xml',
284                 '.xla' => 'application/vnd.ms-excel',
285                 '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12',
286                 '.xlk' => 'application/vnd.ms-excel',
287                 '.xll' => 'application/vnd.ms-excel',
288                 '.xlm' => 'application/vnd.ms-excel',
289                 '.xls' => 'application/vnd.ms-excel',
290                 '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
291                 '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12',
292                 '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
293                 '.xlt' => 'application/vnd.ms-excel',
294                 '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12',
295                 '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
296                 '.xlw' => 'application/vnd.ms-excel',
297                 '.xml' => 'text/xml',
298                 '.xps' => 'application/vnd.ms-xpsdocument',
299                 '.xsl' => 'text/xml',
300         ];
301
302         /**
303          * IE versions which have been analysed to bring you this class, and for
304          * which some substantive difference exists. These will appear as keys
305          * in the return value of getRealMimesFromData(). The names are chosen to sort correctly.
306          */
307         protected $versions = [ 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' ];
308
309         /**
310          * Type table with versions expanded
311          */
312         protected $typeTable = [];
313
314         /** constructor */
315         function __construct() {
316                 // Construct versioned type arrays from the base type array plus additions
317                 $types = $this->baseTypeTable;
318                 foreach ( $this->versions as $version ) {
319                         if ( isset( $this->addedTypes[$version] ) ) {
320                                 foreach ( $this->addedTypes[$version] as $format => $addedTypes ) {
321                                         $types[$format] = array_merge( $types[$format], $addedTypes );
322                                 }
323                         }
324                         $this->typeTable[$version] = $types;
325                 }
326         }
327
328         /**
329          * Get the MIME types from getMimesFromData(), but convert the result from IE's
330          * idiosyncratic private types into something other apps will understand.
331          *
332          * @param string $fileName the file name (unused at present)
333          * @param string $chunk the first 256 bytes of the file
334          * @param string $proposed the MIME type proposed by the server
335          *
336          * @return array map of IE version to detected MIME type
337          */
338         public function getRealMimesFromData( $fileName, $chunk, $proposed ) {
339                 $types = $this->getMimesFromData( $fileName, $chunk, $proposed );
340                 $types = array_map( [ $this, 'translateMimeType' ], $types );
341                 return $types;
342         }
343
344         /**
345          * Translate a MIME type from IE's idiosyncratic private types into
346          * more commonly understood type strings
347          * @param string $type
348          * @return string
349          */
350         public function translateMimeType( $type ) {
351                 static $table = [
352                         'image/pjpeg' => 'image/jpeg',
353                         'image/x-png' => 'image/png',
354                         'image/x-wmf' => 'application/x-msmetafile',
355                         'image/bmp' => 'image/x-bmp',
356                         'application/x-zip-compressed' => 'application/zip',
357                         'application/x-compressed' => 'application/x-compress',
358                         'application/x-gzip-compressed' => 'application/x-gzip',
359                         'audio/mid' => 'audio/midi',
360                 ];
361                 if ( isset( $table[$type] ) ) {
362                         $type = $table[$type];
363                 }
364                 return $type;
365         }
366
367         /**
368          * Get the untranslated MIME types for all known versions
369          *
370          * @param string $fileName the file name (unused at present)
371          * @param string $chunk the first 256 bytes of the file
372          * @param string $proposed the MIME type proposed by the server
373          *
374          * @return array map of IE version to detected MIME type
375          */
376         public function getMimesFromData( $fileName, $chunk, $proposed ) {
377                 $types = [];
378                 foreach ( $this->versions as $version ) {
379                         $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed );
380                 }
381                 return $types;
382         }
383
384         /**
385          * Get the MIME type for a given named version
386          * @param string $version
387          * @param string $fileName
388          * @param string $chunk
389          * @param string $proposed
390          * @return bool|string
391          */
392         protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) {
393                 // Strip text after a semicolon
394                 $semiPos = strpos( $proposed, ';' );
395                 if ( $semiPos !== false ) {
396                         $proposed = substr( $proposed, 0, $semiPos );
397                 }
398
399                 $proposedFormat = $this->getDataFormat( $version, $proposed );
400                 if ( $proposedFormat == 'unknown'
401                         && $proposed != 'multipart/mixed'
402                         && $proposed != 'multipart/x-mixed-replace'
403                 ) {
404                         return $proposed;
405                 }
406                 if ( strval( $chunk ) === '' ) {
407                         return $proposed;
408                 }
409
410                 // Truncate chunk at 255 bytes
411                 $chunk = substr( $chunk, 0, 255 );
412
413                 // IE does the Check*Headers() calls last, and instead does the following image
414                 // type checks by directly looking for the magic numbers. What I do here should
415                 // have the same effect since the magic number checks are identical in both cases.
416                 $result = $this->sampleData( $version, $chunk );
417                 $sampleFound = $result['found'];
418                 $counters = $result['counters'];
419                 $binaryType = $this->checkBinaryHeaders( $version, $chunk );
420                 $textType = $this->checkTextHeaders( $version, $chunk );
421
422                 if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) {
423                         return 'text/html';
424                 }
425                 if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) {
426                         return 'image/gif';
427                 }
428                 if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' )
429                         && $binaryType == 'image/pjpeg'
430                 ) {
431                         return $proposed;
432                 }
433                 // PNG check added in IE 7
434                 if ( $version >= 'ie07'
435                         && ( $proposed == 'image/x-png' || $proposed == 'image/png' )
436                         && $binaryType == 'image/x-png'
437                 ) {
438                         return $proposed;
439                 }
440
441                 // CDF was removed in IE 7 so it won't be in $sampleFound for later versions
442                 if ( isset( $sampleFound['cdf'] ) ) {
443                         return 'application/x-cdf';
444                 }
445
446                 // RSS and Atom were added in IE 7 so they won't be in $sampleFound for
447                 // previous versions
448                 if ( isset( $sampleFound['rss'] ) ) {
449                         return 'application/rss+xml';
450                 }
451                 if ( isset( $sampleFound['rdf-tag'] )
452                         && isset( $sampleFound['rdf-url'] )
453                         && isset( $sampleFound['rdf-purl'] )
454                 ) {
455                         return 'application/rss+xml';
456                 }
457                 if ( isset( $sampleFound['atom'] ) ) {
458                         return 'application/atom+xml';
459                 }
460
461                 if ( isset( $sampleFound['xml'] ) ) {
462                         // TODO: I'm not sure under what circumstances this flag is enabled
463                         if ( strpos( $version, 'strict' ) !== false ) {
464                                 if ( $proposed == 'text/html' || $proposed == 'text/xml' ) {
465                                         return 'text/xml';
466                                 }
467                         } else {
468                                 return 'text/xml';
469                         }
470                 }
471                 if ( isset( $sampleFound['html'] ) ) {
472                         // TODO: I'm not sure under what circumstances this flag is enabled
473                         if ( strpos( $version, 'nohtml' ) !== false ) {
474                                 if ( $proposed == 'text/plain' ) {
475                                         return 'text/html';
476                                 }
477                         } else {
478                                 return 'text/html';
479                         }
480                 }
481                 if ( isset( $sampleFound['xbm'] ) ) {
482                         return 'image/x-bitmap';
483                 }
484                 if ( isset( $sampleFound['binhex'] ) ) {
485                         return 'application/macbinhex40';
486                 }
487                 if ( isset( $sampleFound['scriptlet'] ) ) {
488                         if ( strpos( $version, 'strict' ) !== false ) {
489                                 if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) {
490                                         return 'text/scriptlet';
491                                 }
492                         } else {
493                                 return 'text/scriptlet';
494                         }
495                 }
496
497                 // Freaky heuristics to determine if the data is text or binary
498                 // The heuristic is of course broken for non-ASCII text
499                 if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] )
500                         < ( $counters['ctrl'] + $counters['high'] ) * 16
501                 ) {
502                         $kindOfBinary = true;
503                         $type = $binaryType ? $binaryType : $textType;
504                         if ( $type === false ) {
505                                 $type = 'application/octet-stream';
506                         }
507                 } else {
508                         $kindOfBinary = false;
509                         $type = $textType ? $textType : $binaryType;
510                         if ( $type === false ) {
511                                 $type = 'text/plain';
512                         }
513                 }
514
515                 // Check if the output format is ambiguous
516                 // This generally means that detection failed, real types aren't ambiguous
517                 $detectedFormat = $this->getDataFormat( $version, $type );
518                 if ( $detectedFormat != 'ambiguous' ) {
519                         return $type;
520                 }
521
522                 if ( $proposedFormat != 'ambiguous' ) {
523                         // FormatAgreesWithData()
524                         if ( $proposedFormat == 'text' && !$kindOfBinary ) {
525                                 return $proposed;
526                         }
527                         if ( $proposedFormat == 'binary' && $kindOfBinary ) {
528                                 return $proposed;
529                         }
530                         if ( $proposedFormat == 'html' ) {
531                                 return $proposed;
532                         }
533                 }
534
535                 // Find a MIME type by searching the registry for the file extension.
536                 $dotPos = strrpos( $fileName, '.' );
537                 if ( $dotPos === false ) {
538                         return $type;
539                 }
540                 $ext = substr( $fileName, $dotPos );
541                 if ( isset( $this->registry[$ext] ) ) {
542                         return $this->registry[$ext];
543                 }
544
545                 // TODO: If the extension has an application registered to it, IE will return
546                 // application/octet-stream. We'll skip that, so we could erroneously
547                 // return text/plain or application/x-netcdf where application/octet-stream
548                 // would be correct.
549
550                 return $type;
551         }
552
553         /**
554          * Check for text headers at the start of the chunk
555          * Confirmed same in 5 and 7.
556          * @param string $version
557          * @param string $chunk
558          * @return bool|string
559          */
560         private function checkTextHeaders( $version, $chunk ) {
561                 $chunk2 = substr( $chunk, 0, 2 );
562                 $chunk4 = substr( $chunk, 0, 4 );
563                 $chunk5 = substr( $chunk, 0, 5 );
564                 if ( $chunk4 == '%PDF' ) {
565                         return 'application/pdf';
566                 }
567                 if ( $chunk2 == '%!' ) {
568                         return 'application/postscript';
569                 }
570                 if ( $chunk5 == '{\\rtf' ) {
571                         return 'text/richtext';
572                 }
573                 if ( $chunk5 == 'begin' ) {
574                         return 'application/base64';
575                 }
576                 return false;
577         }
578
579         /**
580          * Check for binary headers at the start of the chunk
581          * Confirmed same in 5 and 7.
582          * @param string $version
583          * @param string $chunk
584          * @return bool|string
585          */
586         private function checkBinaryHeaders( $version, $chunk ) {
587                 $chunk2 = substr( $chunk, 0, 2 );
588                 $chunk3 = substr( $chunk, 0, 3 );
589                 $chunk4 = substr( $chunk, 0, 4 );
590                 $chunk5 = substr( $chunk, 0, 5 );
591                 $chunk5uc = strtoupper( $chunk5 );
592                 $chunk8 = substr( $chunk, 0, 8 );
593                 if ( $chunk5uc == 'GIF87' || $chunk5uc == 'GIF89' ) {
594                         return 'image/gif';
595                 }
596                 if ( $chunk2 == "\xff\xd8" ) {
597                         return 'image/pjpeg'; // actually plain JPEG but this is what IE returns
598                 }
599
600                 if ( $chunk2 == 'BM'
601                         && substr( $chunk, 6, 2 ) == "\000\000"
602                         && substr( $chunk, 8, 2 ) == "\000\000"
603                 ) {
604                         return 'image/bmp'; // another non-standard MIME
605                 }
606                 if ( $chunk4 == 'RIFF'
607                         && substr( $chunk, 8, 4 ) == 'WAVE'
608                 ) {
609                         return 'audio/wav';
610                 }
611                 // These were integer literals in IE
612                 // Perhaps the author was not sure what the target endianness was
613                 if ( $chunk4 == ".sd\000"
614                         || $chunk4 == ".snd"
615                         || $chunk4 == "\000ds."
616                         || $chunk4 == "dns."
617                 ) {
618                         return 'audio/basic';
619                 }
620                 if ( $chunk3 == "MM\000" ) {
621                         return 'image/tiff';
622                 }
623                 if ( $chunk2 == 'MZ' ) {
624                         return 'application/x-msdownload';
625                 }
626                 if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) {
627                         return 'image/x-png'; // [sic]
628                 }
629                 if ( strlen( $chunk ) >= 5 ) {
630                         $byte2 = ord( $chunk[2] );
631                         $byte4 = ord( $chunk[4] );
632                         if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) {
633                                 return 'image/x-jg';
634                         }
635                 }
636                 // More endian confusion?
637                 if ( $chunk4 == 'MROF' ) {
638                         return 'audio/x-aiff';
639                 }
640                 $chunk4_8 = substr( $chunk, 8, 4 );
641                 if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) {
642                         return 'audio/x-aiff';
643                 }
644                 if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) {
645                         return 'video/avi';
646                 }
647                 if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) {
648                         return 'video/mpeg';
649                 }
650                 if ( $chunk4 == "\001\000\000\000"
651                         && substr( $chunk, 40, 4 ) == ' EMF'
652                 ) {
653                         return 'image/x-emf';
654                 }
655                 if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) {
656                         return 'image/x-wmf';
657                 }
658                 if ( $chunk4 == "\xca\xfe\xba\xbe" ) {
659                         return 'application/java';
660                 }
661                 if ( $chunk2 == 'PK' ) {
662                         return 'application/x-zip-compressed';
663                 }
664                 if ( $chunk2 == "\x1f\x9d" ) {
665                         return 'application/x-compressed';
666                 }
667                 if ( $chunk2 == "\x1f\x8b" ) {
668                         return 'application/x-gzip-compressed';
669                 }
670                 // Skip redundant check for ZIP
671                 if ( $chunk5 == "MThd\000" ) {
672                         return 'audio/mid';
673                 }
674                 if ( $chunk4 == '%PDF' ) {
675                         return 'application/pdf';
676                 }
677                 return false;
678         }
679
680         /**
681          * Do heuristic checks on the bulk of the data sample.
682          * Search for HTML tags.
683          * @param string $version
684          * @param string $chunk
685          * @return array
686          */
687         protected function sampleData( $version, $chunk ) {
688                 $found = [];
689                 $counters = [
690                         'ctrl' => 0,
691                         'high' => 0,
692                         'low' => 0,
693                         'lf' => 0,
694                         'cr' => 0,
695                         'ff' => 0
696                 ];
697                 $htmlTags = [
698                         'html',
699                         'head',
700                         'title',
701                         'body',
702                         'script',
703                         'a href',
704                         'pre',
705                         'img',
706                         'plaintext',
707                         'table'
708                 ];
709                 $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
710                 $rdfPurl = 'http://purl.org/rss/1.0/';
711                 $xbmMagic1 = '#define';
712                 $xbmMagic2 = '_width';
713                 $xbmMagic3 = '_bits';
714                 $binhexMagic = 'converted with BinHex';
715                 $chunkLength = strlen( $chunk );
716
717                 for ( $offset = 0; $offset < $chunkLength; $offset++ ) {
718                         $curChar = $chunk[$offset];
719                         if ( $curChar == "\x0a" ) {
720                                 $counters['lf']++;
721                                 continue;
722                         } elseif ( $curChar == "\x0d" ) {
723                                 $counters['cr']++;
724                                 continue;
725                         } elseif ( $curChar == "\x0c" ) {
726                                 $counters['ff']++;
727                                 continue;
728                         } elseif ( $curChar == "\t" ) {
729                                 $counters['low']++;
730                                 continue;
731                         } elseif ( ord( $curChar ) < 32 ) {
732                                 $counters['ctrl']++;
733                                 continue;
734                         } elseif ( ord( $curChar ) >= 128 ) {
735                                 $counters['high']++;
736                                 continue;
737                         }
738
739                         $counters['low']++;
740                         if ( $curChar == '<' ) {
741                                 // XML
742                                 $remainder = substr( $chunk, $offset + 1 );
743                                 if ( !strncasecmp( $remainder, '?XML', 4 ) ) {
744                                         $nextChar = substr( $chunk, $offset + 5, 1 );
745                                         if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) {
746                                                 $found['xml'] = true;
747                                         }
748                                 }
749                                 // Scriptlet (JSP)
750                                 if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) {
751                                         $found['scriptlet'] = true;
752                                         break;
753                                 }
754                                 // HTML
755                                 foreach ( $htmlTags as $tag ) {
756                                         if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) {
757                                                 $found['html'] = true;
758                                         }
759                                 }
760                                 // Skip broken check for additional tags (HR etc.)
761
762                                 // CHANNEL replaced by RSS, RDF and FEED in IE 7
763                                 if ( $version < 'ie07' ) {
764                                         if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) {
765                                                 $found['cdf'] = true;
766                                         }
767                                 } else {
768                                         // RSS
769                                         if ( !strncasecmp( $remainder, 'RSS', 3 ) ) {
770                                                 $found['rss'] = true;
771                                                 break; // return from SampleData
772                                         }
773                                         if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) {
774                                                 $found['rdf-tag'] = true;
775                                                 // no break
776                                         }
777                                         if ( !strncasecmp( $remainder, 'FEED', 4 ) ) {
778                                                 $found['atom'] = true;
779                                                 break;
780                                         }
781                                 }
782                                 continue;
783                         }
784                         // Skip broken check for -->
785
786                         // RSS URL checks
787                         // For some reason both URLs must appear before it is recognised
788                         $remainder = substr( $chunk, $offset );
789                         if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) {
790                                 $found['rdf-url'] = true;
791                                 if ( isset( $found['rdf-tag'] )
792                                         && isset( $found['rdf-purl'] ) // [sic]
793                                 ) {
794                                         break;
795                                 }
796                                 continue;
797                         }
798
799                         if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) {
800                                 if ( isset( $found['rdf-tag'] )
801                                         && isset( $found['rdf-url'] ) // [sic]
802                                 ) {
803                                         break;
804                                 }
805                                 continue;
806                         }
807
808                         // XBM checks
809                         if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) {
810                                 $found['xbm1'] = true;
811                                 continue;
812                         }
813                         if ( $curChar == '_' ) {
814                                 if ( isset( $found['xbm2'] ) ) {
815                                         if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) {
816                                                 $found['xbm'] = true;
817                                                 break;
818                                         }
819                                 } elseif ( isset( $found['xbm1'] ) ) {
820                                         if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) {
821                                                 $found['xbm2'] = true;
822                                         }
823                                 }
824                         }
825
826                         // BinHex
827                         if ( !strncmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) {
828                                 $found['binhex'] = true;
829                         }
830                 }
831                 return [ 'found' => $found, 'counters' => $counters ];
832         }
833
834         /**
835          * @param string $version
836          * @param string|null $type
837          * @return int|string
838          */
839         protected function getDataFormat( $version, $type ) {
840                 $types = $this->typeTable[$version];
841                 if ( $type == '(null)' || strval( $type ) === '' ) {
842                         return 'ambiguous';
843                 }
844                 foreach ( $types as $format => $list ) {
845                         if ( in_array( $type, $list ) ) {
846                                 return $format;
847                         }
848                 }
849                 return 'unknown';
850         }
851 }