]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - includes/IEContentAnalyzer.php
MediaWiki 1.14.0
[autoinstallsdev/mediawiki.git] / includes / IEContentAnalyzer.php
1 <?php
2
3 /**
4  * This class simulates Microsoft Internet Explorer's terribly broken and 
5  * insecure MIME type detection algorithm. It can be used to check web uploads
6  * with an apparently safe type, to see if IE will reinterpret them to produce 
7  * something dangerous.
8  *
9  * It is full of bugs and strange design choices should not under any 
10  * circumstances be used to determine a MIME type to present to a user or 
11  * client. (Apple Safari developers, this means you too.)
12  *
13  * This class is based on a disassembly of IE 5.0, 6.0 and 7.0. Although I have 
14  * attempted to ensure that this code works in exactly the same way as Internet 
15  * Explorer, it does not share any source code, or creative choices such as 
16  * variable names, thus I (Tim Starling) claim copyright on it. 
17  *
18  * It may be redistributed without restriction. To aid reuse, this class does
19  * not depend on any MediaWiki module.
20  */
21 class IEContentAnalyzer {
22         /**
23          * Relevant data taken from the type table in IE 5
24          */
25         protected $baseTypeTable = array(
26                 'ambiguous' /*1*/ => array(
27                         'text/plain', 
28                         'application/octet-stream', 
29                         'application/x-netcdf', // [sic]
30                 ),
31                 'text' /*3*/ => array(
32                         'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64',
33                         'application/macbinhex40', 'application/x-cdf', 'text/scriptlet'
34                 ),
35                 'binary' /*4*/ => array(
36                         'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif',
37                         'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp', 
38                         'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi', 
39                         'video/x-msvideo', 'video/mpeg', 'application/x-compressed',
40                         'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java',
41                         'application/x-msdownload'
42                 ),
43                 'html' /*5*/ => array( 'text/html' ),
44         );
45
46         /**
47          * Changes to the type table in later versions of IE
48          */
49         protected $addedTypes = array(
50                 'ie07' => array(
51                         'text' => array( 'text/xml', 'application/xml' )
52                 ),
53         );
54
55         /**
56          * An approximation of the "Content Type" values in HKEY_CLASSES_ROOT in a
57          * typical Windows installation.
58          *
59          * Used for extension to MIME type mapping if detection fails.
60          */
61         protected $registry = array(
62                 '.323' => 'text/h323',
63                 '.3g2' => 'video/3gpp2',
64                 '.3gp' => 'video/3gpp',
65                 '.3gp2' => 'video/3gpp2',
66                 '.3gpp' => 'video/3gpp',
67                 '.aac' => 'audio/aac',
68                 '.ac3' => 'audio/ac3',
69                 '.accda' => 'application/msaccess',
70                 '.accdb' => 'application/msaccess',
71                 '.accdc' => 'application/msaccess',
72                 '.accde' => 'application/msaccess',
73                 '.accdr' => 'application/msaccess',
74                 '.accdt' => 'application/msaccess',
75                 '.ade' => 'application/msaccess',
76                 '.adp' => 'application/msaccess',
77                 '.adts' => 'audio/aac',
78                 '.ai' => 'application/postscript',
79                 '.aif' => 'audio/aiff',
80                 '.aifc' => 'audio/aiff',
81                 '.aiff' => 'audio/aiff',
82                 '.amc' => 'application/x-mpeg',
83                 '.application' => 'application/x-ms-application',
84                 '.asf' => 'video/x-ms-asf',
85                 '.asx' => 'video/x-ms-asf',
86                 '.au' => 'audio/basic',
87                 '.avi' => 'video/avi',
88                 '.bmp' => 'image/bmp',
89                 '.caf' => 'audio/x-caf',
90                 '.cat' => 'application/vnd.ms-pki.seccat',
91                 '.cbo' => 'application/sha',
92                 '.cdda' => 'audio/aiff',
93                 '.cer' => 'application/x-x509-ca-cert',
94                 '.conf' => 'text/plain',
95                 '.crl' => 'application/pkix-crl',
96                 '.crt' => 'application/x-x509-ca-cert',
97                 '.css' => 'text/css',
98                 '.csv' => 'application/vnd.ms-excel',
99                 '.der' => 'application/x-x509-ca-cert',
100                 '.dib' => 'image/bmp',
101                 '.dif' => 'video/x-dv',
102                 '.dll' => 'application/x-msdownload',
103                 '.doc' => 'application/msword',
104                 '.docm' => 'application/vnd.ms-word.document.macroEnabled.12',
105                 '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
106                 '.dot' => 'application/msword',
107                 '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12',
108                 '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
109                 '.dv' => 'video/x-dv',
110                 '.dwfx' => 'model/vnd.dwfx+xps',
111                 '.edn' => 'application/vnd.adobe.edn',
112                 '.eml' => 'message/rfc822',
113                 '.eps' => 'application/postscript',
114                 '.etd' => 'application/x-ebx',
115                 '.exe' => 'application/x-msdownload',
116                 '.fdf' => 'application/vnd.fdf',
117                 '.fif' => 'application/fractals',
118                 '.gif' => 'image/gif',
119                 '.gsm' => 'audio/x-gsm',
120                 '.hqx' => 'application/mac-binhex40',
121                 '.hta' => 'application/hta',
122                 '.htc' => 'text/x-component',
123                 '.htm' => 'text/html',
124                 '.html' => 'text/html',
125                 '.htt' => 'text/webviewhtml',
126                 '.hxa' => 'application/xml',
127                 '.hxc' => 'application/xml',
128                 '.hxd' => 'application/octet-stream',
129                 '.hxe' => 'application/xml',
130                 '.hxf' => 'application/xml',
131                 '.hxh' => 'application/octet-stream',
132                 '.hxi' => 'application/octet-stream',
133                 '.hxk' => 'application/xml',
134                 '.hxq' => 'application/octet-stream',
135                 '.hxr' => 'application/octet-stream',
136                 '.hxs' => 'application/octet-stream',
137                 '.hxt' => 'application/xml',
138                 '.hxv' => 'application/xml',
139                 '.hxw' => 'application/octet-stream',
140                 '.ico' => 'image/x-icon',
141                 '.iii' => 'application/x-iphone',
142                 '.ins' => 'application/x-internet-signup',
143                 '.iqy' => 'text/x-ms-iqy',
144                 '.isp' => 'application/x-internet-signup',
145                 '.jfif' => 'image/jpeg',
146                 '.jnlp' => 'application/x-java-jnlp-file',
147                 '.jpe' => 'image/jpeg',
148                 '.jpeg' => 'image/jpeg',
149                 '.jpg' => 'image/jpeg',
150                 '.jtx' => 'application/x-jtx+xps',
151                 '.latex' => 'application/x-latex',
152                 '.log' => 'text/plain',
153                 '.m1v' => 'video/mpeg',
154                 '.m2v' => 'video/mpeg',
155                 '.m3u' => 'audio/x-mpegurl',
156                 '.mac' => 'image/x-macpaint',
157                 '.man' => 'application/x-troff-man',
158                 '.mda' => 'application/msaccess',
159                 '.mdb' => 'application/msaccess',
160                 '.mde' => 'application/msaccess',
161                 '.mfp' => 'application/x-shockwave-flash',
162                 '.mht' => 'message/rfc822',
163                 '.mhtml' => 'message/rfc822',
164                 '.mid' => 'audio/mid',
165                 '.midi' => 'audio/mid',
166                 '.mod' => 'video/mpeg',
167                 '.mov' => 'video/quicktime',
168                 '.mp2' => 'video/mpeg',
169                 '.mp2v' => 'video/mpeg',
170                 '.mp3' => 'audio/mpeg',
171                 '.mp4' => 'video/mp4',
172                 '.mpa' => 'video/mpeg',
173                 '.mpe' => 'video/mpeg',
174                 '.mpeg' => 'video/mpeg',
175                 '.mpf' => 'application/vnd.ms-mediapackage',
176                 '.mpg' => 'video/mpeg',
177                 '.mpv2' => 'video/mpeg',
178                 '.mqv' => 'video/quicktime',
179                 '.NMW' => 'application/nmwb',
180                 '.nws' => 'message/rfc822',
181                 '.odc' => 'text/x-ms-odc',
182                 '.ols' => 'application/vnd.ms-publisher',
183                 '.p10' => 'application/pkcs10',
184                 '.p12' => 'application/x-pkcs12',
185                 '.p7b' => 'application/x-pkcs7-certificates',
186                 '.p7c' => 'application/pkcs7-mime',
187                 '.p7m' => 'application/pkcs7-mime',
188                 '.p7r' => 'application/x-pkcs7-certreqresp',
189                 '.p7s' => 'application/pkcs7-signature',
190                 '.pct' => 'image/pict',
191                 '.pdf' => 'application/pdf',
192                 '.pdx' => 'application/vnd.adobe.pdx',
193                 '.pfx' => 'application/x-pkcs12',
194                 '.pic' => 'image/pict',
195                 '.pict' => 'image/pict',
196                 '.pinstall' => 'application/x-picasa-detect',
197                 '.pko' => 'application/vnd.ms-pki.pko',
198                 '.png' => 'image/png',
199                 '.pnt' => 'image/x-macpaint',
200                 '.pntg' => 'image/x-macpaint',
201                 '.pot' => 'application/vnd.ms-powerpoint',
202                 '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12',
203                 '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template',
204                 '.ppa' => 'application/vnd.ms-powerpoint',
205                 '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12',
206                 '.pps' => 'application/vnd.ms-powerpoint',
207                 '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12',
208                 '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
209                 '.ppt' => 'application/vnd.ms-powerpoint',
210                 '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12',
211                 '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
212                 '.prf' => 'application/pics-rules',
213                 '.ps' => 'application/postscript',
214                 '.pub' => 'application/vnd.ms-publisher',
215                 '.pwz' => 'application/vnd.ms-powerpoint',
216                 '.py' => 'text/plain',
217                 '.pyw' => 'text/plain',
218                 '.qht' => 'text/x-html-insertion',
219                 '.qhtm' => 'text/x-html-insertion',
220                 '.qt' => 'video/quicktime',
221                 '.qti' => 'image/x-quicktime',
222                 '.qtif' => 'image/x-quicktime',
223                 '.qtl' => 'application/x-quicktimeplayer',
224                 '.rat' => 'application/rat-file',
225                 '.rmf' => 'application/vnd.adobe.rmf',
226                 '.rmi' => 'audio/mid',
227                 '.rqy' => 'text/x-ms-rqy',
228                 '.rtf' => 'application/msword',
229                 '.sct' => 'text/scriptlet',
230                 '.sd2' => 'audio/x-sd2',
231                 '.sdp' => 'application/sdp',
232                 '.shtml' => 'text/html',
233                 '.sit' => 'application/x-stuffit',
234                 '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12',
235                 '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide',
236                 '.slk' => 'application/vnd.ms-excel',
237                 '.snd' => 'audio/basic',
238                 '.so' => 'application/x-apachemodule',
239                 '.sol' => 'text/plain',
240                 '.sor' => 'text/plain',
241                 '.spc' => 'application/x-pkcs7-certificates',
242                 '.spl' => 'application/futuresplash',
243                 '.sst' => 'application/vnd.ms-pki.certstore',
244                 '.stl' => 'application/vnd.ms-pki.stl',
245                 '.swf' => 'application/x-shockwave-flash',
246                 '.thmx' => 'application/vnd.ms-officetheme',
247                 '.tif' => 'image/tiff',
248                 '.tiff' => 'image/tiff',
249                 '.txt' => 'text/plain',
250                 '.uls' => 'text/iuls',
251                 '.vcf' => 'text/x-vcard',
252                 '.vdx' => 'application/vnd.ms-visio.viewer',
253                 '.vsd' => 'application/vnd.ms-visio.viewer',
254                 '.vss' => 'application/vnd.ms-visio.viewer',
255                 '.vst' => 'application/vnd.ms-visio.viewer',
256                 '.vsx' => 'application/vnd.ms-visio.viewer',
257                 '.vtx' => 'application/vnd.ms-visio.viewer',
258                 '.wav' => 'audio/wav',
259                 '.wax' => 'audio/x-ms-wax',
260                 '.wbk' => 'application/msword',
261                 '.wdp' => 'image/vnd.ms-photo',
262                 '.wiz' => 'application/msword',
263                 '.wm' => 'video/x-ms-wm',
264                 '.wma' => 'audio/x-ms-wma',
265                 '.wmd' => 'application/x-ms-wmd',
266                 '.wmv' => 'video/x-ms-wmv',
267                 '.wmx' => 'video/x-ms-wmx',
268                 '.wmz' => 'application/x-ms-wmz',
269                 '.wpl' => 'application/vnd.ms-wpl',
270                 '.wsc' => 'text/scriptlet',
271                 '.wvx' => 'video/x-ms-wvx',
272                 '.xaml' => 'application/xaml+xml',
273                 '.xbap' => 'application/x-ms-xbap',
274                 '.xdp' => 'application/vnd.adobe.xdp+xml',
275                 '.xfdf' => 'application/vnd.adobe.xfdf',
276                 '.xht' => 'application/xhtml+xml',
277                 '.xhtml' => 'application/xhtml+xml',
278                 '.xla' => 'application/vnd.ms-excel',
279                 '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12',
280                 '.xlk' => 'application/vnd.ms-excel',
281                 '.xll' => 'application/vnd.ms-excel',
282                 '.xlm' => 'application/vnd.ms-excel',
283                 '.xls' => 'application/vnd.ms-excel',
284                 '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
285                 '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12',
286                 '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
287                 '.xlt' => 'application/vnd.ms-excel',
288                 '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12',
289                 '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
290                 '.xlw' => 'application/vnd.ms-excel',
291                 '.xml' => 'text/xml',
292                 '.xps' => 'application/vnd.ms-xpsdocument',
293                 '.xsl' => 'text/xml',
294         );
295
296         /** 
297          * IE versions which have been analysed to bring you this class, and for 
298          * which some substantive difference exists. These will appear as keys 
299          * in the return value of getRealMimesFromData(). The names are chosen to sort correctly.
300          */
301         protected $versions = array( 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' );
302
303         /**
304          * Type table with versions expanded 
305          */
306         protected $typeTable = array();
307
308         /** constructor */
309         function __construct() {
310                 // Construct versioned type arrays from the base type array plus additions 
311                 $types = $this->baseTypeTable;
312                 foreach ( $this->versions as $version ) {
313                         if ( isset( $this->addedTypes[$version] ) ) {
314                                 foreach ( $this->addedTypes[$version] as $format => $addedTypes ) {
315                                         $types[$format] = array_merge( $types[$format], $addedTypes );
316                                 }
317                         }
318                         $this->typeTable[$version] = $types;
319                 }
320         }
321
322         /**
323          * Get the MIME types from getMimesFromData(), but convert the result from IE's 
324          * idiosyncratic private types into something other apps will understand.
325          *
326          * @param string $fileName The file name (unused at present)
327          * @param string $chunk The first 256 bytes of the file
328          * @param string $proposed The MIME type proposed by the server
329          *
330          * @return array Map of IE version to detected mime type
331          */
332         public function getRealMimesFromData( $fileName, $chunk, $proposed ) {
333                 $types = $this->getMimesFromData( $fileName, $chunk, $proposed );
334                 $types = array_map( array( $this, 'translateMimeType' ), $types );
335                 return $types;
336         }
337
338         /**
339          * Translate a MIME type from IE's idiosyncratic private types into
340          * more commonly understood type strings
341          */
342         public function translateMimeType( $type ) {
343                 static $table = array(
344                         'image/pjpeg' => 'image/jpeg',
345                         'image/x-png' => 'image/png',
346                         'image/x-wmf' => 'application/x-msmetafile',
347                         'image/bmp' => 'image/x-bmp',
348                         'application/x-zip-compressed' => 'application/zip',
349                         'application/x-compressed' => 'application/x-compress',
350                         'application/x-gzip-compressed' => 'application/x-gzip',
351                         'audio/mid' => 'audio/midi',
352                 );
353                 if ( isset( $table[$type] ) ) {
354                         $type = $table[$type];
355                 }
356                 return $type;
357         }
358
359         /**
360          * Get the untranslated MIME types for all known versions
361          *
362          * @param string $fileName The file name (unused at present)
363          * @param string $chunk The first 256 bytes of the file
364          * @param string $proposed The MIME type proposed by the server
365          *
366          * @return array Map of IE version to detected mime type
367          */
368         public function getMimesFromData( $fileName, $chunk, $proposed ) {
369                 $types = array();
370                 foreach ( $this->versions as $version ) {
371                         $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed );
372                 }
373                 return $types;
374         }
375
376         /**
377          * Get the MIME type for a given named version
378          */
379         protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) {
380                 // Strip text after a semicolon
381                 $semiPos = strpos( $proposed, ';' );
382                 if ( $semiPos !== false ) {
383                         $proposed = substr( $proposed, 0, $semiPos );
384                 }
385
386                 $proposedFormat = $this->getDataFormat( $version, $proposed );
387                 if ( $proposedFormat == 'unknown'
388                         && $proposed != 'multipart/mixed'
389                         && $proposed != 'multipart/x-mixed-replace' )
390                 {
391                         return $proposed;
392                 }
393                 if ( strval( $chunk ) === '' ) {
394                         return $proposed;
395                 }
396
397                 // Truncate chunk at 255 bytes
398                 $chunk = substr( $chunk, 0, 255 );
399
400                 // IE does the Check*Headers() calls last, and instead does the following image 
401                 // type checks by directly looking for the magic numbers. What I do here should 
402                 // have the same effect since the magic number checks are identical in both cases.
403                 $result = $this->sampleData( $version, $chunk );
404                 $sampleFound = $result['found'];
405                 $counters = $result['counters'];
406                 $binaryType = $this->checkBinaryHeaders( $version, $chunk );
407                 $textType = $this->checkTextHeaders( $version, $chunk );
408
409                 if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) {
410                         return 'text/html';
411                 }
412                 if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) {
413                         return 'image/gif';
414                 }
415                 if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' )
416                         && $binaryType == 'image/pjpeg' ) 
417                 {
418                         return $proposed;
419                 }
420                 // PNG check added in IE 7
421                 if ( $version >= 'ie07'
422                         && ( $proposed == 'image/x-png' || $proposed == 'image/png' )
423                         && $binaryType == 'image/x-png' )
424                 {
425                         return $proposed;
426                 }
427
428                 // CDF was removed in IE 7 so it won't be in $sampleFound for later versions
429                 if ( isset( $sampleFound['cdf'] ) ) {
430                         return 'application/x-cdf';
431                 }
432
433                 // RSS and Atom were added in IE 7 so they won't be in $sampleFound for 
434                 // previous versions
435                 if ( isset( $sampleFound['rss'] ) ) {
436                         return 'application/rss+xml';
437                 }
438                 if ( isset( $sampleFound['rdf-tag'] )
439                         && isset( $sampleFound['rdf-url'] )
440                         && isset( $sampleFound['rdf-purl'] ) )
441                 {
442                         return 'application/rss+xml';
443                 }
444                 if ( isset( $sampleFound['atom'] ) ) {
445                         return 'application/atom+xml';
446                 }
447
448                 if ( isset( $sampleFound['xml'] ) ) {
449                         // TODO: I'm not sure under what circumstances this flag is enabled
450                         if ( strpos( $version, 'strict' ) !== false ) {
451                                 if ( $proposed == 'text/html' || $proposed == 'text/xml' ) {
452                                         return 'text/xml';
453                                 }
454                         } else {
455                                 return 'text/xml';
456                         }
457                 }
458                 if ( isset( $sampleFound['html'] ) ) {
459                         // TODO: I'm not sure under what circumstances this flag is enabled
460                         if ( strpos( $version, 'nohtml' ) !== false ) {
461                                 if ( $proposed == 'text/plain' ) {
462                                         return 'text/html';
463                                 }
464                         } else {
465                                 return 'text/html';
466                         }
467                 }
468                 if ( isset( $sampleFound['xbm'] ) ) {
469                         return 'image/x-bitmap';
470                 }
471                 if ( isset( $sampleFound['binhex'] ) ) {
472                         return 'application/macbinhex40';
473                 }
474                 if ( isset( $sampleFound['scriptlet'] ) ) {
475                         if ( strpos( $version, 'strict' ) !== false ) {
476                                 if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) {
477                                         return 'text/scriptlet';
478                                 }
479                         } else {
480                                 return 'text/scriptlet';
481                         }
482                 }
483
484                 // Freaky heuristics to determine if the data is text or binary
485                 // The heuristic is of course broken for non-ASCII text
486                 if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] ) 
487                         < ( $counters['ctrl'] + $counters['high'] ) * 16 ) 
488                 {
489                         $kindOfBinary = true;
490                         $type = $binaryType ? $binaryType : $textType;
491                         if ( $type === false ) {
492                                 $type = 'application/octet-stream';
493                         }
494                 } else {
495                         $kindOfBinary = false;
496                         $type = $textType ? $textType : $binaryType;
497                         if ( $type === false ) {
498                                 $type = 'text/plain';
499                         }
500                 }
501
502                 // Check if the output format is ambiguous
503                 // This generally means that detection failed, real types aren't ambiguous
504                 $detectedFormat = $this->getDataFormat( $version, $type );
505                 if ( $detectedFormat != 'ambiguous' ) {
506                         return $type;
507                 }
508
509                 if ( $proposedFormat != 'ambiguous' ) {
510                         // FormatAgreesWithData()
511                         if ( $proposedFormat == 'text' && !$kindOfBinary ) {
512                                 return $proposed;
513                         }
514                         if ( $proposedFormat == 'binary' && $kindOfBinary ) {
515                                 return $proposed;
516                         }
517                         if ( $proposedFormat == 'html' ) {
518                                 return $proposed;
519                         }
520                 }
521
522                 // Find a MIME type by searching the registry for the file extension.
523                 $dotPos = strrpos( $fileName, '.' );
524                 if ( $dotPos === false ) {
525                         return $type;
526                 }
527                 $ext = substr( $fileName, $dotPos );
528                 if ( isset( $this->registry[$ext] ) ) {
529                         return $this->registry[$ext];
530                 }
531
532                 // TODO: If the extension has an application registered to it, IE will return 
533                 // application/octet-stream. We'll skip that, so we could erroneously 
534                 // return text/plain or application/x-netcdf where application/octet-stream
535                 // would be correct.
536
537                 return $type;
538         }
539
540         /**
541          * Check for text headers at the start of the chunk
542          * Confirmed same in 5 and 7.
543          */
544         private function checkTextHeaders( $version, $chunk ) {
545                 $chunk2 = substr( $chunk, 0, 2 );
546                 $chunk4 = substr( $chunk, 0, 4 );
547                 $chunk5 = substr( $chunk, 0, 5 );
548                 if ( $chunk4 == '%PDF' ) {
549                         return 'application/pdf';
550                 }
551                 if ( $chunk2 == '%!' ) {
552                         return 'application/postscript';
553                 }
554                 if ( $chunk5 == '{\\rtf' ) {
555                         return 'text/richtext';
556                 }
557                 if ( $chunk5 == 'begin' ) {
558                         return 'application/base64';
559                 }
560                 return false;
561         }
562
563         /**
564          * Check for binary headers at the start of the chunk
565          * Confirmed same in 5 and 7.
566          */
567         private function checkBinaryHeaders( $version, $chunk ) {
568                 $chunk2 = substr( $chunk, 0, 2 );
569                 $chunk3 = substr( $chunk, 0, 3 );
570                 $chunk4 = substr( $chunk, 0, 4 );
571                 $chunk5 = substr( $chunk, 0, 5 );
572                 $chunk5uc = strtoupper( $chunk5 );
573                 $chunk8 = substr( $chunk, 0, 8 );
574                 if ( $chunk5uc == 'GIF87' || $chunk5uc == 'GIF89' ) {
575                         return 'image/gif';
576                 }
577                 if ( $chunk2 == "\xff\xd8" ) {
578                         return 'image/pjpeg'; // actually plain JPEG but this is what IE returns
579                 }
580
581                 if ( $chunk2 == 'BM' 
582                         && substr( $chunk, 6, 2 ) == "\000\000"
583                         && substr( $chunk, 8, 2 ) == "\000\000" )
584                 {
585                         return 'image/bmp'; // another non-standard MIME
586                 }
587                 if ( $chunk4 == 'RIFF' 
588                         && substr( $chunk, 8, 4 ) == 'WAVE' )
589                 {
590                         return 'audio/wav';
591                 }
592                 // These were integer literals in IE
593                 // Perhaps the author was not sure what the target endianness was
594                 if ( $chunk4 == ".sd\000"
595                         || $chunk4 == ".snd"
596                         || $chunk4 == "\000ds."
597                         || $chunk4 == "dns." )
598                 {
599                         return 'audio/basic';
600                 }
601                 if ( $chunk3 == "MM\000" ) {
602                         return 'image/tiff';
603                 }
604                 if ( $chunk2 == 'MZ' ) {
605                         return 'application/x-msdownload';
606                 }
607                 if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) {
608                         return 'image/x-png'; // [sic]
609                 }
610                 if ( strlen( $chunk ) >= 5 ) {
611                         $byte2 = ord( $chunk[2] );
612                         $byte4 = ord( $chunk[4] );
613                         if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) {
614                                 return 'image/x-jg';
615                         }
616                 }
617                 // More endian confusion?
618                 if ( $chunk4 == 'MROF' ) {
619                         return 'audio/x-aiff';
620                 }
621                 $chunk4_8 = substr( $chunk, 8, 4 );
622                 if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) {
623                         return 'audio/x-aiff';
624                 }
625                 if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) {
626                         return 'video/avi';
627                 }
628                 if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) {
629                         return 'video/mpeg';
630                 }
631                 if ( $chunk4 == "\001\000\000\000"
632                         && substr( $chunk, 40, 4 ) == ' EMF' )
633                 {
634                         return 'image/x-emf';
635                 }
636                 if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) {
637                         return 'image/x-wmf';
638                 }
639                 if ( $chunk4 == "\xca\xfe\xba\xbe" ) {
640                         return 'application/java';
641                 }
642                 if ( $chunk2 == 'PK' ) {
643                         return 'application/x-zip-compressed';
644                 }
645                 if ( $chunk2 == "\x1f\x9d" ) {
646                         return 'application/x-compressed';
647                 }
648                 if ( $chunk2 == "\x1f\x8b" ) {
649                         return 'application/x-gzip-compressed';
650                 }
651                 // Skip redundant check for ZIP
652                 if ( $chunk5 == "MThd\000" ) {
653                         return 'audio/mid';
654                 }
655                 if ( $chunk4 == '%PDF' ) {
656                         return 'application/pdf';
657                 }
658                 return false;
659         }
660
661         /**
662          * Do heuristic checks on the bulk of the data sample.
663          * Search for HTML tags.
664          */
665         protected function sampleData( $version, $chunk ) {
666                 $found = array();
667                 $counters = array(
668                         'ctrl' => 0,
669                         'high' => 0,
670                         'low' => 0,
671                         'lf' => 0,
672                         'cr' => 0,
673                         'ff' => 0
674                 );
675                 $htmlTags = array(
676                         'html',
677                         'head',
678                         'title',
679                         'body',
680                         'script',
681                         'a href',
682                         'pre',
683                         'img',
684                         'plaintext',
685                         'table'
686                 );
687                 $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
688                 $rdfPurl = 'http://purl.org/rss/1.0/';
689                 $xbmMagic1 = '#define';
690                 $xbmMagic2 = '_width';
691                 $xbmMagic3 = '_bits';
692                 $binhexMagic = 'converted with BinHex';
693
694                 for ( $offset = 0; $offset < strlen( $chunk ); $offset++ ) {
695                         $curChar = $chunk[$offset];
696                         if ( $curChar == "\x0a" ) {
697                                 $counters['lf']++;
698                                 continue;
699                         } elseif ( $curChar == "\x0d" ) {
700                                 $counters['cr']++;
701                                 continue;
702                         } elseif ( $curChar == "\x0c" ) {
703                                 $counters['ff']++;
704                                 continue;
705                         } elseif ( $curChar == "\t" ) {
706                                 $counters['low']++;
707                                 continue;
708                         } elseif ( ord( $curChar ) < 32 ) {
709                                 $counters['ctrl']++;
710                                 continue;
711                         } elseif ( ord( $curChar ) >= 128 ) {
712                                 $counters['high']++;
713                                 continue;
714                         }
715
716                         $counters['low']++;
717                         if ( $curChar == '<' ) {
718                                 // XML
719                                 $remainder = substr( $chunk, $offset + 1 );
720                                 if ( !strncasecmp( $remainder, '?XML', 4 ) ) {
721                                         $nextChar = substr( $chunk, $offset + 5, 1 );
722                                         if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) {
723                                                 $found['xml'] = true;
724                                         }
725                                 }
726                                 // Scriptlet (JSP)
727                                 if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) {
728                                         $found['scriptlet'] = true;
729                                         break;
730                                 }
731                                 // HTML
732                                 foreach ( $htmlTags as $tag ) {
733                                         if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) {
734                                                 $found['html'] = true;
735                                         }
736                                 }
737                                 // Skip broken check for additional tags (HR etc.)
738
739                                 // CHANNEL replaced by RSS, RDF and FEED in IE 7
740                                 if ( $version < 'ie07' ) {
741                                         if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) {
742                                                 $found['cdf'] = true;
743                                         }
744                                 } else {
745                                         // RSS
746                                         if ( !strncasecmp( $remainder, 'RSS', 3 ) ) {
747                                                 $found['rss'] = true;
748                                                 break; // return from SampleData
749                                         }
750                                         if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) {
751                                                 $found['rdf-tag'] = true;
752                                                 // no break
753                                         }
754                                         if ( !strncasecmp( $remainder, 'FEED', 4 ) ) {
755                                                 $found['atom'] = true;
756                                                 break;
757                                         }
758                                 }
759                                 continue;
760                         }
761                         // Skip broken check for -->
762
763                         // RSS URL checks
764                         // For some reason both URLs must appear before it is recognised
765                         $remainder = substr( $chunk, $offset );
766                         if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) {
767                                 $found['rdf-url'] = true;
768                                 if ( isset( $found['rdf-tag'] )
769                                         && isset( $found['rdf-purl'] ) ) // [sic]
770                                 {
771                                         break;
772                                 }
773                                 continue;
774                         }
775
776                         if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) {
777                                 if ( isset( $found['rdf-tag'] ) 
778                                         && isset( $found['rdf-url'] ) ) // [sic]
779                                 {
780                                         break;
781                                 }
782                                 continue;
783                         }
784
785                         // XBM checks
786                         if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) {
787                                 $found['xbm1'] = true;
788                                 continue;
789                         }
790                         if ( $curChar == '_' ) {
791                                 if ( isset( $found['xbm2'] ) ) {
792                                         if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) {
793                                                 $found['xbm'] = true;
794                                                 break;
795                                         }
796                                 } elseif ( isset( $found['xbm1'] ) ) {
797                                         if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) {
798                                                 $found['xbm2'] = true;
799                                         }
800                                 }
801                         }
802
803                         // BinHex
804                         if ( !strncmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) {
805                                 $found['binhex'] = true;
806                         }
807                 }
808                 return array( 'found' => $found, 'counters' => $counters );
809         }
810
811         protected function getDataFormat( $version, $type ) {
812                 $types = $this->typeTable[$version];
813                 if ( $type == '(null)' || strval( $type ) === '' ) {
814                         return 'ambiguous';
815                 }
816                 foreach ( $types as $format => $list ) {
817                         if ( in_array( $type, $list ) ) {
818                                 return $format;
819                         }
820                 }
821                 return 'unknown';
822         }
823 }
824