]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - includes/media/DjVuImage.php
MediaWiki 1.30.2
[autoinstallsdev/mediawiki.git] / includes / media / DjVuImage.php
1 <?php
2 /**
3  * DjVu image handler.
4  *
5  * Copyright © 2006 Brion Vibber <brion@pobox.com>
6  * https://www.mediawiki.org/
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License along
19  * with this program; if not, write to the Free Software Foundation, Inc.,
20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21  * http://www.gnu.org/copyleft/gpl.html
22  *
23  * @file
24  * @ingroup Media
25  */
26
27 /**
28  * Support for detecting/validating DjVu image files and getting
29  * some basic file metadata (resolution etc)
30  *
31  * File format docs are available in source package for DjVuLibre:
32  * http://djvulibre.djvuzone.org/
33  *
34  * @ingroup Media
35  */
36 class DjVuImage {
37         /**
38          * @const DJVUTXT_MEMORY_LIMIT Memory limit for the DjVu description software
39          */
40         const DJVUTXT_MEMORY_LIMIT = 300000;
41
42         /**
43          * @param string $filename The DjVu file name.
44          */
45         function __construct( $filename ) {
46                 $this->mFilename = $filename;
47         }
48
49         /**
50          * Check if the given file is indeed a valid DjVu image file
51          * @return bool
52          */
53         public function isValid() {
54                 $info = $this->getInfo();
55
56                 return $info !== false;
57         }
58
59         /**
60          * Return data in the style of getimagesize()
61          * @return array|bool Array or false on failure
62          */
63         public function getImageSize() {
64                 $data = $this->getInfo();
65
66                 if ( $data !== false ) {
67                         $width = $data['width'];
68                         $height = $data['height'];
69
70                         return [ $width, $height, 'DjVu',
71                                 "width=\"$width\" height=\"$height\"" ];
72                 }
73
74                 return false;
75         }
76
77         // ---------
78
79         /**
80          * For debugging; dump the IFF chunk structure
81          */
82         function dump() {
83                 $file = fopen( $this->mFilename, 'rb' );
84                 $header = fread( $file, 12 );
85                 $arr = unpack( 'a4magic/a4chunk/NchunkLength', $header );
86                 $chunk = $arr['chunk'];
87                 $chunkLength = $arr['chunkLength'];
88                 echo "$chunk $chunkLength\n";
89                 $this->dumpForm( $file, $chunkLength, 1 );
90                 fclose( $file );
91         }
92
93         private function dumpForm( $file, $length, $indent ) {
94                 $start = ftell( $file );
95                 $secondary = fread( $file, 4 );
96                 echo str_repeat( ' ', $indent * 4 ) . "($secondary)\n";
97                 while ( ftell( $file ) - $start < $length ) {
98                         $chunkHeader = fread( $file, 8 );
99                         if ( $chunkHeader == '' ) {
100                                 break;
101                         }
102                         $arr = unpack( 'a4chunk/NchunkLength', $chunkHeader );
103                         $chunk = $arr['chunk'];
104                         $chunkLength = $arr['chunkLength'];
105                         echo str_repeat( ' ', $indent * 4 ) . "$chunk $chunkLength\n";
106
107                         if ( $chunk == 'FORM' ) {
108                                 $this->dumpForm( $file, $chunkLength, $indent + 1 );
109                         } else {
110                                 fseek( $file, $chunkLength, SEEK_CUR );
111                                 if ( $chunkLength & 1 == 1 ) {
112                                         // Padding byte between chunks
113                                         fseek( $file, 1, SEEK_CUR );
114                                 }
115                         }
116                 }
117         }
118
119         function getInfo() {
120                 MediaWiki\suppressWarnings();
121                 $file = fopen( $this->mFilename, 'rb' );
122                 MediaWiki\restoreWarnings();
123                 if ( $file === false ) {
124                         wfDebug( __METHOD__ . ": missing or failed file read\n" );
125
126                         return false;
127                 }
128
129                 $header = fread( $file, 16 );
130                 $info = false;
131
132                 if ( strlen( $header ) < 16 ) {
133                         wfDebug( __METHOD__ . ": too short file header\n" );
134                 } else {
135                         $arr = unpack( 'a4magic/a4form/NformLength/a4subtype', $header );
136
137                         $subtype = $arr['subtype'];
138                         if ( $arr['magic'] != 'AT&T' ) {
139                                 wfDebug( __METHOD__ . ": not a DjVu file\n" );
140                         } elseif ( $subtype == 'DJVU' ) {
141                                 // Single-page document
142                                 $info = $this->getPageInfo( $file );
143                         } elseif ( $subtype == 'DJVM' ) {
144                                 // Multi-page document
145                                 $info = $this->getMultiPageInfo( $file, $arr['formLength'] );
146                         } else {
147                                 wfDebug( __METHOD__ . ": unrecognized DJVU file type '{$arr['subtype']}'\n" );
148                         }
149                 }
150                 fclose( $file );
151
152                 return $info;
153         }
154
155         private function readChunk( $file ) {
156                 $header = fread( $file, 8 );
157                 if ( strlen( $header ) < 8 ) {
158                         return [ false, 0 ];
159                 } else {
160                         $arr = unpack( 'a4chunk/Nlength', $header );
161
162                         return [ $arr['chunk'], $arr['length'] ];
163                 }
164         }
165
166         private function skipChunk( $file, $chunkLength ) {
167                 fseek( $file, $chunkLength, SEEK_CUR );
168
169                 if ( $chunkLength & 0x01 == 1 && !feof( $file ) ) {
170                         // padding byte
171                         fseek( $file, 1, SEEK_CUR );
172                 }
173         }
174
175         private function getMultiPageInfo( $file, $formLength ) {
176                 // For now, we'll just look for the first page in the file
177                 // and report its information, hoping others are the same size.
178                 $start = ftell( $file );
179                 do {
180                         list( $chunk, $length ) = $this->readChunk( $file );
181                         if ( !$chunk ) {
182                                 break;
183                         }
184
185                         if ( $chunk == 'FORM' ) {
186                                 $subtype = fread( $file, 4 );
187                                 if ( $subtype == 'DJVU' ) {
188                                         wfDebug( __METHOD__ . ": found first subpage\n" );
189
190                                         return $this->getPageInfo( $file );
191                                 }
192                                 $this->skipChunk( $file, $length - 4 );
193                         } else {
194                                 wfDebug( __METHOD__ . ": skipping '$chunk' chunk\n" );
195                                 $this->skipChunk( $file, $length );
196                         }
197                 } while ( $length != 0 && !feof( $file ) && ftell( $file ) - $start < $formLength );
198
199                 wfDebug( __METHOD__ . ": multi-page DJVU file contained no pages\n" );
200
201                 return false;
202         }
203
204         private function getPageInfo( $file ) {
205                 list( $chunk, $length ) = $this->readChunk( $file );
206                 if ( $chunk != 'INFO' ) {
207                         wfDebug( __METHOD__ . ": expected INFO chunk, got '$chunk'\n" );
208
209                         return false;
210                 }
211
212                 if ( $length < 9 ) {
213                         wfDebug( __METHOD__ . ": INFO should be 9 or 10 bytes, found $length\n" );
214
215                         return false;
216                 }
217                 $data = fread( $file, $length );
218                 if ( strlen( $data ) < $length ) {
219                         wfDebug( __METHOD__ . ": INFO chunk cut off\n" );
220
221                         return false;
222                 }
223
224                 $arr = unpack(
225                         'nwidth/' .
226                         'nheight/' .
227                         'Cminor/' .
228                         'Cmajor/' .
229                         'vresolution/' .
230                         'Cgamma', $data );
231
232                 # Newer files have rotation info in byte 10, but we don't use it yet.
233
234                 return [
235                         'width' => $arr['width'],
236                         'height' => $arr['height'],
237                         'version' => "{$arr['major']}.{$arr['minor']}",
238                         'resolution' => $arr['resolution'],
239                         'gamma' => $arr['gamma'] / 10.0 ];
240         }
241
242         /**
243          * Return an XML string describing the DjVu image
244          * @return string|bool
245          */
246         function retrieveMetaData() {
247                 global $wgDjvuToXML, $wgDjvuDump, $wgDjvuTxt;
248
249                 if ( !$this->isValid() ) {
250                         return false;
251                 }
252
253                 if ( isset( $wgDjvuDump ) ) {
254                         # djvudump is faster as of version 3.5
255                         # https://sourceforge.net/p/djvu/bugs/71/
256                         $cmd = wfEscapeShellArg( $wgDjvuDump ) . ' ' . wfEscapeShellArg( $this->mFilename );
257                         $dump = wfShellExec( $cmd );
258                         $xml = $this->convertDumpToXML( $dump );
259                 } elseif ( isset( $wgDjvuToXML ) ) {
260                         $cmd = wfEscapeShellArg( $wgDjvuToXML ) . ' --without-anno --without-text ' .
261                                 wfEscapeShellArg( $this->mFilename );
262                         $xml = wfShellExec( $cmd );
263                 } else {
264                         $xml = null;
265                 }
266                 # Text layer
267                 if ( isset( $wgDjvuTxt ) ) {
268                         $cmd = wfEscapeShellArg( $wgDjvuTxt ) . ' --detail=page ' . wfEscapeShellArg( $this->mFilename );
269                         wfDebug( __METHOD__ . ": $cmd\n" );
270                         $retval = '';
271                         $txt = wfShellExec( $cmd, $retval, [], [ 'memory' => self::DJVUTXT_MEMORY_LIMIT ] );
272                         if ( $retval == 0 ) {
273                                 # Strip some control characters
274                                 $txt = preg_replace( "/[\013\035\037]/", "", $txt );
275                                 $reg = <<<EOR
276                                         /\(page\s[\d-]*\s[\d-]*\s[\d-]*\s[\d-]*\s*"
277                                         ((?>    # Text to match is composed of atoms of either:
278                                                 \\\\. # - any escaped character
279                                                 |     # - any character different from " and \
280                                                 [^"\\\\]+
281                                         )*?)
282                                         "\s*\)
283                                         | # Or page can be empty ; in this case, djvutxt dumps ()
284                                         \(\s*()\)/sx
285 EOR;
286                                 $txt = preg_replace_callback( $reg, [ $this, 'pageTextCallback' ], $txt );
287                                 $txt = "<DjVuTxt>\n<HEAD></HEAD>\n<BODY>\n" . $txt . "</BODY>\n</DjVuTxt>\n";
288                                 $xml = preg_replace( "/<DjVuXML>/", "<mw-djvu><DjVuXML>", $xml, 1 );
289                                 $xml = $xml . $txt . '</mw-djvu>';
290                         }
291                 }
292
293                 return $xml;
294         }
295
296         function pageTextCallback( $matches ) {
297                 # Get rid of invalid UTF-8, strip control characters
298                 $val = htmlspecialchars( UtfNormal\Validator::cleanUp( stripcslashes( $matches[1] ) ) );
299                 $val = str_replace( [ "\n", '�' ], [ '&#10;', '' ], $val );
300                 return '<PAGE value="' . $val . '" />';
301         }
302
303         /**
304          * Hack to temporarily work around djvutoxml bug
305          * @param string $dump
306          * @return string
307          */
308         function convertDumpToXML( $dump ) {
309                 if ( strval( $dump ) == '' ) {
310                         return false;
311                 }
312
313                 $xml = <<<EOT
314 <?xml version="1.0" ?>
315 <!DOCTYPE DjVuXML PUBLIC "-//W3C//DTD DjVuXML 1.1//EN" "pubtext/DjVuXML-s.dtd">
316 <DjVuXML>
317 <HEAD></HEAD>
318 <BODY>
319 EOT;
320
321                 $dump = str_replace( "\r", '', $dump );
322                 $line = strtok( $dump, "\n" );
323                 $m = false;
324                 $good = false;
325                 if ( preg_match( '/^( *)FORM:DJVU/', $line, $m ) ) {
326                         # Single-page
327                         if ( $this->parseFormDjvu( $line, $xml ) ) {
328                                 $good = true;
329                         } else {
330                                 return false;
331                         }
332                 } elseif ( preg_match( '/^( *)FORM:DJVM/', $line, $m ) ) {
333                         # Multi-page
334                         $parentLevel = strlen( $m[1] );
335                         # Find DIRM
336                         $line = strtok( "\n" );
337                         while ( $line !== false ) {
338                                 $childLevel = strspn( $line, ' ' );
339                                 if ( $childLevel <= $parentLevel ) {
340                                         # End of chunk
341                                         break;
342                                 }
343
344                                 if ( preg_match( '/^ *DIRM.*indirect/', $line ) ) {
345                                         wfDebug( "Indirect multi-page DjVu document, bad for server!\n" );
346
347                                         return false;
348                                 }
349                                 if ( preg_match( '/^ *FORM:DJVU/', $line ) ) {
350                                         # Found page
351                                         if ( $this->parseFormDjvu( $line, $xml ) ) {
352                                                 $good = true;
353                                         } else {
354                                                 return false;
355                                         }
356                                 }
357                                 $line = strtok( "\n" );
358                         }
359                 }
360                 if ( !$good ) {
361                         return false;
362                 }
363
364                 $xml .= "</BODY>\n</DjVuXML>\n";
365
366                 return $xml;
367         }
368
369         function parseFormDjvu( $line, &$xml ) {
370                 $parentLevel = strspn( $line, ' ' );
371                 $line = strtok( "\n" );
372
373                 # Find INFO
374                 while ( $line !== false ) {
375                         $childLevel = strspn( $line, ' ' );
376                         if ( $childLevel <= $parentLevel ) {
377                                 # End of chunk
378                                 break;
379                         }
380
381                         if ( preg_match(
382                                 '/^ *INFO *\[\d*\] *DjVu *(\d+)x(\d+), *\w*, *(\d+) *dpi, *gamma=([0-9.-]+)/',
383                                 $line,
384                                 $m
385                         ) ) {
386                                 $xml .= Xml::tags(
387                                         'OBJECT',
388                                         [
389                                                 # 'data' => '',
390                                                 # 'type' => 'image/x.djvu',
391                                                 'height' => $m[2],
392                                                 'width' => $m[1],
393                                                 # 'usemap' => '',
394                                         ],
395                                         "\n" .
396                                                 Xml::element( 'PARAM', [ 'name' => 'DPI', 'value' => $m[3] ] ) . "\n" .
397                                                 Xml::element( 'PARAM', [ 'name' => 'GAMMA', 'value' => $m[4] ] ) . "\n"
398                                 ) . "\n";
399
400                                 return true;
401                         }
402                         $line = strtok( "\n" );
403                 }
404
405                 # Not found
406                 return false;
407         }
408 }