]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - extensions/PdfHandler/PdfHandler.image.php
MediaWiki 1.30.2
[autoinstalls/mediawiki.git] / extensions / PdfHandler / PdfHandler.image.php
1 <?php
2 /**
3  *
4  * Copyright © 2007 Xarax <jodeldi@gmx.de>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License along
17  * with this program; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19  * http://www.gnu.org/copyleft/gpl.html
20  */
21
22 use MediaWiki\Logger\LoggerFactory;
23 use UtfNormal\Validator;
24
25 /**
26  * inspired by djvuimage from Brion Vibber
27  * modified and written by xarax
28  */
29
30 class PdfImage {
31
32         /**
33          * @param $filename
34          */
35         function __construct( $filename ) {
36                 $this->mFilename = $filename;
37         }
38
39         /**
40          * @return bool
41          */
42         public function isValid() {
43                 return true;
44         }
45
46         /**
47          * @return array|bool
48          */
49         public function getImageSize() {
50                 $data = $this->retrieveMetadata();
51                 $size = self::getPageSize( $data, 1 );
52
53                 if ( $size ) {
54                         $width = $size['width'];
55                         $height = $size['height'];
56                         return [ $width, $height, 'Pdf',
57                                 "width=\"$width\" height=\"$height\"" ];
58                 }
59                 return false;
60         }
61
62         /**
63          * @param $data array
64          * @param $page
65          * @return array|bool
66          */
67         public static function getPageSize( $data, $page ) {
68                 global $wgPdfHandlerDpi;
69
70                 if ( isset( $data['pages'][$page]['Page size'] ) ) {
71                         $o = $data['pages'][$page]['Page size'];
72                 } elseif ( isset( $data['Page size'] ) ) {
73                         $o = $data['Page size'];
74                 } else {
75                         $o = false;
76                 }
77
78                 if ( $o ) {
79                         if ( isset( $data['pages'][$page]['Page rot'] ) ) {
80                                 $r = $data['pages'][$page]['Page rot'];
81                         } elseif ( isset( $data['Page rot'] ) ) {
82                                 $r = $data['Page rot'];
83                         } else {
84                                 $r = 0;
85                         }
86                         $size = explode( 'x', $o, 2 );
87
88                         if ( $size ) {
89                                 $width  = intval( trim( $size[0] ) / 72 * $wgPdfHandlerDpi );
90                                 $height = explode( ' ', trim( $size[1] ), 2 );
91                                 $height = intval( trim( $height[0] ) / 72 * $wgPdfHandlerDpi );
92                                 if ( ( $r / 90 ) & 1 ) {
93                                         // Swap width and height for landscape pages
94                                         $t = $width;
95                                         $width = $height;
96                                         $height = $t;
97                                 }
98
99                                 return [
100                                         'width' => $width,
101                                         'height' => $height
102                                 ];
103                         }
104                 }
105
106                 return false;
107         }
108
109         /**
110          * @return array|bool|null
111          */
112         public function retrieveMetaData() {
113                 global $wgPdfInfo, $wgPdftoText;
114
115                 if ( $wgPdfInfo ) {
116                         // Note in poppler 0.26 the -meta and page data options worked together,
117                         // but as of poppler 0.48 they must be queried separately.
118                         // https://bugs.freedesktop.org/show_bug.cgi?id=96801
119                         $cmd = wfEscapeShellArg( $wgPdfInfo ) .
120                                 " -enc UTF-8 " . # Report metadata as UTF-8 text...
121                                 " -meta " .      # Report XMP metadata
122                                 wfEscapeShellArg( $this->mFilename );
123                         $retval = '';
124                         $resultMeta = wfShellExec( $cmd, $retval );
125
126                         $cmd = wfEscapeShellArg( $wgPdfInfo ) .
127                                 " -enc UTF-8 " . # Report metadata as UTF-8 text...
128                                 " -l 9999999 " .  # Report page sizes for all pages
129                                 wfEscapeShellArg( $this->mFilename );
130                         $retval = '';
131                         $resultPages = wfShellExec( $cmd, $retval );
132
133                         $dump = $resultMeta . $resultPages;
134                         $data = $this->convertDumpToArray( $dump );
135                 } else {
136                         $data = null;
137                 }
138
139                 // Read text layer
140                 if ( isset( $wgPdftoText ) ) {
141                         $cmd = wfEscapeShellArg( $wgPdftoText ) . ' '. wfEscapeShellArg( $this->mFilename ) . ' - ';
142                         wfDebug( __METHOD__.": $cmd\n" );
143                         $retval = '';
144                         $txt = wfShellExec( $cmd, $retval );
145                         if ( $retval == 0 ) {
146                                 $txt = str_replace( "\r\n", "\n", $txt );
147                                 $pages = explode( "\f", $txt );
148                                 foreach ( $pages as $page => $pageText ) {
149                                         // Get rid of invalid UTF-8, strip control characters
150                                         // Note we need to do this per page, as \f page feed would be stripped.
151                                         $pages[$page] = Validator::cleanUp( $pageText );
152                                 }
153                                 $data['text'] = $pages;
154                         }
155                 }
156                 return $data;
157         }
158
159         /**
160          * @param $dump string
161          * @return array|bool
162          */
163         protected function convertDumpToArray( $dump ) {
164                 if ( strval( $dump ) == '' ) {
165                         return false;
166                 }
167
168                 $lines = explode( "\n", $dump );
169                 $data = [];
170
171                 // Metadata is always the last item, and spans multiple lines.
172                 $inMetadata = false;
173
174                 // Basically this loop will go through each line, splitting key value
175                 // pairs on the colon, until it gets to a "Metadata:\n" at which point
176                 // it will gather all remaining lines into the xmp key.
177                 foreach ( $lines as $line ) {
178                         if ( $inMetadata ) {
179                                 // Handle XMP differently due to diffence in line break
180                                 $data['xmp'] .= "\n$line";
181                                 continue;
182                         }
183                         $bits = explode( ':', $line, 2 );
184                         if ( count( $bits ) > 1 ) {
185                                 $key = trim( $bits[0] );
186                                 if ( $key === 'Metadata' ) {
187                                         $inMetadata = true;
188                                         $data['xmp'] = '';
189                                         continue;
190                                 }
191                                 $value = trim( $bits[1] );
192                                 $matches = [];
193                                 // "Page xx rot" will be in poppler 0.20's pdfinfo output
194                                 // See https://bugs.freedesktop.org/show_bug.cgi?id=41867
195                                 if ( preg_match( '/^Page +(\d+) (size|rot)$/', $key, $matches ) ) {
196                                         $data['pages'][$matches[1]][$matches[2] == 'size' ? 'Page size' : 'Page rot'] = $value;
197                                 } else {
198                                         $data[$key] = $value;
199                                 }
200                         }
201                 }
202                 $data = $this->postProcessDump( $data );
203                 return $data;
204         }
205
206         /**
207          * Postprocess the metadata (convert xmp into useful form, etc)
208          *
209          * This is used to generate the metadata table at the bottom
210          * of the image description page.
211          *
212          * @param $data Array metadata
213          * @return Array post-processed metadata
214          */
215         protected function postProcessDump( array $data ) {
216                 $meta = new BitmapMetadataHandler();
217                 $items = [];
218                 foreach ( $data as $key => $val ) {
219                         switch ( $key ) {
220                                 case 'Title':
221                                         $items['ObjectName'] = $val;
222                                         break;
223                                 case 'Subject':
224                                         $items['ImageDescription'] = $val;
225                                         break;
226                                 case 'Keywords':
227                                         // Sometimes we have empty keywords. This seems
228                                         // to be a product of how pdfinfo deals with keywords
229                                         // with spaces in them. Filter such empty keywords
230                                         $keyList = array_filter( explode( ' ', $val ) );
231                                         if ( count( $keyList ) > 0 ) {
232                                                 $items['Keywords'] = $keyList;
233                                         }
234                                         break;
235                                 case 'Author':
236                                         $items['Artist'] = $val;
237                                         break;
238                                 case 'Creator':
239                                         // Program used to create file.
240                                         // Different from program used to convert to pdf.
241                                         $items['Software'] = $val;
242                                         break;
243                                 case 'Producer':
244                                         // Conversion program
245                                         $items['pdf-Producer'] = $val;
246                                         break;
247                                 case 'ModTime':
248                                         $timestamp = wfTimestamp( TS_EXIF, $val );
249                                         if ( $timestamp ) {
250                                                 // 'if' is just paranoia
251                                                 $items['DateTime'] = $timestamp;
252                                         }
253                                         break;
254                                 case 'CreationTime':
255                                         $timestamp = wfTimestamp( TS_EXIF, $val );
256                                         if ( $timestamp ) {
257                                                 $items['DateTimeDigitized'] = $timestamp;
258                                         }
259                                         break;
260                                 // These last two (version and encryption) I was unsure
261                                 // if we should include in the table, since they aren't
262                                 // all that useful to editors. I leaned on the side
263                                 // of including. However not including if file
264                                 // is optimized/linearized since that is really useless
265                                 // to an editor.
266                                 case 'PDF version':
267                                         $items['pdf-Version'] = $val;
268                                         break;
269                                 case 'Encrypted':
270                                         // @todo: The value isn't i18n-ised. The appropriate
271                                         // place to do that is in FormatMetadata.php
272                                         // should add a hook a there.
273                                         // For reference, if encrypted this fields value looks like:
274                                         // "yes (print:yes copy:no change:no addNotes:no)"
275                                         $items['pdf-Encrypted'] = $val;
276                                         break;
277                                 // Note 'pages' and 'Pages' are different keys (!)
278                                 case 'pages':
279                                         // A pdf document can have multiple sized pages in it.
280                                         // (However 95% of the time, all pages are the same size)
281                                         // get a list of all the unique page sizes in document.
282                                         // This doesn't do anything with rotation as of yet,
283                                         // mostly because I am unsure of what a good way to
284                                         // present that information to the user would be.
285                                         $pageSizes = [];
286                                         foreach ( $val as $page ) {
287                                                 if ( isset( $page['Page size'] ) ) {
288                                                         $pageSizes[$page['Page size']] = true;
289                                                 }
290                                         }
291
292                                         $pageSizeArray = array_keys( $pageSizes );
293                                         if ( count( $pageSizeArray ) > 0 ) {
294                                                 $items['pdf-PageSize'] = $pageSizeArray;
295                                         }
296                                         break;
297                         }
298
299                 }
300                 $meta->addMetadata( $items, 'native' );
301
302                 if ( isset( $data['xmp'] ) && function_exists( 'xml_parser_create_ns' ) ) {
303                         // func exists verifies that the xml extension required for XMPReader
304                         // is present (Almost always is present)
305                         // @todo: This only handles generic xmp properties. Would be improved
306                         // by handling pdf xmp properties (pdf and pdfx) via XMPInfo hook.
307                         $xmp = new XMPReader( LoggerFactory::getInstance( 'XMP' ) );
308                         $xmp->parse( $data['xmp'] );
309                         $xmpRes = $xmp->getResults();
310                         foreach ( $xmpRes as $type => $xmpSection ) {
311                                 $meta->addMetadata( $xmpSection, $type );
312                         }
313                 }
314                 unset( $data['xmp'] );
315                 $data['mergedMetadata'] = $meta->getMetadataArray();
316                 return $data;
317         }
318 }