4 * Copyright © 2007 Xarax <jodeldi@gmx.de>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
22 use MediaWiki\Logger\LoggerFactory;
23 use UtfNormal\Validator;
26 * inspired by djvuimage from Brion Vibber
27 * modified and written by xarax
35 function __construct( $filename ) {
36 $this->mFilename = $filename;
42 public function isValid() {
49 public function getImageSize() {
50 $data = $this->retrieveMetadata();
51 $size = self::getPageSize( $data, 1 );
54 $width = $size['width'];
55 $height = $size['height'];
56 return [ $width, $height, 'Pdf',
57 "width=\"$width\" height=\"$height\"" ];
67 public static function getPageSize( $data, $page ) {
68 global $wgPdfHandlerDpi;
70 if ( isset( $data['pages'][$page]['Page size'] ) ) {
71 $o = $data['pages'][$page]['Page size'];
72 } elseif ( isset( $data['Page size'] ) ) {
73 $o = $data['Page size'];
79 if ( isset( $data['pages'][$page]['Page rot'] ) ) {
80 $r = $data['pages'][$page]['Page rot'];
81 } elseif ( isset( $data['Page rot'] ) ) {
82 $r = $data['Page rot'];
86 $size = explode( 'x', $o, 2 );
89 $width = intval( trim( $size[0] ) / 72 * $wgPdfHandlerDpi );
90 $height = explode( ' ', trim( $size[1] ), 2 );
91 $height = intval( trim( $height[0] ) / 72 * $wgPdfHandlerDpi );
92 if ( ( $r / 90 ) & 1 ) {
93 // Swap width and height for landscape pages
110 * @return array|bool|null
112 public function retrieveMetaData() {
113 global $wgPdfInfo, $wgPdftoText;
116 // Note in poppler 0.26 the -meta and page data options worked together,
117 // but as of poppler 0.48 they must be queried separately.
118 // https://bugs.freedesktop.org/show_bug.cgi?id=96801
119 $cmd = wfEscapeShellArg( $wgPdfInfo ) .
120 " -enc UTF-8 " . # Report metadata as UTF-8 text...
121 " -meta " . # Report XMP metadata
122 wfEscapeShellArg( $this->mFilename );
124 $resultMeta = wfShellExec( $cmd, $retval );
126 $cmd = wfEscapeShellArg( $wgPdfInfo ) .
127 " -enc UTF-8 " . # Report metadata as UTF-8 text...
128 " -l 9999999 " . # Report page sizes for all pages
129 wfEscapeShellArg( $this->mFilename );
131 $resultPages = wfShellExec( $cmd, $retval );
133 $dump = $resultMeta . $resultPages;
134 $data = $this->convertDumpToArray( $dump );
140 if ( isset( $wgPdftoText ) ) {
141 $cmd = wfEscapeShellArg( $wgPdftoText ) . ' '. wfEscapeShellArg( $this->mFilename ) . ' - ';
142 wfDebug( __METHOD__.": $cmd\n" );
144 $txt = wfShellExec( $cmd, $retval );
145 if ( $retval == 0 ) {
146 $txt = str_replace( "\r\n", "\n", $txt );
147 $pages = explode( "\f", $txt );
148 foreach ( $pages as $page => $pageText ) {
149 // Get rid of invalid UTF-8, strip control characters
150 // Note we need to do this per page, as \f page feed would be stripped.
151 $pages[$page] = Validator::cleanUp( $pageText );
153 $data['text'] = $pages;
160 * @param $dump string
163 protected function convertDumpToArray( $dump ) {
164 if ( strval( $dump ) == '' ) {
168 $lines = explode( "\n", $dump );
171 // Metadata is always the last item, and spans multiple lines.
174 // Basically this loop will go through each line, splitting key value
175 // pairs on the colon, until it gets to a "Metadata:\n" at which point
176 // it will gather all remaining lines into the xmp key.
177 foreach ( $lines as $line ) {
179 // Handle XMP differently due to diffence in line break
180 $data['xmp'] .= "\n$line";
183 $bits = explode( ':', $line, 2 );
184 if ( count( $bits ) > 1 ) {
185 $key = trim( $bits[0] );
186 if ( $key === 'Metadata' ) {
191 $value = trim( $bits[1] );
193 // "Page xx rot" will be in poppler 0.20's pdfinfo output
194 // See https://bugs.freedesktop.org/show_bug.cgi?id=41867
195 if ( preg_match( '/^Page +(\d+) (size|rot)$/', $key, $matches ) ) {
196 $data['pages'][$matches[1]][$matches[2] == 'size' ? 'Page size' : 'Page rot'] = $value;
198 $data[$key] = $value;
202 $data = $this->postProcessDump( $data );
207 * Postprocess the metadata (convert xmp into useful form, etc)
209 * This is used to generate the metadata table at the bottom
210 * of the image description page.
212 * @param $data Array metadata
213 * @return Array post-processed metadata
215 protected function postProcessDump( array $data ) {
216 $meta = new BitmapMetadataHandler();
218 foreach ( $data as $key => $val ) {
221 $items['ObjectName'] = $val;
224 $items['ImageDescription'] = $val;
227 // Sometimes we have empty keywords. This seems
228 // to be a product of how pdfinfo deals with keywords
229 // with spaces in them. Filter such empty keywords
230 $keyList = array_filter( explode( ' ', $val ) );
231 if ( count( $keyList ) > 0 ) {
232 $items['Keywords'] = $keyList;
236 $items['Artist'] = $val;
239 // Program used to create file.
240 // Different from program used to convert to pdf.
241 $items['Software'] = $val;
244 // Conversion program
245 $items['pdf-Producer'] = $val;
248 $timestamp = wfTimestamp( TS_EXIF, $val );
250 // 'if' is just paranoia
251 $items['DateTime'] = $timestamp;
255 $timestamp = wfTimestamp( TS_EXIF, $val );
257 $items['DateTimeDigitized'] = $timestamp;
260 // These last two (version and encryption) I was unsure
261 // if we should include in the table, since they aren't
262 // all that useful to editors. I leaned on the side
263 // of including. However not including if file
264 // is optimized/linearized since that is really useless
267 $items['pdf-Version'] = $val;
270 // @todo: The value isn't i18n-ised. The appropriate
271 // place to do that is in FormatMetadata.php
272 // should add a hook a there.
273 // For reference, if encrypted this fields value looks like:
274 // "yes (print:yes copy:no change:no addNotes:no)"
275 $items['pdf-Encrypted'] = $val;
277 // Note 'pages' and 'Pages' are different keys (!)
279 // A pdf document can have multiple sized pages in it.
280 // (However 95% of the time, all pages are the same size)
281 // get a list of all the unique page sizes in document.
282 // This doesn't do anything with rotation as of yet,
283 // mostly because I am unsure of what a good way to
284 // present that information to the user would be.
286 foreach ( $val as $page ) {
287 if ( isset( $page['Page size'] ) ) {
288 $pageSizes[$page['Page size']] = true;
292 $pageSizeArray = array_keys( $pageSizes );
293 if ( count( $pageSizeArray ) > 0 ) {
294 $items['pdf-PageSize'] = $pageSizeArray;
300 $meta->addMetadata( $items, 'native' );
302 if ( isset( $data['xmp'] ) && function_exists( 'xml_parser_create_ns' ) ) {
303 // func exists verifies that the xml extension required for XMPReader
304 // is present (Almost always is present)
305 // @todo: This only handles generic xmp properties. Would be improved
306 // by handling pdf xmp properties (pdf and pdfx) via XMPInfo hook.
307 $xmp = new XMPReader( LoggerFactory::getInstance( 'XMP' ) );
308 $xmp->parse( $data['xmp'] );
309 $xmpRes = $xmp->getResults();
310 foreach ( $xmpRes as $type => $xmpSection ) {
311 $meta->addMetadata( $xmpSection, $type );
314 unset( $data['xmp'] );
315 $data['mergedMetadata'] = $meta->getMetadataArray();