X-Git-Url: https://scripts.mit.edu/gitweb/autoinstallsdev/mediawiki.git/blobdiff_plain/19e297c21b10b1b8a3acad5e73fc71dcb35db44a..6932310fd58ebef145fa01eb76edf7150284d8ea:/includes/media/DjVu.php diff --git a/includes/media/DjVu.php b/includes/media/DjVu.php index cc3f1db5..aae66d37 100644 --- a/includes/media/DjVu.php +++ b/includes/media/DjVu.php @@ -1,39 +1,95 @@ getSize() > static::EXPENSIVE_SIZE_LIMIT; + } + + /** + * @param File $file + * @return bool + */ + public function isMultiPage( $file ) { + return true; + } + + /** + * @return array + */ + public function getParamMap() { + return [ 'img_width' => 'width', 'img_page' => 'page', - ); + ]; } - function validateParam( $name, $value ) { - if ( in_array( $name, array( 'width', 'height', 'page' ) ) ) { + /** + * @param string $name + * @param mixed $value + * @return bool + */ + public function validateParam( $name, $value ) { + if ( $name === 'page' && trim( $value ) !== (string)intval( $value ) ) { + // Extra junk on the end of page, probably actually a caption + // e.g. [[File:Foo.djvu|thumb|Page 3 of the document shows foo]] + return false; + } + if ( in_array( $name, [ 'width', 'height', 'page' ] ) ) { if ( $value <= 0 ) { return false; } else { @@ -44,87 +100,144 @@ class DjVuHandler extends ImageHandler { } } - function makeParamString( $params ) { + /** + * @param array $params + * @return bool|string + */ + public function makeParamString( $params ) { $page = isset( $params['page'] ) ? $params['page'] : 1; if ( !isset( $params['width'] ) ) { return false; } + return "page{$page}-{$params['width']}px"; } - function parseParamString( $str ) { + /** + * @param string $str + * @return array|bool + */ + public function parseParamString( $str ) { $m = false; if ( preg_match( '/^page(\d+)-(\d+)px$/', $str, $m ) ) { - return array( 'width' => $m[2], 'page' => $m[1] ); + return [ 'width' => $m[2], 'page' => $m[1] ]; } else { return false; } } + /** + * @param array $params + * @return array + */ function getScriptParams( $params ) { - return array( + return [ 'width' => $params['width'], 'page' => $params['page'], - ); + ]; } + /** + * @param File $image + * @param string $dstPath + * @param string $dstUrl + * @param array $params + * @param int $flags + * @return MediaTransformError|ThumbnailImage|TransformParameterError + */ function doTransform( $image, $dstPath, $dstUrl, $params, $flags = 0 ) { global $wgDjvuRenderer, $wgDjvuPostProcessor; - // Fetch XML and check it, to give a more informative error message than the one which - // normaliseParams will inevitably give. - $xml = $image->getMetadata(); - if ( !$xml ) { - return new MediaTransformError( 'thumbnail_error', @$params['width'], @$params['height'], - wfMsg( 'djvu_no_xml' ) ); - } - if ( !$this->normaliseParams( $image, $params ) ) { return new TransformParameterError( $params ); } $width = $params['width']; $height = $params['height']; - $srcPath = $image->getPath(); $page = $params['page']; - if ( $page > $this->pageCount( $image ) ) { - return new MediaTransformError( 'thumbnail_error', $width, $height, wfMsg( 'djvu_page_error' ) ); - } if ( $flags & self::TRANSFORM_LATER ) { - return new ThumbnailImage( $image, $dstUrl, $width, $height, $dstPath, $page ); + $params = [ + 'width' => $width, + 'height' => $height, + 'page' => $page + ]; + + return new ThumbnailImage( $image, $dstUrl, $dstPath, $params ); + } + + if ( !wfMkdirParents( dirname( $dstPath ), null, __METHOD__ ) ) { + return new MediaTransformError( + 'thumbnail_error', + $width, + $height, + wfMessage( 'thumbnail_dest_directory' ) + ); } - if ( !wfMkdirParents( dirname( $dstPath ) ) ) { - return new MediaTransformError( 'thumbnail_error', $width, $height, wfMsg( 'thumbnail_dest_directory' ) ); + // Get local copy source for shell scripts + // Thumbnail extraction is very inefficient for large files. + // Provide a way to pool count limit the number of downloaders. + if ( $image->getSize() >= 1e7 ) { // 10MB + $work = new PoolCounterWorkViaCallback( 'GetLocalFileCopy', sha1( $image->getName() ), + [ + 'doWork' => function () use ( $image ) { + return $image->getLocalRefPath(); + } + ] + ); + $srcPath = $work->execute(); + } else { + $srcPath = $image->getLocalRefPath(); + } + + if ( $srcPath === false ) { // Failed to get local copy + wfDebugLog( 'thumbnail', + sprintf( 'Thumbnail failed on %s: could not get local copy of "%s"', + wfHostname(), $image->getName() ) ); + + return new MediaTransformError( 'thumbnail_error', + $params['width'], $params['height'], + wfMessage( 'filemissing' ) + ); } # Use a subshell (brackets) to aggregate stderr from both pipeline commands # before redirecting it to the overall stdout. This works in both Linux and Windows XP. - $cmd = '(' . wfEscapeShellArg( $wgDjvuRenderer ) . " -format=ppm -page={$page} -size={$width}x{$height} " . - wfEscapeShellArg( $srcPath ); + $cmd = '(' . wfEscapeShellArg( + $wgDjvuRenderer, + "-format=ppm", + "-page={$page}", + "-size={$params['physicalWidth']}x{$params['physicalHeight']}", + $srcPath ); if ( $wgDjvuPostProcessor ) { $cmd .= " | {$wgDjvuPostProcessor}"; } - $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1'; - wfProfileIn( 'ddjvu' ); - wfDebug( __METHOD__.": $cmd\n" ); + $cmd .= ' > ' . wfEscapeShellArg( $dstPath ) . ') 2>&1'; + wfDebug( __METHOD__ . ": $cmd\n" ); $retval = ''; $err = wfShellExec( $cmd, $retval ); - wfProfileOut( 'ddjvu' ); $removed = $this->removeBadFile( $dstPath, $retval ); if ( $retval != 0 || $removed ) { - wfDebugLog( 'thumbnail', - sprintf( 'thumbnail failed on %s: error %d "%s" from "%s"', - wfHostname(), $retval, trim($err), $cmd ) ); + $this->logErrorForExternalProcess( $retval, $err, $cmd ); return new MediaTransformError( 'thumbnail_error', $width, $height, $err ); } else { - return new ThumbnailImage( $image, $dstUrl, $width, $height, $dstPath, $page ); + $params = [ + 'width' => $width, + 'height' => $height, + 'page' => $page + ]; + + return new ThumbnailImage( $image, $dstUrl, $dstPath, $params ); } } /** * Cache an instance of DjVuImage in an Image object, return that instance + * + * @param File|FSFile $image + * @param string $path + * @return DjVuImage */ function getDjVuImage( $image, $path ) { if ( !$image ) { @@ -134,52 +247,106 @@ class DjVuHandler extends ImageHandler { } else { $deja = $image->dejaImage; } + return $deja; } + /** + * Get metadata, unserializing it if neccessary. + * + * @param File $file The DjVu file in question + * @return string XML metadata as a string. + * @throws MWException + */ + private function getUnserializedMetadata( File $file ) { + $metadata = $file->getMetadata(); + if ( substr( $metadata, 0, 3 ) === 'dejaMetaTree ) ) { + public function getMetaTree( $image, $gettext = false ) { + if ( $gettext && isset( $image->djvuTextTree ) ) { + return $image->djvuTextTree; + } + if ( !$gettext && isset( $image->dejaMetaTree ) ) { return $image->dejaMetaTree; } - $metadata = $image->getMetadata(); + $metadata = $this->getUnserializedMetadata( $image ); if ( !$this->isMetadataValid( $image, $metadata ) ) { wfDebug( "DjVu XML metadata is invalid or missing, should have been fixed in upgradeRow\n" ); + return false; } - wfProfileIn( __METHOD__ ); - wfSuppressWarnings(); + $trees = $this->extractTreesFromMetadata( $metadata ); + $image->djvuTextTree = $trees['TextTree']; + $image->dejaMetaTree = $trees['MetaTree']; + + if ( $gettext ) { + return $image->djvuTextTree; + } else { + return $image->dejaMetaTree; + } + } + + /** + * Extracts metadata and text trees from metadata XML in string form + * @param string $metadata XML metadata as a string + * @return array + */ + protected function extractTreesFromMetadata( $metadata ) { + MediaWiki\suppressWarnings(); try { // Set to false rather than null to avoid further attempts - $image->dejaMetaTree = false; - $image->djvuTextTree = false; - $tree = new SimpleXMLElement( $metadata ); - if( $tree->getName() == 'mw-djvu' ) { - foreach($tree->children() as $b){ - if( $b->getName() == 'DjVuTxt' ) { - $image->djvuTextTree = $b; - } - else if ( $b->getName() == 'DjVuXML' ) { - $image->dejaMetaTree = $b; + $metaTree = false; + $textTree = false; + $tree = new SimpleXMLElement( $metadata, LIBXML_PARSEHUGE ); + if ( $tree->getName() == 'mw-djvu' ) { + /** @var SimpleXMLElement $b */ + foreach ( $tree->children() as $b ) { + if ( $b->getName() == 'DjVuTxt' ) { + // @todo File::djvuTextTree and File::dejaMetaTree are declared + // dynamically. Add a public File::$data to facilitate this? + $textTree = $b; + } elseif ( $b->getName() == 'DjVuXML' ) { + $metaTree = $b; } } } else { - $image->dejaMetaTree = $tree; + $metaTree = $tree; } - } catch( Exception $e ) { - wfDebug( "Bogus multipage XML metadata on '$image->name'\n" ); - } - wfRestoreWarnings(); - wfProfileOut( __METHOD__ ); - if( $gettext ) { - return $image->djvuTextTree; - } else { - return $image->dejaMetaTree; + } catch ( Exception $e ) { + wfDebug( "Bogus multipage XML metadata\n" ); } + MediaWiki\restoreWarnings(); + + return [ 'MetaTree' => $metaTree, 'TextTree' => $textTree ]; } function getImageSize( $image, $path ) { @@ -193,12 +360,20 @@ class DjVuHandler extends ImageHandler { $magic = MimeMagic::singleton(); $mime = $magic->guessTypesForExtension( $wgDjvuOutputExtension ); } - return array( $wgDjvuOutputExtension, $mime ); + + return [ $wgDjvuOutputExtension, $mime ]; } function getMetadata( $image, $path ) { wfDebug( "Getting DjVu metadata for $path\n" ); - return $this->getDjVuImage( $image, $path )->retrieveMetaData(); + + $xml = $this->getDjVuImage( $image, $path )->retrieveMetaData(); + if ( $xml === false ) { + // Special value so that we don't repetitively try and decode a broken file. + return serialize( [ 'error' => 'Error extracting metadata' ] ); + } else { + return serialize( [ 'xml' => $xml ] ); + } } function getMetadataType( $image ) { @@ -206,48 +381,84 @@ class DjVuHandler extends ImageHandler { } function isMetadataValid( $image, $metadata ) { - return !empty( $metadata ) && $metadata != serialize(array()); + return !empty( $metadata ) && $metadata != serialize( [] ); } - function pageCount( $image ) { - $tree = $this->getMetaTree( $image ); - if ( !$tree ) { - return false; + function pageCount( File $image ) { + $info = $this->getDimensionInfo( $image ); + + return $info ? $info['pageCount'] : false; + } + + function getPageDimensions( File $image, $page ) { + $index = $page - 1; // MW starts pages at 1 + + $info = $this->getDimensionInfo( $image ); + if ( $info && isset( $info['dimensionsByPage'][$index] ) ) { + return $info['dimensionsByPage'][$index]; } - return count( $tree->xpath( '//OBJECT' ) ); + + return false; } - function getPageDimensions( $image, $page ) { - $tree = $this->getMetaTree( $image ); - if ( !$tree ) { + protected function getDimensionInfo( File $file ) { + $cache = ObjectCache::getMainWANInstance(); + return $cache->getWithSetCallback( + $cache->makeKey( 'file-djvu', 'dimensions', $file->getSha1() ), + $cache::TTL_INDEFINITE, + function () use ( $file ) { + $tree = $this->getMetaTree( $file ); + return $this->getDimensionInfoFromMetaTree( $tree ); + }, + [ 'pcTTL' => $cache::TTL_INDEFINITE ] + ); + } + + /** + * Given an XML metadata tree, returns dimension information about the document + * @param bool|SimpleXMLElement $metatree The file's XML metadata tree + * @return bool|array + */ + protected function getDimensionInfoFromMetaTree( $metatree ) { + if ( !$metatree ) { return false; } - $o = $tree->BODY[0]->OBJECT[$page-1]; - if ( $o ) { - return array( - 'width' => intval( $o['width'] ), - 'height' => intval( $o['height'] ) - ); - } else { - return false; + $dimsByPage = []; + $count = count( $metatree->xpath( '//OBJECT' ) ); + for ( $i = 0; $i < $count; $i++ ) { + $o = $metatree->BODY[0]->OBJECT[$i]; + if ( $o ) { + $dimsByPage[$i] = [ + 'width' => (int)$o['width'], + 'height' => (int)$o['height'], + ]; + } else { + $dimsByPage[$i] = false; + } } + + return [ 'pageCount' => $count, 'dimensionsByPage' => $dimsByPage ]; } - function getPageText( $image, $page ){ + /** + * @param File $image + * @param int $page Page number to get information for + * @return bool|string Page text or false when no text found. + */ + function getPageText( File $image, $page ) { $tree = $this->getMetaTree( $image, true ); if ( !$tree ) { return false; } - $o = $tree->BODY[0]->PAGE[$page-1]; + $o = $tree->BODY[0]->PAGE[$page - 1]; if ( $o ) { $txt = $o['value']; + return $txt; } else { return false; } - } - }