]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - includes/media/JpegMetadataExtractor.php
MediaWiki 1.30.2
[autoinstallsdev/mediawiki.git] / includes / media / JpegMetadataExtractor.php
1 <?php
2 /**
3  * Extraction of JPEG image metadata.
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program; if not, write to the Free Software Foundation, Inc.,
17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18  * http://www.gnu.org/copyleft/gpl.html
19  *
20  * @file
21  * @ingroup Media
22  */
23
24 /**
25  * Class for reading jpegs and extracting metadata.
26  * see also BitmapMetadataHandler.
27  *
28  * Based somewhat on GIFMetadataExtractor.
29  *
30  * @ingroup Media
31  */
32 class JpegMetadataExtractor {
33         const MAX_JPEG_SEGMENTS = 200;
34
35         // the max segment is a sanity check.
36         // A jpeg file should never even remotely have
37         // that many segments. Your average file has about 10.
38
39         /** Function to extract metadata segments of interest from jpeg files
40          * based on GIFMetadataExtractor.
41          *
42          * we can almost use getimagesize to do this
43          * but gis doesn't support having multiple app1 segments
44          * and those can't extract xmp on files containing both exif and xmp data
45          *
46          * @param string $filename Name of jpeg file
47          * @return array Array of interesting segments.
48          * @throws MWException If given invalid file.
49          */
50         static function segmentSplitter( $filename ) {
51                 $showXMP = XMPReader::isSupported();
52
53                 $segmentCount = 0;
54
55                 $segments = [
56                         'XMP_ext' => [],
57                         'COM' => [],
58                         'PSIR' => [],
59                 ];
60
61                 if ( !$filename ) {
62                         throw new MWException( "No filename specified for " . __METHOD__ );
63                 }
64                 if ( !file_exists( $filename ) || is_dir( $filename ) ) {
65                         throw new MWException( "Invalid file $filename passed to " . __METHOD__ );
66                 }
67
68                 $fh = fopen( $filename, "rb" );
69
70                 if ( !$fh ) {
71                         throw new MWException( "Could not open file $filename" );
72                 }
73
74                 $buffer = fread( $fh, 2 );
75                 if ( $buffer !== "\xFF\xD8" ) {
76                         throw new MWException( "Not a jpeg, no SOI" );
77                 }
78                 while ( !feof( $fh ) ) {
79                         $buffer = fread( $fh, 1 );
80                         $segmentCount++;
81                         if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
82                                 // this is just a sanity check
83                                 throw new MWException( 'Too many jpeg segments. Aborting' );
84                         }
85                         while ( $buffer !== "\xFF" ) {
86                                 // In theory JPEG files are not allowed to contain anything between the sections,
87                                 // but in practice they sometimes do. It's customary to ignore the garbage data.
88                                 $buffer = fread( $fh, 1 );
89                         }
90
91                         $buffer = fread( $fh, 1 );
92                         while ( $buffer === "\xFF" && !feof( $fh ) ) {
93                                 // Skip through any 0xFF padding bytes.
94                                 $buffer = fread( $fh, 1 );
95                         }
96                         if ( $buffer === "\xFE" ) {
97                                 // COM section -- file comment
98                                 // First see if valid utf-8,
99                                 // if not try to convert it to windows-1252.
100                                 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
101                                 UtfNormal\Validator::quickIsNFCVerify( $com );
102                                 // turns $com to valid utf-8.
103                                 // thus if no change, its utf-8, otherwise its something else.
104                                 if ( $com !== $oldCom ) {
105                                         MediaWiki\suppressWarnings();
106                                         $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
107                                         MediaWiki\restoreWarnings();
108                                 }
109                                 // Try it again, if its still not a valid string, then probably
110                                 // binary junk or some really weird encoding, so don't extract.
111                                 UtfNormal\Validator::quickIsNFCVerify( $com );
112                                 if ( $com === $oldCom ) {
113                                         $segments["COM"][] = $oldCom;
114                                 } else {
115                                         wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage.\n" );
116                                 }
117                         } elseif ( $buffer === "\xE1" ) {
118                                 // APP1 section (Exif, XMP, and XMP extended)
119                                 // only extract if XMP is enabled.
120                                 $temp = self::jpegExtractMarker( $fh );
121                                 // check what type of app segment this is.
122                                 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
123                                         $segments["XMP"] = substr( $temp, 29 );
124                                 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
125                                         $segments["XMP_ext"][] = substr( $temp, 35 );
126                                 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
127                                         // Some images (especially flickr images) seem to have this.
128                                         // I really have no idea what the deal is with them, but
129                                         // whatever...
130                                         $segments["XMP"] = substr( $temp, 29 );
131                                         wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
132                                                 . "Using anyways.\n" );
133                                 } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
134                                         // Just need to find out what the byte order is.
135                                         // because php's exif plugin sucks...
136                                         // This is a II for little Endian, MM for big. Not a unicode BOM.
137                                         $byteOrderMarker = substr( $temp, 6, 2 );
138                                         if ( $byteOrderMarker === 'MM' ) {
139                                                 $segments['byteOrder'] = 'BE';
140                                         } elseif ( $byteOrderMarker === 'II' ) {
141                                                 $segments['byteOrder'] = 'LE';
142                                         } else {
143                                                 wfDebug( __METHOD__ . " Invalid byte ordering?!\n" );
144                                         }
145                                 }
146                         } elseif ( $buffer === "\xED" ) {
147                                 // APP13 - PSIR. IPTC and some photoshop stuff
148                                 $temp = self::jpegExtractMarker( $fh );
149                                 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
150                                         $segments["PSIR"][] = $temp;
151                                 }
152                         } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
153                                 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
154                                 return $segments;
155                         } else {
156                                 // segment we don't care about, so skip
157                                 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
158                                 if ( $size['int'] < 2 ) {
159                                         throw new MWException( "invalid marker size in jpeg" );
160                                 }
161                                 fseek( $fh, $size['int'] - 2, SEEK_CUR );
162                         }
163                 }
164                 // shouldn't get here.
165                 throw new MWException( "Reached end of jpeg file unexpectedly" );
166         }
167
168         /**
169          * Helper function for jpegSegmentSplitter
170          * @param resource &$fh File handle for JPEG file
171          * @throws MWException
172          * @return string Data content of segment.
173          */
174         private static function jpegExtractMarker( &$fh ) {
175                 $size = wfUnpack( "nint", fread( $fh, 2 ), 2 );
176                 if ( $size['int'] < 2 ) {
177                         throw new MWException( "invalid marker size in jpeg" );
178                 }
179                 if ( $size['int'] === 2 ) {
180                         // fread( ..., 0 ) generates a warning
181                         return '';
182                 }
183                 $segment = fread( $fh, $size['int'] - 2 );
184                 if ( strlen( $segment ) !== $size['int'] - 2 ) {
185                         throw new MWException( "Segment shorter than expected" );
186                 }
187
188                 return $segment;
189         }
190
191         /**
192          * This reads the photoshop image resource.
193          * Currently it only compares the iptc/iim hash
194          * with the stored hash, which is used to determine the precedence
195          * of the iptc data. In future it may extract some other info, like
196          * url of copyright license.
197          *
198          * This should generally be called by BitmapMetadataHandler::doApp13()
199          *
200          * @param string $app13 Photoshop psir app13 block from jpg.
201          * @throws MWException (It gets caught next level up though)
202          * @return string If the iptc hash is good or not. One of 'iptc-no-hash',
203          *   'iptc-good-hash', 'iptc-bad-hash'.
204          */
205         public static function doPSIR( $app13 ) {
206                 if ( !$app13 ) {
207                         throw new MWException( "No App13 segment given" );
208                 }
209                 // First compare hash with real thing
210                 // 0x404 contains IPTC, 0x425 has hash
211                 // This is used to determine if the iptc is newer than
212                 // the xmp data, as xmp programs update the hash,
213                 // where non-xmp programs don't.
214
215                 $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
216                 $appLen = strlen( $app13 );
217                 $realHash = "";
218                 $recordedHash = "";
219
220                 // the +12 is the length of an empty item.
221                 while ( $offset + 12 <= $appLen ) {
222                         $valid = true;
223                         if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
224                                 // its supposed to be 8BIM
225                                 // but apparently sometimes isn't esp. in
226                                 // really old jpg's
227                                 $valid = false;
228                         }
229                         $offset += 4;
230                         $id = substr( $app13, $offset, 2 );
231                         // id is a 2 byte id number which identifies
232                         // the piece of info this record contains.
233
234                         $offset += 2;
235
236                         // some record types can contain a name, which
237                         // is a pascal string 0-padded to be an even
238                         // number of bytes. Most times (and any time
239                         // we care) this is empty, making it two null bytes.
240
241                         $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
242                         // we never use the name so skip it. +1 for length byte
243                         if ( $lenName % 2 == 1 ) {
244                                 $lenName++;
245                         } // pad to even.
246                         $offset += $lenName;
247
248                         // now length of data (unsigned long big endian)
249                         $lenData = wfUnpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
250                         // PHP can take issue with very large unsigned ints and make them negative.
251                         // Which should never ever happen, as this has to be inside a segment
252                         // which is limited to a 16 bit number.
253                         if ( $lenData['len'] < 0 ) {
254                                 throw new MWException( "Too big PSIR (" . $lenData['len'] . ')' );
255                         }
256
257                         $offset += 4; // 4bytes length field;
258
259                         // this should not happen, but check.
260                         if ( $lenData['len'] + $offset > $appLen ) {
261                                 throw new MWException( "PSIR data too long. (item length=" . $lenData['len']
262                                         . "; offset=$offset; total length=$appLen)" );
263                         }
264
265                         if ( $valid ) {
266                                 switch ( $id ) {
267                                         case "\x04\x04":
268                                                 // IPTC block
269                                                 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
270                                                 break;
271                                         case "\x04\x25":
272                                                 $recordedHash = substr( $app13, $offset, $lenData['len'] );
273                                                 break;
274                                 }
275                         }
276
277                         // if odd, add 1 to length to account for
278                         // null pad byte.
279                         if ( $lenData['len'] % 2 == 1 ) {
280                                 $lenData['len']++;
281                         }
282                         $offset += $lenData['len'];
283                 }
284
285                 if ( !$realHash || !$recordedHash ) {
286                         return 'iptc-no-hash';
287                 } elseif ( $realHash === $recordedHash ) {
288                         return 'iptc-good-hash';
289                 } else { /*$realHash !== $recordedHash */
290                         return 'iptc-bad-hash';
291                 }
292         }
293 }