]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - includes/media/IPTC.php
MediaWiki 1.30.2-scripts2
[autoinstalls/mediawiki.git] / includes / media / IPTC.php
1 <?php
2 /**
3  * Class for some IPTC functions.
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program; if not, write to the Free Software Foundation, Inc.,
17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18  * http://www.gnu.org/copyleft/gpl.html
19  *
20  * @file
21  * @ingroup Media
22  */
23
24 /**
25  * Class for some IPTC functions.
26  *
27  * @ingroup Media
28  */
29 class IPTC {
30         /**
31          * This takes the results of iptcparse() and puts it into a
32          * form that can be handled by mediawiki. Generally called from
33          * BitmapMetadataHandler::doApp13.
34          *
35          * @see http://www.iptc.org/std/IIM/4.1/specification/IIMV4.1.pdf
36          *
37          * @param string $rawData The app13 block from jpeg containing iptc/iim data
38          * @return array IPTC metadata array
39          */
40         static function parse( $rawData ) {
41                 $parsed = iptcparse( $rawData );
42                 $data = [];
43                 if ( !is_array( $parsed ) ) {
44                         return $data;
45                 }
46
47                 $c = '';
48                 // charset info contained in tag 1:90.
49                 if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
50                         $c = self::getCharset( $parsed['1#090'][0] );
51                         if ( $c === false ) {
52                                 // Unknown charset. refuse to parse.
53                                 // note: There is a different between
54                                 // unknown and no charset specified.
55                                 return [];
56                         }
57                         unset( $parsed['1#090'] );
58                 }
59
60                 foreach ( $parsed as $tag => $val ) {
61                         if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
62                                 wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
63                                 continue;
64                         }
65                         switch ( $tag ) {
66                                 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
67                                         $data['ImageDescription'] = self::convIPTC( $val, $c );
68                                         break;
69                                 case '2#116': /* copyright. Mapped with exif copyright */
70                                         $data['Copyright'] = self::convIPTC( $val, $c );
71                                         break;
72                                 case '2#080': /* byline. Mapped with exif Artist */
73                                         /* merge with byline title (2:85)
74                                          * like how exif does it with
75                                          * Title, person. Not sure if this is best
76                                          * approach since we no longer have the two fields
77                                          * separate. each byline title entry corresponds to a
78                                          * specific byline.                          */
79
80                                         $bylines = self::convIPTC( $val, $c );
81                                         if ( isset( $parsed['2#085'] ) ) {
82                                                 $titles = self::convIPTC( $parsed['2#085'], $c );
83                                         } else {
84                                                 $titles = [];
85                                         }
86
87                                         $titleCount = count( $titles );
88                                         for ( $i = 0; $i < $titleCount; $i++ ) {
89                                                 if ( isset( $bylines[$i] ) ) {
90                                                         // theoretically this should always be set
91                                                         // but doesn't hurt to be careful.
92                                                         $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
93                                                 }
94                                         }
95                                         $data['Artist'] = $bylines;
96                                         break;
97                                 case '2#025': /* keywords */
98                                         $data['Keywords'] = self::convIPTC( $val, $c );
99                                         break;
100                                 case '2#101': /* Country (shown) */
101                                         $data['CountryDest'] = self::convIPTC( $val, $c );
102                                         break;
103                                 case '2#095': /* state/province (shown) */
104                                         $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
105                                         break;
106                                 case '2#090': /* city (Shown) */
107                                         $data['CityDest'] = self::convIPTC( $val, $c );
108                                         break;
109                                 case '2#092': /* sublocation (shown) */
110                                         $data['SublocationDest'] = self::convIPTC( $val, $c );
111                                         break;
112                                 case '2#005': /* object name/title */
113                                         $data['ObjectName'] = self::convIPTC( $val, $c );
114                                         break;
115                                 case '2#040': /* special instructions */
116                                         $data['SpecialInstructions'] = self::convIPTC( $val, $c );
117                                         break;
118                                 case '2#105': /* headline */
119                                         $data['Headline'] = self::convIPTC( $val, $c );
120                                         break;
121                                 case '2#110': /* credit */
122                                         /*"Identifies the provider of the objectdata,
123                                          * not necessarily the owner/creator". */
124                                         $data['Credit'] = self::convIPTC( $val, $c );
125                                         break;
126                                 case '2#115': /* source */
127                                         /* "Identifies the original owner of the intellectual content of the
128                                          *objectdata. This could be an agency, a member of an agency or
129                                          *an individual." */
130                                         $data['Source'] = self::convIPTC( $val, $c );
131                                         break;
132
133                                 case '2#007': /* edit status (lead, correction, etc) */
134                                         $data['EditStatus'] = self::convIPTC( $val, $c );
135                                         break;
136                                 case '2#015': /* category. deprecated. max 3 letters in theory, often more */
137                                         $data['iimCategory'] = self::convIPTC( $val, $c );
138                                         break;
139                                 case '2#020': /* category. deprecated. */
140                                         $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
141                                         break;
142                                 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
143                                         $data['Urgency'] = self::convIPTC( $val, $c );
144                                         break;
145                                 case '2#022':
146                                         /* "Identifies objectdata that recurs often and predictably...
147                                          * Example: Euroweather" */
148                                         $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
149                                         break;
150                                 case '2#026':
151                                         /* Content location code (iso 3166 + some custom things)
152                                          * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
153                                          * See wikipedia article on iso 3166 and appendix D of iim std. */
154                                         $data['LocationDestCode'] = self::convIPTC( $val, $c );
155                                         break;
156                                 case '2#027':
157                                         /* Content location name. Full printable name
158                                          * of location of photo. */
159                                         $data['LocationDest'] = self::convIPTC( $val, $c );
160                                         break;
161                                 case '2#065':
162                                         /* Originating Program.
163                                          * Combine with Program version (2:70) if present.
164                                          */
165                                         $software = self::convIPTC( $val, $c );
166
167                                         if ( count( $software ) !== 1 ) {
168                                                 // according to iim standard this cannot have multiple values
169                                                 // so if there is more than one, something weird is happening,
170                                                 // and we skip it.
171                                                 wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
172                                                 break;
173                                         }
174
175                                         if ( isset( $parsed['2#070'] ) ) {
176                                                 // if a version is set for the software.
177                                                 $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
178                                                 unset( $parsed['2#070'] );
179                                                 $data['Software'] = [ [ $software[0], $softwareVersion[0] ] ];
180                                         } else {
181                                                 $data['Software'] = $software;
182                                         }
183                                         break;
184                                 case '2#075':
185                                         /* Object cycle.
186                                          * a for morning (am), p for evening, b for both */
187                                         $data['ObjectCycle'] = self::convIPTC( $val, $c );
188                                         break;
189                                 case '2#100':
190                                         /* Country/Primary location code.
191                                          * "Indicates the code of the country/primary location where the
192                                          * intellectual property of the objectdata was created"
193                                          * unclear how this differs from 2#026
194                                          */
195                                         $data['CountryCodeDest'] = self::convIPTC( $val, $c );
196                                         break;
197                                 case '2#103':
198                                         /* original transmission ref.
199                                          * "A code representing the location of original transmission ac-
200                                          * cording to practises of the provider."
201                                          */
202                                         $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
203                                         break;
204                                 case '2#118': /*contact*/
205                                         $data['Contact'] = self::convIPTC( $val, $c );
206                                         break;
207                                 case '2#122':
208                                         /* Writer/Editor
209                                          * "Identification of the name of the person involved in the writing,
210                                          * editing or correcting the objectdata or caption/abstract."
211                                          */
212                                         $data['Writer'] = self::convIPTC( $val, $c );
213                                         break;
214                                 case '2#135': /* lang code */
215                                         $data['LanguageCode'] = self::convIPTC( $val, $c );
216                                         break;
217
218                                 // Start date stuff.
219                                 // It doesn't accept incomplete dates even though they are valid
220                                 // according to spec.
221                                 // Should potentially store timezone as well.
222                                 case '2#055':
223                                         // Date created (not date digitized).
224                                         // Maps to exif DateTimeOriginal
225                                         if ( isset( $parsed['2#060'] ) ) {
226                                                 $time = $parsed['2#060'];
227                                         } else {
228                                                 $time = [];
229                                         }
230                                         $timestamp = self::timeHelper( $val, $time, $c );
231                                         if ( $timestamp ) {
232                                                 $data['DateTimeOriginal'] = $timestamp;
233                                         }
234                                         break;
235
236                                 case '2#062':
237                                         // Date converted to digital representation.
238                                         // Maps to exif DateTimeDigitized
239                                         if ( isset( $parsed['2#063'] ) ) {
240                                                 $time = $parsed['2#063'];
241                                         } else {
242                                                 $time = [];
243                                         }
244                                         $timestamp = self::timeHelper( $val, $time, $c );
245                                         if ( $timestamp ) {
246                                                 $data['DateTimeDigitized'] = $timestamp;
247                                         }
248                                         break;
249
250                                 case '2#030':
251                                         // Date released.
252                                         if ( isset( $parsed['2#035'] ) ) {
253                                                 $time = $parsed['2#035'];
254                                         } else {
255                                                 $time = [];
256                                         }
257                                         $timestamp = self::timeHelper( $val, $time, $c );
258                                         if ( $timestamp ) {
259                                                 $data['DateTimeReleased'] = $timestamp;
260                                         }
261                                         break;
262
263                                 case '2#037':
264                                         // Date expires.
265                                         if ( isset( $parsed['2#038'] ) ) {
266                                                 $time = $parsed['2#038'];
267                                         } else {
268                                                 $time = [];
269                                         }
270                                         $timestamp = self::timeHelper( $val, $time, $c );
271                                         if ( $timestamp ) {
272                                                 $data['DateTimeExpires'] = $timestamp;
273                                         }
274                                         break;
275
276                                 case '2#000': /* iim version */
277                                         // unlike other tags, this is a 2-byte binary number.
278                                         // technically this is required if there is iptc data
279                                         // but in practise it isn't always there.
280                                         if ( strlen( $val[0] ) == 2 ) {
281                                                 // if is just to be paranoid.
282                                                 $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
283                                                 $versionValue += ord( substr( $val[0], 1, 1 ) );
284                                                 $data['iimVersion'] = $versionValue;
285                                         }
286                                         break;
287
288                                 case '2#004':
289                                         // IntellectualGenere.
290                                         // first 4 characters are an id code
291                                         // That we're not really interested in.
292
293                                         // This prop is weird, since it's
294                                         // allowed to have multiple values
295                                         // in iim 4.1, but not in the XMP
296                                         // stuff. We're going to just
297                                         // extract the first value.
298                                         $con = self::convIPTC( $val, $c );
299                                         if ( strlen( $con[0] ) < 5 ) {
300                                                 wfDebugLog( 'iptc', 'IPTC: '
301                                                         . '2:04 too short. '
302                                                         . 'Ignoring.' );
303                                                 break;
304                                         }
305                                         $extracted = substr( $con[0], 4 );
306                                         $data['IntellectualGenre'] = $extracted;
307                                         break;
308
309                                 case '2#012':
310                                         // Subject News code - this is a compound field
311                                         // at the moment we only extract the subject news
312                                         // code, which is an 8 digit (ascii) number
313                                         // describing the subject matter of the content.
314                                         $codes = self::convIPTC( $val, $c );
315                                         foreach ( $codes as $ic ) {
316                                                 $fields = explode( ':', $ic, 3 );
317
318                                                 if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
319                                                         wfDebugLog( 'IPTC', 'IPTC: '
320                                                                 . 'Invalid 2:12 - ' . $ic );
321                                                         break;
322                                                 }
323                                                 $data['SubjectNewsCode'] = $fields[1];
324                                         }
325                                         break;
326
327                                 // purposely does not do 2:125, 2:130, 2:131,
328                                 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
329                                 // 2:200, 2:201, 2:202
330                                 // or the audio stuff (2:150 to 2:154)
331
332                                 case '2#070':
333                                 case '2#060':
334                                 case '2#063':
335                                 case '2#085':
336                                 case '2#038':
337                                 case '2#035':
338                                         // ignore. Handled elsewhere.
339                                         break;
340
341                                 default:
342                                         wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
343                                         break;
344                         }
345                 }
346
347                 return $data;
348         }
349
350         /**
351          * Convert an iptc date and time tags into the exif format
352          *
353          * @todo Potentially this should also capture the timezone offset.
354          * @param array $date The date tag
355          * @param array $time The time tag
356          * @param string $c The charset
357          * @return string Date in EXIF format.
358          */
359         private static function timeHelper( $date, $time, $c ) {
360                 if ( count( $date ) === 1 ) {
361                         // the standard says this should always be 1
362                         // just double checking.
363                         list( $date ) = self::convIPTC( $date, $c );
364                 } else {
365                         return null;
366                 }
367
368                 if ( count( $time ) === 1 ) {
369                         list( $time ) = self::convIPTC( $time, $c );
370                         $dateOnly = false;
371                 } else {
372                         $time = '000000+0000'; // placeholder
373                         $dateOnly = true;
374                 }
375
376                 if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
377                         && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
378                         && substr( $date, 0, 4 ) !== '0000'
379                         && substr( $date, 4, 2 ) !== '00'
380                         && substr( $date, 6, 2 ) !== '00'
381                 ) ) {
382                         // something wrong.
383                         // Note, this rejects some valid dates according to iptc spec
384                         // for example: the date 00000400 means the photo was taken in
385                         // April, but the year and day is unknown. We don't process these
386                         // types of incomplete dates atm.
387                         wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
388
389                         return null;
390                 }
391
392                 $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
393                 if ( $unixTS === false ) {
394                         wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
395
396                         return null;
397                 }
398
399                 $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 )
400                         + ( intval( substr( $time, 9, 2 ) ) * 60 );
401
402                 if ( substr( $time, 6, 1 ) === '-' ) {
403                         $tz = -$tz;
404                 }
405
406                 $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
407                 if ( $finalTimestamp === false ) {
408                         wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
409
410                         return null;
411                 }
412                 if ( $dateOnly ) {
413                         // return the date only
414                         return substr( $finalTimestamp, 0, 10 );
415                 } else {
416                         return $finalTimestamp;
417                 }
418         }
419
420         /**
421          * Helper function to convert charset for iptc values.
422          * @param string|array $data The iptc string
423          * @param string $charset The charset
424          *
425          * @return string|array
426          */
427         private static function convIPTC( $data, $charset ) {
428                 if ( is_array( $data ) ) {
429                         foreach ( $data as &$val ) {
430                                 $val = self::convIPTCHelper( $val, $charset );
431                         }
432                 } else {
433                         $data = self::convIPTCHelper( $data, $charset );
434                 }
435
436                 return $data;
437         }
438
439         /**
440          * Helper function of a helper function to convert charset for iptc values.
441          * @param string|array $data The IPTC string
442          * @param string $charset The charset
443          *
444          * @return string
445          */
446         private static function convIPTCHelper( $data, $charset ) {
447                 if ( $charset ) {
448                         MediaWiki\suppressWarnings();
449                         $data = iconv( $charset, "UTF-8//IGNORE", $data );
450                         MediaWiki\restoreWarnings();
451                         if ( $data === false ) {
452                                 $data = "";
453                                 wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
454                         }
455                 } else {
456                         // treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
457                         // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
458                         $oldData = $data;
459                         UtfNormal\Validator::quickIsNFCVerify( $data ); // make $data valid utf-8
460                         if ( $data === $oldData ) {
461                                 return $data; // if validation didn't change $data
462                         } else {
463                                 return self::convIPTCHelper( $oldData, 'Windows-1252' );
464                         }
465                 }
466
467                 return trim( $data );
468         }
469
470         /**
471          * take the value of 1:90 tag and returns a charset
472          * @param string $tag 1:90 tag.
473          * @return string Charset name or "?"
474          * Warning, this function does not (and is not intended to) detect
475          * all iso 2022 escape codes. In practise, the code for utf-8 is the
476          * only code that seems to have wide use. It does detect that code.
477          */
478         static function getCharset( $tag ) {
479                 // According to iim standard, charset is defined by the tag 1:90.
480                 // in which there are iso 2022 escape sequences to specify the character set.
481                 // the iim standard seems to encourage that all necessary escape sequences are
482                 // in the 1:90 tag, but says it doesn't have to be.
483
484                 // This is in need of more testing probably. This is definitely not complete.
485                 // however reading the docs of some other iptc software, it appears that most iptc software
486                 // only recognizes utf-8. If 1:90 tag is not present content is
487                 // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
488
489                 // This also won't work if there are more than one escape sequence in the 1:90 tag
490                 // or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
491
492                 // This is just going through the charsets mentioned in appendix C of the iim standard.
493
494                 //  \x1b = ESC.
495                 switch ( $tag ) {
496                         case "\x1b%G": // utf-8
497                         // Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
498                         case "\x1b(B": // ascii
499                         case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
500                                 $c = 'UTF-8';
501                                 break;
502                         case "\x1b(A": // like ascii, but british.
503                                 $c = 'ISO646-GB';
504                                 break;
505                         case "\x1b(C": // some obscure sweedish/finland encoding
506                                 $c = 'ISO-IR-8-1';
507                                 break;
508                         case "\x1b(D":
509                                 $c = 'ISO-IR-8-2';
510                                 break;
511                         case "\x1b(E": // some obscure danish/norway encoding
512                                 $c = 'ISO-IR-9-1';
513                                 break;
514                         case "\x1b(F":
515                                 $c = 'ISO-IR-9-2';
516                                 break;
517                         case "\x1b(G":
518                                 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
519                                 break;
520                         case "\x1b(I":
521                                 $c = "ISO646-IT";
522                                 break;
523                         case "\x1b(L":
524                                 $c = "ISO646-PT";
525                                 break;
526                         case "\x1b(Z":
527                                 $c = "ISO646-ES";
528                                 break;
529                         case "\x1b([":
530                                 $c = "GREEK7-OLD";
531                                 break;
532                         case "\x1b(K":
533                                 $c = "ISO646-DE";
534                                 break;
535                         case "\x1b(N": // crylic
536                                 $c = "ISO_5427";
537                                 break;
538                         case "\x1b(`": // iso646-NO
539                                 $c = "NS_4551-1";
540                                 break;
541                         case "\x1b(f": // iso646-FR
542                                 $c = "NF_Z_62-010";
543                                 break;
544                         case "\x1b(g":
545                                 $c = "PT2"; // iso646-PT2
546                                 break;
547                         case "\x1b(h":
548                                 $c = "ES2";
549                                 break;
550                         case "\x1b(i": // iso646-HU
551                                 $c = "MSZ_7795.3";
552                                 break;
553                         case "\x1b(w":
554                                 $c = "CSA_Z243.4-1985-1";
555                                 break;
556                         case "\x1b(x":
557                                 $c = "CSA_Z243.4-1985-2";
558                                 break;
559                         case "\x1b\$(B":
560                         case "\x1b\$B":
561                         case "\x1b&@\x1b\$B":
562                         case "\x1b&@\x1b\$(B":
563                                 $c = "JIS_C6226-1983";
564                                 break;
565                         case "\x1b-A": // iso-8859-1. at least for the high code characters.
566                         case "\x1b(@\x1b-A":
567                         case "\x1b(B\x1b-A":
568                                 $c = 'ISO-8859-1';
569                                 break;
570                         case "\x1b-B": // iso-8859-2. at least for the high code characters.
571                                 $c = 'ISO-8859-2';
572                                 break;
573                         case "\x1b-C": // iso-8859-3. at least for the high code characters.
574                                 $c = 'ISO-8859-3';
575                                 break;
576                         case "\x1b-D": // iso-8859-4. at least for the high code characters.
577                                 $c = 'ISO-8859-4';
578                                 break;
579                         case "\x1b-E": // iso-8859-5. at least for the high code characters.
580                                 $c = 'ISO-8859-5';
581                                 break;
582                         case "\x1b-F": // iso-8859-6. at least for the high code characters.
583                                 $c = 'ISO-8859-6';
584                                 break;
585                         case "\x1b-G": // iso-8859-7. at least for the high code characters.
586                                 $c = 'ISO-8859-7';
587                                 break;
588                         case "\x1b-H": // iso-8859-8. at least for the high code characters.
589                                 $c = 'ISO-8859-8';
590                                 break;
591                         case "\x1b-I": // CSN_369103. at least for the high code characters.
592                                 $c = 'CSN_369103';
593                                 break;
594                         default:
595                                 wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
596                                 // at this point just give up and refuse to parse iptc?
597                                 $c = false;
598                 }
599                 return $c;
600         }
601 }