]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - includes/export/XmlDumpWriter.php
MediaWiki 1.30.2
[autoinstallsdev/mediawiki.git] / includes / export / XmlDumpWriter.php
1 <?php
2 /**
3  * XmlDumpWriter
4  *
5  * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com>
6  * https://www.mediawiki.org/
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License along
19  * with this program; if not, write to the Free Software Foundation, Inc.,
20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21  * http://www.gnu.org/copyleft/gpl.html
22  *
23  * @file
24  */
25
26 /**
27  * @ingroup Dump
28  */
29 class XmlDumpWriter {
30         /**
31          * Opens the XML output stream's root "<mediawiki>" element.
32          * This does not include an xml directive, so is safe to include
33          * as a subelement in a larger XML stream. Namespace and XML Schema
34          * references are included.
35          *
36          * Output will be encoded in UTF-8.
37          *
38          * @return string
39          */
40         function openStream() {
41                 global $wgContLang;
42                 $ver = WikiExporter::schemaVersion();
43                 return Xml::element( 'mediawiki', [
44                         'xmlns'              => "http://www.mediawiki.org/xml/export-$ver/",
45                         'xmlns:xsi'          => "http://www.w3.org/2001/XMLSchema-instance",
46                         /*
47                          * When a new version of the schema is created, it needs staging on mediawiki.org.
48                          * This requires a change in the operations/mediawiki-config git repo.
49                          *
50                          * Create a changeset like https://gerrit.wikimedia.org/r/#/c/149643/ in which
51                          * you copy in the new xsd file.
52                          *
53                          * After it is reviewed, merged and deployed (sync-docroot), the index.html needs purging.
54                          * echo "https://www.mediawiki.org/xml/index.html" | mwscript purgeList.php --wiki=aawiki
55                          */
56                         'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " .
57                                 "http://www.mediawiki.org/xml/export-$ver.xsd",
58                         'version'            => $ver,
59                         'xml:lang'           => $wgContLang->getHtmlCode() ],
60                         null ) .
61                         "\n" .
62                         $this->siteInfo();
63         }
64
65         /**
66          * @return string
67          */
68         function siteInfo() {
69                 $info = [
70                         $this->sitename(),
71                         $this->dbname(),
72                         $this->homelink(),
73                         $this->generator(),
74                         $this->caseSetting(),
75                         $this->namespaces() ];
76                 return "  <siteinfo>\n    " .
77                         implode( "\n    ", $info ) .
78                         "\n  </siteinfo>\n";
79         }
80
81         /**
82          * @return string
83          */
84         function sitename() {
85                 global $wgSitename;
86                 return Xml::element( 'sitename', [], $wgSitename );
87         }
88
89         /**
90          * @return string
91          */
92         function dbname() {
93                 global $wgDBname;
94                 return Xml::element( 'dbname', [], $wgDBname );
95         }
96
97         /**
98          * @return string
99          */
100         function generator() {
101                 global $wgVersion;
102                 return Xml::element( 'generator', [], "MediaWiki $wgVersion" );
103         }
104
105         /**
106          * @return string
107          */
108         function homelink() {
109                 return Xml::element( 'base', [], Title::newMainPage()->getCanonicalURL() );
110         }
111
112         /**
113          * @return string
114          */
115         function caseSetting() {
116                 global $wgCapitalLinks;
117                 // "case-insensitive" option is reserved for future
118                 $sensitivity = $wgCapitalLinks ? 'first-letter' : 'case-sensitive';
119                 return Xml::element( 'case', [], $sensitivity );
120         }
121
122         /**
123          * @return string
124          */
125         function namespaces() {
126                 global $wgContLang;
127                 $spaces = "<namespaces>\n";
128                 foreach ( $wgContLang->getFormattedNamespaces() as $ns => $title ) {
129                         $spaces .= '      ' .
130                                 Xml::element( 'namespace',
131                                         [
132                                                 'key' => $ns,
133                                                 'case' => MWNamespace::isCapitalized( $ns ) ? 'first-letter' : 'case-sensitive',
134                                         ], $title ) . "\n";
135                 }
136                 $spaces .= "    </namespaces>";
137                 return $spaces;
138         }
139
140         /**
141          * Closes the output stream with the closing root element.
142          * Call when finished dumping things.
143          *
144          * @return string
145          */
146         function closeStream() {
147                 return "</mediawiki>\n";
148         }
149
150         /**
151          * Opens a "<page>" section on the output stream, with data
152          * from the given database row.
153          *
154          * @param object $row
155          * @return string
156          */
157         public function openPage( $row ) {
158                 $out = "  <page>\n";
159                 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
160                 $out .= '    ' . Xml::elementClean( 'title', [], self::canonicalTitle( $title ) ) . "\n";
161                 $out .= '    ' . Xml::element( 'ns', [], strval( $row->page_namespace ) ) . "\n";
162                 $out .= '    ' . Xml::element( 'id', [], strval( $row->page_id ) ) . "\n";
163                 if ( $row->page_is_redirect ) {
164                         $page = WikiPage::factory( $title );
165                         $redirect = $page->getRedirectTarget();
166                         if ( $redirect instanceof Title && $redirect->isValidRedirectTarget() ) {
167                                 $out .= '    ';
168                                 $out .= Xml::element( 'redirect', [ 'title' => self::canonicalTitle( $redirect ) ] );
169                                 $out .= "\n";
170                         }
171                 }
172
173                 if ( $row->page_restrictions != '' ) {
174                         $out .= '    ' . Xml::element( 'restrictions', [],
175                                 strval( $row->page_restrictions ) ) . "\n";
176                 }
177
178                 Hooks::run( 'XmlDumpWriterOpenPage', [ $this, &$out, $row, $title ] );
179
180                 return $out;
181         }
182
183         /**
184          * Closes a "<page>" section on the output stream.
185          *
186          * @access private
187          * @return string
188          */
189         function closePage() {
190                 return "  </page>\n";
191         }
192
193         /**
194          * Dumps a "<revision>" section on the output stream, with
195          * data filled in from the given database row.
196          *
197          * @param object $row
198          * @return string
199          * @access private
200          */
201         function writeRevision( $row ) {
202                 $out = "    <revision>\n";
203                 $out .= "      " . Xml::element( 'id', null, strval( $row->rev_id ) ) . "\n";
204                 if ( isset( $row->rev_parent_id ) && $row->rev_parent_id ) {
205                         $out .= "      " . Xml::element( 'parentid', null, strval( $row->rev_parent_id ) ) . "\n";
206                 }
207
208                 $out .= $this->writeTimestamp( $row->rev_timestamp );
209
210                 if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_USER ) ) {
211                         $out .= "      " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
212                 } else {
213                         $out .= $this->writeContributor( $row->rev_user, $row->rev_user_text );
214                 }
215
216                 if ( isset( $row->rev_minor_edit ) && $row->rev_minor_edit ) {
217                         $out .= "      <minor/>\n";
218                 }
219                 if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_COMMENT ) ) {
220                         $out .= "      " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
221                 } else {
222                         $comment = CommentStore::newKey( 'rev_comment' )->getComment( $row )->text;
223                         if ( $comment != '' ) {
224                                 $out .= "      " . Xml::elementClean( 'comment', [], strval( $comment ) ) . "\n";
225                         }
226                 }
227
228                 if ( isset( $row->rev_content_model ) && !is_null( $row->rev_content_model ) ) {
229                         $content_model = strval( $row->rev_content_model );
230                 } else {
231                         // probably using $wgContentHandlerUseDB = false;
232                         $title = Title::makeTitle( $row->page_namespace, $row->page_title );
233                         $content_model = ContentHandler::getDefaultModelFor( $title );
234                 }
235
236                 $content_handler = ContentHandler::getForModelID( $content_model );
237
238                 if ( isset( $row->rev_content_format ) && !is_null( $row->rev_content_format ) ) {
239                         $content_format = strval( $row->rev_content_format );
240                 } else {
241                         // probably using $wgContentHandlerUseDB = false;
242                         $content_format = $content_handler->getDefaultFormat();
243                 }
244
245                 $out .= "      " . Xml::element( 'model', null, strval( $content_model ) ) . "\n";
246                 $out .= "      " . Xml::element( 'format', null, strval( $content_format ) ) . "\n";
247
248                 $text = '';
249                 if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_TEXT ) ) {
250                         $out .= "      " . Xml::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n";
251                 } elseif ( isset( $row->old_text ) ) {
252                         // Raw text from the database may have invalid chars
253                         $text = strval( Revision::getRevisionText( $row ) );
254                         $text = $content_handler->exportTransform( $text, $content_format );
255                         $out .= "      " . Xml::elementClean( 'text',
256                                 [ 'xml:space' => 'preserve', 'bytes' => intval( $row->rev_len ) ],
257                                 strval( $text ) ) . "\n";
258                 } else {
259                         // Stub output
260                         $out .= "      " . Xml::element( 'text',
261                                 [ 'id' => $row->rev_text_id, 'bytes' => intval( $row->rev_len ) ],
262                                 "" ) . "\n";
263                 }
264
265                 if ( isset( $row->rev_sha1 )
266                         && $row->rev_sha1
267                         && !( $row->rev_deleted & Revision::DELETED_TEXT )
268                 ) {
269                         $out .= "      " . Xml::element( 'sha1', null, strval( $row->rev_sha1 ) ) . "\n";
270                 } else {
271                         $out .= "      <sha1/>\n";
272                 }
273
274                 // Avoid PHP 7.1 warning from passing $this by reference
275                 $writer = $this;
276                 Hooks::run( 'XmlDumpWriterWriteRevision', [ &$writer, &$out, $row, $text ] );
277
278                 $out .= "    </revision>\n";
279
280                 return $out;
281         }
282
283         /**
284          * Dumps a "<logitem>" section on the output stream, with
285          * data filled in from the given database row.
286          *
287          * @param object $row
288          * @return string
289          * @access private
290          */
291         function writeLogItem( $row ) {
292                 $out = "  <logitem>\n";
293                 $out .= "    " . Xml::element( 'id', null, strval( $row->log_id ) ) . "\n";
294
295                 $out .= $this->writeTimestamp( $row->log_timestamp, "    " );
296
297                 if ( $row->log_deleted & LogPage::DELETED_USER ) {
298                         $out .= "    " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
299                 } else {
300                         $out .= $this->writeContributor( $row->log_user, $row->user_name, "    " );
301                 }
302
303                 if ( $row->log_deleted & LogPage::DELETED_COMMENT ) {
304                         $out .= "    " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
305                 } else {
306                         $comment = CommentStore::newKey( 'log_comment' )->getComment( $row )->text;
307                         if ( $comment != '' ) {
308                                 $out .= "    " . Xml::elementClean( 'comment', null, strval( $comment ) ) . "\n";
309                         }
310                 }
311
312                 $out .= "    " . Xml::element( 'type', null, strval( $row->log_type ) ) . "\n";
313                 $out .= "    " . Xml::element( 'action', null, strval( $row->log_action ) ) . "\n";
314
315                 if ( $row->log_deleted & LogPage::DELETED_ACTION ) {
316                         $out .= "    " . Xml::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n";
317                 } else {
318                         $title = Title::makeTitle( $row->log_namespace, $row->log_title );
319                         $out .= "    " . Xml::elementClean( 'logtitle', null, self::canonicalTitle( $title ) ) . "\n";
320                         $out .= "    " . Xml::elementClean( 'params',
321                                 [ 'xml:space' => 'preserve' ],
322                                 strval( $row->log_params ) ) . "\n";
323                 }
324
325                 $out .= "  </logitem>\n";
326
327                 return $out;
328         }
329
330         /**
331          * @param string $timestamp
332          * @param string $indent Default to six spaces
333          * @return string
334          */
335         function writeTimestamp( $timestamp, $indent = "      " ) {
336                 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
337                 return $indent . Xml::element( 'timestamp', null, $ts ) . "\n";
338         }
339
340         /**
341          * @param int $id
342          * @param string $text
343          * @param string $indent Default to six spaces
344          * @return string
345          */
346         function writeContributor( $id, $text, $indent = "      " ) {
347                 $out = $indent . "<contributor>\n";
348                 if ( $id || !IP::isValid( $text ) ) {
349                         $out .= $indent . "  " . Xml::elementClean( 'username', null, strval( $text ) ) . "\n";
350                         $out .= $indent . "  " . Xml::element( 'id', null, strval( $id ) ) . "\n";
351                 } else {
352                         $out .= $indent . "  " . Xml::elementClean( 'ip', null, strval( $text ) ) . "\n";
353                 }
354                 $out .= $indent . "</contributor>\n";
355                 return $out;
356         }
357
358         /**
359          * Warning! This data is potentially inconsistent. :(
360          * @param object $row
361          * @param bool $dumpContents
362          * @return string
363          */
364         function writeUploads( $row, $dumpContents = false ) {
365                 if ( $row->page_namespace == NS_FILE ) {
366                         $img = wfLocalFile( $row->page_title );
367                         if ( $img && $img->exists() ) {
368                                 $out = '';
369                                 foreach ( array_reverse( $img->getHistory() ) as $ver ) {
370                                         $out .= $this->writeUpload( $ver, $dumpContents );
371                                 }
372                                 $out .= $this->writeUpload( $img, $dumpContents );
373                                 return $out;
374                         }
375                 }
376                 return '';
377         }
378
379         /**
380          * @param File $file
381          * @param bool $dumpContents
382          * @return string
383          */
384         function writeUpload( $file, $dumpContents = false ) {
385                 if ( $file->isOld() ) {
386                         $archiveName = "      " .
387                                 Xml::element( 'archivename', null, $file->getArchiveName() ) . "\n";
388                 } else {
389                         $archiveName = '';
390                 }
391                 if ( $dumpContents ) {
392                         $be = $file->getRepo()->getBackend();
393                         # Dump file as base64
394                         # Uses only XML-safe characters, so does not need escaping
395                         # @todo Too bad this loads the contents into memory (script might swap)
396                         $contents = '      <contents encoding="base64">' .
397                                 chunk_split( base64_encode(
398                                         $be->getFileContents( [ 'src' => $file->getPath() ] ) ) ) .
399                                 "      </contents>\n";
400                 } else {
401                         $contents = '';
402                 }
403                 if ( $file->isDeleted( File::DELETED_COMMENT ) ) {
404                         $comment = Xml::element( 'comment', [ 'deleted' => 'deleted' ] );
405                 } else {
406                         $comment = Xml::elementClean( 'comment', null, strval( $file->getDescription() ) );
407                 }
408                 return "    <upload>\n" .
409                         $this->writeTimestamp( $file->getTimestamp() ) .
410                         $this->writeContributor( $file->getUser( 'id' ), $file->getUser( 'text' ) ) .
411                         "      " . $comment . "\n" .
412                         "      " . Xml::element( 'filename', null, $file->getName() ) . "\n" .
413                         $archiveName .
414                         "      " . Xml::element( 'src', null, $file->getCanonicalUrl() ) . "\n" .
415                         "      " . Xml::element( 'size', null, $file->getSize() ) . "\n" .
416                         "      " . Xml::element( 'sha1base36', null, $file->getSha1() ) . "\n" .
417                         "      " . Xml::element( 'rel', null, $file->getRel() ) . "\n" .
418                         $contents .
419                         "    </upload>\n";
420         }
421
422         /**
423          * Return prefixed text form of title, but using the content language's
424          * canonical namespace. This skips any special-casing such as gendered
425          * user namespaces -- which while useful, are not yet listed in the
426          * XML "<siteinfo>" data so are unsafe in export.
427          *
428          * @param Title $title
429          * @return string
430          * @since 1.18
431          */
432         public static function canonicalTitle( Title $title ) {
433                 if ( $title->isExternal() ) {
434                         return $title->getPrefixedText();
435                 }
436
437                 global $wgContLang;
438                 $prefix = $wgContLang->getFormattedNsText( $title->getNamespace() );
439
440                 // @todo Emit some kind of warning to the user if $title->getNamespace() !==
441                 // NS_MAIN and $prefix === '' (viz. pages in an unregistered namespace)
442
443                 if ( $prefix !== '' ) {
444                         $prefix .= ':';
445                 }
446
447                 return $prefix . $title->getText();
448         }
449 }