]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - maintenance/generateSitemap.php
MediaWiki 1.30.2 renames
[autoinstalls/mediawiki.git] / maintenance / generateSitemap.php
1 <?php
2 /**
3  * Creates a sitemap for the site.
4  *
5  * Copyright © 2005, Ævar Arnfjörð Bjarmason, Jens Frank <jeluf@gmx.de> and
6  * Brion Vibber <brion@pobox.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License along
19  * with this program; if not, write to the Free Software Foundation, Inc.,
20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21  * http://www.gnu.org/copyleft/gpl.html
22  *
23  * @file
24  * @ingroup Maintenance
25  * @see http://www.sitemaps.org/
26  * @see http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
27  */
28
29 require_once __DIR__ . '/Maintenance.php';
30
31 /**
32  * Maintenance script that generates a sitemap for the site.
33  *
34  * @ingroup Maintenance
35  */
36 class GenerateSitemap extends Maintenance {
37         const GS_MAIN = -2;
38         const GS_TALK = -1;
39
40         /**
41          * The maximum amount of urls in a sitemap file
42          *
43          * @link http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
44          *
45          * @var int
46          */
47         public $url_limit;
48
49         /**
50          * The maximum size of a sitemap file
51          *
52          * @link http://www.sitemaps.org/faq.php#faq_sitemap_size
53          *
54          * @var int
55          */
56         public $size_limit;
57
58         /**
59          * The path to prepend to the filename
60          *
61          * @var string
62          */
63         public $fspath;
64
65         /**
66          * The URL path to prepend to filenames in the index;
67          * should resolve to the same directory as $fspath.
68          *
69          * @var string
70          */
71         public $urlpath;
72
73         /**
74          * Whether or not to use compression
75          *
76          * @var bool
77          */
78         public $compress;
79
80         /**
81          * Whether or not to include redirection pages
82          *
83          * @var bool
84          */
85         public $skipRedirects;
86
87         /**
88          * The number of entries to save in each sitemap file
89          *
90          * @var array
91          */
92         public $limit = [];
93
94         /**
95          * Key => value entries of namespaces and their priorities
96          *
97          * @var array
98          */
99         public $priorities = [];
100
101         /**
102          * A one-dimensional array of namespaces in the wiki
103          *
104          * @var array
105          */
106         public $namespaces = [];
107
108         /**
109          * When this sitemap batch was generated
110          *
111          * @var string
112          */
113         public $timestamp;
114
115         /**
116          * A database replica DB object
117          *
118          * @var object
119          */
120         public $dbr;
121
122         /**
123          * A resource pointing to the sitemap index file
124          *
125          * @var resource
126          */
127         public $findex;
128
129         /**
130          * A resource pointing to a sitemap file
131          *
132          * @var resource
133          */
134         public $file;
135
136         /**
137          * Identifier to use in filenames, default $wgDBname
138          *
139          * @var string
140          */
141         private $identifier;
142
143         public function __construct() {
144                 parent::__construct();
145                 $this->addDescription( 'Creates a sitemap for the site' );
146                 $this->addOption(
147                         'fspath',
148                         'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory',
149                         false,
150                         true
151                 );
152                 $this->addOption(
153                         'urlpath',
154                         'The URL path corresponding to --fspath, prepended to filenames in the index; '
155                                 . 'defaults to an empty string',
156                         false,
157                         true
158                 );
159                 $this->addOption(
160                         'compress',
161                         'Compress the sitemap files, can take value yes|no, default yes',
162                         false,
163                         true
164                 );
165                 $this->addOption( 'skip-redirects', 'Do not include redirecting articles in the sitemap' );
166                 $this->addOption(
167                         'identifier',
168                         'What site identifier to use for the wiki, defaults to $wgDBname',
169                         false,
170                         true
171                 );
172         }
173
174         /**
175          * Execute
176          */
177         public function execute() {
178                 $this->setNamespacePriorities();
179                 $this->url_limit = 50000;
180                 $this->size_limit = pow( 2, 20 ) * 10;
181
182                 # Create directory if needed
183                 $fspath = $this->getOption( 'fspath', getcwd() );
184                 if ( !wfMkdirParents( $fspath, null, __METHOD__ ) ) {
185                         $this->error( "Can not create directory $fspath.", 1 );
186                 }
187
188                 $this->fspath = realpath( $fspath ) . DIRECTORY_SEPARATOR;
189                 $this->urlpath = $this->getOption( 'urlpath', "" );
190                 if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) {
191                         $this->urlpath .= '/';
192                 }
193                 $this->identifier = $this->getOption( 'identifier', wfWikiID() );
194                 $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no';
195                 $this->skipRedirects = $this->hasOption( 'skip-redirects' );
196                 $this->dbr = $this->getDB( DB_REPLICA );
197                 $this->generateNamespaces();
198                 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
199                 $this->findex = fopen( "{$this->fspath}sitemap-index-{$this->identifier}.xml", 'wb' );
200                 $this->main();
201         }
202
203         private function setNamespacePriorities() {
204                 global $wgSitemapNamespacesPriorities;
205
206                 // Custom main namespaces
207                 $this->priorities[self::GS_MAIN] = '0.5';
208                 // Custom talk namesspaces
209                 $this->priorities[self::GS_TALK] = '0.1';
210                 // MediaWiki standard namespaces
211                 $this->priorities[NS_MAIN] = '1.0';
212                 $this->priorities[NS_TALK] = '0.1';
213                 $this->priorities[NS_USER] = '0.5';
214                 $this->priorities[NS_USER_TALK] = '0.1';
215                 $this->priorities[NS_PROJECT] = '0.5';
216                 $this->priorities[NS_PROJECT_TALK] = '0.1';
217                 $this->priorities[NS_FILE] = '0.5';
218                 $this->priorities[NS_FILE_TALK] = '0.1';
219                 $this->priorities[NS_MEDIAWIKI] = '0.0';
220                 $this->priorities[NS_MEDIAWIKI_TALK] = '0.1';
221                 $this->priorities[NS_TEMPLATE] = '0.0';
222                 $this->priorities[NS_TEMPLATE_TALK] = '0.1';
223                 $this->priorities[NS_HELP] = '0.5';
224                 $this->priorities[NS_HELP_TALK] = '0.1';
225                 $this->priorities[NS_CATEGORY] = '0.5';
226                 $this->priorities[NS_CATEGORY_TALK] = '0.1';
227
228                 // Custom priorities
229                 if ( $wgSitemapNamespacesPriorities !== false ) {
230                         /**
231                          * @var $wgSitemapNamespacesPriorities array
232                          */
233                         foreach ( $wgSitemapNamespacesPriorities as $namespace => $priority ) {
234                                 $float = floatval( $priority );
235                                 if ( $float > 1.0 ) {
236                                         $priority = '1.0';
237                                 } elseif ( $float < 0.0 ) {
238                                         $priority = '0.0';
239                                 }
240                                 $this->priorities[$namespace] = $priority;
241                         }
242                 }
243         }
244
245         /**
246          * Generate a one-dimensional array of existing namespaces
247          */
248         function generateNamespaces() {
249                 // Only generate for specific namespaces if $wgSitemapNamespaces is an array.
250                 global $wgSitemapNamespaces;
251                 if ( is_array( $wgSitemapNamespaces ) ) {
252                         $this->namespaces = $wgSitemapNamespaces;
253
254                         return;
255                 }
256
257                 $res = $this->dbr->select( 'page',
258                         [ 'page_namespace' ],
259                         [],
260                         __METHOD__,
261                         [
262                                 'GROUP BY' => 'page_namespace',
263                                 'ORDER BY' => 'page_namespace',
264                         ]
265                 );
266
267                 foreach ( $res as $row ) {
268                         $this->namespaces[] = $row->page_namespace;
269                 }
270         }
271
272         /**
273          * Get the priority of a given namespace
274          *
275          * @param int $namespace The namespace to get the priority for
276          * @return string
277          */
278         function priority( $namespace ) {
279                 return isset( $this->priorities[$namespace] )
280                         ? $this->priorities[$namespace]
281                         : $this->guessPriority( $namespace );
282         }
283
284         /**
285          * If the namespace isn't listed on the priority list return the
286          * default priority for the namespace, varies depending on whether it's
287          * a talkpage or not.
288          *
289          * @param int $namespace The namespace to get the priority for
290          * @return string
291          */
292         function guessPriority( $namespace ) {
293                 return MWNamespace::isSubject( $namespace )
294                         ? $this->priorities[self::GS_MAIN]
295                         : $this->priorities[self::GS_TALK];
296         }
297
298         /**
299          * Return a database resolution of all the pages in a given namespace
300          *
301          * @param int $namespace Limit the query to this namespace
302          * @return Resource
303          */
304         function getPageRes( $namespace ) {
305                 return $this->dbr->select( 'page',
306                         [
307                                 'page_namespace',
308                                 'page_title',
309                                 'page_touched',
310                                 'page_is_redirect'
311                         ],
312                         [ 'page_namespace' => $namespace ],
313                         __METHOD__
314                 );
315         }
316
317         /**
318          * Main loop
319          */
320         public function main() {
321                 global $wgContLang;
322
323                 fwrite( $this->findex, $this->openIndex() );
324
325                 foreach ( $this->namespaces as $namespace ) {
326                         $res = $this->getPageRes( $namespace );
327                         $this->file = false;
328                         $this->generateLimit( $namespace );
329                         $length = $this->limit[0];
330                         $i = $smcount = 0;
331
332                         $fns = $wgContLang->getFormattedNsText( $namespace );
333                         $this->output( "$namespace ($fns)\n" );
334                         $skippedRedirects = 0; // Number of redirects skipped for that namespace
335                         foreach ( $res as $row ) {
336                                 if ( $this->skipRedirects && $row->page_is_redirect ) {
337                                         $skippedRedirects++;
338                                         continue;
339                                 }
340
341                                 if ( $i++ === 0
342                                         || $i === $this->url_limit + 1
343                                         || $length + $this->limit[1] + $this->limit[2] > $this->size_limit
344                                 ) {
345                                         if ( $this->file !== false ) {
346                                                 $this->write( $this->file, $this->closeFile() );
347                                                 $this->close( $this->file );
348                                         }
349                                         $filename = $this->sitemapFilename( $namespace, $smcount++ );
350                                         $this->file = $this->open( $this->fspath . $filename, 'wb' );
351                                         $this->write( $this->file, $this->openFile() );
352                                         fwrite( $this->findex, $this->indexEntry( $filename ) );
353                                         $this->output( "\t$this->fspath$filename\n" );
354                                         $length = $this->limit[0];
355                                         $i = 1;
356                                 }
357                                 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
358                                 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
359                                 $entry = $this->fileEntry( $title->getCanonicalURL(), $date, $this->priority( $namespace ) );
360                                 $length += strlen( $entry );
361                                 $this->write( $this->file, $entry );
362                                 // generate pages for language variants
363                                 if ( $wgContLang->hasVariants() ) {
364                                         $variants = $wgContLang->getVariants();
365                                         foreach ( $variants as $vCode ) {
366                                                 if ( $vCode == $wgContLang->getCode() ) {
367                                                         continue; // we don't want default variant
368                                                 }
369                                                 $entry = $this->fileEntry(
370                                                         $title->getCanonicalURL( '', $vCode ),
371                                                         $date,
372                                                         $this->priority( $namespace )
373                                                 );
374                                                 $length += strlen( $entry );
375                                                 $this->write( $this->file, $entry );
376                                         }
377                                 }
378                         }
379
380                         if ( $this->skipRedirects && $skippedRedirects > 0 ) {
381                                 $this->output( "  skipped $skippedRedirects redirect(s)\n" );
382                         }
383
384                         if ( $this->file ) {
385                                 $this->write( $this->file, $this->closeFile() );
386                                 $this->close( $this->file );
387                         }
388                 }
389                 fwrite( $this->findex, $this->closeIndex() );
390                 fclose( $this->findex );
391         }
392
393         /**
394          * gzopen() / fopen() wrapper
395          *
396          * @param string $file
397          * @param string $flags
398          * @return resource
399          */
400         function open( $file, $flags ) {
401                 $resource = $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
402                 if ( $resource === false ) {
403                         throw new MWException( __METHOD__
404                                 . " error opening file $file with flags $flags. Check permissions?" );
405                 }
406
407                 return $resource;
408         }
409
410         /**
411          * gzwrite() / fwrite() wrapper
412          *
413          * @param resource &$handle
414          * @param string $str
415          */
416         function write( &$handle, $str ) {
417                 if ( $handle === true || $handle === false ) {
418                         throw new MWException( __METHOD__ . " was passed a boolean as a file handle.\n" );
419                 }
420                 if ( $this->compress ) {
421                         gzwrite( $handle, $str );
422                 } else {
423                         fwrite( $handle, $str );
424                 }
425         }
426
427         /**
428          * gzclose() / fclose() wrapper
429          *
430          * @param resource &$handle
431          */
432         function close( &$handle ) {
433                 if ( $this->compress ) {
434                         gzclose( $handle );
435                 } else {
436                         fclose( $handle );
437                 }
438         }
439
440         /**
441          * Get a sitemap filename
442          *
443          * @param int $namespace The namespace
444          * @param int $count The count
445          * @return string
446          */
447         function sitemapFilename( $namespace, $count ) {
448                 $ext = $this->compress ? '.gz' : '';
449
450                 return "sitemap-{$this->identifier}-NS_$namespace-$count.xml$ext";
451         }
452
453         /**
454          * Return the XML required to open an XML file
455          *
456          * @return string
457          */
458         function xmlHead() {
459                 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
460         }
461
462         /**
463          * Return the XML schema being used
464          *
465          * @return string
466          */
467         function xmlSchema() {
468                 return 'http://www.sitemaps.org/schemas/sitemap/0.9';
469         }
470
471         /**
472          * Return the XML required to open a sitemap index file
473          *
474          * @return string
475          */
476         function openIndex() {
477                 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
478         }
479
480         /**
481          * Return the XML for a single sitemap indexfile entry
482          *
483          * @param string $filename The filename of the sitemap file
484          * @return string
485          */
486         function indexEntry( $filename ) {
487                 return
488                         "\t<sitemap>\n" .
489                         "\t\t<loc>{$this->urlpath}$filename</loc>\n" .
490                         "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
491                         "\t</sitemap>\n";
492         }
493
494         /**
495          * Return the XML required to close a sitemap index file
496          *
497          * @return string
498          */
499         function closeIndex() {
500                 return "</sitemapindex>\n";
501         }
502
503         /**
504          * Return the XML required to open a sitemap file
505          *
506          * @return string
507          */
508         function openFile() {
509                 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
510         }
511
512         /**
513          * Return the XML for a single sitemap entry
514          *
515          * @param string $url An RFC 2396 compliant URL
516          * @param string $date A ISO 8601 date
517          * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
518          * @return string
519          */
520         function fileEntry( $url, $date, $priority ) {
521                 return
522                         "\t<url>\n" .
523                         // T36666: $url may contain bad characters such as ampersands.
524                         "\t\t<loc>" . htmlspecialchars( $url ) . "</loc>\n" .
525                         "\t\t<lastmod>$date</lastmod>\n" .
526                         "\t\t<priority>$priority</priority>\n" .
527                         "\t</url>\n";
528         }
529
530         /**
531          * Return the XML required to close sitemap file
532          *
533          * @return string
534          */
535         function closeFile() {
536                 return "</urlset>\n";
537         }
538
539         /**
540          * Populate $this->limit
541          *
542          * @param int $namespace
543          */
544         function generateLimit( $namespace ) {
545                 // T19961: make a title with the longest possible URL in this namespace
546                 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
547
548                 $this->limit = [
549                         strlen( $this->openFile() ),
550                         strlen( $this->fileEntry(
551                                 $title->getCanonicalURL(),
552                                 wfTimestamp( TS_ISO_8601, wfTimestamp() ),
553                                 $this->priority( $namespace )
554                         ) ),
555                         strlen( $this->closeFile() )
556                 ];
557         }
558 }
559
560 $maintClass = "GenerateSitemap";
561 require_once RUN_MAINTENANCE_IF_MAIN;