]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - maintenance/generateSitemap.php
MediaWiki 1.16.4
[autoinstalls/mediawiki.git] / maintenance / generateSitemap.php
1 <?php
2 define( 'GS_MAIN', -2 );
3 define( 'GS_TALK', -1 );
4 /**
5  * Creates a sitemap for the site
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License along
18  * with this program; if not, write to the Free Software Foundation, Inc.,
19  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20  * http://www.gnu.org/copyleft/gpl.html
21  *
22  * @ingroup Maintenance
23  *
24  * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
25  * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
26  * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
27  *
28  * @see http://www.sitemaps.org/
29  * @see http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
30  *
31  * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
32  */
33
34 require_once( dirname(__FILE__) . '/Maintenance.php' );
35
36 class GenerateSitemap extends Maintenance {
37         /**
38          * The maximum amount of urls in a sitemap file
39          *
40          * @link http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
41          *
42          * @var int
43          */
44         var $url_limit;
45
46         /**
47          * The maximum size of a sitemap file
48          *
49          * @link http://www.sitemaps.org/faq.php#faq_sitemap_size
50          *
51          * @var int
52          */
53         var $size_limit;
54
55         /**
56          * The path to prepend to the filename
57          *
58          * @var string
59          */
60         var $fspath;
61
62         /**
63          * The path to append to the domain name
64          *
65          * @var string
66          */
67         var $path;
68
69         /**
70          * Whether or not to use compression
71          *
72          * @var bool
73          */
74         var $compress;
75
76         /**
77          * The number of entries to save in each sitemap file
78          *
79          * @var array
80          */
81         var $limit = array();
82
83         /**
84          * Key => value entries of namespaces and their priorities
85          *
86          * @var array
87          */
88         var $priorities = array();
89
90         /**
91          * A one-dimensional array of namespaces in the wiki
92          *
93          * @var array
94          */
95         var $namespaces = array();
96
97         /**
98          * When this sitemap batch was generated
99          *
100          * @var string
101          */
102         var $timestamp;
103
104         /**
105          * A database slave object
106          *
107          * @var object
108          */
109         var $dbr;
110
111         /**
112          * A resource pointing to the sitemap index file
113          *
114          * @var resource
115          */
116         var $findex;
117
118
119         /**
120          * A resource pointing to a sitemap file
121          *
122          * @var resource
123          */
124         var $file;
125
126         /**
127          * Constructor
128          */
129         public function __construct() {
130                 parent::__construct();
131                 $this->mDescription = "Creates a sitemap for the site";
132                 $this->addOption( 'fspath', 'The file system path to save to, e.g. /tmp/sitemap' .
133                                                                         "\n\t\tdefaults to current directory", false, true );
134                 $this->addOption( 'compress', 'Compress the sitemap files, can take value yes|no, default yes', false, true );
135         }
136
137         /**
138          * Execute
139          */
140         public function execute() {
141                 global $wgScriptPath;
142                 $this->setNamespacePriorities();
143                 $this->url_limit = 50000;
144                 $this->size_limit = pow( 2, 20 ) * 10;
145                 $this->fspath = self::init_path( $this->getOption( 'fspath', getcwd() ) );
146                 $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no';
147                 $this->dbr = wfGetDB( DB_SLAVE );
148                 $this->generateNamespaces();
149                 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
150                 $this->findex = fopen( "{$this->fspath}sitemap-index-" . wfWikiID() . ".xml", 'wb' );
151                 $this->main();
152         }
153
154         private function setNamespacePriorities() {
155                 // Custom main namespaces
156                 $this->priorities[GS_MAIN] = '0.5';
157                 // Custom talk namesspaces
158                 $this->priorities[GS_TALK] = '0.1';
159                 // MediaWiki standard namespaces
160                 $this->priorities[NS_MAIN] = '1.0';
161                 $this->priorities[NS_TALK] = '0.1';
162                 $this->priorities[NS_USER] = '0.5';
163                 $this->priorities[NS_USER_TALK] = '0.1';
164                 $this->priorities[NS_PROJECT] = '0.5';
165                 $this->priorities[NS_PROJECT_TALK] = '0.1';
166                 $this->priorities[NS_FILE] = '0.5';
167                 $this->priorities[NS_FILE_TALK] = '0.1';
168                 $this->priorities[NS_MEDIAWIKI] = '0.0';
169                 $this->priorities[NS_MEDIAWIKI_TALK] = '0.1';
170                 $this->priorities[NS_TEMPLATE] = '0.0';
171                 $this->priorities[NS_TEMPLATE_TALK] = '0.1';
172                 $this->priorities[NS_HELP] = '0.5';
173                 $this->priorities[NS_HELP_TALK] = '0.1';
174                 $this->priorities[NS_CATEGORY] = '0.5';
175                 $this->priorities[NS_CATEGORY_TALK] = '0.1';
176         }
177
178         /**
179          * Create directory if it does not exist and return pathname with a trailing slash
180          */
181         private static function init_path( $fspath ) {
182                 if( !isset( $fspath ) ) {
183                         return null;
184                 }
185                 # Create directory if needed
186                 if( $fspath && !is_dir( $fspath ) ) {
187                         wfMkdirParents( $fspath ) or die("Can not create directory $fspath.\n");
188                 }
189
190                 return realpath( $fspath ). DIRECTORY_SEPARATOR ;
191         }
192
193         /**
194          * Generate a one-dimensional array of existing namespaces
195          */
196         function generateNamespaces() {
197                 // Only generate for specific namespaces if $wgSitemapNamespaces is an array.
198                 global $wgSitemapNamespaces;
199                 if( is_array( $wgSitemapNamespaces ) ) {
200                         $this->namespaces = $wgSitemapNamespaces;
201                         return;
202                 }
203
204                 $res = $this->dbr->select( 'page',
205                         array( 'page_namespace' ),
206                         array(),
207                         __METHOD__,
208                         array(
209                                 'GROUP BY' => 'page_namespace',
210                                 'ORDER BY' => 'page_namespace',
211                         )
212                 );
213
214                 foreach ( $res as $row )
215                         $this->namespaces[] = $row->page_namespace;
216         }
217
218         /**
219          * Get the priority of a given namespace
220          *
221          * @param int $namespace The namespace to get the priority for
222          +
223          * @return string
224          */
225
226         function priority( $namespace ) {
227                 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
228         }
229
230         /**
231          * If the namespace isn't listed on the priority list return the
232          * default priority for the namespace, varies depending on whether it's
233          * a talkpage or not.
234          *
235          * @param int $namespace The namespace to get the priority for
236          *
237          * @return string
238          */
239         function guessPriority( $namespace ) {
240                 return MWNamespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK];
241         }
242
243         /**
244          * Return a database resolution of all the pages in a given namespace
245          *
246          * @param int $namespace Limit the query to this namespace
247          *
248          * @return resource
249          */
250         function getPageRes( $namespace ) {
251                 return $this->dbr->select( 'page',
252                         array(
253                                 'page_namespace',
254                                 'page_title',
255                                 'page_touched',
256                         ),
257                         array( 'page_namespace' => $namespace ),
258                         __METHOD__
259                 );
260         }
261
262         /**
263          * Main loop
264          *
265          * @access public
266          */
267         function main() {
268                 global $wgContLang;
269
270                 fwrite( $this->findex, $this->openIndex() );
271
272                 foreach ( $this->namespaces as $namespace ) {
273                         $res = $this->getPageRes( $namespace );
274                         $this->file = false;
275                         $this->generateLimit( $namespace );
276                         $length = $this->limit[0];
277                         $i = $smcount = 0;
278
279                         $fns = $wgContLang->getFormattedNsText( $namespace );
280                         $this->output( "$namespace ($fns)" );
281                         foreach ( $res as $row ) {
282                                 if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) {
283                                         if ( $this->file !== false ) {
284                                                 $this->write( $this->file, $this->closeFile() );
285                                                 $this->close( $this->file );
286                                         }
287                                         $filename = $this->sitemapFilename( $namespace, $smcount++ );
288                                         $this->file = $this->open( $this->fspath . $filename, 'wb' );
289                                         $this->write( $this->file, $this->openFile() );
290                                         fwrite( $this->findex, $this->indexEntry( $filename ) );
291                                         $this->output( "\t$this->fspath$filename\n" );
292                                         $length = $this->limit[0];
293                                         $i = 1;
294                                 }
295                                 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
296                                 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
297                                 $entry = $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) );
298                                 $length += strlen( $entry );
299                                 $this->write( $this->file, $entry );
300                                 // generate pages for language variants
301                                 if($wgContLang->hasVariants()){
302                                         $variants = $wgContLang->getVariants();
303                                         foreach($variants as $vCode){
304                                                 if($vCode==$wgContLang->getCode()) continue; // we don't want default variant
305                                                 $entry = $this->fileEntry( $title->getFullURL('',$vCode), $date, $this->priority( $namespace ) );
306                                                 $length += strlen( $entry );
307                                                 $this->write( $this->file, $entry );
308                                         }
309                                 }
310                         }
311                         if ( $this->file ) {
312                                 $this->write( $this->file, $this->closeFile() );
313                                 $this->close( $this->file );
314                         }
315                 }
316                 fwrite( $this->findex, $this->closeIndex() );
317                 fclose( $this->findex );
318         }
319
320         /**
321          * gzopen() / fopen() wrapper
322          *
323          * @return resource
324          */
325         function open( $file, $flags ) {
326                 return $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
327         }
328
329         /**
330          * gzwrite() / fwrite() wrapper
331          */
332         function write( &$handle, $str ) {
333                 if ( $this->compress )
334                         gzwrite( $handle, $str );
335                 else
336                         fwrite( $handle, $str );
337         }
338
339         /**
340          * gzclose() / fclose() wrapper
341          */
342         function close( &$handle ) {
343                 if ( $this->compress )
344                         gzclose( $handle );
345                 else
346                         fclose( $handle );
347         }
348
349         /**
350          * Get a sitemap filename
351          *
352          * @static
353          *
354          * @param int $namespace The namespace
355          * @param int $count The count
356          *
357          * @return string
358          */
359         function sitemapFilename( $namespace, $count ) {
360                 $ext = $this->compress ? '.gz' : '';
361                 return "sitemap-".wfWikiID()."-NS_$namespace-$count.xml$ext";
362         }
363
364         /**
365          * Return the XML required to open an XML file
366          *
367          * @static
368          *
369          * @return string
370          */
371         function xmlHead() {
372                 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
373         }
374
375         /**
376          * Return the XML schema being used
377          *
378          * @static
379          *
380          * @returns string
381          */
382         function xmlSchema() {
383                 return 'http://www.sitemaps.org/schemas/sitemap/0.9';
384         }
385
386         /**
387          * Return the XML required to open a sitemap index file
388          *
389          * @return string
390          */
391         function openIndex() {
392                 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
393         }
394
395         /**
396          * Return the XML for a single sitemap indexfile entry
397          *
398          * @static
399          *
400          * @param string $filename The filename of the sitemap file
401          *
402          * @return string
403          */
404         function indexEntry( $filename ) {
405                 return
406                         "\t<sitemap>\n" .
407                         "\t\t<loc>$filename</loc>\n" .
408                         "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
409                         "\t</sitemap>\n";
410         }
411
412         /**
413          * Return the XML required to close a sitemap index file
414          *
415          * @static
416          *
417          * @return string
418          */
419         function closeIndex() {
420                 return "</sitemapindex>\n";
421         }
422
423         /**
424          * Return the XML required to open a sitemap file
425          *
426          * @return string
427          */
428         function openFile() {
429                 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
430         }
431
432         /**
433          * Return the XML for a single sitemap entry
434          *
435          * @static
436          *
437          * @param string $url An RFC 2396 compliant URL
438          * @param string $date A ISO 8601 date
439          * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
440          *
441          * @return string
442          */
443         function fileEntry( $url, $date, $priority ) {
444                 return
445                         "\t<url>\n" .
446                         "\t\t<loc>$url</loc>\n" .
447                         "\t\t<lastmod>$date</lastmod>\n" .
448                         "\t\t<priority>$priority</priority>\n" .
449                         "\t</url>\n";
450         }
451
452         /**
453          * Return the XML required to close sitemap file
454          *
455          * @static
456          * @return string
457          */
458         function closeFile() {
459                 return "</urlset>\n";
460         }
461
462         /**
463          * Populate $this->limit
464          */
465         function generateLimit( $namespace ) {
466                 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
467
468                 $this->limit = array(
469                         strlen( $this->openFile() ),
470                         strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ),
471                         strlen( $this->closeFile() )
472                 );
473         }
474 }
475
476 $maintClass = "GenerateSitemap";
477 require_once( DO_MAINTENANCE );