]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - maintenance/generateSitemap.php
MediaWiki 1.11.0
[autoinstallsdev/mediawiki.git] / maintenance / generateSitemap.php
1 <?php
2 define( 'GS_MAIN', -2 );
3 define( 'GS_TALK', -1 );
4 /**
5  * Creates a Google sitemap for the site
6  *
7  * @addtogroup Maintenance
8  *
9  * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
10  * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
11  * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
12  *
13  * @see http://www.google.com/webmasters/sitemaps/docs/en/about.html
14  * @see http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
15  *
16  * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
17  */
18
19 class GenerateSitemap {
20         /**
21          * The maximum amount of urls in a sitemap file
22          *
23          * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
24          *
25          * @var int
26          */
27         var $url_limit;
28
29         /**
30          * The maximum size of a sitemap file
31          *
32          * @link http://www.google.com/webmasters/sitemaps/docs/en/protocol.html#faq_sitemap_size
33          *
34          * @var int
35          */
36         var $size_limit;
37
38         /**
39          * The path to prepend to the filename
40          *
41          * @var string
42          */
43         var $fspath;
44
45         /**
46          * The path to append to the domain name
47          *
48          * @var string
49          */
50         var $path;
51
52         /**
53          * Whether or not to use compression
54          *
55          * @var bool
56          */
57         var $compress;
58
59         /**
60          * The number of entries to save in each sitemap file
61          *
62          * @var array
63          */
64         var $limit = array();
65
66         /**
67          * Key => value entries of namespaces and their priorities
68          *
69          * @var array
70          */
71         var $priorities = array(
72                 // Custom main namespaces
73                 GS_MAIN                 => '0.5',
74                 // Custom talk namesspaces
75                 GS_TALK                 => '0.1',
76                 // MediaWiki standard namespaces
77                 NS_MAIN                 => '1.0',
78                 NS_TALK                 => '0.1',
79                 NS_USER                 => '0.5',
80                 NS_USER_TALK            => '0.1',
81                 NS_PROJECT              => '0.5',
82                 NS_PROJECT_TALK         => '0.1',
83                 NS_IMAGE                => '0.5',
84                 NS_IMAGE_TALK           => '0.1',
85                 NS_MEDIAWIKI            => '0.0',
86                 NS_MEDIAWIKI_TALK       => '0.1',
87                 NS_TEMPLATE             => '0.0',
88                 NS_TEMPLATE_TALK        => '0.1',
89                 NS_HELP                 => '0.5',
90                 NS_HELP_TALK            => '0.1',
91                 NS_CATEGORY             => '0.5',
92                 NS_CATEGORY_TALK        => '0.1',
93         );
94
95         /**
96          * A one-dimensional array of namespaces in the wiki
97          *
98          * @var array
99          */
100         var $namespaces = array();
101
102         /**
103          * When this sitemap batch was generated
104          *
105          * @var string
106          */
107         var $timestamp;
108
109         /**
110          * A database slave object
111          *
112          * @var object
113          */
114         var $dbr;
115
116         /**
117          * A resource pointing to the sitemap index file
118          *
119          * @var resource
120          */
121         var $findex;
122
123
124         /**
125          * A resource pointing to a sitemap file
126          *
127          * @var resource
128          */
129         var $file;
130
131         /**
132          * A resource pointing to php://stderr
133          *
134          * @var resource
135          */
136         var $stderr;
137
138         /**
139          * Constructor
140          *
141          * @param string $fspath The path to prepend to the filenames, used to
142          *                     save them somewhere else than in the root directory
143          * @param string $path The path to append to the domain name
144          * @param bool $compress Whether to compress the sitemap files
145          */
146         function GenerateSitemap( $fspath, $compress ) {
147                 global $wgScriptPath;
148
149                 $this->url_limit = 50000;
150                 $this->size_limit = pow( 2, 20 ) * 10;
151                 $this->fspath = isset( $fspath ) ? $fspath : '';
152                 $this->compress = $compress;
153
154                 $this->stderr = fopen( 'php://stderr', 'wt' );
155                 $this->dbr = wfGetDB( DB_SLAVE );
156                 $this->generateNamespaces();
157                 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
158                 $this->findex = fopen( "{$this->fspath}sitemap-index-" . wfWikiID() . ".xml", 'wb' );
159         }
160
161         /**
162          * Generate a one-dimensional array of existing namespaces
163          */
164         function generateNamespaces() {
165                 $fname = 'GenerateSitemap::generateNamespaces';
166
167                 $res = $this->dbr->select( 'page',
168                         array( 'page_namespace' ),
169                         array(),
170                         $fname,
171                         array(
172                                 'GROUP BY' => 'page_namespace',
173                                 'ORDER BY' => 'page_namespace',
174                         )
175                 );
176
177                 while ( $row = $this->dbr->fetchObject( $res ) )
178                         $this->namespaces[] = $row->page_namespace;
179         }
180
181         /**
182          * Get the priority of a given namespace
183          *
184          * @param int $namespace The namespace to get the priority for
185          +
186          * @return string
187          */
188
189         function priority( $namespace ) {
190                 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
191         }
192
193         /**
194          * If the namespace isn't listed on the priority list return the
195          * default priority for the namespace, varies depending on whether it's
196          * a talkpage or not.
197          *
198          * @param int $namespace The namespace to get the priority for
199          *
200          * @return string
201          */
202         function guessPriority( $namespace ) {
203                 return Namespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK];
204         }
205
206         /**
207          * Return a database resolution of all the pages in a given namespace
208          *
209          * @param int $namespace Limit the query to this namespace
210          *
211          * @return resource
212          */
213         function getPageRes( $namespace ) {
214                 $fname = 'GenerateSitemap::getPageRes';
215
216                 return $this->dbr->select( 'page',
217                         array(
218                                 'page_namespace',
219                                 'page_title',
220                                 'page_touched',
221                         ),
222                         array( 'page_namespace' => $namespace ),
223                         $fname
224                 );
225         }
226
227         /**
228          * Main loop
229          *
230          * @access public
231          */
232         function main() {
233                 global $wgContLang;
234
235                 fwrite( $this->findex, $this->openIndex() );
236
237                 foreach ( $this->namespaces as $namespace ) {
238                         $res = $this->getPageRes( $namespace );
239                         $this->file = false;
240                         $this->generateLimit( $namespace );
241                         $length = $this->limit[0];
242                         $i = $smcount = 0;
243
244                         $fns = $wgContLang->getFormattedNsText( $namespace );
245                         $this->debug( "$namespace ($fns)" );
246                         while ( $row = $this->dbr->fetchObject( $res ) ) {
247                                 if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) {
248                                         if ( $this->file !== false ) {
249                                                 $this->write( $this->file, $this->closeFile() );
250                                                 $this->close( $this->file );
251                                         }
252                                         $filename = $this->sitemapFilename( $namespace, $smcount++ );
253                                         $this->file = $this->open( $this->fspath . $filename, 'wb' );
254                                         $this->write( $this->file, $this->openFile() );
255                                         fwrite( $this->findex, $this->indexEntry( $filename ) );
256                                         $this->debug( "\t$filename" );
257                                         $length = $this->limit[0];
258                                         $i = 1;
259                                 }
260                                 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
261                                 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
262                                 $entry = $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) );
263                                 $length += strlen( $entry );
264                                 $this->write( $this->file, $entry );
265                                 // generate pages for language variants
266                                 if($wgContLang->hasVariants()){
267                                         $variants = $wgContLang->getVariants();
268                                         foreach($variants as $vCode){
269                                                 if($vCode==$wgContLang->getCode()) continue; // we don't want default variant
270                                                 $entry = $this->fileEntry( $title->getFullURL('',$vCode), $date, $this->priority( $namespace ) );
271                                                 $length += strlen( $entry );
272                                                 $this->write( $this->file, $entry );
273                                         }
274                                 }
275                         }
276                         if ( $this->file ) {
277                                 $this->write( $this->file, $this->closeFile() );
278                                 $this->close( $this->file );
279                         }
280                 }
281                 fwrite( $this->findex, $this->closeIndex() );
282                 fclose( $this->findex );
283         }
284
285         /**
286          * gzopen() / fopen() wrapper
287          *
288          * @return resource
289          */
290         function open( $file, $flags ) {
291                 return $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
292         }
293
294         /**
295          * gzwrite() / fwrite() wrapper
296          */
297         function write( &$handle, $str ) {
298                 if ( $this->compress )
299                         gzwrite( $handle, $str );
300                 else
301                         fwrite( $handle, $str );
302         }
303
304         /**
305          * gzclose() / fclose() wrapper
306          */
307         function close( &$handle ) {
308                 if ( $this->compress )
309                         gzclose( $handle );
310                 else
311                         fclose( $handle );
312         }
313
314         /**
315          * Get a sitemap filename
316          *
317          * @static
318          *
319          * @param int $namespace The namespace
320          * @param int $count The count
321          *
322          * @return string
323          */
324         function sitemapFilename( $namespace, $count ) {
325                 $ext = $this->compress ? '.gz' : '';
326                 return "sitemap-".wfWikiID()."-NS_$namespace-$count.xml$ext";
327         }
328
329         /**
330          * Return the XML required to open an XML file
331          *
332          * @static
333          *
334          * @return string
335          */
336         function xmlHead() {
337                 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
338         }
339
340         /**
341          * Return the XML schema being used
342          *
343          * @static
344          *
345          * @returns string
346          */
347         function xmlSchema() {
348                 return 'http://www.google.com/schemas/sitemap/0.84';
349         }
350
351         /**
352          * Return the XML required to open a sitemap index file
353          *
354          * @return string
355          */
356         function openIndex() {
357                 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
358         }
359
360         /**
361          * Return the XML for a single sitemap indexfile entry
362          *
363          * @static
364          *
365          * @param string $filename The filename of the sitemap file
366          *
367          * @return string
368          */
369         function indexEntry( $filename ) {
370                 return
371                         "\t<sitemap>\n" .
372                         "\t\t<loc>$filename</loc>\n" .
373                         "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
374                         "\t</sitemap>\n";
375         }
376
377         /**
378          * Return the XML required to close a sitemap index file
379          *
380          * @static
381          *
382          * @return string
383          */
384         function closeIndex() {
385                 return "</sitemapindex>\n";
386         }
387
388         /**
389          * Return the XML required to open a sitemap file
390          *
391          * @return string
392          */
393         function openFile() {
394                 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
395         }
396
397         /**
398          * Return the XML for a single sitemap entry
399          *
400          * @static
401          *
402          * @param string $url An RFC 2396 compilant URL
403          * @param string $date A ISO 8601 date
404          * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
405          *
406          * @return string
407          */
408         function fileEntry( $url, $date, $priority ) {
409                 return
410                         "\t<url>\n" .
411                         "\t\t<loc>$url</loc>\n" .
412                         "\t\t<lastmod>$date</lastmod>\n" .
413                         "\t\t<priority>$priority</priority>\n" .
414                         "\t</url>\n";
415         }
416
417         /**
418          * Return the XML required to close sitemap file
419          *
420          * @static
421          * @return string
422          */
423         function closeFile() {
424                 return "</urlset>\n";
425         }
426
427         /**
428          * Write a string to stderr followed by a UNIX newline
429          */
430         function debug( $str ) {
431                 fwrite( $this->stderr, "$str\n" );
432         }
433
434         /**
435          * Populate $this->limit
436          */
437         function generateLimit( $namespace ) {
438                 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
439
440                 $this->limit = array(
441                         strlen( $this->openFile() ),
442                         strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ),
443                         strlen( $this->closeFile() )
444                 );
445         }
446 }
447
448 if ( in_array( '--help', $argv ) ) {
449         echo <<<EOT
450 Usage: php generateSitemap.php [options]
451         --help                  show this message
452
453         --fspath=<path>         The file system path to save to, e.g /tmp/sitemap/
454
455         --server=<server>       The protocol and server name to use in URLs, e.g.
456                 http://en.wikipedia.org. This is sometimes necessary because
457                 server name detection may fail in command line scripts.
458
459         --compress=[yes|no]     compress the sitemap files, default yes
460
461 EOT;
462         die( -1 );
463 }
464
465 $optionsWithArgs = array( 'fspath', 'server', 'compress' );
466 require_once( dirname( __FILE__ ) . '/commandLine.inc' );
467
468 if ( isset( $options['server'] ) ) {
469         $wgServer = $options['server'];
470 }
471
472 $gs = new GenerateSitemap( @$options['fspath'], @$options['compress'] !== 'no' );
473 $gs->main();
474