]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - maintenance/refreshLinks.php
MediaWiki 1.30.2 renames
[autoinstalls/mediawiki.git] / maintenance / refreshLinks.php
1 <?php
2 /**
3  * Refresh link tables.
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program; if not, write to the Free Software Foundation, Inc.,
17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18  * http://www.gnu.org/copyleft/gpl.html
19  *
20  * @file
21  * @ingroup Maintenance
22  */
23
24 use Wikimedia\Rdbms\IDatabase;
25
26 require_once __DIR__ . '/Maintenance.php';
27
28 /**
29  * Maintenance script to refresh link tables.
30  *
31  * @ingroup Maintenance
32  */
33 class RefreshLinks extends Maintenance {
34         const REPORTING_INTERVAL = 100;
35
36         /** @var int|bool */
37         protected $namespace = false;
38
39         public function __construct() {
40                 parent::__construct();
41                 $this->addDescription( 'Refresh link tables' );
42                 $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
43                 $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
44                 $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
45                 $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
46                 $this->addOption( 'e', 'Last page id to refresh', false, true );
47                 $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
48                         'query, default 100000', false, true );
49                 $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true );
50                 $this->addOption( 'category', 'Only fix pages in this category', false, true );
51                 $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true );
52                 $this->addArg( 'start', 'Page_id to start from, default 1', false );
53                 $this->setBatchSize( 100 );
54         }
55
56         public function execute() {
57                 // Note that there is a difference between not specifying the start
58                 // and end IDs and using the minimum and maximum values from the page
59                 // table. In the latter case, deleteLinksFromNonexistent() will not
60                 // delete entries for nonexistent IDs that fall outside the range.
61                 $start = (int)$this->getArg( 0 ) ?: null;
62                 $end = (int)$this->getOption( 'e' ) ?: null;
63                 $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100000 );
64                 $ns = $this->getOption( 'namespace' );
65                 if ( $ns === null ) {
66                         $this->namespace = false;
67                 } else {
68                         $this->namespace = (int)$ns;
69                 }
70                 if ( ( $category = $this->getOption( 'category', false ) ) !== false ) {
71                         $title = Title::makeTitleSafe( NS_CATEGORY, $category );
72                         if ( !$title ) {
73                                 $this->error( "'$category' is an invalid category name!\n", true );
74                         }
75                         $this->refreshCategory( $title );
76                 } elseif ( ( $category = $this->getOption( 'tracking-category', false ) ) !== false ) {
77                         $this->refreshTrackingCategory( $category );
78                 } elseif ( !$this->hasOption( 'dfn-only' ) ) {
79                         $new = $this->hasOption( 'new-only' );
80                         $redir = $this->hasOption( 'redirects-only' );
81                         $oldRedir = $this->hasOption( 'old-redirects-only' );
82                         $this->doRefreshLinks( $start, $new, $end, $redir, $oldRedir );
83                         $this->deleteLinksFromNonexistent( null, null, $this->mBatchSize, $dfnChunkSize );
84                 } else {
85                         $this->deleteLinksFromNonexistent( $start, $end, $this->mBatchSize, $dfnChunkSize );
86                 }
87         }
88
89         private function namespaceCond() {
90                 return $this->namespace !== false
91                         ? [ 'page_namespace' => $this->namespace ]
92                         : [];
93         }
94
95         /**
96          * Do the actual link refreshing.
97          * @param int|null $start Page_id to start from
98          * @param bool $newOnly Only do pages with 1 edit
99          * @param int|null $end Page_id to stop at
100          * @param bool $redirectsOnly Only fix redirects
101          * @param bool $oldRedirectsOnly Only fix redirects without redirect entries
102          */
103         private function doRefreshLinks( $start, $newOnly = false,
104                 $end = null, $redirectsOnly = false, $oldRedirectsOnly = false
105         ) {
106                 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
107
108                 if ( $start === null ) {
109                         $start = 1;
110                 }
111
112                 // Give extensions a chance to optimize settings
113                 Hooks::run( 'MaintenanceRefreshLinksInit', [ $this ] );
114
115                 $what = $redirectsOnly ? "redirects" : "links";
116
117                 if ( $oldRedirectsOnly ) {
118                         # This entire code path is cut-and-pasted from below.  Hurrah.
119
120                         $conds = [
121                                 "page_is_redirect=1",
122                                 "rd_from IS NULL",
123                                 self::intervalCond( $dbr, 'page_id', $start, $end ),
124                         ] + $this->namespaceCond();
125
126                         $res = $dbr->select(
127                                 [ 'page', 'redirect' ],
128                                 'page_id',
129                                 $conds,
130                                 __METHOD__,
131                                 [],
132                                 [ 'redirect' => [ "LEFT JOIN", "page_id=rd_from" ] ]
133                         );
134                         $num = $res->numRows();
135                         $this->output( "Refreshing $num old redirects from $start...\n" );
136
137                         $i = 0;
138
139                         foreach ( $res as $row ) {
140                                 if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
141                                         $this->output( "$i\n" );
142                                         wfWaitForSlaves();
143                                 }
144                                 $this->fixRedirect( $row->page_id );
145                         }
146                 } elseif ( $newOnly ) {
147                         $this->output( "Refreshing $what from " );
148                         $res = $dbr->select( 'page',
149                                 [ 'page_id' ],
150                                 [
151                                         'page_is_new' => 1,
152                                         self::intervalCond( $dbr, 'page_id', $start, $end ),
153                                 ] + $this->namespaceCond(),
154                                 __METHOD__
155                         );
156                         $num = $res->numRows();
157                         $this->output( "$num new articles...\n" );
158
159                         $i = 0;
160                         foreach ( $res as $row ) {
161                                 if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
162                                         $this->output( "$i\n" );
163                                         wfWaitForSlaves();
164                                 }
165                                 if ( $redirectsOnly ) {
166                                         $this->fixRedirect( $row->page_id );
167                                 } else {
168                                         self::fixLinksFromArticle( $row->page_id, $this->namespace );
169                                 }
170                         }
171                 } else {
172                         if ( !$end ) {
173                                 $maxPage = $dbr->selectField( 'page', 'max(page_id)', false );
174                                 $maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', false );
175                                 $end = max( $maxPage, $maxRD );
176                         }
177                         $this->output( "Refreshing redirects table.\n" );
178                         $this->output( "Starting from page_id $start of $end.\n" );
179
180                         for ( $id = $start; $id <= $end; $id++ ) {
181                                 if ( !( $id % self::REPORTING_INTERVAL ) ) {
182                                         $this->output( "$id\n" );
183                                         wfWaitForSlaves();
184                                 }
185                                 $this->fixRedirect( $id );
186                         }
187
188                         if ( !$redirectsOnly ) {
189                                 $this->output( "Refreshing links tables.\n" );
190                                 $this->output( "Starting from page_id $start of $end.\n" );
191
192                                 for ( $id = $start; $id <= $end; $id++ ) {
193                                         if ( !( $id % self::REPORTING_INTERVAL ) ) {
194                                                 $this->output( "$id\n" );
195                                                 wfWaitForSlaves();
196                                         }
197                                         self::fixLinksFromArticle( $id, $this->namespace );
198                                 }
199                         }
200                 }
201         }
202
203         /**
204          * Update the redirect entry for a given page.
205          *
206          * This methods bypasses the "redirect" table to get the redirect target,
207          * and parses the page's content to fetch it. This allows to be sure that
208          * the redirect target is up to date and valid.
209          * This is particularly useful when modifying namespaces to be sure the
210          * entry in the "redirect" table points to the correct page and not to an
211          * invalid one.
212          *
213          * @param int $id The page ID to check
214          */
215         private function fixRedirect( $id ) {
216                 $page = WikiPage::newFromID( $id );
217                 $dbw = $this->getDB( DB_MASTER );
218
219                 if ( $page === null ) {
220                         // This page doesn't exist (any more)
221                         // Delete any redirect table entry for it
222                         $dbw->delete( 'redirect', [ 'rd_from' => $id ],
223                                 __METHOD__ );
224
225                         return;
226                 } elseif ( $this->namespace !== false
227                         && !$page->getTitle()->inNamespace( $this->namespace )
228                 ) {
229                         return;
230                 }
231
232                 $rt = null;
233                 $content = $page->getContent( Revision::RAW );
234                 if ( $content !== null ) {
235                         $rt = $content->getUltimateRedirectTarget();
236                 }
237
238                 if ( $rt === null ) {
239                         // The page is not a redirect
240                         // Delete any redirect table entry for it
241                         $dbw->delete( 'redirect', [ 'rd_from' => $id ], __METHOD__ );
242                         $fieldValue = 0;
243                 } else {
244                         $page->insertRedirectEntry( $rt );
245                         $fieldValue = 1;
246                 }
247
248                 // Update the page table to be sure it is an a consistent state
249                 $dbw->update( 'page', [ 'page_is_redirect' => $fieldValue ],
250                         [ 'page_id' => $id ], __METHOD__ );
251         }
252
253         /**
254          * Run LinksUpdate for all links on a given page_id
255          * @param int $id The page_id
256          * @param int|bool $ns Only fix links if it is in this namespace
257          */
258         public static function fixLinksFromArticle( $id, $ns = false ) {
259                 $page = WikiPage::newFromID( $id );
260
261                 LinkCache::singleton()->clear();
262
263                 if ( $page === null ) {
264                         return;
265                 } elseif ( $ns !== false
266                         && !$page->getTitle()->inNamespace( $ns ) ) {
267                         return;
268                 }
269
270                 $content = $page->getContent( Revision::RAW );
271                 if ( $content === null ) {
272                         return;
273                 }
274
275                 $updates = $content->getSecondaryDataUpdates(
276                         $page->getTitle(), /* $old = */ null, /* $recursive = */ false );
277                 foreach ( $updates as $update ) {
278                         DeferredUpdates::addUpdate( $update );
279                         DeferredUpdates::doUpdates();
280                 }
281         }
282
283         /**
284          * Removes non-existing links from pages from pagelinks, imagelinks,
285          * categorylinks, templatelinks, externallinks, interwikilinks, langlinks and redirect tables.
286          *
287          * @param int|null $start Page_id to start from
288          * @param int|null $end Page_id to stop at
289          * @param int $batchSize The size of deletion batches
290          * @param int $chunkSize Maximum number of existent IDs to check per query
291          *
292          * @author Merlijn van Deen <valhallasw@arctus.nl>
293          */
294         private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100,
295                 $chunkSize = 100000
296         ) {
297                 wfWaitForSlaves();
298                 $this->output( "Deleting illegal entries from the links tables...\n" );
299                 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
300                 do {
301                         // Find the start of the next chunk. This is based only
302                         // on existent page_ids.
303                         $nextStart = $dbr->selectField(
304                                 'page',
305                                 'page_id',
306                                 [ self::intervalCond( $dbr, 'page_id', $start, $end ) ]
307                                 + $this->namespaceCond(),
308                                 __METHOD__,
309                                 [ 'ORDER BY' => 'page_id', 'OFFSET' => $chunkSize ]
310                         );
311
312                         if ( $nextStart !== false ) {
313                                 // To find the end of the current chunk, subtract one.
314                                 // This will serve to limit the number of rows scanned in
315                                 // dfnCheckInterval(), per query, to at most the sum of
316                                 // the chunk size and deletion batch size.
317                                 $chunkEnd = $nextStart - 1;
318                         } else {
319                                 // This is the last chunk. Check all page_ids up to $end.
320                                 $chunkEnd = $end;
321                         }
322
323                         $fmtStart = $start !== null ? "[$start" : '(-INF';
324                         $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)';
325                         $this->output( "  Checking interval $fmtStart, $fmtChunkEnd\n" );
326                         $this->dfnCheckInterval( $start, $chunkEnd, $batchSize );
327
328                         $start = $nextStart;
329
330                 } while ( $nextStart !== false );
331         }
332
333         /**
334          * @see RefreshLinks::deleteLinksFromNonexistent()
335          * @param int|null $start Page_id to start from
336          * @param int|null $end Page_id to stop at
337          * @param int $batchSize The size of deletion batches
338          */
339         private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) {
340                 $dbw = $this->getDB( DB_MASTER );
341                 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
342
343                 $linksTables = [ // table name => page_id field
344                         'pagelinks' => 'pl_from',
345                         'imagelinks' => 'il_from',
346                         'categorylinks' => 'cl_from',
347                         'templatelinks' => 'tl_from',
348                         'externallinks' => 'el_from',
349                         'iwlinks' => 'iwl_from',
350                         'langlinks' => 'll_from',
351                         'redirect' => 'rd_from',
352                         'page_props' => 'pp_page',
353                 ];
354
355                 foreach ( $linksTables as $table => $field ) {
356                         $this->output( "    $table: 0" );
357                         $tableStart = $start;
358                         $counter = 0;
359                         do {
360                                 $ids = $dbr->selectFieldValues(
361                                         $table,
362                                         $field,
363                                         [
364                                                 self::intervalCond( $dbr, $field, $tableStart, $end ),
365                                                 "$field NOT IN ({$dbr->selectSQLText( 'page', 'page_id' )})",
366                                         ],
367                                         __METHOD__,
368                                         [ 'DISTINCT', 'ORDER BY' => $field, 'LIMIT' => $batchSize ]
369                                 );
370
371                                 $numIds = count( $ids );
372                                 if ( $numIds ) {
373                                         $counter += $numIds;
374                                         $dbw->delete( $table, [ $field => $ids ], __METHOD__ );
375                                         $this->output( ", $counter" );
376                                         $tableStart = $ids[$numIds - 1] + 1;
377                                         wfWaitForSlaves();
378                                 }
379
380                         } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) );
381
382                         $this->output( " deleted.\n" );
383                 }
384         }
385
386         /**
387          * Build a SQL expression for a closed interval (i.e. BETWEEN).
388          *
389          * By specifying a null $start or $end, it is also possible to create
390          * half-bounded or unbounded intervals using this function.
391          *
392          * @param IDatabase $db
393          * @param string $var Field name
394          * @param mixed $start First value to include or null
395          * @param mixed $end Last value to include or null
396          * @return string
397          */
398         private static function intervalCond( IDatabase $db, $var, $start, $end ) {
399                 if ( $start === null && $end === null ) {
400                         return "$var IS NOT NULL";
401                 } elseif ( $end === null ) {
402                         return "$var >= {$db->addQuotes( $start )}";
403                 } elseif ( $start === null ) {
404                         return "$var <= {$db->addQuotes( $end )}";
405                 } else {
406                         return "$var BETWEEN {$db->addQuotes( $start )} AND {$db->addQuotes( $end )}";
407                 }
408         }
409
410         /**
411          * Refershes links for pages in a tracking category
412          *
413          * @param string $category Category key
414          */
415         private function refreshTrackingCategory( $category ) {
416                 $cats = $this->getPossibleCategories( $category );
417
418                 if ( !$cats ) {
419                         $this->error( "Tracking category '$category' is disabled\n" );
420                         // Output to stderr but don't bail out,
421                 }
422
423                 foreach ( $cats as $cat ) {
424                         $this->refreshCategory( $cat );
425                 }
426         }
427
428         /**
429          * Refreshes links to a category
430          *
431          * @param Title $category
432          */
433         private function refreshCategory( Title $category ) {
434                 $this->output( "Refreshing pages in category '{$category->getText()}'...\n" );
435
436                 $dbr = $this->getDB( DB_REPLICA );
437                 $conds = [
438                         'page_id=cl_from',
439                         'cl_to' => $category->getDBkey(),
440                 ];
441                 if ( $this->namespace !== false ) {
442                         $conds['page_namespace'] = $this->namespace;
443                 }
444
445                 $i = 0;
446                 $timestamp = '';
447                 $lastId = 0;
448                 do {
449                         $finalConds = $conds;
450                         $timestamp = $dbr->addQuotes( $timestamp );
451                         $finalConds [] =
452                                 "(cl_timestamp > $timestamp OR (cl_timestamp = $timestamp AND cl_from > $lastId))";
453                         $res = $dbr->select( [ 'page', 'categorylinks' ],
454                                 [ 'page_id', 'cl_timestamp' ],
455                                 $finalConds,
456                                 __METHOD__,
457                                 [
458                                         'ORDER BY' => [ 'cl_timestamp', 'cl_from' ],
459                                         'LIMIT' => $this->mBatchSize,
460                                 ]
461                         );
462
463                         foreach ( $res as $row ) {
464                                 if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
465                                         $this->output( "$i\n" );
466                                         wfWaitForSlaves();
467                                 }
468                                 $lastId = $row->page_id;
469                                 $timestamp = $row->cl_timestamp;
470                                 self::fixLinksFromArticle( $row->page_id );
471                         }
472
473                 } while ( $res->numRows() == $this->mBatchSize );
474         }
475
476         /**
477          * Returns a list of possible categories for a given tracking category key
478          *
479          * @param string $categoryKey
480          * @return Title[]
481          */
482         private function getPossibleCategories( $categoryKey ) {
483                 $trackingCategories = new TrackingCategories( $this->getConfig() );
484                 $cats = $trackingCategories->getTrackingCategories();
485                 if ( isset( $cats[$categoryKey] ) ) {
486                         return $cats[$categoryKey]['cats'];
487                 }
488                 $this->error( "Unknown tracking category {$categoryKey}\n", true );
489         }
490 }
491
492 $maintClass = 'RefreshLinks';
493 require_once RUN_MAINTENANCE_IF_MAIN;