]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blobdiff - maintenance/refreshLinks.php
MediaWiki 1.30.2
[autoinstallsdev/mediawiki.git] / maintenance / refreshLinks.php
index 144e96c57c36a8a6cb330e1657ff6f7e2f76f961..b099aff44f360c44f0e5f4f08a60cc2460376691 100644 (file)
@@ -1,5 +1,7 @@
 <?php
 /**
+ * Refresh link tables.
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  * http://www.gnu.org/copyleft/gpl.html
  *
+ * @file
  * @ingroup Maintenance
  */
 
-require_once( dirname( __FILE__ ) . '/Maintenance.php' );
+use Wikimedia\Rdbms\IDatabase;
+
+require_once __DIR__ . '/Maintenance.php';
 
+/**
+ * Maintenance script to refresh link tables.
+ *
+ * @ingroup Maintenance
+ */
 class RefreshLinks extends Maintenance {
+       const REPORTING_INTERVAL = 100;
+
+       /** @var int|bool */
+       protected $namespace = false;
+
        public function __construct() {
                parent::__construct();
-               $this->mDescription = "Refresh link tables";
+               $this->addDescription( 'Refresh link tables' );
                $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
                $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
                $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
                $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
-               $this->addOption( 'm', 'Maximum replication lag', false, true );
                $this->addOption( 'e', 'Last page id to refresh', false, true );
+               $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
+                       'query, default 100000', false, true );
+               $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true );
+               $this->addOption( 'category', 'Only fix pages in this category', false, true );
+               $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true );
                $this->addArg( 'start', 'Page_id to start from, default 1', false );
                $this->setBatchSize( 100 );
        }
 
        public function execute() {
-               $max = $this->getOption( 'm', 0 );
-               if ( !$this->hasOption( 'dfn-only' ) ) {
-                       $start = $this->getArg( 0, 1 );
-                       $new = $this->getOption( 'new-only', false );
-                       $end = $this->getOption( 'e', 0 );
-                       $redir = $this->getOption( 'redirects-only', false );
-                       $oldRedir = $this->getOption( 'old-redirects-only', false );
-                       $this->doRefreshLinks( $start, $new, $max, $end, $redir, $oldRedir );
+               // Note that there is a difference between not specifying the start
+               // and end IDs and using the minimum and maximum values from the page
+               // table. In the latter case, deleteLinksFromNonexistent() will not
+               // delete entries for nonexistent IDs that fall outside the range.
+               $start = (int)$this->getArg( 0 ) ?: null;
+               $end = (int)$this->getOption( 'e' ) ?: null;
+               $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100000 );
+               $ns = $this->getOption( 'namespace' );
+               if ( $ns === null ) {
+                       $this->namespace = false;
+               } else {
+                       $this->namespace = (int)$ns;
                }
-               $this->deleteLinksFromNonexistent( $max, $this->mBatchSize );
+               if ( ( $category = $this->getOption( 'category', false ) ) !== false ) {
+                       $title = Title::makeTitleSafe( NS_CATEGORY, $category );
+                       if ( !$title ) {
+                               $this->error( "'$category' is an invalid category name!\n", true );
+                       }
+                       $this->refreshCategory( $title );
+               } elseif ( ( $category = $this->getOption( 'tracking-category', false ) ) !== false ) {
+                       $this->refreshTrackingCategory( $category );
+               } elseif ( !$this->hasOption( 'dfn-only' ) ) {
+                       $new = $this->hasOption( 'new-only' );
+                       $redir = $this->hasOption( 'redirects-only' );
+                       $oldRedir = $this->hasOption( 'old-redirects-only' );
+                       $this->doRefreshLinks( $start, $new, $end, $redir, $oldRedir );
+                       $this->deleteLinksFromNonexistent( null, null, $this->mBatchSize, $dfnChunkSize );
+               } else {
+                       $this->deleteLinksFromNonexistent( $start, $end, $this->mBatchSize, $dfnChunkSize );
+               }
+       }
+
+       private function namespaceCond() {
+               return $this->namespace !== false
+                       ? [ 'page_namespace' => $this->namespace ]
+                       : [];
        }
 
        /**
         * Do the actual link refreshing.
-        * @param $start int Page_id to start from
-        * @param $newOnly bool Only do pages with 1 edit
-        * @param $maxLag int Max DB replication lag
-        * @param $end int Page_id to stop at
-        * @param $redirectsOnly bool Only fix redirects
-        * @param $oldRedirectsOnly bool Only fix redirects without redirect entries
+        * @param int|null $start Page_id to start from
+        * @param bool $newOnly Only do pages with 1 edit
+        * @param int|null $end Page_id to stop at
+        * @param bool $redirectsOnly Only fix redirects
+        * @param bool $oldRedirectsOnly Only fix redirects without redirect entries
         */
-       private function doRefreshLinks( $start, $newOnly = false, $maxLag = false,
-                                               $end = 0, $redirectsOnly = false, $oldRedirectsOnly = false ) {
-               global $wgUser, $wgParser, $wgUseTidy;
-
-               $reportingInterval = 100;
-               $dbr = wfGetDB( DB_SLAVE );
-               $start = intval( $start );
-
-               # Don't generate TeX PNGs (lack of a sensible current directory causes errors anyway)
-               $wgUser->setOption( 'math', MW_MATH_SOURCE );
+       private function doRefreshLinks( $start, $newOnly = false,
+               $end = null, $redirectsOnly = false, $oldRedirectsOnly = false
+       ) {
+               $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
 
-               # Don't generate extension images (e.g. Timeline)
-               if ( method_exists( $wgParser, "clearTagHooks" ) ) {
-                       $wgParser->clearTagHooks();
+               if ( $start === null ) {
+                       $start = 1;
                }
 
-               # Don't use HTML tidy
-               $wgUseTidy = false;
+               // Give extensions a chance to optimize settings
+               Hooks::run( 'MaintenanceRefreshLinksInit', [ $this ] );
 
                $what = $redirectsOnly ? "redirects" : "links";
 
                if ( $oldRedirectsOnly ) {
                        # This entire code path is cut-and-pasted from below.  Hurrah.
-                       $res = $dbr->query(
-                               "SELECT page_id " .
-                               "FROM page " .
-                               "LEFT JOIN redirect ON page_id=rd_from " .
-                               "WHERE page_is_redirect=1 AND rd_from IS NULL AND " .
-                               ( $end == 0 ? "page_id >= $start"
-                                                  : "page_id BETWEEN $start AND $end" ),
-                               __METHOD__
+
+                       $conds = [
+                               "page_is_redirect=1",
+                               "rd_from IS NULL",
+                               self::intervalCond( $dbr, 'page_id', $start, $end ),
+                       ] + $this->namespaceCond();
+
+                       $res = $dbr->select(
+                               [ 'page', 'redirect' ],
+                               'page_id',
+                               $conds,
+                               __METHOD__,
+                               [],
+                               [ 'redirect' => [ "LEFT JOIN", "page_id=rd_from" ] ]
                        );
-                       $num = $dbr->numRows( $res );
+                       $num = $res->numRows();
                        $this->output( "Refreshing $num old redirects from $start...\n" );
 
                        $i = 0;
+
                        foreach ( $res as $row ) {
-                               if ( !( ++$i % $reportingInterval ) ) {
+                               if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
                                        $this->output( "$i\n" );
-                                       wfWaitForSlaves( $maxLag );
+                                       wfWaitForSlaves();
                                }
                                $this->fixRedirect( $row->page_id );
                        }
                } elseif ( $newOnly ) {
                        $this->output( "Refreshing $what from " );
                        $res = $dbr->select( 'page',
-                               array( 'page_id' ),
-                               array(
+                               [ 'page_id' ],
+                               [
                                        'page_is_new' => 1,
-                                       "page_id >= $start" ),
+                                       self::intervalCond( $dbr, 'page_id', $start, $end ),
+                               ] + $this->namespaceCond(),
                                __METHOD__
                        );
-                       $num = $dbr->numRows( $res );
+                       $num = $res->numRows();
                        $this->output( "$num new articles...\n" );
 
                        $i = 0;
                        foreach ( $res as $row ) {
-                               if ( !( ++$i % $reportingInterval ) ) {
+                               if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
                                        $this->output( "$i\n" );
-                                       wfWaitForSlaves( $maxLag );
+                                       wfWaitForSlaves();
                                }
-                               if ( $redirectsOnly )
+                               if ( $redirectsOnly ) {
                                        $this->fixRedirect( $row->page_id );
-                               else
-                                       self::fixLinksFromArticle( $row->page_id );
+                               } else {
+                                       self::fixLinksFromArticle( $row->page_id, $this->namespace );
+                               }
                        }
                } else {
                        if ( !$end ) {
@@ -132,152 +178,316 @@ class RefreshLinks extends Maintenance {
                        $this->output( "Starting from page_id $start of $end.\n" );
 
                        for ( $id = $start; $id <= $end; $id++ ) {
-
-                               if ( !( $id % $reportingInterval ) ) {
+                               if ( !( $id % self::REPORTING_INTERVAL ) ) {
                                        $this->output( "$id\n" );
-                                       wfWaitForSlaves( $maxLag );
+                                       wfWaitForSlaves();
                                }
                                $this->fixRedirect( $id );
                        }
 
                        if ( !$redirectsOnly ) {
-                               $this->output( "Refreshing links table.\n" );
+                               $this->output( "Refreshing links tables.\n" );
                                $this->output( "Starting from page_id $start of $end.\n" );
 
                                for ( $id = $start; $id <= $end; $id++ ) {
-
-                                       if ( !( $id % $reportingInterval ) ) {
+                                       if ( !( $id % self::REPORTING_INTERVAL ) ) {
                                                $this->output( "$id\n" );
-                                               wfWaitForSlaves( $maxLag );
+                                               wfWaitForSlaves();
                                        }
-                                       self::fixLinksFromArticle( $id );
+                                       self::fixLinksFromArticle( $id, $this->namespace );
                                }
                        }
                }
        }
 
        /**
-        * Update the redirect entry for a given page
-        * @param $id int The page_id of the redirect
+        * Update the redirect entry for a given page.
+        *
+        * This methods bypasses the "redirect" table to get the redirect target,
+        * and parses the page's content to fetch it. This allows to be sure that
+        * the redirect target is up to date and valid.
+        * This is particularly useful when modifying namespaces to be sure the
+        * entry in the "redirect" table points to the correct page and not to an
+        * invalid one.
+        *
+        * @param int $id The page ID to check
         */
        private function fixRedirect( $id ) {
-               global $wgTitle, $wgArticle;
+               $page = WikiPage::newFromID( $id );
+               $dbw = $this->getDB( DB_MASTER );
 
-               $wgTitle = Title::newFromID( $id );
-               $dbw = wfGetDB( DB_MASTER );
-
-               if ( is_null( $wgTitle ) ) {
+               if ( $page === null ) {
                        // This page doesn't exist (any more)
                        // Delete any redirect table entry for it
-                       $dbw->delete( 'redirect', array( 'rd_from' => $id ),
+                       $dbw->delete( 'redirect', [ 'rd_from' => $id ],
                                __METHOD__ );
+
+                       return;
+               } elseif ( $this->namespace !== false
+                       && !$page->getTitle()->inNamespace( $this->namespace )
+               ) {
                        return;
                }
-               $wgArticle = new Article( $wgTitle );
 
-               $rt = $wgArticle->followRedirect();
+               $rt = null;
+               $content = $page->getContent( Revision::RAW );
+               if ( $content !== null ) {
+                       $rt = $content->getUltimateRedirectTarget();
+               }
 
-               if ( !$rt || !is_object( $rt ) ) {
-                       // $wgTitle is not a redirect
+               if ( $rt === null ) {
+                       // The page is not a redirect
                        // Delete any redirect table entry for it
-                       $dbw->delete( 'redirect', array( 'rd_from' => $id ),
-                               __METHOD__ );
+                       $dbw->delete( 'redirect', [ 'rd_from' => $id ], __METHOD__ );
+                       $fieldValue = 0;
                } else {
-                       $wgArticle->updateRedirectOn( $dbw, $rt );
+                       $page->insertRedirectEntry( $rt );
+                       $fieldValue = 1;
                }
+
+               // Update the page table to be sure it is an a consistent state
+               $dbw->update( 'page', [ 'page_is_redirect' => $fieldValue ],
+                       [ 'page_id' => $id ], __METHOD__ );
        }
 
        /**
         * Run LinksUpdate for all links on a given page_id
-        * @param $id int The page_id
+        * @param int $id The page_id
+        * @param int|bool $ns Only fix links if it is in this namespace
         */
-       public static function fixLinksFromArticle( $id ) {
-               global $wgTitle, $wgParser;
-
-               $wgTitle = Title::newFromID( $id );
-               $dbw = wfGetDB( DB_MASTER );
+       public static function fixLinksFromArticle( $id, $ns = false ) {
+               $page = WikiPage::newFromID( $id );
 
                LinkCache::singleton()->clear();
 
-               if ( is_null( $wgTitle ) ) {
+               if ( $page === null ) {
+                       return;
+               } elseif ( $ns !== false
+                       && !$page->getTitle()->inNamespace( $ns ) ) {
                        return;
                }
-               $dbw->begin();
 
-               $revision = Revision::newFromTitle( $wgTitle );
-               if ( !$revision ) {
+               $content = $page->getContent( Revision::RAW );
+               if ( $content === null ) {
                        return;
                }
 
-               $options = new ParserOptions;
-               $parserOutput = $wgParser->parse( $revision->getText(), $wgTitle, $options, true, true, $revision->getId() );
-               $update = new LinksUpdate( $wgTitle, $parserOutput, false );
-               $update->doUpdate();
-               $dbw->commit();
+               $updates = $content->getSecondaryDataUpdates(
+                       $page->getTitle(), /* $old = */ null, /* $recursive = */ false );
+               foreach ( $updates as $update ) {
+                       DeferredUpdates::addUpdate( $update );
+                       DeferredUpdates::doUpdates();
+               }
        }
 
-       /*
+       /**
         * Removes non-existing links from pages from pagelinks, imagelinks,
-        * categorylinks, templatelinks and externallinks tables.
+        * categorylinks, templatelinks, externallinks, interwikilinks, langlinks and redirect tables.
         *
-        * @param $maxLag
-        * @param $batchSize The size of deletion batches
+        * @param int|null $start Page_id to start from
+        * @param int|null $end Page_id to stop at
+        * @param int $batchSize The size of deletion batches
+        * @param int $chunkSize Maximum number of existent IDs to check per query
         *
         * @author Merlijn van Deen <valhallasw@arctus.nl>
         */
-       private function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) {
-               wfWaitForSlaves( $maxLag );
+       private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100,
+               $chunkSize = 100000
+       ) {
+               wfWaitForSlaves();
+               $this->output( "Deleting illegal entries from the links tables...\n" );
+               $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
+               do {
+                       // Find the start of the next chunk. This is based only
+                       // on existent page_ids.
+                       $nextStart = $dbr->selectField(
+                               'page',
+                               'page_id',
+                               [ self::intervalCond( $dbr, 'page_id', $start, $end ) ]
+                               + $this->namespaceCond(),
+                               __METHOD__,
+                               [ 'ORDER BY' => 'page_id', 'OFFSET' => $chunkSize ]
+                       );
+
+                       if ( $nextStart !== false ) {
+                               // To find the end of the current chunk, subtract one.
+                               // This will serve to limit the number of rows scanned in
+                               // dfnCheckInterval(), per query, to at most the sum of
+                               // the chunk size and deletion batch size.
+                               $chunkEnd = $nextStart - 1;
+                       } else {
+                               // This is the last chunk. Check all page_ids up to $end.
+                               $chunkEnd = $end;
+                       }
+
+                       $fmtStart = $start !== null ? "[$start" : '(-INF';
+                       $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)';
+                       $this->output( "  Checking interval $fmtStart, $fmtChunkEnd\n" );
+                       $this->dfnCheckInterval( $start, $chunkEnd, $batchSize );
 
-               $dbw = wfGetDB( DB_MASTER );
+                       $start = $nextStart;
 
-               $lb = wfGetLBFactory()->newMainLB();
-               $dbr = $lb->getConnection( DB_SLAVE );
-               $dbr->bufferResults( false );
+               } while ( $nextStart !== false );
+       }
 
-               $linksTables = array( // table name => page_id field
+       /**
+        * @see RefreshLinks::deleteLinksFromNonexistent()
+        * @param int|null $start Page_id to start from
+        * @param int|null $end Page_id to stop at
+        * @param int $batchSize The size of deletion batches
+        */
+       private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) {
+               $dbw = $this->getDB( DB_MASTER );
+               $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
+
+               $linksTables = [ // table name => page_id field
                        'pagelinks' => 'pl_from',
                        'imagelinks' => 'il_from',
                        'categorylinks' => 'cl_from',
                        'templatelinks' => 'tl_from',
                        'externallinks' => 'el_from',
-               );
+                       'iwlinks' => 'iwl_from',
+                       'langlinks' => 'll_from',
+                       'redirect' => 'rd_from',
+                       'page_props' => 'pp_page',
+               ];
 
                foreach ( $linksTables as $table => $field ) {
-                       $this->output( "Retrieving illegal entries from $table... " );
-
-                       // SELECT DISTINCT( $field ) FROM $table LEFT JOIN page ON $field=page_id WHERE page_id IS NULL;
-                       $results = $dbr->select( array( $table, 'page' ),
-                                                 $field,
-                                                 array( 'page_id' => null ),
-                                                 __METHOD__,
-                                                 'DISTINCT',
-                                                 array( 'page' => array( 'LEFT JOIN', "$field=page_id" ) )
+                       $this->output( "    $table: 0" );
+                       $tableStart = $start;
+                       $counter = 0;
+                       do {
+                               $ids = $dbr->selectFieldValues(
+                                       $table,
+                                       $field,
+                                       [
+                                               self::intervalCond( $dbr, $field, $tableStart, $end ),
+                                               "$field NOT IN ({$dbr->selectSQLText( 'page', 'page_id' )})",
+                                       ],
+                                       __METHOD__,
+                                       [ 'DISTINCT', 'ORDER BY' => $field, 'LIMIT' => $batchSize ]
+                               );
+
+                               $numIds = count( $ids );
+                               if ( $numIds ) {
+                                       $counter += $numIds;
+                                       $dbw->delete( $table, [ $field => $ids ], __METHOD__ );
+                                       $this->output( ", $counter" );
+                                       $tableStart = $ids[$numIds - 1] + 1;
+                                       wfWaitForSlaves();
+                               }
+
+                       } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) );
+
+                       $this->output( " deleted.\n" );
+               }
+       }
+
+       /**
+        * Build a SQL expression for a closed interval (i.e. BETWEEN).
+        *
+        * By specifying a null $start or $end, it is also possible to create
+        * half-bounded or unbounded intervals using this function.
+        *
+        * @param IDatabase $db
+        * @param string $var Field name
+        * @param mixed $start First value to include or null
+        * @param mixed $end Last value to include or null
+        * @return string
+        */
+       private static function intervalCond( IDatabase $db, $var, $start, $end ) {
+               if ( $start === null && $end === null ) {
+                       return "$var IS NOT NULL";
+               } elseif ( $end === null ) {
+                       return "$var >= {$db->addQuotes( $start )}";
+               } elseif ( $start === null ) {
+                       return "$var <= {$db->addQuotes( $end )}";
+               } else {
+                       return "$var BETWEEN {$db->addQuotes( $start )} AND {$db->addQuotes( $end )}";
+               }
+       }
+
+       /**
+        * Refershes links for pages in a tracking category
+        *
+        * @param string $category Category key
+        */
+       private function refreshTrackingCategory( $category ) {
+               $cats = $this->getPossibleCategories( $category );
+
+               if ( !$cats ) {
+                       $this->error( "Tracking category '$category' is disabled\n" );
+                       // Output to stderr but don't bail out,
+               }
+
+               foreach ( $cats as $cat ) {
+                       $this->refreshCategory( $cat );
+               }
+       }
+
+       /**
+        * Refreshes links to a category
+        *
+        * @param Title $category
+        */
+       private function refreshCategory( Title $category ) {
+               $this->output( "Refreshing pages in category '{$category->getText()}'...\n" );
+
+               $dbr = $this->getDB( DB_REPLICA );
+               $conds = [
+                       'page_id=cl_from',
+                       'cl_to' => $category->getDBkey(),
+               ];
+               if ( $this->namespace !== false ) {
+                       $conds['page_namespace'] = $this->namespace;
+               }
+
+               $i = 0;
+               $timestamp = '';
+               $lastId = 0;
+               do {
+                       $finalConds = $conds;
+                       $timestamp = $dbr->addQuotes( $timestamp );
+                       $finalConds [] =
+                               "(cl_timestamp > $timestamp OR (cl_timestamp = $timestamp AND cl_from > $lastId))";
+                       $res = $dbr->select( [ 'page', 'categorylinks' ],
+                               [ 'page_id', 'cl_timestamp' ],
+                               $finalConds,
+                               __METHOD__,
+                               [
+                                       'ORDER BY' => [ 'cl_timestamp', 'cl_from' ],
+                                       'LIMIT' => $this->mBatchSize,
+                               ]
                        );
 
-                       $counter = 0;
-                       $list = array();
-                       $this->output( "0.." );
-
-                       foreach ( $results as $row ) {
-                               $counter++;
-                               $list[] = $row->$field;
-                               if ( ( $counter % $batchSize ) == 0 ) {
-                                       wfWaitForSlaves( 5 );
-                                       $dbw->delete( $table, array( $field => $list ), __METHOD__ );
-
-                                       $this->output( $counter . ".." );
-                                       $list = array();
+                       foreach ( $res as $row ) {
+                               if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
+                                       $this->output( "$i\n" );
+                                       wfWaitForSlaves();
                                }
+                               $lastId = $row->page_id;
+                               $timestamp = $row->cl_timestamp;
+                               self::fixLinksFromArticle( $row->page_id );
                        }
-                       $this->output( $counter );
-                       if ( count( $list ) > 0 ) {
-                               $dbw->delete( $table, array( $field => $list ), __METHOD__ );
-                       }
-                       $this->output( "\n" );
+
+               } while ( $res->numRows() == $this->mBatchSize );
+       }
+
+       /**
+        * Returns a list of possible categories for a given tracking category key
+        *
+        * @param string $categoryKey
+        * @return Title[]
+        */
+       private function getPossibleCategories( $categoryKey ) {
+               $trackingCategories = new TrackingCategories( $this->getConfig() );
+               $cats = $trackingCategories->getTrackingCategories();
+               if ( isset( $cats[$categoryKey] ) ) {
+                       return $cats[$categoryKey]['cats'];
                }
-               $lb->closeAll();
+               $this->error( "Unknown tracking category {$categoryKey}\n", true );
        }
 }
 
 $maintClass = 'RefreshLinks';
-require_once( RUN_MAINTENANCE_IF_MAIN );
+require_once RUN_MAINTENANCE_IF_MAIN;