X-Git-Url: https://scripts.mit.edu/gitweb/autoinstallsdev/mediawiki.git/blobdiff_plain/19e297c21b10b1b8a3acad5e73fc71dcb35db44a..6932310fd58ebef145fa01eb76edf7150284d8ea:/includes/jobqueue/jobs/CategoryMembershipChangeJob.php diff --git a/includes/jobqueue/jobs/CategoryMembershipChangeJob.php b/includes/jobqueue/jobs/CategoryMembershipChangeJob.php new file mode 100644 index 00000000..3907fc65 --- /dev/null +++ b/includes/jobqueue/jobs/CategoryMembershipChangeJob.php @@ -0,0 +1,243 @@ +removeDuplicates = true; + } + + public function run() { + $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); + $lb = $lbFactory->getMainLB(); + $dbw = $lb->getConnection( DB_MASTER ); + + $this->ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ ); + + $page = WikiPage::newFromID( $this->params['pageId'], WikiPage::READ_LATEST ); + if ( !$page ) { + $this->setLastError( "Could not find page #{$this->params['pageId']}" ); + return false; // deleted? + } + + // Use a named lock so that jobs for this page see each others' changes + $lockKey = "CategoryMembershipUpdates:{$page->getId()}"; + $scopedLock = $dbw->getScopedLockAndFlush( $lockKey, __METHOD__, 3 ); + if ( !$scopedLock ) { + $this->setLastError( "Could not acquire lock '$lockKey'" ); + return false; + } + + $dbr = $lb->getConnection( DB_REPLICA, [ 'recentchanges' ] ); + // Wait till the replica DB is caught up so that jobs for this page see each others' changes + if ( !$lb->safeWaitForMasterPos( $dbr ) ) { + $this->setLastError( "Timed out while waiting for replica DB to catch up" ); + return false; + } + // Clear any stale REPEATABLE-READ snapshot + $dbr->flushSnapshot( __METHOD__ ); + + $cutoffUnix = wfTimestamp( TS_UNIX, $this->params['revTimestamp'] ); + // Using ENQUEUE_FUDGE_SEC handles jobs inserted out of revision order due to the delay + // between COMMIT and actual enqueueing of the CategoryMembershipChangeJob job. + $cutoffUnix -= self::ENQUEUE_FUDGE_SEC; + + // Get the newest revision that has a SRC_CATEGORIZE row... + $row = $dbr->selectRow( + [ 'revision', 'recentchanges' ], + [ 'rev_timestamp', 'rev_id' ], + [ + 'rev_page' => $page->getId(), + 'rev_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $cutoffUnix ) ) + ], + __METHOD__, + [ 'ORDER BY' => 'rev_timestamp DESC, rev_id DESC' ], + [ + 'recentchanges' => [ + 'INNER JOIN', + [ + 'rc_this_oldid = rev_id', + 'rc_source' => RecentChange::SRC_CATEGORIZE, + // Allow rc_cur_id or rc_timestamp index usage + 'rc_cur_id = rev_page', + 'rc_timestamp >= rev_timestamp' + ] + ] + ] + ); + // Only consider revisions newer than any such revision + if ( $row ) { + $cutoffUnix = wfTimestamp( TS_UNIX, $row->rev_timestamp ); + $lastRevId = (int)$row->rev_id; + } else { + $lastRevId = 0; + } + + // Find revisions to this page made around and after this revision which lack category + // notifications in recent changes. This lets jobs pick up were the last one left off. + $encCutoff = $dbr->addQuotes( $dbr->timestamp( $cutoffUnix ) ); + $res = $dbr->select( + 'revision', + Revision::selectFields(), + [ + 'rev_page' => $page->getId(), + "rev_timestamp > $encCutoff" . + " OR (rev_timestamp = $encCutoff AND rev_id > $lastRevId)" + ], + __METHOD__, + [ 'ORDER BY' => 'rev_timestamp ASC, rev_id ASC' ] + ); + + // Apply all category updates in revision timestamp order + foreach ( $res as $row ) { + $this->notifyUpdatesForRevision( $lbFactory, $page, Revision::newFromRow( $row ) ); + } + + return true; + } + + /** + * @param LBFactory $lbFactory + * @param WikiPage $page + * @param Revision $newRev + * @throws MWException + */ + protected function notifyUpdatesForRevision( + LBFactory $lbFactory, WikiPage $page, Revision $newRev + ) { + $config = RequestContext::getMain()->getConfig(); + $title = $page->getTitle(); + + // Get the new revision + if ( !$newRev->getContent() ) { + return; // deleted? + } + + // Get the prior revision (the same for null edits) + if ( $newRev->getParentId() ) { + $oldRev = Revision::newFromId( $newRev->getParentId(), Revision::READ_LATEST ); + if ( !$oldRev->getContent() ) { + return; // deleted? + } + } else { + $oldRev = null; + } + + // Parse the new revision and get the categories + $categoryChanges = $this->getExplicitCategoriesChanges( $title, $newRev, $oldRev ); + list( $categoryInserts, $categoryDeletes ) = $categoryChanges; + if ( !$categoryInserts && !$categoryDeletes ) { + return; // nothing to do + } + + $catMembChange = new CategoryMembershipChange( $title, $newRev ); + $catMembChange->checkTemplateLinks(); + + $batchSize = $config->get( 'UpdateRowsPerQuery' ); + $insertCount = 0; + + foreach ( $categoryInserts as $categoryName ) { + $categoryTitle = Title::makeTitle( NS_CATEGORY, $categoryName ); + $catMembChange->triggerCategoryAddedNotification( $categoryTitle ); + if ( $insertCount++ && ( $insertCount % $batchSize ) == 0 ) { + $lbFactory->commitAndWaitForReplication( __METHOD__, $this->ticket ); + } + } + + foreach ( $categoryDeletes as $categoryName ) { + $categoryTitle = Title::makeTitle( NS_CATEGORY, $categoryName ); + $catMembChange->triggerCategoryRemovedNotification( $categoryTitle ); + if ( $insertCount++ && ( $insertCount++ % $batchSize ) == 0 ) { + $lbFactory->commitAndWaitForReplication( __METHOD__, $this->ticket ); + } + } + } + + private function getExplicitCategoriesChanges( + Title $title, Revision $newRev, Revision $oldRev = null + ) { + // Inject the same timestamp for both revision parses to avoid seeing category changes + // due to time-based parser functions. Inject the same page title for the parses too. + // Note that REPEATABLE-READ makes template/file pages appear unchanged between parses. + $parseTimestamp = $newRev->getTimestamp(); + // Parse the old rev and get the categories. Do not use link tables as that + // assumes these updates are perfectly FIFO and that link tables are always + // up to date, neither of which are true. + $oldCategories = $oldRev + ? $this->getCategoriesAtRev( $title, $oldRev, $parseTimestamp ) + : []; + // Parse the new revision and get the categories + $newCategories = $this->getCategoriesAtRev( $title, $newRev, $parseTimestamp ); + + $categoryInserts = array_values( array_diff( $newCategories, $oldCategories ) ); + $categoryDeletes = array_values( array_diff( $oldCategories, $newCategories ) ); + + return [ $categoryInserts, $categoryDeletes ]; + } + + /** + * @param Title $title + * @param Revision $rev + * @param string $parseTimestamp TS_MW + * + * @return string[] category names + */ + private function getCategoriesAtRev( Title $title, Revision $rev, $parseTimestamp ) { + $content = $rev->getContent(); + $options = $content->getContentHandler()->makeParserOptions( 'canonical' ); + $options->setTimestamp( $parseTimestamp ); + // This could possibly use the parser cache if it checked the revision ID, + // but that's more complicated than it's worth. + $output = $content->getParserOutput( $title, $rev->getId(), $options ); + + // array keys will cast numeric category names to ints + // so we need to cast them back to strings to avoid breaking things! + return array_map( 'strval', array_keys( $output->getCategories() ) ); + } + + public function getDeduplicationInfo() { + $info = parent::getDeduplicationInfo(); + unset( $info['params']['revTimestamp'] ); // first job wins + + return $info; + } +}