]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - includes/jobqueue/jobs/CategoryMembershipChangeJob.php
MediaWiki 1.30.2
[autoinstallsdev/mediawiki.git] / includes / jobqueue / jobs / CategoryMembershipChangeJob.php
1 <?php
2 /**
3  * Updater for link tracking tables after a page edit.
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program; if not, write to the Free Software Foundation, Inc.,
17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18  * http://www.gnu.org/copyleft/gpl.html
19  *
20  * @file
21  */
22 use MediaWiki\MediaWikiServices;
23 use Wikimedia\Rdbms\LBFactory;
24
25 /**
26  * Job to add recent change entries mentioning category membership changes
27  *
28  * Parameters include:
29  *   - pageId : page ID
30  *   - revTimestamp : timestamp of the triggering revision
31  *
32  * Category changes will be mentioned for revisions at/after the timestamp for this page
33  *
34  * @since 1.27
35  */
36 class CategoryMembershipChangeJob extends Job {
37         /** @var int|null */
38         private $ticket;
39
40         const ENQUEUE_FUDGE_SEC = 60;
41
42         public function __construct( Title $title, array $params ) {
43                 parent::__construct( 'categoryMembershipChange', $title, $params );
44                 // Only need one job per page. Note that ENQUEUE_FUDGE_SEC handles races where an
45                 // older revision job gets inserted while the newer revision job is de-duplicated.
46                 $this->removeDuplicates = true;
47         }
48
49         public function run() {
50                 $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
51                 $lb = $lbFactory->getMainLB();
52                 $dbw = $lb->getConnection( DB_MASTER );
53
54                 $this->ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );
55
56                 $page = WikiPage::newFromID( $this->params['pageId'], WikiPage::READ_LATEST );
57                 if ( !$page ) {
58                         $this->setLastError( "Could not find page #{$this->params['pageId']}" );
59                         return false; // deleted?
60                 }
61
62                 // Use a named lock so that jobs for this page see each others' changes
63                 $lockKey = "CategoryMembershipUpdates:{$page->getId()}";
64                 $scopedLock = $dbw->getScopedLockAndFlush( $lockKey, __METHOD__, 3 );
65                 if ( !$scopedLock ) {
66                         $this->setLastError( "Could not acquire lock '$lockKey'" );
67                         return false;
68                 }
69
70                 $dbr = $lb->getConnection( DB_REPLICA, [ 'recentchanges' ] );
71                 // Wait till the replica DB is caught up so that jobs for this page see each others' changes
72                 if ( !$lb->safeWaitForMasterPos( $dbr ) ) {
73                         $this->setLastError( "Timed out while waiting for replica DB to catch up" );
74                         return false;
75                 }
76                 // Clear any stale REPEATABLE-READ snapshot
77                 $dbr->flushSnapshot( __METHOD__ );
78
79                 $cutoffUnix = wfTimestamp( TS_UNIX, $this->params['revTimestamp'] );
80                 // Using ENQUEUE_FUDGE_SEC handles jobs inserted out of revision order due to the delay
81                 // between COMMIT and actual enqueueing of the CategoryMembershipChangeJob job.
82                 $cutoffUnix -= self::ENQUEUE_FUDGE_SEC;
83
84                 // Get the newest revision that has a SRC_CATEGORIZE row...
85                 $row = $dbr->selectRow(
86                         [ 'revision', 'recentchanges' ],
87                         [ 'rev_timestamp', 'rev_id' ],
88                         [
89                                 'rev_page' => $page->getId(),
90                                 'rev_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $cutoffUnix ) )
91                         ],
92                         __METHOD__,
93                         [ 'ORDER BY' => 'rev_timestamp DESC, rev_id DESC' ],
94                         [
95                                 'recentchanges' => [
96                                         'INNER JOIN',
97                                         [
98                                                 'rc_this_oldid = rev_id',
99                                                 'rc_source' => RecentChange::SRC_CATEGORIZE,
100                                                 // Allow rc_cur_id or rc_timestamp index usage
101                                                 'rc_cur_id = rev_page',
102                                                 'rc_timestamp >= rev_timestamp'
103                                         ]
104                                 ]
105                         ]
106                 );
107                 // Only consider revisions newer than any such revision
108                 if ( $row ) {
109                         $cutoffUnix = wfTimestamp( TS_UNIX, $row->rev_timestamp );
110                         $lastRevId = (int)$row->rev_id;
111                 } else {
112                         $lastRevId = 0;
113                 }
114
115                 // Find revisions to this page made around and after this revision which lack category
116                 // notifications in recent changes. This lets jobs pick up were the last one left off.
117                 $encCutoff = $dbr->addQuotes( $dbr->timestamp( $cutoffUnix ) );
118                 $res = $dbr->select(
119                         'revision',
120                         Revision::selectFields(),
121                         [
122                                 'rev_page' => $page->getId(),
123                                 "rev_timestamp > $encCutoff" .
124                                         " OR (rev_timestamp = $encCutoff AND rev_id > $lastRevId)"
125                         ],
126                         __METHOD__,
127                         [ 'ORDER BY' => 'rev_timestamp ASC, rev_id ASC' ]
128                 );
129
130                 // Apply all category updates in revision timestamp order
131                 foreach ( $res as $row ) {
132                         $this->notifyUpdatesForRevision( $lbFactory, $page, Revision::newFromRow( $row ) );
133                 }
134
135                 return true;
136         }
137
138         /**
139          * @param LBFactory $lbFactory
140          * @param WikiPage $page
141          * @param Revision $newRev
142          * @throws MWException
143          */
144         protected function notifyUpdatesForRevision(
145                 LBFactory $lbFactory, WikiPage $page, Revision $newRev
146         ) {
147                 $config = RequestContext::getMain()->getConfig();
148                 $title = $page->getTitle();
149
150                 // Get the new revision
151                 if ( !$newRev->getContent() ) {
152                         return; // deleted?
153                 }
154
155                 // Get the prior revision (the same for null edits)
156                 if ( $newRev->getParentId() ) {
157                         $oldRev = Revision::newFromId( $newRev->getParentId(), Revision::READ_LATEST );
158                         if ( !$oldRev->getContent() ) {
159                                 return; // deleted?
160                         }
161                 } else {
162                         $oldRev = null;
163                 }
164
165                 // Parse the new revision and get the categories
166                 $categoryChanges = $this->getExplicitCategoriesChanges( $title, $newRev, $oldRev );
167                 list( $categoryInserts, $categoryDeletes ) = $categoryChanges;
168                 if ( !$categoryInserts && !$categoryDeletes ) {
169                         return; // nothing to do
170                 }
171
172                 $catMembChange = new CategoryMembershipChange( $title, $newRev );
173                 $catMembChange->checkTemplateLinks();
174
175                 $batchSize = $config->get( 'UpdateRowsPerQuery' );
176                 $insertCount = 0;
177
178                 foreach ( $categoryInserts as $categoryName ) {
179                         $categoryTitle = Title::makeTitle( NS_CATEGORY, $categoryName );
180                         $catMembChange->triggerCategoryAddedNotification( $categoryTitle );
181                         if ( $insertCount++ && ( $insertCount % $batchSize ) == 0 ) {
182                                 $lbFactory->commitAndWaitForReplication( __METHOD__, $this->ticket );
183                         }
184                 }
185
186                 foreach ( $categoryDeletes as $categoryName ) {
187                         $categoryTitle = Title::makeTitle( NS_CATEGORY, $categoryName );
188                         $catMembChange->triggerCategoryRemovedNotification( $categoryTitle );
189                         if ( $insertCount++ && ( $insertCount++ % $batchSize ) == 0 ) {
190                                 $lbFactory->commitAndWaitForReplication( __METHOD__, $this->ticket );
191                         }
192                 }
193         }
194
195         private function getExplicitCategoriesChanges(
196                 Title $title, Revision $newRev, Revision $oldRev = null
197         ) {
198                 // Inject the same timestamp for both revision parses to avoid seeing category changes
199                 // due to time-based parser functions. Inject the same page title for the parses too.
200                 // Note that REPEATABLE-READ makes template/file pages appear unchanged between parses.
201                 $parseTimestamp = $newRev->getTimestamp();
202                 // Parse the old rev and get the categories. Do not use link tables as that
203                 // assumes these updates are perfectly FIFO and that link tables are always
204                 // up to date, neither of which are true.
205                 $oldCategories = $oldRev
206                         ? $this->getCategoriesAtRev( $title, $oldRev, $parseTimestamp )
207                         : [];
208                 // Parse the new revision and get the categories
209                 $newCategories = $this->getCategoriesAtRev( $title, $newRev, $parseTimestamp );
210
211                 $categoryInserts = array_values( array_diff( $newCategories, $oldCategories ) );
212                 $categoryDeletes = array_values( array_diff( $oldCategories, $newCategories ) );
213
214                 return [ $categoryInserts, $categoryDeletes ];
215         }
216
217         /**
218          * @param Title $title
219          * @param Revision $rev
220          * @param string $parseTimestamp TS_MW
221          *
222          * @return string[] category names
223          */
224         private function getCategoriesAtRev( Title $title, Revision $rev, $parseTimestamp ) {
225                 $content = $rev->getContent();
226                 $options = $content->getContentHandler()->makeParserOptions( 'canonical' );
227                 $options->setTimestamp( $parseTimestamp );
228                 // This could possibly use the parser cache if it checked the revision ID,
229                 // but that's more complicated than it's worth.
230                 $output = $content->getParserOutput( $title, $rev->getId(), $options );
231
232                 // array keys will cast numeric category names to ints
233                 // so we need to cast them back to strings to avoid breaking things!
234                 return array_map( 'strval', array_keys( $output->getCategories() ) );
235         }
236
237         public function getDeduplicationInfo() {
238                 $info = parent::getDeduplicationInfo();
239                 unset( $info['params']['revTimestamp'] ); // first job wins
240
241                 return $info;
242         }
243 }