]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - maintenance/storage/trackBlobs.php
MediaWiki 1.30.2-scripts
[autoinstalls/mediawiki.git] / maintenance / storage / trackBlobs.php
1 <?php
2 /**
3  * Adds blobs from a given external storage cluster to the blob_tracking table.
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program; if not, write to the Free Software Foundation, Inc.,
17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18  * http://www.gnu.org/copyleft/gpl.html
19  *
20  * @file
21  * @ingroup Maintenance
22  * @see wfWaitForSlaves()
23  */
24
25 use Wikimedia\Rdbms\DBConnectionError;
26
27 require __DIR__ . '/../commandLine.inc';
28
29 if ( count( $args ) < 1 ) {
30         echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
31         echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
32         echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
33
34         exit( 1 );
35 }
36 $tracker = new TrackBlobs( $args );
37 $tracker->run();
38 echo "All done.\n";
39
40 class TrackBlobs {
41         public $clusters, $textClause;
42         public $doBlobOrphans;
43         public $trackedBlobs = [];
44
45         public $batchSize = 1000;
46         public $reportingInterval = 10;
47
48         function __construct( $clusters ) {
49                 $this->clusters = $clusters;
50                 if ( extension_loaded( 'gmp' ) ) {
51                         $this->doBlobOrphans = true;
52                         foreach ( $clusters as $cluster ) {
53                                 $this->trackedBlobs[$cluster] = gmp_init( 0 );
54                         }
55                 } else {
56                         echo "Warning: the gmp extension is needed to find orphan blobs\n";
57                 }
58         }
59
60         function run() {
61                 $this->checkIntegrity();
62                 $this->initTrackingTable();
63                 $this->trackRevisions();
64                 $this->trackOrphanText();
65                 if ( $this->doBlobOrphans ) {
66                         $this->findOrphanBlobs();
67                 }
68         }
69
70         function checkIntegrity() {
71                 echo "Doing integrity check...\n";
72                 $dbr = wfGetDB( DB_REPLICA );
73
74                 // Scan for HistoryBlobStub objects in the text table (T22757)
75
76                 $exists = $dbr->selectField( 'text', 1,
77                         'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
78                         'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
79                         __METHOD__
80                 );
81
82                 if ( $exists ) {
83                         echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
84                                 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
85                                 "to fix this.\n";
86                         exit( 1 );
87                 }
88
89                 // Scan the archive table for HistoryBlobStub objects or external flags (T24624)
90                 $flags = $dbr->selectField( 'archive', 'ar_flags',
91                         'ar_flags LIKE \'%external%\' OR (' .
92                         'ar_flags LIKE \'%object%\' ' .
93                         'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
94                         __METHOD__
95                 );
96
97                 if ( strpos( $flags, 'external' ) !== false ) {
98                         echo "Integrity check failed: found external storage pointers in your archive table.\n" .
99                                 "Run normaliseArchiveTable.php to fix this.\n";
100                         exit( 1 );
101                 } elseif ( $flags ) {
102                         echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
103                                 "These objects are probably already broken, continuing would make them\n" .
104                                 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
105                         exit( 1 );
106                 }
107
108                 echo "Integrity check OK\n";
109         }
110
111         function initTrackingTable() {
112                 $dbw = wfGetDB( DB_MASTER );
113                 if ( $dbw->tableExists( 'blob_tracking' ) ) {
114                         $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
115                         $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
116                 }
117                 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
118         }
119
120         function getTextClause() {
121                 if ( !$this->textClause ) {
122                         $dbr = wfGetDB( DB_REPLICA );
123                         $this->textClause = '';
124                         foreach ( $this->clusters as $cluster ) {
125                                 if ( $this->textClause != '' ) {
126                                         $this->textClause .= ' OR ';
127                                 }
128                                 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
129                         }
130                 }
131
132                 return $this->textClause;
133         }
134
135         function interpretPointer( $text ) {
136                 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
137                         return false;
138                 }
139
140                 return [
141                         'cluster' => $m[1],
142                         'id' => intval( $m[2] ),
143                         'hash' => isset( $m[3] ) ? $m[3] : null
144                 ];
145         }
146
147         /**
148          *  Scan the revision table for rows stored in the specified clusters
149          */
150         function trackRevisions() {
151                 $dbw = wfGetDB( DB_MASTER );
152                 $dbr = wfGetDB( DB_REPLICA );
153
154                 $textClause = $this->getTextClause();
155                 $startId = 0;
156                 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
157                 $batchesDone = 0;
158                 $rowsInserted = 0;
159
160                 echo "Finding revisions...\n";
161
162                 while ( true ) {
163                         $res = $dbr->select( [ 'revision', 'text' ],
164                                 [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ],
165                                 [
166                                         'rev_id > ' . $dbr->addQuotes( $startId ),
167                                         'rev_text_id=old_id',
168                                         $textClause,
169                                         'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
170                                 ],
171                                 __METHOD__,
172                                 [
173                                         'ORDER BY' => 'rev_id',
174                                         'LIMIT' => $this->batchSize
175                                 ]
176                         );
177                         if ( !$res->numRows() ) {
178                                 break;
179                         }
180
181                         $insertBatch = [];
182                         foreach ( $res as $row ) {
183                                 $startId = $row->rev_id;
184                                 $info = $this->interpretPointer( $row->old_text );
185                                 if ( !$info ) {
186                                         echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
187                                         continue;
188                                 }
189                                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
190                                         echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
191                                         continue;
192                                 }
193                                 $insertBatch[] = [
194                                         'bt_page' => $row->rev_page,
195                                         'bt_rev_id' => $row->rev_id,
196                                         'bt_text_id' => $row->old_id,
197                                         'bt_cluster' => $info['cluster'],
198                                         'bt_blob_id' => $info['id'],
199                                         'bt_cgz_hash' => $info['hash']
200                                 ];
201                                 if ( $this->doBlobOrphans ) {
202                                         gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
203                                 }
204                         }
205                         $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
206                         $rowsInserted += count( $insertBatch );
207
208                         ++$batchesDone;
209                         if ( $batchesDone >= $this->reportingInterval ) {
210                                 $batchesDone = 0;
211                                 echo "$startId / $endId\n";
212                                 wfWaitForSlaves();
213                         }
214                 }
215                 echo "Found $rowsInserted revisions\n";
216         }
217
218         /**
219          * Scan the text table for orphan text
220          * Orphan text here does not imply DB corruption -- deleted text tracked by the
221          * archive table counts as orphan for our purposes.
222          */
223         function trackOrphanText() {
224                 # Wait until the blob_tracking table is available in the replica DB
225                 $dbw = wfGetDB( DB_MASTER );
226                 $dbr = wfGetDB( DB_REPLICA );
227                 $pos = $dbw->getMasterPos();
228                 $dbr->masterPosWait( $pos, 100000 );
229
230                 $textClause = $this->getTextClause( $this->clusters );
231                 $startId = 0;
232                 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
233                 $rowsInserted = 0;
234                 $batchesDone = 0;
235
236                 echo "Finding orphan text...\n";
237
238                 # Scan the text table for orphan text
239                 while ( true ) {
240                         $res = $dbr->select( [ 'text', 'blob_tracking' ],
241                                 [ 'old_id', 'old_flags', 'old_text' ],
242                                 [
243                                         'old_id>' . $dbr->addQuotes( $startId ),
244                                         $textClause,
245                                         'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
246                                         'bt_text_id IS NULL'
247                                 ],
248                                 __METHOD__,
249                                 [
250                                         'ORDER BY' => 'old_id',
251                                         'LIMIT' => $this->batchSize
252                                 ],
253                                 [ 'blob_tracking' => [ 'LEFT JOIN', 'bt_text_id=old_id' ] ]
254                         );
255                         $ids = [];
256                         foreach ( $res as $row ) {
257                                 $ids[] = $row->old_id;
258                         }
259
260                         if ( !$res->numRows() ) {
261                                 break;
262                         }
263
264                         $insertBatch = [];
265                         foreach ( $res as $row ) {
266                                 $startId = $row->old_id;
267                                 $info = $this->interpretPointer( $row->old_text );
268                                 if ( !$info ) {
269                                         echo "Invalid DB:// URL in old_id {$row->old_id}\n";
270                                         continue;
271                                 }
272                                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
273                                         echo "Invalid cluster returned in SQL query\n";
274                                         continue;
275                                 }
276
277                                 $insertBatch[] = [
278                                         'bt_page' => 0,
279                                         'bt_rev_id' => 0,
280                                         'bt_text_id' => $row->old_id,
281                                         'bt_cluster' => $info['cluster'],
282                                         'bt_blob_id' => $info['id'],
283                                         'bt_cgz_hash' => $info['hash']
284                                 ];
285                                 if ( $this->doBlobOrphans ) {
286                                         gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
287                                 }
288                         }
289                         $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
290
291                         $rowsInserted += count( $insertBatch );
292                         ++$batchesDone;
293                         if ( $batchesDone >= $this->reportingInterval ) {
294                                 $batchesDone = 0;
295                                 echo "$startId / $endId\n";
296                                 wfWaitForSlaves();
297                         }
298                 }
299                 echo "Found $rowsInserted orphan text rows\n";
300         }
301
302         /**
303          * Scan the blobs table for rows not registered in blob_tracking (and thus not
304          * registered in the text table).
305          *
306          * Orphan blobs are indicative of DB corruption. They are inaccessible and
307          * should probably be deleted.
308          */
309         function findOrphanBlobs() {
310                 if ( !extension_loaded( 'gmp' ) ) {
311                         echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
312
313                         return;
314                 }
315
316                 $dbw = wfGetDB( DB_MASTER );
317
318                 foreach ( $this->clusters as $cluster ) {
319                         echo "Searching for orphan blobs in $cluster...\n";
320                         $lb = wfGetLBFactory()->getExternalLB( $cluster );
321                         try {
322                                 $extDB = $lb->getConnection( DB_REPLICA );
323                         } catch ( DBConnectionError $e ) {
324                                 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
325                                         echo "No database on $cluster\n";
326                                 } else {
327                                         echo "Error on $cluster: " . $e->getMessage() . "\n";
328                                 }
329                                 continue;
330                         }
331                         $table = $extDB->getLBInfo( 'blobs table' );
332                         if ( is_null( $table ) ) {
333                                 $table = 'blobs';
334                         }
335                         if ( !$extDB->tableExists( $table ) ) {
336                                 echo "No blobs table on cluster $cluster\n";
337                                 continue;
338                         }
339                         $startId = 0;
340                         $batchesDone = 0;
341                         $actualBlobs = gmp_init( 0 );
342                         $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
343
344                         // Build a bitmap of actual blob rows
345                         while ( true ) {
346                                 $res = $extDB->select( $table,
347                                         [ 'blob_id' ],
348                                         [ 'blob_id > ' . $extDB->addQuotes( $startId ) ],
349                                         __METHOD__,
350                                         [ 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' ]
351                                 );
352
353                                 if ( !$res->numRows() ) {
354                                         break;
355                                 }
356
357                                 foreach ( $res as $row ) {
358                                         gmp_setbit( $actualBlobs, $row->blob_id );
359                                 }
360                                 $startId = $row->blob_id;
361
362                                 ++$batchesDone;
363                                 if ( $batchesDone >= $this->reportingInterval ) {
364                                         $batchesDone = 0;
365                                         echo "$startId / $endId\n";
366                                 }
367                         }
368
369                         // Find actual blobs that weren't tracked by the previous passes
370                         // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
371                         $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
372
373                         // Traverse the orphan list
374                         $insertBatch = [];
375                         $id = 0;
376                         $numOrphans = 0;
377                         while ( true ) {
378                                 $id = gmp_scan1( $orphans, $id );
379                                 if ( $id == -1 ) {
380                                         break;
381                                 }
382                                 $insertBatch[] = [
383                                         'bo_cluster' => $cluster,
384                                         'bo_blob_id' => $id
385                                 ];
386                                 if ( count( $insertBatch ) > $this->batchSize ) {
387                                         $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
388                                         $insertBatch = [];
389                                 }
390
391                                 ++$id;
392                                 ++$numOrphans;
393                         }
394                         if ( $insertBatch ) {
395                                 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
396                         }
397                         echo "Found $numOrphans orphan(s) in $cluster\n";
398                 }
399         }
400 }