]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - maintenance/storage/trackBlobs.php
MediaWiki 1.17.0
[autoinstalls/mediawiki.git] / maintenance / storage / trackBlobs.php
1 <?php
2
3 require( dirname( __FILE__ ) . '/../commandLine.inc' );
4
5
6 if ( count( $args ) < 1 ) {
7         echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
8         echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
9         echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
10
11         exit( 1 );
12 }
13 $tracker = new TrackBlobs( $args );
14 $tracker->run();
15 echo "All done.\n";
16
17 class TrackBlobs {
18         var $clusters, $textClause;
19         var $doBlobOrphans;
20         var $trackedBlobs = array();
21
22         var $batchSize = 1000;
23         var $reportingInterval = 10;
24
25         function __construct( $clusters ) {
26                 $this->clusters = $clusters;
27                 if ( extension_loaded( 'gmp' ) ) {
28                         $this->doBlobOrphans = true;
29                         foreach ( $clusters as $cluster ) {
30                                 $this->trackedBlobs[$cluster] = gmp_init( 0 );
31                         }
32                 } else {
33                         echo "Warning: the gmp extension is needed to find orphan blobs\n";
34                 }
35         }
36
37         function run() {
38                 $this->checkIntegrity();
39                 $this->initTrackingTable();
40                 $this->trackRevisions();
41                 $this->trackOrphanText();
42                 if ( $this->doBlobOrphans ) {
43                         $this->findOrphanBlobs();
44                 }
45         }
46
47         function checkIntegrity() {
48                 echo "Doing integrity check...\n";
49                 $dbr = wfGetDB( DB_SLAVE );
50
51                 // Scan for HistoryBlobStub objects in the text table (bug 20757)
52
53                 $exists = $dbr->selectField( 'text', 1,
54                         'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
55                         'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
56                         __METHOD__
57                 );
58
59                 if ( $exists ) {
60                         echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
61                                 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
62                                 "to fix this.\n";
63                         exit( 1 );
64                 }
65
66                 // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
67                 $flags = $dbr->selectField( 'archive', 'ar_flags',
68                         'ar_flags LIKE \'%external%\' OR (' .
69                         'ar_flags LIKE \'%object%\' ' .
70                         'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
71                         __METHOD__
72                 );
73
74                 if ( strpos( $flags, 'external' ) !== false ) {
75                         echo "Integrity check failed: found external storage pointers in your archive table.\n" .
76                                 "Run normaliseArchiveTable.php to fix this.\n";
77                         exit( 1 );
78                 } elseif ( $flags ) {
79                         echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
80                                 "These objects are probably already broken, continuing would make them\n" .
81                                 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
82                         exit( 1 );
83                 }
84
85                 echo "Integrity check OK\n";
86         }
87
88         function initTrackingTable() {
89                 $dbw = wfGetDB( DB_MASTER );
90                 if ( $dbw->tableExists( 'blob_tracking' ) ) {
91                         $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
92                         $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
93                 }
94                 $dbw->sourceFile( dirname( __FILE__ ) . '/blob_tracking.sql' );
95         }
96
97         function getTextClause() {
98                 if ( !$this->textClause ) {
99                         $dbr = wfGetDB( DB_SLAVE );
100                         $this->textClause = '';
101                         foreach ( $this->clusters as $cluster ) {
102                                 if ( $this->textClause != '' ) {
103                                         $this->textClause .= ' OR ';
104                                 }
105                                 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
106                         }
107                 }
108                 return $this->textClause;
109         }
110
111         function interpretPointer( $text ) {
112                 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
113                         return false;
114                 }
115                 return array(
116                         'cluster' => $m[1],
117                         'id' => intval( $m[2] ),
118                         'hash' => isset( $m[3] ) ? $m[3] : null
119                 );
120         }
121
122         /**
123          *  Scan the revision table for rows stored in the specified clusters
124          */
125         function trackRevisions() {
126                 $dbw = wfGetDB( DB_MASTER );
127                 $dbr = wfGetDB( DB_SLAVE );
128
129                 $textClause = $this->getTextClause();
130                 $startId = 0;
131                 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
132                 $batchesDone = 0;
133                 $rowsInserted = 0;
134
135                 echo "Finding revisions...\n";
136
137                 while ( true ) {
138                         $res = $dbr->select( array( 'revision', 'text' ),
139                                 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
140                                 array(
141                                         'rev_id > ' . $dbr->addQuotes( $startId ),
142                                         'rev_text_id=old_id',
143                                         $textClause,
144                                         'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
145                                 ),
146                                 __METHOD__,
147                                 array(
148                                         'ORDER BY' => 'rev_id',
149                                         'LIMIT' => $this->batchSize
150                                 )
151                         );
152                         if ( !$res->numRows() ) {
153                                 break;
154                         }
155
156                         $insertBatch = array();
157                         foreach ( $res as $row ) {
158                                 $startId = $row->rev_id;
159                                 $info = $this->interpretPointer( $row->old_text );
160                                 if ( !$info ) {
161                                         echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
162                                         continue;
163                                 }
164                                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
165                                         echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
166                                         continue;
167                                 }
168                                 $insertBatch[] = array(
169                                         'bt_page' => $row->rev_page,
170                                         'bt_rev_id' => $row->rev_id,
171                                         'bt_text_id' => $row->old_id,
172                                         'bt_cluster' => $info['cluster'],
173                                         'bt_blob_id' => $info['id'],
174                                         'bt_cgz_hash' => $info['hash']
175                                 );
176                                 if ( $this->doBlobOrphans ) {
177                                         gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
178                                 }
179                         }
180                         $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
181                         $rowsInserted += count( $insertBatch );
182
183                         ++$batchesDone;
184                         if ( $batchesDone >= $this->reportingInterval ) {
185                                 $batchesDone = 0;
186                                 echo "$startId / $endId\n";
187                                 wfWaitForSlaves( 5 );
188                         }
189                 }
190                 echo "Found $rowsInserted revisions\n";
191         }
192
193         /**
194          * Scan the text table for orphan text
195          * Orphan text here does not imply DB corruption -- deleted text tracked by the
196          * archive table counts as orphan for our purposes.
197          */
198         function trackOrphanText() {
199                 # Wait until the blob_tracking table is available in the slave
200                 $dbw = wfGetDB( DB_MASTER );
201                 $dbr = wfGetDB( DB_SLAVE );
202                 $pos = $dbw->getMasterPos();
203                 $dbr->masterPosWait( $pos, 100000 );
204
205                 $textClause = $this->getTextClause( $this->clusters );
206                 $startId = 0;
207                 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
208                 $rowsInserted = 0;
209                 $batchesDone = 0;
210
211                 echo "Finding orphan text...\n";
212
213                 # Scan the text table for orphan text
214                 while ( true ) {
215                         $res = $dbr->select( array( 'text', 'blob_tracking' ),
216                                 array( 'old_id', 'old_flags', 'old_text' ),
217                                 array(
218                                         'old_id>' . $dbr->addQuotes( $startId ),
219                                         $textClause,
220                                         'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
221                                         'bt_text_id IS NULL'
222                                 ),
223                                 __METHOD__,
224                                 array(
225                                         'ORDER BY' => 'old_id',
226                                         'LIMIT' => $this->batchSize
227                                 ),
228                                 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
229                         );
230                         $ids = array();
231                         foreach ( $res as $row ) {
232                                 $ids[] = $row->old_id;
233                         }
234
235                         if ( !$res->numRows() ) {
236                                 break;
237                         }
238
239                         $insertBatch = array();
240                         foreach ( $res as $row ) {
241                                 $startId = $row->old_id;
242                                 $info = $this->interpretPointer( $row->old_text );
243                                 if ( !$info ) {
244                                         echo "Invalid DB:// URL in old_id {$row->old_id}\n";
245                                         continue;
246                                 }
247                                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
248                                         echo "Invalid cluster returned in SQL query\n";
249                                         continue;
250                                 }
251
252                                 $insertBatch[] = array(
253                                         'bt_page' => 0,
254                                         'bt_rev_id' => 0,
255                                         'bt_text_id' => $row->old_id,
256                                         'bt_cluster' => $info['cluster'],
257                                         'bt_blob_id' => $info['id'],
258                                         'bt_cgz_hash' => $info['hash']
259                                 );
260                                 if ( $this->doBlobOrphans ) {
261                                         gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
262                                 }
263                         }
264                         $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
265
266                         $rowsInserted += count( $insertBatch );
267                         ++$batchesDone;
268                         if ( $batchesDone >= $this->reportingInterval ) {
269                                 $batchesDone = 0;
270                                 echo "$startId / $endId\n";
271                                 wfWaitForSlaves( 5 );
272                         }
273                 }
274                 echo "Found $rowsInserted orphan text rows\n";
275         }
276
277         /**
278          * Scan the blobs table for rows not registered in blob_tracking (and thus not
279          * registered in the text table).
280          *
281          * Orphan blobs are indicative of DB corruption. They are inaccessible and
282          * should probably be deleted.
283          */
284         function findOrphanBlobs() {
285                 if ( !extension_loaded( 'gmp' ) ) {
286                         echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
287                         return;
288                 }
289
290                 $dbw = wfGetDB( DB_MASTER );
291
292                 foreach ( $this->clusters as $cluster ) {
293                         echo "Searching for orphan blobs in $cluster...\n";
294                         $lb = wfGetLBFactory()->getExternalLB( $cluster );
295                         try {
296                                 $extDB = $lb->getConnection( DB_SLAVE );
297                         } catch ( DBConnectionError $e ) {
298                                 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
299                                         echo "No database on $cluster\n";
300                                 } else {
301                                         echo "Error on $cluster: " . $e->getMessage() . "\n";
302                                 }
303                                 continue;
304                         }
305                         $table = $extDB->getLBInfo( 'blobs table' );
306                         if ( is_null( $table ) ) {
307                                 $table = 'blobs';
308                         }
309                         if ( !$extDB->tableExists( $table ) ) {
310                                 echo "No blobs table on cluster $cluster\n";
311                                 continue;
312                         }
313                         $startId = 0;
314                         $batchesDone = 0;
315                         $actualBlobs = gmp_init( 0 );
316                         $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
317
318                         // Build a bitmap of actual blob rows
319                         while ( true ) {
320                                 $res = $extDB->select( $table,
321                                         array( 'blob_id' ),
322                                         array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
323                                         __METHOD__,
324                                         array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
325                                 );
326
327                                 if ( !$res->numRows() ) {
328                                         break;
329                                 }
330
331                                 foreach ( $res as $row ) {
332                                         gmp_setbit( $actualBlobs, $row->blob_id );
333                                 }
334                                 $startId = $row->blob_id;
335
336                                 ++$batchesDone;
337                                 if ( $batchesDone >= $this->reportingInterval ) {
338                                         $batchesDone = 0;
339                                         echo "$startId / $endId\n";
340                                 }
341                         }
342
343                         // Find actual blobs that weren't tracked by the previous passes
344                         // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
345                         $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
346
347                         // Traverse the orphan list
348                         $insertBatch = array();
349                         $id = 0;
350                         $numOrphans = 0;
351                         while ( true ) {
352                                 $id = gmp_scan1( $orphans, $id );
353                                 if ( $id == -1 ) {
354                                         break;
355                                 }
356                                 $insertBatch[] = array(
357                                         'bo_cluster' => $cluster,
358                                         'bo_blob_id' => $id
359                                 );
360                                 if ( count( $insertBatch ) > $this->batchSize ) {
361                                         $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
362                                         $insertBatch = array();
363                                 }
364
365                                 ++$id;
366                                 ++$numOrphans;
367                         }
368                         if ( $insertBatch ) {
369                                 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
370                         }
371                         echo "Found $numOrphans orphan(s) in $cluster\n";
372                 }
373         }
374 }