]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - maintenance/storage/trackBlobs.php
MediaWiki 1.16.0
[autoinstallsdev/mediawiki.git] / maintenance / storage / trackBlobs.php
1 <?php
2
3 require( dirname( __FILE__ ) .'/../commandLine.inc' );
4
5
6 if ( count( $args ) < 1 ) {
7         echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
8         echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
9         echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
10
11         exit( 1 );
12 }
13 $tracker = new TrackBlobs( $args );
14 $tracker->run();
15 echo "All done.\n";
16
17 class TrackBlobs {
18         var $clusters, $textClause;
19         var $doBlobOrphans;
20         var $trackedBlobs = array();
21
22         var $batchSize = 1000;
23         var $reportingInterval = 10;
24
25         function __construct( $clusters ) {
26                 $this->clusters = $clusters;
27                 if ( extension_loaded( 'gmp' ) ) {
28                         $this->doBlobOrphans = true;
29                         foreach ( $clusters as $cluster ) {
30                                 $this->trackedBlobs[$cluster] = gmp_init( 0 );
31                         }
32                 } else {
33                         echo "Warning: the gmp extension is needed to find orphan blobs\n";
34                 }
35         }
36
37         function run() {
38                 $this->initTrackingTable();
39                 $this->trackRevisions();
40                 $this->trackOrphanText();
41                 if ( $this->doBlobOrphans ) {
42                         $this->findOrphanBlobs();
43                 }
44         }
45
46         function initTrackingTable() {
47                 $dbw = wfGetDB( DB_MASTER );
48                 if ( $dbw->tableExists( 'blob_tracking' ) ) {
49                         $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
50                         $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
51                 }
52                 $dbw->sourceFile( dirname( __FILE__ ) . '/blob_tracking.sql' );
53         }
54
55         function getTextClause() {
56                 if ( !$this->textClause ) {
57                         $dbr = wfGetDB( DB_SLAVE );
58                         $this->textClause = '';
59                         foreach ( $this->clusters as $cluster ) {
60                                 if ( $this->textClause != '' ) {
61                                         $this->textClause .= ' OR ';
62                                 }
63                                 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
64                         }
65                 }
66                 return $this->textClause;
67         }
68
69         function interpretPointer( $text ) {
70                 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
71                         return false;
72                 }
73                 return array(
74                         'cluster' => $m[1],
75                         'id' => intval( $m[2] ),
76                         'hash' => isset( $m[3] ) ? $m[3] : null
77                 );
78         }
79
80         /**
81          *  Scan the revision table for rows stored in the specified clusters
82          */
83         function trackRevisions() {
84                 $dbw = wfGetDB( DB_MASTER );
85                 $dbr = wfGetDB( DB_SLAVE );
86
87                 $textClause = $this->getTextClause();
88                 $startId = 0;
89                 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
90                 $batchesDone = 0;
91                 $rowsInserted = 0;
92
93                 echo "Finding revisions...\n";
94
95                 while ( true ) {
96                         $res = $dbr->select( array( 'revision', 'text' ),
97                                 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
98                                 array(
99                                         'rev_id > ' . $dbr->addQuotes( $startId ),
100                                         'rev_text_id=old_id',
101                                         $textClause,
102                                         'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
103                                 ),
104                                 __METHOD__,
105                                 array(
106                                         'ORDER BY' => 'rev_id',
107                                         'LIMIT' => $this->batchSize
108                                 )
109                         );
110                         if ( !$res->numRows() ) {
111                                 break;
112                         }
113
114                         $insertBatch = array();
115                         foreach ( $res as $row ) {
116                                 $startId = $row->rev_id;
117                                 $info = $this->interpretPointer( $row->old_text );
118                                 if ( !$info ) {
119                                         echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
120                                         continue;
121                                 }
122                                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
123                                         echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
124                                         continue;
125                                 }
126                                 $insertBatch[] = array(
127                                         'bt_page' => $row->rev_page,
128                                         'bt_rev_id' => $row->rev_id,
129                                         'bt_text_id' => $row->old_id,
130                                         'bt_cluster' => $info['cluster'],
131                                         'bt_blob_id' => $info['id'],
132                                         'bt_cgz_hash' => $info['hash']
133                                 );
134                                 if ( $this->doBlobOrphans ) {
135                                         gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
136                                 }
137                         }
138                         $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
139                         $rowsInserted += count( $insertBatch );
140
141                         ++$batchesDone;
142                         if ( $batchesDone >= $this->reportingInterval ) {
143                                 $batchesDone = 0;
144                                 echo "$startId / $endId\n";
145                                 wfWaitForSlaves( 5 );
146                         }
147                 }
148                 echo "Found $rowsInserted revisions\n";
149         }
150
151         /**
152          * Scan the text table for orphan text
153          * Orphan text here does not imply DB corruption -- deleted text tracked by the
154          * archive table counts as orphan for our purposes.
155          */
156         function trackOrphanText() {
157                 # Wait until the blob_tracking table is available in the slave
158                 $dbw = wfGetDB( DB_MASTER );
159                 $dbr = wfGetDB( DB_SLAVE );
160                 $pos = $dbw->getMasterPos();
161                 $dbr->masterPosWait( $pos, 100000 );
162
163                 $textClause = $this->getTextClause( $this->clusters );
164                 $startId = 0;
165                 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
166                 $rowsInserted = 0;
167                 $batchesDone = 0;
168
169                 echo "Finding orphan text...\n";
170
171                 # Scan the text table for orphan text
172                 while ( true ) {
173                         $res = $dbr->select( array( 'text', 'blob_tracking' ), 
174                                 array( 'old_id', 'old_flags', 'old_text' ),
175                                 array( 
176                                         'old_id>' . $dbr->addQuotes( $startId ),
177                                         $textClause,
178                                         'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
179                                         'bt_text_id IS NULL'
180                                 ),
181                                 __METHOD__,
182                                 array(
183                                         'ORDER BY' => 'old_id',
184                                         'LIMIT' => $this->batchSize 
185                                 ),
186                                 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
187                         );
188                         $ids = array();
189                         foreach ( $res as $row ) {
190                                 $ids[] = $row->old_id;
191                         }
192
193                         if ( !$res->numRows() ) {
194                                 break;
195                         }
196
197                         $insertBatch = array();
198                         foreach ( $res as $row ) {
199                                 $startId = $row->old_id;
200                                 $info = $this->interpretPointer( $row->old_text );
201                                 if ( !$info ) {
202                                         echo "Invalid DB:// URL in old_id {$row->old_id}\n";
203                                         continue;
204                                 }
205                                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
206                                         echo "Invalid cluster returned in SQL query\n";
207                                         continue;
208                                 }
209
210                                 $insertBatch[] = array(
211                                         'bt_page' => 0,
212                                         'bt_rev_id' => 0,
213                                         'bt_text_id' => $row->old_id,
214                                         'bt_cluster' => $info['cluster'],
215                                         'bt_blob_id' => $info['id'],
216                                         'bt_cgz_hash' => $info['hash']
217                                 );
218                                 if ( $this->doBlobOrphans ) {
219                                         gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
220                                 }
221                         }
222                         $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
223
224                         $rowsInserted += count( $insertBatch );
225                         ++$batchesDone;
226                         if ( $batchesDone >= $this->reportingInterval ) {
227                                 $batchesDone = 0;
228                                 echo "$startId / $endId\n";
229                                 wfWaitForSlaves( 5 );
230                         }
231                 }
232                 echo "Found $rowsInserted orphan text rows\n";
233         }
234
235         /**
236          * Scan the blobs table for rows not registered in blob_tracking (and thus not
237          * registered in the text table).
238          *
239          * Orphan blobs are indicative of DB corruption. They are inaccessible and
240          * should probably be deleted.
241          */
242         function findOrphanBlobs() {
243                 if ( !extension_loaded( 'gmp' ) ) {
244                         echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
245                         return;
246                 }
247
248                 $dbw = wfGetDB( DB_MASTER );
249
250                 foreach ( $this->clusters as $cluster ) {
251                         echo "Searching for orphan blobs in $cluster...\n";
252                         $lb = wfGetLBFactory()->getExternalLB( $cluster );
253                         try {
254                                 $extDB = $lb->getConnection( DB_SLAVE );
255                         } catch ( DBConnectionError $e ) {
256                                 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
257                                         echo "No database on $cluster\n";
258                                 } else {
259                                         echo "Error on $cluster: " . $e->getMessage() . "\n";
260                                 }
261                                 continue;
262                         }
263                         $table = $extDB->getLBInfo( 'blobs table' );
264                         if ( is_null( $table ) ) {
265                                 $table = 'blobs';
266                         }
267                         if ( !$extDB->tableExists( $table ) ) {
268                                 echo "No blobs table on cluster $cluster\n";
269                                 continue;
270                         }
271                         $startId = 0;
272                         $batchesDone = 0;
273                         $actualBlobs = gmp_init( 0 );
274                         $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
275
276                         // Build a bitmap of actual blob rows
277                         while ( true ) {
278                                 $res = $extDB->select( $table, 
279                                         array( 'blob_id' ), 
280                                         array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
281                                         __METHOD__,
282                                         array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
283                                 );
284
285                                 if ( !$res->numRows() ) {
286                                         break;
287                                 }
288
289                                 foreach ( $res as $row ) {
290                                         gmp_setbit( $actualBlobs, $row->blob_id );
291                                 }
292                                 $startId = $row->blob_id;
293
294                                 ++$batchesDone;
295                                 if ( $batchesDone >= $this->reportingInterval ) {
296                                         $batchesDone = 0;
297                                         echo "$startId / $endId\n";
298                                 }
299                         }
300
301                         // Find actual blobs that weren't tracked by the previous passes
302                         // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
303                         $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
304                         
305                         // Traverse the orphan list
306                         $insertBatch = array();
307                         $id = 0;
308                         $numOrphans = 0;
309                         while ( true ) {
310                                 $id = gmp_scan1( $orphans, $id );
311                                 if ( $id == -1 ) {
312                                         break;
313                                 }
314                                 $insertBatch[] = array(
315                                         'bo_cluster' => $cluster,
316                                         'bo_blob_id' => $id
317                                 );
318                                 if ( count( $insertBatch ) > $this->batchSize ) {
319                                         $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
320                                         $insertBatch = array();
321                                 }
322
323                                 ++$id;
324                                 ++$numOrphans;
325                         }
326                         if ( $insertBatch ) {
327                                 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
328                         }
329                         echo "Found $numOrphans orphan(s) in $cluster\n";
330                 }
331         }
332 }