]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - maintenance/storage/trackBlobs.php
MediaWiki 1.14.0
[autoinstallsdev/mediawiki.git] / maintenance / storage / trackBlobs.php
1 <?php
2
3 require( dirname( __FILE__ ) .'/../commandLine.inc' );
4
5
6 if ( count( $args ) < 1 ) {
7         echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
8         echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
9         echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
10
11         exit( 1 );
12 }
13 $tracker = new TrackBlobs( $args );
14 $tracker->trackBlobs();
15
16 class TrackBlobs {
17         var $clusters, $textClause;
18         var $doBlobOrphans;
19         var $trackedBlobs = array();
20
21         var $batchSize = 1000;
22         var $reportingInterval = 10;
23
24         function __construct( $clusters ) {
25                 $this->clusters = $clusters;
26                 if ( extension_loaded( 'gmp' ) ) {
27                         $this->doBlobOrphans = true;
28                         foreach ( $clusters as $cluster ) {
29                                 $this->trackedBlobs[$cluster] = gmp_init( 0 );
30                         }
31                 } else {
32                         echo "Warning: the gmp extension is needed to find orphan blobs\n";
33                 }
34         }
35
36         function trackBlobs() {
37                 $this->initTrackingTable();
38                 $this->trackRevisions();
39                 $this->trackOrphanText();
40                 if ( $this->doBlobOrphans ) {
41                         $this->findOrphanBlobs();
42                 }
43         }
44
45         function initTrackingTable() {
46                 $dbw = wfGetDB( DB_MASTER );
47                 if ( $dbw->tableExists( 'blob_tracking' ) ) {
48                         $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
49                         $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
50                 }
51                 $dbw->sourceFile( dirname( __FILE__ ) . '/blob_tracking.sql' );
52         }
53
54         function getTextClause() {
55                 if ( !$this->textClause ) {
56                         $dbr = wfGetDB( DB_SLAVE );
57                         $this->textClause = '';
58                         foreach ( $this->clusters as $cluster ) {
59                                 if ( $this->textClause != '' ) {
60                                         $this->textClause .= ' OR ';
61                                 }
62                                 $this->textClause .= 'old_text LIKE ' . $dbr->addQuotes( $dbr->escapeLike( "DB://$cluster/" ) . '%' );
63                         }
64                 }
65                 return $this->textClause;
66         }
67
68         function interpretPointer( $text ) {
69                 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
70                         return false;
71                 }
72                 return array(
73                         'cluster' => $m[1],
74                         'id' => intval( $m[2] ),
75                         'hash' => isset( $m[3] ) ? $m[2] : null
76                 );
77         }
78
79         /**
80          *  Scan the revision table for rows stored in the specified clusters
81          */
82         function trackRevisions() {
83                 $dbw = wfGetDB( DB_MASTER );
84                 $dbr = wfGetDB( DB_SLAVE );
85
86                 $textClause = $this->getTextClause();
87                 $startId = 0;
88                 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
89                 $batchesDone = 0;
90                 $rowsInserted = 0;
91
92                 echo "Finding revisions...\n";
93
94                 while ( true ) {
95                         $res = $dbr->select( array( 'revision', 'text' ),
96                                 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
97                                 array(
98                                         'rev_id > ' . $dbr->addQuotes( $startId ),
99                                         'rev_text_id=old_id',
100                                         $textClause,
101                                         "old_flags LIKE '%external%'",
102                                 ),
103                                 __METHOD__,
104                                 array(
105                                         'ORDER BY' => 'rev_id',
106                                         'LIMIT' => $this->batchSize
107                                 )
108                         );
109                         if ( !$res->numRows() ) {
110                                 break;
111                         }
112
113                         $insertBatch = array();
114                         foreach ( $res as $row ) {
115                                 $startId = $row->rev_id;
116                                 $info = $this->interpretPointer( $row->old_text );
117                                 if ( !$info ) {
118                                         echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
119                                         continue;
120                                 }
121                                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
122                                         echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
123                                         continue;
124                                 }
125                                 $insertBatch[] = array(
126                                         'bt_page' => $row->rev_page,
127                                         'bt_rev_id' => $row->rev_id,
128                                         'bt_text_id' => $row->old_id,
129                                         'bt_cluster' => $info['cluster'],
130                                         'bt_blob_id' => $info['id'],
131                                         'bt_cgz_hash' => $info['hash']
132                                 );
133                                 if ( $this->doBlobOrphans ) {
134                                         gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
135                                 }
136                         }
137                         $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
138                         $rowsInserted += count( $insertBatch );
139
140                         ++$batchesDone;
141                         if ( $batchesDone >= $this->reportingInterval ) {
142                                 $batchesDone = 0;
143                                 echo "$startId / $endId\n";
144                                 wfWaitForSlaves( 5 );
145                         }
146                 }
147                 echo "Found $rowsInserted revisions\n";
148         }
149
150         /**
151          * Scan the text table for orphan text
152          * Orphan text here does not imply DB corruption -- deleted text tracked by the
153          * archive table counts as orphan for our purposes.
154          */
155         function trackOrphanText() {
156                 # Wait until the blob_tracking table is available in the slave
157                 $dbw = wfGetDB( DB_MASTER );
158                 $dbr = wfGetDB( DB_SLAVE );
159                 $pos = $dbw->getMasterPos();
160                 $dbr->masterPosWait( $pos, 100000 );
161
162                 $textClause = $this->getTextClause( $this->clusters );
163                 $startId = 0;
164                 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
165                 $rowsInserted = 0;
166                 $batchesDone = 0;
167
168                 echo "Finding orphan text...\n";
169
170                 # Scan the text table for orphan text
171                 while ( true ) {
172                         $res = $dbr->select( array( 'text', 'blob_tracking' ), 
173                                 array( 'old_id', 'old_flags', 'old_text' ),
174                                 array( 
175                                         'old_id>' . $dbr->addQuotes( $startId ),
176                                         $textClause,
177                                         "old_flags LIKE '%external%'",
178                                         'bt_text_id IS NULL'
179                                 ),
180                                 __METHOD__,
181                                 array(
182                                         'ORDER BY' => 'old_id',
183                                         'LIMIT' => $this->batchSize 
184                                 ),
185                                 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
186                         );
187                         $ids = array();
188                         foreach ( $res as $row ) {
189                                 $ids[] = $row->old_id;
190                         }
191
192                         if ( !$res->numRows() ) {
193                                 break;
194                         }
195
196                         $insertBatch = array();
197                         foreach ( $res as $row ) {
198                                 $startId = $row->old_id;
199                                 $info = $this->interpretPointer( $row->old_text );
200                                 if ( !$info ) {
201                                         echo "Invalid DB:// URL in old_id {$row->old_id}\n";
202                                         continue;
203                                 }
204                                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
205                                         echo "Invalid cluster returned in SQL query\n";
206                                         continue;
207                                 }
208
209                                 $insertBatch[] = array(
210                                         'bt_page' => 0,
211                                         'bt_rev_id' => 0,
212                                         'bt_text_id' => $row->old_id,
213                                         'bt_cluster' => $info['cluster'],
214                                         'bt_blob_id' => $info['id'],
215                                         'bt_cgz_hash' => $info['hash']
216                                 );
217                                 if ( $this->doBlobOrphans ) {
218                                         gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
219                                 }
220                         }
221                         $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
222
223                         $rowsInserted += count( $insertBatch );
224                         ++$batchesDone;
225                         if ( $batchesDone >= $this->reportingInterval ) {
226                                 $batchesDone = 0;
227                                 echo "$startId / $endId\n";
228                                 wfWaitForSlaves( 5 );
229                         }
230                 }
231                 echo "Found $rowsInserted orphan text rows\n";
232         }
233
234         /**
235          * Scan the blobs table for rows not registered in blob_tracking (and thus not
236          * registered in the text table).
237          *
238          * Orphan blobs are indicative of DB corruption. They are inaccessible and
239          * should probably be deleted.
240          */
241         function findOrphanBlobs() {
242                 if ( !extension_loaded( 'gmp' ) ) {
243                         echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
244                         return;
245                 }
246
247                 $dbw = wfGetDB( DB_MASTER );
248
249                 foreach ( $this->clusters as $cluster ) {
250                         echo "Searching for orphan blobs in $cluster...\n";
251                         $lb = wfGetLBFactory()->getExternalLB( $cluster );
252                         try {
253                                 $extDB = $lb->getConnection( DB_SLAVE );
254                         } catch ( DBConnectionError $e ) {
255                                 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
256                                         echo "No database on $cluster\n";
257                                 } else {
258                                         echo "Error on $cluster: " . $e->getMessage() . "\n";
259                                 }
260                                 continue;
261                         }
262                         $startId = 0;
263                         $batchesDone = 0;
264                         $actualBlobs = gmp_init( 0 );
265                         $endId = $extDB->selectField( 'blobs', 'MAX(blob_id)', false, __METHOD__ );
266
267                         // Build a bitmap of actual blob rows
268                         while ( true ) {
269                                 $res = $extDB->select( 'blobs', 
270                                         array( 'blob_id' ), 
271                                         array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
272                                         __METHOD__,
273                                         array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
274                                 );
275
276                                 if ( !$res->numRows() ) {
277                                         break;
278                                 }
279
280                                 foreach ( $res as $row ) {
281                                         gmp_setbit( $actualBlobs, $row->blob_id );
282                                 }
283                                 $startId = $row->blob_id;
284
285                                 ++$batchesDone;
286                                 if ( $batchesDone >= $this->reportingInterval ) {
287                                         $batchesDone = 0;
288                                         echo "$startId / $endId\n";
289                                 }
290                         }
291
292                         // Find actual blobs that weren't tracked by the previous passes
293                         // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
294                         $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
295                         
296                         // Traverse the orphan list
297                         $insertBatch = array();
298                         $id = 0;
299                         while ( true ) {
300                                 $id = gmp_scan1( $orphans, $id );
301                                 if ( $id == -1 ) {
302                                         break;
303                                 }
304                                 $insertBatch[] = array(
305                                         'bo_cluster' => $cluster,
306                                         'bo_blob_id' => $id
307                                 );
308                                 ++$id;
309                         }
310
311                         // Insert the batch
312                         echo "Found " . count( $insertBatch ) . " orphan(s) in $cluster\n";
313                         $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
314                 }
315         }
316 }