]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - maintenance/storage/compressOld.php
MediaWiki 1.30.2-scripts
[autoinstalls/mediawiki.git] / maintenance / storage / compressOld.php
1 <?php
2 /**
3  * Compress the text of a wiki.
4  *
5  * Usage:
6  *
7  * Non-wikimedia
8  * php compressOld.php [options...]
9  *
10  * Wikimedia
11  * php compressOld.php <database> [options...]
12  *
13  * Options are:
14  *  -t <type>           set compression type to either:
15  *                          gzip: compress revisions independently
16  *                          concat: concatenate revisions and compress in chunks (default)
17  *  -c <chunk-size>     maximum number of revisions in a concat chunk
18  *  -b <begin-date>     earliest date to check for uncompressed revisions
19  *  -e <end-date>       latest revision date to compress
20  *  -s <startid>        the id to start from (referring to the text table for
21  *                      type gzip, and to the page table for type concat)
22  *  -n <endid>          the page_id to stop at (only when using concat compression type)
23  *  --extdb <cluster>   store specified revisions in an external cluster (untested)
24  *
25  * This program is free software; you can redistribute it and/or modify
26  * it under the terms of the GNU General Public License as published by
27  * the Free Software Foundation; either version 2 of the License, or
28  * (at your option) any later version.
29  *
30  * This program is distributed in the hope that it will be useful,
31  * but WITHOUT ANY WARRANTY; without even the implied warranty of
32  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33  * GNU General Public License for more details.
34  *
35  * You should have received a copy of the GNU General Public License along
36  * with this program; if not, write to the Free Software Foundation, Inc.,
37  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
38  * http://www.gnu.org/copyleft/gpl.html
39  *
40  * @file
41  * @ingroup Maintenance ExternalStorage
42  */
43
44 require_once __DIR__ . '/../Maintenance.php';
45
46 /**
47  * Maintenance script that compress the text of a wiki.
48  *
49  * @ingroup Maintenance ExternalStorage
50  */
51 class CompressOld extends Maintenance {
52         /**
53          * Option to load each revision individually.
54          */
55         const LS_INDIVIDUAL = 0;
56
57         /**
58          * Option to load revisions in chunks.
59          */
60         const LS_CHUNKED = 1;
61
62         public function __construct() {
63                 parent::__construct();
64                 $this->addDescription( 'Compress the text of a wiki' );
65                 $this->addOption( 'type', 'Set compression type to either: gzip|concat', false, true, 't' );
66                 $this->addOption(
67                         'chunksize',
68                         'Maximum number of revisions in a concat chunk',
69                         false,
70                         true,
71                         'c'
72                 );
73                 $this->addOption(
74                         'begin-date',
75                         'Earliest date to check for uncompressed revisions',
76                         false,
77                         true,
78                         'b'
79                 );
80                 $this->addOption( 'end-date', 'Latest revision date to compress', false, true, 'e' );
81                 $this->addOption(
82                         'startid',
83                         'The id to start from (gzip -> text table, concat -> page table)',
84                         false,
85                         true,
86                         's'
87                 );
88                 $this->addOption(
89                         'extdb',
90                         'Store specified revisions in an external cluster (untested)',
91                         false,
92                         true
93                 );
94                 $this->addOption(
95                         'endid',
96                         'The page_id to stop at (only when using concat compression type)',
97                         false,
98                         true,
99                         'n'
100                 );
101         }
102
103         public function execute() {
104                 global $wgDBname;
105                 if ( !function_exists( "gzdeflate" ) ) {
106                         $this->error( "You must enable zlib support in PHP to compress old revisions!\n" .
107                                 "Please see http://www.php.net/manual/en/ref.zlib.php\n", true );
108                 }
109
110                 $type = $this->getOption( 'type', 'concat' );
111                 $chunkSize = $this->getOption( 'chunksize', 20 );
112                 $startId = $this->getOption( 'startid', 0 );
113                 $beginDate = $this->getOption( 'begin-date', '' );
114                 $endDate = $this->getOption( 'end-date', '' );
115                 $extDB = $this->getOption( 'extdb', '' );
116                 $endId = $this->getOption( 'endid', false );
117
118                 if ( $type != 'concat' && $type != 'gzip' ) {
119                         $this->error( "Type \"{$type}\" not supported" );
120                 }
121
122                 if ( $extDB != '' ) {
123                         $this->output( "Compressing database {$wgDBname} to external cluster {$extDB}\n"
124                                 . str_repeat( '-', 76 ) . "\n\n" );
125                 } else {
126                         $this->output( "Compressing database {$wgDBname}\n"
127                                 . str_repeat( '-', 76 ) . "\n\n" );
128                 }
129
130                 $success = true;
131                 if ( $type == 'concat' ) {
132                         $success = $this->compressWithConcat( $startId, $chunkSize, $beginDate,
133                                 $endDate, $extDB, $endId );
134                 } else {
135                         $this->compressOldPages( $startId, $extDB );
136                 }
137
138                 if ( $success ) {
139                         $this->output( "Done.\n" );
140                 }
141         }
142
143         /**
144          * Fetch the text row-by-row to 'compressPage' function for compression.
145          *
146          * @param int $start
147          * @param string $extdb
148          */
149         private function compressOldPages( $start = 0, $extdb = '' ) {
150                 $chunksize = 50;
151                 $this->output( "Starting from old_id $start...\n" );
152                 $dbw = $this->getDB( DB_MASTER );
153                 do {
154                         $res = $dbw->select(
155                                 'text',
156                                 [ 'old_id', 'old_flags', 'old_text' ],
157                                 "old_id>=$start",
158                                 __METHOD__,
159                                 [ 'ORDER BY' => 'old_id', 'LIMIT' => $chunksize, 'FOR UPDATE' ]
160                         );
161
162                         if ( $res->numRows() == 0 ) {
163                                 break;
164                         }
165
166                         $last = $start;
167
168                         foreach ( $res as $row ) {
169                                 # print "  {$row->old_id} - {$row->old_namespace}:{$row->old_title}\n";
170                                 $this->compressPage( $row, $extdb );
171                                 $last = $row->old_id;
172                         }
173
174                         $start = $last + 1; # Deletion may leave long empty stretches
175                         $this->output( "$start...\n" );
176                 } while ( true );
177         }
178
179         /**
180          * Compress the text in gzip format.
181          *
182          * @param stdClass $row
183          * @param string $extdb
184          * @return bool
185          */
186         private function compressPage( $row, $extdb ) {
187                 if ( false !== strpos( $row->old_flags, 'gzip' )
188                         || false !== strpos( $row->old_flags, 'object' )
189                 ) {
190                         # print "Already compressed row {$row->old_id}\n";
191                         return false;
192                 }
193                 $dbw = $this->getDB( DB_MASTER );
194                 $flags = $row->old_flags ? "{$row->old_flags},gzip" : "gzip";
195                 $compress = gzdeflate( $row->old_text );
196
197                 # Store in external storage if required
198                 if ( $extdb !== '' ) {
199                         $storeObj = new ExternalStoreDB;
200                         $compress = $storeObj->store( $extdb, $compress );
201                         if ( $compress === false ) {
202                                 $this->error( "Unable to store object" );
203
204                                 return false;
205                         }
206                 }
207
208                 # Update text row
209                 $dbw->update( 'text',
210                         [ /* SET */
211                                 'old_flags' => $flags,
212                                 'old_text' => $compress
213                         ], [ /* WHERE */
214                                 'old_id' => $row->old_id
215                         ], __METHOD__,
216                         [ 'LIMIT' => 1 ]
217                 );
218
219                 return true;
220         }
221
222         /**
223          * Compress the text in chunks after concatenating the revisions.
224          *
225          * @param int $startId
226          * @param int $maxChunkSize
227          * @param string $beginDate
228          * @param string $endDate
229          * @param string $extdb
230          * @param bool|int $maxPageId
231          * @return bool
232          */
233         private function compressWithConcat( $startId, $maxChunkSize, $beginDate,
234                 $endDate, $extdb = "", $maxPageId = false
235         ) {
236                 $loadStyle = self::LS_CHUNKED;
237
238                 $dbr = $this->getDB( DB_REPLICA );
239                 $dbw = $this->getDB( DB_MASTER );
240
241                 # Set up external storage
242                 if ( $extdb != '' ) {
243                         $storeObj = new ExternalStoreDB;
244                 }
245
246                 # Get all articles by page_id
247                 if ( !$maxPageId ) {
248                         $maxPageId = $dbr->selectField( 'page', 'max(page_id)', '', __METHOD__ );
249                 }
250                 $this->output( "Starting from $startId of $maxPageId\n" );
251                 $pageConds = [];
252
253                 /*
254                 if ( $exclude_ns0 ) {
255                         print "Excluding main namespace\n";
256                         $pageConds[] = 'page_namespace<>0';
257                 }
258                 if ( $queryExtra ) {
259                                         $pageConds[] = $queryExtra;
260                 }
261                  */
262
263                 # For each article, get a list of revisions which fit the criteria
264
265                 # No recompression, use a condition on old_flags
266                 # Don't compress object type entities, because that might produce data loss when
267                 # overwriting bulk storage concat rows. Don't compress external references, because
268                 # the script doesn't yet delete rows from external storage.
269                 $conds = [
270                         'old_flags NOT ' . $dbr->buildLike( $dbr->anyString(), 'object', $dbr->anyString() )
271                         . ' AND old_flags NOT '
272                         . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() )
273                 ];
274
275                 if ( $beginDate ) {
276                         if ( !preg_match( '/^\d{14}$/', $beginDate ) ) {
277                                 $this->error( "Invalid begin date \"$beginDate\"\n" );
278
279                                 return false;
280                         }
281                         $conds[] = "rev_timestamp>'" . $beginDate . "'";
282                 }
283                 if ( $endDate ) {
284                         if ( !preg_match( '/^\d{14}$/', $endDate ) ) {
285                                 $this->error( "Invalid end date \"$endDate\"\n" );
286
287                                 return false;
288                         }
289                         $conds[] = "rev_timestamp<'" . $endDate . "'";
290                 }
291                 if ( $loadStyle == self::LS_CHUNKED ) {
292                         $tables = [ 'revision', 'text' ];
293                         $fields = [ 'rev_id', 'rev_text_id', 'old_flags', 'old_text' ];
294                         $conds[] = 'rev_text_id=old_id';
295                         $revLoadOptions = 'FOR UPDATE';
296                 } else {
297                         $tables = [ 'revision' ];
298                         $fields = [ 'rev_id', 'rev_text_id' ];
299                         $revLoadOptions = [];
300                 }
301
302                 # Don't work with current revisions
303                 # Don't lock the page table for update either -- TS 2006-04-04
304                 # $tables[] = 'page';
305                 # $conds[] = 'page_id=rev_page AND rev_id != page_latest';
306
307                 for ( $pageId = $startId; $pageId <= $maxPageId; $pageId++ ) {
308                         wfWaitForSlaves();
309
310                         # Wake up
311                         $dbr->ping();
312
313                         # Get the page row
314                         $pageRes = $dbr->select( 'page',
315                                 [ 'page_id', 'page_namespace', 'page_title', 'page_latest' ],
316                                 $pageConds + [ 'page_id' => $pageId ], __METHOD__ );
317                         if ( $pageRes->numRows() == 0 ) {
318                                 continue;
319                         }
320                         $pageRow = $dbr->fetchObject( $pageRes );
321
322                         # Display progress
323                         $titleObj = Title::makeTitle( $pageRow->page_namespace, $pageRow->page_title );
324                         $this->output( "$pageId\t" . $titleObj->getPrefixedDBkey() . " " );
325
326                         # Load revisions
327                         $revRes = $dbw->select( $tables, $fields,
328                                 array_merge( [
329                                         'rev_page' => $pageRow->page_id,
330                                         # Don't operate on the current revision
331                                         # Use < instead of <> in case the current revision has changed
332                                         # since the page select, which wasn't locking
333                                         'rev_id < ' . $pageRow->page_latest
334                                 ], $conds ),
335                                 __METHOD__,
336                                 $revLoadOptions
337                         );
338                         $revs = [];
339                         foreach ( $revRes as $revRow ) {
340                                 $revs[] = $revRow;
341                         }
342
343                         if ( count( $revs ) < 2 ) {
344                                 # No revisions matching, no further processing
345                                 $this->output( "\n" );
346                                 continue;
347                         }
348
349                         # For each chunk
350                         $i = 0;
351                         while ( $i < count( $revs ) ) {
352                                 if ( $i < count( $revs ) - $maxChunkSize ) {
353                                         $thisChunkSize = $maxChunkSize;
354                                 } else {
355                                         $thisChunkSize = count( $revs ) - $i;
356                                 }
357
358                                 $chunk = new ConcatenatedGzipHistoryBlob();
359                                 $stubs = [];
360                                 $this->beginTransaction( $dbw, __METHOD__ );
361                                 $usedChunk = false;
362                                 $primaryOldid = $revs[$i]->rev_text_id;
363
364                                 // @codingStandardsIgnoreStart Ignore avoid function calls in a FOR loop test part warning
365                                 # Get the text of each revision and add it to the object
366                                 for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy(); $j++ ) {
367                                         // @codingStandardsIgnoreEnd
368                                         $oldid = $revs[$i + $j]->rev_text_id;
369
370                                         # Get text
371                                         if ( $loadStyle == self::LS_INDIVIDUAL ) {
372                                                 $textRow = $dbw->selectRow( 'text',
373                                                         [ 'old_flags', 'old_text' ],
374                                                         [ 'old_id' => $oldid ],
375                                                         __METHOD__,
376                                                         'FOR UPDATE'
377                                                 );
378                                                 $text = Revision::getRevisionText( $textRow );
379                                         } else {
380                                                 $text = Revision::getRevisionText( $revs[$i + $j] );
381                                         }
382
383                                         if ( $text === false ) {
384                                                 $this->error( "\nError, unable to get text in old_id $oldid" );
385                                                 # $dbw->delete( 'old', [ 'old_id' => $oldid ] );
386                                         }
387
388                                         if ( $extdb == "" && $j == 0 ) {
389                                                 $chunk->setText( $text );
390                                                 $this->output( '.' );
391                                         } else {
392                                                 # Don't make a stub if it's going to be longer than the article
393                                                 # Stubs are typically about 100 bytes
394                                                 if ( strlen( $text ) < 120 ) {
395                                                         $stub = false;
396                                                         $this->output( 'x' );
397                                                 } else {
398                                                         $stub = new HistoryBlobStub( $chunk->addItem( $text ) );
399                                                         $stub->setLocation( $primaryOldid );
400                                                         $stub->setReferrer( $oldid );
401                                                         $this->output( '.' );
402                                                         $usedChunk = true;
403                                                 }
404                                                 $stubs[$j] = $stub;
405                                         }
406                                 }
407                                 $thisChunkSize = $j;
408
409                                 # If we couldn't actually use any stubs because the pages were too small, do nothing
410                                 if ( $usedChunk ) {
411                                         if ( $extdb != "" ) {
412                                                 # Move blob objects to External Storage
413                                                 $stored = $storeObj->store( $extdb, serialize( $chunk ) );
414                                                 if ( $stored === false ) {
415                                                         $this->error( "Unable to store object" );
416
417                                                         return false;
418                                                 }
419                                                 # Store External Storage URLs instead of Stub placeholders
420                                                 foreach ( $stubs as $stub ) {
421                                                         if ( $stub === false ) {
422                                                                 continue;
423                                                         }
424                                                         # $stored should provide base path to a BLOB
425                                                         $url = $stored . "/" . $stub->getHash();
426                                                         $dbw->update( 'text',
427                                                                 [ /* SET */
428                                                                         'old_text' => $url,
429                                                                         'old_flags' => 'external,utf-8',
430                                                                 ], [ /* WHERE */
431                                                                         'old_id' => $stub->getReferrer(),
432                                                                 ]
433                                                         );
434                                                 }
435                                         } else {
436                                                 # Store the main object locally
437                                                 $dbw->update( 'text',
438                                                         [ /* SET */
439                                                                 'old_text' => serialize( $chunk ),
440                                                                 'old_flags' => 'object,utf-8',
441                                                         ], [ /* WHERE */
442                                                                 'old_id' => $primaryOldid
443                                                         ]
444                                                 );
445
446                                                 # Store the stub objects
447                                                 for ( $j = 1; $j < $thisChunkSize; $j++ ) {
448                                                         # Skip if not compressing and don't overwrite the first revision
449                                                         if ( $stubs[$j] !== false && $revs[$i + $j]->rev_text_id != $primaryOldid ) {
450                                                                 $dbw->update( 'text',
451                                                                         [ /* SET */
452                                                                                 'old_text' => serialize( $stubs[$j] ),
453                                                                                 'old_flags' => 'object,utf-8',
454                                                                         ], [ /* WHERE */
455                                                                                 'old_id' => $revs[$i + $j]->rev_text_id
456                                                                         ]
457                                                                 );
458                                                         }
459                                                 }
460                                         }
461                                 }
462                                 # Done, next
463                                 $this->output( "/" );
464                                 $this->commitTransaction( $dbw, __METHOD__ );
465                                 $i += $thisChunkSize;
466                                 wfWaitForSlaves();
467                         }
468                         $this->output( "\n" );
469                 }
470
471                 return true;
472         }
473 }
474
475 $maintClass = 'CompressOld';
476 require_once RUN_MAINTENANCE_IF_MAIN;