X-Git-Url: https://scripts.mit.edu/gitweb/autoinstallsdev/mediawiki.git/blobdiff_plain/19e297c21b10b1b8a3acad5e73fc71dcb35db44a..6932310fd58ebef145fa01eb76edf7150284d8ea:/maintenance/updateCollation.php diff --git a/maintenance/updateCollation.php b/maintenance/updateCollation.php index e890bce7..84fc2d20 100644 --- a/maintenance/updateCollation.php +++ b/maintenance/updateCollation.php @@ -1,97 +1,182 @@ mDescription = <<addDescription( <<addOption( 'force', 'Run on all rows, even if the collation is ' . - 'supposed to be up-to-date.' ); - } - - public function syncDBs() { - $lb = wfGetLB(); - // bug 27975 - Don't try to wait for slaves if there are none - // Prevents permission error when getting master position - if ( $lb->getServerCount() > 1 ) { - $dbw = $lb->getConnection( DB_MASTER ); - $pos = $dbw->getMasterPos(); - $lb->waitForAll( $pos ); - } + $this->addOption( 'force', 'Run on all rows, even if the collation is ' . + 'supposed to be up-to-date.', false, false, 'f' ); + $this->addOption( 'previous-collation', 'Set the previous value of ' . + '$wgCategoryCollation here to speed up this script, especially if your ' . + 'categorylinks table is large. This will only update rows with that ' . + 'collation, though, so it may miss out-of-date rows with a different, ' . + 'even older collation.', false, true ); + $this->addOption( 'target-collation', 'Set this to the new collation type to ' . + 'use instead of $wgCategoryCollation. Usually you should not use this, ' . + 'you should just update $wgCategoryCollation in LocalSettings.php.', + false, true ); + $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' . + 'compile statistics.' ); + $this->addOption( 'verbose-stats', 'Show more statistics.' ); } public function execute() { - global $wgCategoryCollation, $wgMiserMode; + global $wgCategoryCollation; - $dbw = wfGetDB( DB_MASTER ); + $dbw = $this->getDB( DB_MASTER ); + $dbr = $this->getDB( DB_REPLICA ); $force = $this->getOption( 'force' ); + $dryRun = $this->getOption( 'dry-run' ); + $verboseStats = $this->getOption( 'verbose-stats' ); + if ( $this->hasOption( 'target-collation' ) ) { + $collationName = $this->getOption( 'target-collation' ); + $collation = Collation::factory( $collationName ); + } else { + $collationName = $wgCategoryCollation; + $collation = Collation::singleton(); + } + + // Collation sanity check: in some cases the constructor will work, + // but this will raise an exception, breaking all category pages + $collation->getFirstLetter( 'MediaWiki' ); - $options = array( 'LIMIT' => self::BATCH_SIZE ); + // Locally at least, (my local is a rather old version of mysql) + // mysql seems to filesort if there is both an equality + // (but not for an inequality) condition on cl_collation in the + // WHERE and it is also the first item in the ORDER BY. + if ( $this->hasOption( 'previous-collation' ) ) { + $orderBy = 'cl_to, cl_type, cl_from'; + } else { + $orderBy = 'cl_collation, cl_to, cl_type, cl_from'; + } + $options = [ + 'LIMIT' => self::BATCH_SIZE, + 'ORDER BY' => $orderBy, + 'STRAIGHT_JOIN' // per T58041 + ]; if ( $force ) { - $options['ORDER BY'] = 'cl_from, cl_to'; - $collationConds = array(); + $collationConds = []; } else { - $collationConds = array( 0 => - 'cl_collation != ' . $dbw->addQuotes( $wgCategoryCollation ) ); + if ( $this->hasOption( 'previous-collation' ) ) { + $collationConds['cl_collation'] = $this->getOption( 'previous-collation' ); + } else { + $collationConds = [ 0 => + 'cl_collation != ' . $dbw->addQuotes( $collationName ) + ]; + } - if ( !$wgMiserMode ) { - $count = $dbw->selectField( + $count = $dbr->estimateRowCount( + 'categorylinks', + '*', + $collationConds, + __METHOD__ + ); + // Improve estimate if feasible + if ( $count < 1000000 ) { + $count = $dbr->selectField( 'categorylinks', 'COUNT(*)', $collationConds, __METHOD__ ); + } + if ( $count == 0 ) { + $this->output( "Collations up-to-date.\n" ); - if ( $count == 0 ) { - $this->output( "Collations up-to-date.\n" ); - return; - } + return; + } + if ( $dryRun ) { + $this->output( "$count rows would be updated.\n" ); + } else { $this->output( "Fixing collation for $count rows.\n" ); } + wfWaitForSlaves(); } - $count = 0; - $row = false; - $batchConds = array(); + $batchCount = 0; + $batchConds = []; do { - $this->output( 'Processing next ' . self::BATCH_SIZE . ' rows... '); + $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." ); + + // cl_type must be selected as a number for proper paging because + // enums suck. + if ( $dbw->getType() === 'mysql' ) { + $clType = 'cl_type+0 AS "cl_type_numeric"'; + } else { + $clType = 'cl_type'; + } $res = $dbw->select( - array( 'categorylinks', 'page' ), - array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation', - 'cl_sortkey', 'page_namespace', 'page_title' - ), - array_merge( $collationConds, $batchConds, array( 'cl_from = page_id' ) ), + [ 'categorylinks', 'page' ], + [ 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation', + 'cl_sortkey', $clType, + 'page_namespace', 'page_title' + ], + array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ), __METHOD__, $options ); + $this->output( " processing..." ); - $dbw->begin(); + if ( !$dryRun ) { + $this->beginTransaction( $dbw, __METHOD__ ); + } foreach ( $res as $row ) { $title = Title::newFromRow( $row ); if ( !$row->cl_collation ) { # This is an old-style row, so the sortkey needs to be # converted. if ( $row->cl_sortkey == $title->getText() - || $row->cl_sortkey == $title->getPrefixedText() ) { + || $row->cl_sortkey == $title->getPrefixedText() + ) { $prefix = ''; } else { # Custom sortkey, use it as a prefix @@ -109,37 +194,155 @@ TEXT; } else { $type = 'page'; } - $dbw->update( - 'categorylinks', - array( - 'cl_sortkey' => Collation::singleton()->getSortKey( - $title->getCategorySortkey( $prefix ) ), - 'cl_sortkey_prefix' => $prefix, - 'cl_collation' => $wgCategoryCollation, - 'cl_type' => $type, - 'cl_timestamp = cl_timestamp', - ), - array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ), - __METHOD__ - ); - } - $dbw->commit(); + $newSortKey = $collation->getSortKey( + $title->getCategorySortkey( $prefix ) ); + if ( $verboseStats ) { + $this->updateSortKeySizeHistogram( $newSortKey ); + } - if ( $force && $row ) { - $encFrom = $dbw->addQuotes( $row->cl_from ); - $encTo = $dbw->addQuotes( $row->cl_to ); - $batchConds = array( - "(cl_from = $encFrom AND cl_to > $encTo) " . - " OR cl_from > $encFrom" ); + if ( !$dryRun ) { + $dbw->update( + 'categorylinks', + [ + 'cl_sortkey' => $newSortKey, + 'cl_sortkey_prefix' => $prefix, + 'cl_collation' => $collationName, + 'cl_type' => $type, + 'cl_timestamp = cl_timestamp', + ], + [ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ], + __METHOD__ + ); + } + if ( $row ) { + $batchConds = [ $this->getBatchCondition( $row, $dbw ) ]; + } + } + if ( !$dryRun ) { + $this->commitTransaction( $dbw, __METHOD__ ); } $count += $res->numRows(); $this->output( "$count done.\n" ); - - $this->syncDBs(); + + if ( !$dryRun && ++$batchCount % self::SYNC_INTERVAL == 0 ) { + $this->output( "Waiting for replica DBs ... " ); + wfWaitForSlaves(); + $this->output( "done\n" ); + } } while ( $res->numRows() == self::BATCH_SIZE ); + + $this->output( "$count rows processed\n" ); + + if ( $verboseStats ) { + $this->output( "\n" ); + $this->showSortKeySizeHistogram(); + } + } + + /** + * Return an SQL expression selecting rows which sort above the given row, + * assuming an ordering of cl_collation, cl_to, cl_type, cl_from + * @param stdClass $row + * @param IDatabase $dbw + * @return string + */ + function getBatchCondition( $row, $dbw ) { + if ( $this->hasOption( 'previous-collation' ) ) { + $fields = [ 'cl_to', 'cl_type', 'cl_from' ]; + } else { + $fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ]; + } + $first = true; + $cond = false; + $prefix = false; + foreach ( $fields as $field ) { + if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) { + // Range conditions with enums are weird in mysql + // This must be a numeric literal, or it won't work. + $encValue = intval( $row->cl_type_numeric ); + } else { + $encValue = $dbw->addQuotes( $row->$field ); + } + $inequality = "$field > $encValue"; + $equality = "$field = $encValue"; + if ( $first ) { + $cond = $inequality; + $prefix = $equality; + $first = false; + } else { + $cond .= " OR ($prefix AND $inequality)"; + $prefix .= " AND $equality"; + } + } + + return $cond; + } + + function updateSortKeySizeHistogram( $key ) { + $length = strlen( $key ); + if ( !isset( $this->sizeHistogram[$length] ) ) { + $this->sizeHistogram[$length] = 0; + } + $this->sizeHistogram[$length]++; + } + + function showSortKeySizeHistogram() { + $maxLength = max( array_keys( $this->sizeHistogram ) ); + if ( $maxLength == 0 ) { + return; + } + $numBins = 20; + $coarseHistogram = array_fill( 0, $numBins, 0 ); + $coarseBoundaries = []; + $boundary = 0; + for ( $i = 0; $i < $numBins - 1; $i++ ) { + $boundary += $maxLength / $numBins; + $coarseBoundaries[$i] = round( $boundary ); + } + $coarseBoundaries[$numBins - 1] = $maxLength + 1; + $raw = ''; + for ( $i = 0; $i <= $maxLength; $i++ ) { + if ( $raw !== '' ) { + $raw .= ', '; + } + if ( !isset( $this->sizeHistogram[$i] ) ) { + $val = 0; + } else { + $val = $this->sizeHistogram[$i]; + } + for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) { + if ( $coarseBoundaries[$coarseIndex] > $i ) { + $coarseHistogram[$coarseIndex] += $val; + break; + } + } + if ( $coarseIndex == $numBins - 1 ) { + $coarseHistogram[$coarseIndex] += $val; + } + $raw .= $val; + } + + $this->output( "Sort key size histogram\nRaw data: $raw\n\n" ); + + $maxBinVal = max( $coarseHistogram ); + $scale = 60 / $maxBinVal; + $prevBoundary = 0; + for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) { + if ( !isset( $coarseHistogram[$coarseIndex] ) ) { + $val = 0; + } else { + $val = $coarseHistogram[$coarseIndex]; + } + $boundary = $coarseBoundaries[$coarseIndex]; + $this->output( sprintf( "%-10s %-10d |%s\n", + $prevBoundary . '-' . ( $boundary - 1 ) . ': ', + $val, + str_repeat( '*', $scale * $val ) ) ); + $prevBoundary = $boundary; + } } } $maintClass = "UpdateCollation"; -require_once( RUN_MAINTENANCE_IF_MAIN ); +require_once RUN_MAINTENANCE_IF_MAIN;