X-Git-Url: https://scripts.mit.edu/gitweb/autoinstallsdev/mediawiki.git/blobdiff_plain/19e297c21b10b1b8a3acad5e73fc71dcb35db44a..6932310fd58ebef145fa01eb76edf7150284d8ea:/extensions/SpamBlacklist/maintenance/cleanup.php diff --git a/extensions/SpamBlacklist/maintenance/cleanup.php b/extensions/SpamBlacklist/maintenance/cleanup.php new file mode 100644 index 00000000..c56f1482 --- /dev/null +++ b/extensions/SpamBlacklist/maintenance/cleanup.php @@ -0,0 +1,129 @@ +getTitle(); + $revId = $rev->getId(); + while ( $rev ) { + $matches = false; + foreach ( $regexes as $regex ) { + $matches = $matches + || preg_match( + $regex, + ContentHandler::getContentText( $rev->getContent() ) + ); + } + if ( !$matches ) { + // Didn't find any spam + break; + } + # Revision::getPrevious can't be used in this way before MW 1.6 (Revision.php 1.26) + # $rev = $rev->getPrevious(); + $revId = $title->getPreviousRevisionID( $revId ); + if ( $revId ) { + $rev = Revision::newFromTitle( $title, $revId ); + } else { + $rev = false; + } + } + if ( !$rev ) { + // Didn't find a non-spammy revision, delete the page + /* + print "All revisions are spam, deleting...\n"; + $article = new Article( $title ); + $article->doDeleteArticle( "All revisions matched the spam blacklist" ); + */ + // Too scary, blank instead + print "All revisions are spam, blanking...\n"; + $text = ''; + $comment = "All revisions matched the spam blacklist ($match), blanking"; + } else { + // Revert to this revision + $text = ContentHandler::getContentText( $rev->getContent() ); + $comment = "Cleaning up links to $match"; + } + $wikiPage = new WikiPage( $title ); + $wikiPage->doEditContent( ContentHandler::makeContent( $text, $title ), $comment ); +} + +// ------------------------------------------------------------------------------ + +$username = 'Spam cleanup script'; +if ( method_exists( 'User', 'newSystemUser' ) ) { + $wgUser = User::newSystemUser( $username, [ 'steal' => true ] ); +} else { + $wgUser = User::newFromName( $username ); + if ( $wgUser->idForName() == 0 ) { + // Create the user + $status = $wgUser->addToDatabase(); + if ( $status === null || $status->isOK() ) { + $dbw = wfGetDB( DB_MASTER ); + $dbw->update( 'user', [ 'user_password' => 'nologin' ], + [ 'user_name' => $username ], $username ); + } + } +} + +if ( isset( $options['n'] ) ) { + $dryRun = true; +} else { + $dryRun = false; +} + +$sb = new SpamBlacklist( $wgSpamBlacklistSettings ); +if ( $wgSpamBlacklistFiles ) { + $sb->files = $wgSpamBlacklistFiles; +} +$regexes = $sb->getBlacklists(); +if ( !$regexes ) { + print "Invalid regex, can't clean up spam\n"; + exit( 1 ); +} + +$dbr = wfGetDB( DB_SLAVE ); +$maxID = $dbr->selectField( 'page', 'MAX(page_id)' ); +$reportingInterval = 100; + +print "Regexes are " . implode( ', ', array_map( 'count', $regexes ) ) . " bytes\n"; +print "Searching for spam in $maxID pages...\n"; +if ( $dryRun ) { + print "Dry run only\n"; +} + +for ( $id = 1; $id <= $maxID; $id++ ) { + if ( $id % $reportingInterval == 0 ) { + printf( "%-8d %-5.2f%%\r", $id, $id / $maxID * 100 ); + } + $revision = Revision::loadFromPageId( $dbr, $id ); + if ( $revision ) { + $text = ContentHandler::getContentText( $revision->getContent() ); + if ( $text ) { + foreach ( $regexes as $regex ) { + if ( preg_match( $regex, $text, $matches ) ) { + $title = $revision->getTitle(); + $titleText = $title->getPrefixedText(); + if ( $dryRun ) { + print "\nFound spam in [[$titleText]]\n"; + } else { + print "\nCleaning up links to {$matches[0]} in [[$titleText]]\n"; + $match = str_replace( 'http://', '', $matches[0] ); + cleanupArticle( $revision, $regexes, $match ); + } + } + } + } + } +} +// Just for satisfaction +printf( "%-8d %-5.2f%%\n", $id - 1, ( $id - 1 ) / $maxID * 100 );