4 * An aggressive spam cleanup script.
5 * Searches the database for matching pages, and reverts them to the last non-spammed revision.
6 * If all revisions contain spam, deletes the page
9 require_once '../../maintenance/commandLine.inc';
10 require_once 'SpamBlacklist_body.php';
13 * Find the latest revision of the article that does not contain spam and revert to it
15 function cleanupArticle( Revision $rev, $regexes, $match ) {
16 $title = $rev->getTitle();
17 $revId = $rev->getId();
20 foreach ( $regexes as $regex ) {
24 ContentHandler::getContentText( $rev->getContent() )
28 // Didn't find any spam
31 # Revision::getPrevious can't be used in this way before MW 1.6 (Revision.php 1.26)
32 # $rev = $rev->getPrevious();
33 $revId = $title->getPreviousRevisionID( $revId );
35 $rev = Revision::newFromTitle( $title, $revId );
41 // Didn't find a non-spammy revision, delete the page
43 print "All revisions are spam, deleting...\n";
44 $article = new Article( $title );
45 $article->doDeleteArticle( "All revisions matched the spam blacklist" );
47 // Too scary, blank instead
48 print "All revisions are spam, blanking...\n";
50 $comment = "All revisions matched the spam blacklist ($match), blanking";
52 // Revert to this revision
53 $text = ContentHandler::getContentText( $rev->getContent() );
54 $comment = "Cleaning up links to $match";
56 $wikiPage = new WikiPage( $title );
57 $wikiPage->doEditContent( ContentHandler::makeContent( $text, $title ), $comment );
60 // ------------------------------------------------------------------------------
62 $username = 'Spam cleanup script';
63 if ( method_exists( 'User', 'newSystemUser' ) ) {
64 $wgUser = User::newSystemUser( $username, [ 'steal' => true ] );
66 $wgUser = User::newFromName( $username );
67 if ( $wgUser->idForName() == 0 ) {
69 $status = $wgUser->addToDatabase();
70 if ( $status === null || $status->isOK() ) {
71 $dbw = wfGetDB( DB_MASTER );
72 $dbw->update( 'user', [ 'user_password' => 'nologin' ],
73 [ 'user_name' => $username ], $username );
78 if ( isset( $options['n'] ) ) {
84 $sb = new SpamBlacklist( $wgSpamBlacklistSettings );
85 if ( $wgSpamBlacklistFiles ) {
86 $sb->files = $wgSpamBlacklistFiles;
88 $regexes = $sb->getBlacklists();
90 print "Invalid regex, can't clean up spam\n";
94 $dbr = wfGetDB( DB_SLAVE );
95 $maxID = $dbr->selectField( 'page', 'MAX(page_id)' );
96 $reportingInterval = 100;
98 print "Regexes are " . implode( ', ', array_map( 'count', $regexes ) ) . " bytes\n";
99 print "Searching for spam in $maxID pages...\n";
101 print "Dry run only\n";
104 for ( $id = 1; $id <= $maxID; $id++ ) {
105 if ( $id % $reportingInterval == 0 ) {
106 printf( "%-8d %-5.2f%%\r", $id, $id / $maxID * 100 );
108 $revision = Revision::loadFromPageId( $dbr, $id );
110 $text = ContentHandler::getContentText( $revision->getContent() );
112 foreach ( $regexes as $regex ) {
113 if ( preg_match( $regex, $text, $matches ) ) {
114 $title = $revision->getTitle();
115 $titleText = $title->getPrefixedText();
117 print "\nFound spam in [[$titleText]]\n";
119 print "\nCleaning up links to {$matches[0]} in [[$titleText]]\n";
120 $match = str_replace( 'http://', '', $matches[0] );
121 cleanupArticle( $revision, $regexes, $match );
128 // Just for satisfaction
129 printf( "%-8d %-5.2f%%\n", $id - 1, ( $id - 1 ) / $maxID * 100 );