4 * Base class for different kinds of blacklists
6 abstract class BaseBlacklist {
8 * Array of blacklist sources
15 * Array containing regexes to test against
19 protected $regexes = false;
22 * Chance of receiving a warning when the filter is hit
26 public $warningChance = 100;
31 public $warningTime = 600;
36 public $expiryTime = 900;
39 * Array containing blacklists that extend BaseBlacklist
43 private static $blacklistTypes = [
44 'spam' => 'SpamBlacklist',
45 'email' => 'EmailBlacklist',
49 * Array of blacklist instances
53 private static $instances = [];
58 * @param array $settings
60 function __construct( $settings = [] ) {
61 foreach ( $settings as $name => $value ) {
62 $this->$name = $value;
69 * @param bool $preventLog
72 abstract public function filter( array $links, Title $title, $preventLog = false );
75 * Adds a blacklist class to the registry
78 * @param $class string
80 public static function addBlacklistType( $type, $class ) {
81 self::$blacklistTypes[$type] = $class;
85 * Return the array of blacklist types currently defined
89 public static function getBlacklistTypes() {
90 return self::$blacklistTypes;
94 * Returns an instance of the given blacklist
96 * @param $type string Code for the blacklist
97 * @return BaseBlacklist
100 public static function getInstance( $type ) {
101 if ( !isset( self::$blacklistTypes[$type] ) ) {
102 throw new Exception( "Invalid blacklist type '$type' passed to " . __METHOD__ );
105 if ( !isset( self::$instances[$type] ) ) {
106 global $wgBlacklistSettings;
109 if ( !isset( $wgBlacklistSettings[$type] ) ) {
110 $wgBlacklistSettings[$type] = [];
113 $class = self::$blacklistTypes[$type];
114 self::$instances[$type] = new $class( $wgBlacklistSettings[$type] );
117 return self::$instances[$type];
121 * Returns the code for the blacklist implementation
125 abstract protected function getBlacklistType();
128 * Check if the given local page title is a spam regex source.
130 * @param Title $title
133 public static function isLocalSource( Title $title ) {
134 global $wgDBname, $wgBlacklistSettings;
136 if ( $title->getNamespace() == NS_MEDIAWIKI ) {
138 foreach ( self::$blacklistTypes as $type => $class ) {
139 $type = ucfirst( $type );
146 if ( in_array( $title->getDBkey(), $sources ) ) {
151 $thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP );
152 $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/';
155 foreach ( self::$blacklistTypes as $type => $class ) {
156 if ( isset( $wgBlacklistSettings[$type]['files'] ) ) {
157 $files += $wgBlacklistSettings[$type]['files'];
161 foreach ( $files as $fileName ) {
163 if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
164 if ( $wgDBname == $matches[1] ) {
165 if ( $matches[2] == $title->getPrefixedDbKey() ) {
166 // Local DB fetch of this page...
170 } elseif ( preg_match( $thisHttpRegex, $fileName ) ) {
171 // Raw view of this page
180 * Returns the type of blacklist from the given title
182 * @todo building a regex for this is pretty overkill
183 * @param Title $title
184 * @return bool|string
186 public static function getTypeFromTitle( Title $title ) {
189 $types = array_map( [ $wgContLang, 'ucfirst' ], array_keys( self::$blacklistTypes ) );
190 $regex = '/(' . implode( '|', $types ). ')-(?:blacklist|whitelist)/';
192 if ( preg_match( $regex, $title->getDBkey(), $m ) ) {
193 return strtolower( $m[1] );
200 * Fetch local and (possibly cached) remote blacklists.
201 * Will be cached locally across multiple invocations.
202 * @return array set of regular expressions, potentially empty.
204 function getBlacklists() {
205 if ( $this->regexes === false ) {
206 $this->regexes = array_merge(
207 $this->getLocalBlacklists(),
208 $this->getSharedBlacklists() );
210 return $this->regexes;
214 * Returns the local blacklist
216 * @return array Regular expressions
218 public function getLocalBlacklists() {
220 $type = $this->getBlacklistType();
222 return ObjectCache::getMainWANInstance()->getWithSetCallback(
223 wfMemcKey( 'spamblacklist', $type, 'blacklist-regex' ),
225 function () use ( $that, $type ) {
226 return SpamRegexBatch::regexesFromMessage( "{$type}-blacklist", $that );
232 * Returns the (local) whitelist
234 * @return array Regular expressions
236 public function getWhitelists() {
238 $type = $this->getBlacklistType();
240 return ObjectCache::getMainWANInstance()->getWithSetCallback(
241 wfMemcKey( 'spamblacklist', $type, 'whitelist-regex' ),
243 function () use ( $that, $type ) {
244 return SpamRegexBatch::regexesFromMessage( "{$type}-whitelist", $that );
250 * Fetch (possibly cached) remote blacklists.
253 function getSharedBlacklists() {
254 $listType = $this->getBlacklistType();
256 wfDebugLog( 'SpamBlacklist', "Loading $listType regex..." );
258 if ( count( $this->files ) == 0 ) {
260 wfDebugLog( 'SpamBlacklist', "no files specified\n" );
267 $regexes = ObjectCache::getMainWANInstance()->getWithSetCallback(
268 // This used to be cached per-site, but that could be bad on a shared
269 // server where not all wikis have the same configuration.
270 wfMemcKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ),
272 function () use ( $that, &$miss ) {
274 return $that->buildSharedBlacklists();
279 wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" );
286 * Clear all primary blacklist cache keys
288 * @note: this method is unused atm
290 function clearCache() {
291 $listType = $this->getBlacklistType();
293 $cache = ObjectCache::getMainWANInstance();
294 $cache->delete( wfMemcKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ) );
295 $cache->delete( wfMemcKey( 'spamblacklist', $listType, 'blacklist-regex' ) );
296 $cache->delete( wfMemcKey( 'spamblacklist', $listType, 'whitelist-regex' ) );
298 wfDebugLog( 'SpamBlacklist', "$listType blacklist local cache cleared.\n" );
301 function buildSharedBlacklists() {
303 $listType = $this->getBlacklistType();
305 wfDebugLog( 'SpamBlacklist', "Constructing $listType blacklist\n" );
306 foreach ( $this->files as $fileName ) {
308 if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) {
309 $text = $this->getArticleText( $matches[1], $matches[2] );
310 } elseif ( preg_match( '/^(https?:)?\/\//', $fileName ) ) {
311 $text = $this->getHttpText( $fileName );
313 $text = file_get_contents( $fileName );
314 wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" );
317 // Build a separate batch of regexes from each source.
318 // While in theory we could squeeze a little efficiency
319 // out of combining multiple sources in one regex, if
320 // there's a bad line in one of them we'll gain more
321 // from only having to break that set into smaller pieces.
322 $regexes = array_merge( $regexes,
323 SpamRegexBatch::regexesFromText( $text, $this, $fileName ) );
329 function getHttpText( $fileName ) {
330 global $wgDBname, $messageMemc;
331 $listType = $this->getBlacklistType();
334 # To keep requests to a minimum, we save results into $messageMemc, which is
335 # similar to $wgMemc except almost certain to exist. By default, it is stored
337 # There are two keys, when the warning key expires, a random thread will refresh
338 # the real key. This reduces the chance of multiple requests under high traffic
340 $key = "{$listType}_blacklist_file:$fileName";
341 $warningKey = "$wgDBname:{$listType}filewarning:$fileName";
342 $httpText = $messageMemc->get( $key );
343 $warning = $messageMemc->get( $warningKey );
345 if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
346 wfDebugLog( 'SpamBlacklist', "Loading $listType blacklist from $fileName\n" );
347 $httpText = Http::get( $fileName );
348 if ( $httpText === false ) {
349 wfDebugLog( 'SpamBlacklist', "Error loading $listType blacklist from $fileName\n" );
351 $messageMemc->set( $warningKey, 1, $this->warningTime );
352 $messageMemc->set( $key, $httpText, $this->expiryTime );
354 wfDebugLog( 'SpamBlacklist', "Got $listType blacklist from HTTP cache for $fileName\n" );
360 * Fetch an article from this or another local MediaWiki database.
361 * This is probably *very* fragile, and shouldn't be used perhaps.
363 * @param string $wiki
364 * @param string $article
367 function getArticleText( $wiki, $article ) {
368 wfDebugLog( 'SpamBlacklist',
369 "Fetching {$this->getBlacklistType()} blacklist from '$article' on '$wiki'...\n" );
371 $title = Title::newFromText( $article );
372 // Load all the relevant tables from the correct DB.
373 // This assumes that old_text is the actual text or
374 // that the external store system is at least unified.
375 $row = wfGetDB( DB_SLAVE, [], $wiki )->selectRow(
376 [ 'page', 'revision', 'text' ],
378 Revision::selectFields(),
379 Revision::selectPageFields(),
380 Revision::selectTextFields()
383 'page_namespace' => $title->getNamespace(), // assume NS IDs match
384 'page_title' => $title->getDBkey(), // assume same case rules
385 'rev_id=page_latest',
392 ? ContentHandler::getContentText( Revision::newFromRow( $row )->getContent() )
397 * Returns the start of the regex for matches
401 public function getRegexStart() {
402 return '/[a-z0-9_\-.]*';
406 * Returns the end of the regex for matches
411 public function getRegexEnd( $batchSize ) {
412 return ( $batchSize > 0 ) ? '/Sim' : '/im';
416 * @param Title $title
417 * @param string[] $entries
419 public function warmCachesForFilter( Title $title, array $entries ) {