]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - extensions/SpamBlacklist/BaseBlacklist.php
MediaWiki 1.30.2
[autoinstallsdev/mediawiki.git] / extensions / SpamBlacklist / BaseBlacklist.php
1 <?php
2
3 /**
4  * Base class for different kinds of blacklists
5  */
6 abstract class BaseBlacklist {
7         /**
8          * Array of blacklist sources
9          *
10          * @var array
11          */
12         public $files = [];
13
14         /**
15          * Array containing regexes to test against
16          *
17          * @var bool|array
18          */
19         protected $regexes = false;
20
21         /**
22          * Chance of receiving a warning when the filter is hit
23          *
24          * @var int
25          */
26         public $warningChance = 100;
27
28         /**
29          * @var int
30          */
31         public $warningTime = 600;
32
33         /**
34          * @var int
35          */
36         public $expiryTime = 900;
37
38         /**
39          * Array containing blacklists that extend BaseBlacklist
40          *
41          * @var array
42          */
43         private static $blacklistTypes = [
44                 'spam' => 'SpamBlacklist',
45                 'email' => 'EmailBlacklist',
46         ];
47
48         /**
49          * Array of blacklist instances
50          *
51          * @var array
52          */
53         private static $instances = [];
54
55         /**
56          * Constructor
57          *
58          * @param array $settings
59          */
60         function __construct( $settings = [] ) {
61                 foreach ( $settings as $name => $value ) {
62                         $this->$name = $value;
63                 }
64         }
65
66         /**
67          * @param array $links
68          * @param Title $title
69          * @param bool $preventLog
70          * @return mixed
71          */
72         abstract public function filter( array $links, Title $title, $preventLog = false );
73
74         /**
75          * Adds a blacklist class to the registry
76          *
77          * @param $type string
78          * @param $class string
79          */
80         public static function addBlacklistType( $type, $class ) {
81                 self::$blacklistTypes[$type] = $class;
82         }
83
84         /**
85          * Return the array of blacklist types currently defined
86          *
87          * @return array
88          */
89         public static function getBlacklistTypes() {
90                 return self::$blacklistTypes;
91         }
92
93         /**
94          * Returns an instance of the given blacklist
95          *
96          * @param $type string Code for the blacklist
97          * @return BaseBlacklist
98          * @throws Exception
99          */
100         public static function getInstance( $type ) {
101                 if ( !isset( self::$blacklistTypes[$type] ) ) {
102                         throw new Exception( "Invalid blacklist type '$type' passed to " . __METHOD__ );
103                 }
104
105                 if ( !isset( self::$instances[$type] ) ) {
106                         global $wgBlacklistSettings;
107
108                         // Prevent notices
109                         if ( !isset( $wgBlacklistSettings[$type] ) ) {
110                                 $wgBlacklistSettings[$type] = [];
111                         }
112
113                         $class = self::$blacklistTypes[$type];
114                         self::$instances[$type] = new $class( $wgBlacklistSettings[$type] );
115                 }
116
117                 return self::$instances[$type];
118         }
119
120         /**
121          * Returns the code for the blacklist implementation
122          *
123          * @return string
124          */
125         abstract protected function getBlacklistType();
126
127         /**
128          * Check if the given local page title is a spam regex source.
129          *
130          * @param Title $title
131          * @return bool
132          */
133         public static function isLocalSource( Title $title ) {
134                 global $wgDBname, $wgBlacklistSettings;
135
136                 if ( $title->getNamespace() == NS_MEDIAWIKI ) {
137                         $sources = [];
138                         foreach ( self::$blacklistTypes as $type => $class ) {
139                                 $type = ucfirst( $type );
140                                 $sources += [
141                                         "$type-blacklist",
142                                         "$type-whitelist"
143                                 ];
144                         }
145
146                         if ( in_array( $title->getDBkey(), $sources ) ) {
147                                 return true;
148                         }
149                 }
150
151                 $thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP );
152                 $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/';
153
154                 $files = [];
155                 foreach ( self::$blacklistTypes as $type => $class ) {
156                         if ( isset( $wgBlacklistSettings[$type]['files'] ) ) {
157                                 $files += $wgBlacklistSettings[$type]['files'];
158                         }
159                 }
160
161                 foreach ( $files as $fileName ) {
162                         $matches = [];
163                         if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
164                                 if ( $wgDBname == $matches[1] ) {
165                                         if ( $matches[2] == $title->getPrefixedDbKey() ) {
166                                                 // Local DB fetch of this page...
167                                                 return true;
168                                         }
169                                 }
170                         } elseif ( preg_match( $thisHttpRegex, $fileName ) ) {
171                                 // Raw view of this page
172                                 return true;
173                         }
174                 }
175
176                 return false;
177         }
178
179         /**
180          * Returns the type of blacklist from the given title
181          *
182          * @todo building a regex for this is pretty overkill
183          * @param Title $title
184          * @return bool|string
185          */
186         public static function getTypeFromTitle( Title $title ) {
187                 global $wgContLang;
188
189                 $types = array_map( [ $wgContLang, 'ucfirst' ], array_keys( self::$blacklistTypes ) );
190                 $regex = '/(' . implode( '|', $types ).  ')-(?:blacklist|whitelist)/';
191
192                 if ( preg_match( $regex, $title->getDBkey(), $m ) ) {
193                         return strtolower( $m[1] );
194                 }
195
196                 return false;
197         }
198
199         /**
200          * Fetch local and (possibly cached) remote blacklists.
201          * Will be cached locally across multiple invocations.
202          * @return array set of regular expressions, potentially empty.
203          */
204         function getBlacklists() {
205                 if ( $this->regexes === false ) {
206                         $this->regexes = array_merge(
207                                 $this->getLocalBlacklists(),
208                                 $this->getSharedBlacklists() );
209                 }
210                 return $this->regexes;
211         }
212
213         /**
214          * Returns the local blacklist
215          *
216          * @return array Regular expressions
217          */
218         public function getLocalBlacklists() {
219                 $that = $this;
220                 $type = $this->getBlacklistType();
221
222                 return ObjectCache::getMainWANInstance()->getWithSetCallback(
223                         wfMemcKey( 'spamblacklist', $type, 'blacklist-regex' ),
224                         $this->expiryTime,
225                         function () use ( $that, $type ) {
226                                 return SpamRegexBatch::regexesFromMessage( "{$type}-blacklist", $that );
227                         }
228                 );
229         }
230
231         /**
232          * Returns the (local) whitelist
233          *
234          * @return array Regular expressions
235          */
236         public function getWhitelists() {
237                 $that = $this;
238                 $type = $this->getBlacklistType();
239
240                 return ObjectCache::getMainWANInstance()->getWithSetCallback(
241                         wfMemcKey( 'spamblacklist', $type, 'whitelist-regex' ),
242                         $this->expiryTime,
243                         function () use ( $that, $type ) {
244                                 return SpamRegexBatch::regexesFromMessage( "{$type}-whitelist", $that );
245                         }
246                 );
247         }
248
249         /**
250          * Fetch (possibly cached) remote blacklists.
251          * @return array
252          */
253         function getSharedBlacklists() {
254                 $listType = $this->getBlacklistType();
255
256                 wfDebugLog( 'SpamBlacklist', "Loading $listType regex..." );
257
258                 if ( count( $this->files ) == 0 ) {
259                         # No lists
260                         wfDebugLog( 'SpamBlacklist', "no files specified\n" );
261                         return [];
262                 }
263
264                 $miss = false;
265
266                 $that = $this;
267                 $regexes = ObjectCache::getMainWANInstance()->getWithSetCallback(
268                         // This used to be cached per-site, but that could be bad on a shared
269                         // server where not all wikis have the same configuration.
270                         wfMemcKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ),
271                         $this->expiryTime,
272                         function () use ( $that, &$miss ) {
273                                 $miss = true;
274                                 return $that->buildSharedBlacklists();
275                         }
276                 );
277
278                 if ( !$miss ) {
279                         wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" );
280                 }
281
282                 return $regexes;
283         }
284
285         /**
286          * Clear all primary blacklist cache keys
287          *
288          * @note: this method is unused atm
289          */
290         function clearCache() {
291                 $listType = $this->getBlacklistType();
292
293                 $cache = ObjectCache::getMainWANInstance();
294                 $cache->delete( wfMemcKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ) );
295                 $cache->delete( wfMemcKey( 'spamblacklist', $listType, 'blacklist-regex' ) );
296                 $cache->delete( wfMemcKey( 'spamblacklist', $listType, 'whitelist-regex' ) );
297
298                 wfDebugLog( 'SpamBlacklist', "$listType blacklist local cache cleared.\n" );
299         }
300
301         function buildSharedBlacklists() {
302                 $regexes = [];
303                 $listType = $this->getBlacklistType();
304                 # Load lists
305                 wfDebugLog( 'SpamBlacklist', "Constructing $listType blacklist\n" );
306                 foreach ( $this->files as $fileName ) {
307                         $matches = [];
308                         if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) {
309                                 $text = $this->getArticleText( $matches[1], $matches[2] );
310                         } elseif ( preg_match( '/^(https?:)?\/\//', $fileName ) ) {
311                                 $text = $this->getHttpText( $fileName );
312                         } else {
313                                 $text = file_get_contents( $fileName );
314                                 wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" );
315                         }
316
317                         // Build a separate batch of regexes from each source.
318                         // While in theory we could squeeze a little efficiency
319                         // out of combining multiple sources in one regex, if
320                         // there's a bad line in one of them we'll gain more
321                         // from only having to break that set into smaller pieces.
322                         $regexes = array_merge( $regexes,
323                                 SpamRegexBatch::regexesFromText( $text, $this, $fileName ) );
324                 }
325
326                 return $regexes;
327         }
328
329         function getHttpText( $fileName ) {
330                 global $wgDBname, $messageMemc;
331                 $listType = $this->getBlacklistType();
332
333                 # HTTP request
334                 # To keep requests to a minimum, we save results into $messageMemc, which is
335                 # similar to $wgMemc except almost certain to exist. By default, it is stored
336                 # in the database
337                 # There are two keys, when the warning key expires, a random thread will refresh
338                 # the real key. This reduces the chance of multiple requests under high traffic
339                 # conditions.
340                 $key = "{$listType}_blacklist_file:$fileName";
341                 $warningKey = "$wgDBname:{$listType}filewarning:$fileName";
342                 $httpText = $messageMemc->get( $key );
343                 $warning = $messageMemc->get( $warningKey );
344
345                 if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
346                         wfDebugLog( 'SpamBlacklist', "Loading $listType blacklist from $fileName\n" );
347                         $httpText = Http::get( $fileName );
348                         if ( $httpText === false ) {
349                                 wfDebugLog( 'SpamBlacklist', "Error loading $listType blacklist from $fileName\n" );
350                         }
351                         $messageMemc->set( $warningKey, 1, $this->warningTime );
352                         $messageMemc->set( $key, $httpText, $this->expiryTime );
353                 } else {
354                         wfDebugLog( 'SpamBlacklist', "Got $listType blacklist from HTTP cache for $fileName\n" );
355                 }
356                 return $httpText;
357         }
358
359         /**
360          * Fetch an article from this or another local MediaWiki database.
361          * This is probably *very* fragile, and shouldn't be used perhaps.
362          *
363          * @param string $wiki
364          * @param string $article
365          * @return string
366          */
367         function getArticleText( $wiki, $article ) {
368                 wfDebugLog( 'SpamBlacklist',
369                         "Fetching {$this->getBlacklistType()} blacklist from '$article' on '$wiki'...\n" );
370
371                 $title = Title::newFromText( $article );
372                 // Load all the relevant tables from the correct DB.
373                 // This assumes that old_text is the actual text or
374                 // that the external store system is at least unified.
375                 $row = wfGetDB( DB_SLAVE, [], $wiki )->selectRow(
376                         [ 'page', 'revision', 'text' ],
377                         array_merge(
378                                 Revision::selectFields(),
379                                 Revision::selectPageFields(),
380                                 Revision::selectTextFields()
381                         ),
382                         [
383                                 'page_namespace' => $title->getNamespace(), // assume NS IDs match
384                                 'page_title' => $title->getDBkey(), // assume same case rules
385                                 'rev_id=page_latest',
386                                 'old_id=rev_text_id'
387                         ],
388                         __METHOD__
389                 );
390
391                 return $row
392                         ? ContentHandler::getContentText( Revision::newFromRow( $row )->getContent() )
393                         : false;
394         }
395
396         /**
397          * Returns the start of the regex for matches
398          *
399          * @return string
400          */
401         public function getRegexStart() {
402                 return '/[a-z0-9_\-.]*';
403         }
404
405         /**
406          * Returns the end of the regex for matches
407          *
408          * @param $batchSize
409          * @return string
410          */
411         public function getRegexEnd( $batchSize ) {
412                 return ( $batchSize > 0 ) ? '/Sim' : '/im';
413         }
414
415         /**
416          * @param Title $title
417          * @param string[] $entries
418          */
419         public function warmCachesForFilter( Title $title, array $entries ) {
420                 // subclass this
421         }
422 }