]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - includes/search/SearchHighlighter.php
MediaWiki 1.30.2
[autoinstallsdev/mediawiki.git] / includes / search / SearchHighlighter.php
1 <?php
2 /**
3  * Basic search engine highlighting
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program; if not, write to the Free Software Foundation, Inc.,
17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18  * http://www.gnu.org/copyleft/gpl.html
19  *
20  * @file
21  * @ingroup Search
22  */
23
24 /**
25  * Highlight bits of wikitext
26  *
27  * @ingroup Search
28  */
29 class SearchHighlighter {
30         protected $mCleanWikitext = true;
31
32         /**
33          * @warning If you pass false to this constructor, then
34          *  the caller is responsible for HTML escaping.
35          * @param bool $cleanupWikitext
36          */
37         function __construct( $cleanupWikitext = true ) {
38                 $this->mCleanWikitext = $cleanupWikitext;
39         }
40
41         /**
42          * Wikitext highlighting when $wgAdvancedSearchHighlighting = true
43          *
44          * @param string $text
45          * @param array $terms Terms to highlight (not html escaped but
46          *   regex escaped via SearchDatabase::regexTerm())
47          * @param int $contextlines
48          * @param int $contextchars
49          * @return string
50          */
51         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
52                 global $wgContLang, $wgSearchHighlightBoundaries;
53
54                 if ( $text == '' ) {
55                         return '';
56                 }
57
58                 // spli text into text + templates/links/tables
59                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
60                 // first capture group is for detecting nested templates/links/tables/references
61                 $endPatterns = [
62                         1 => '/(\{\{)|(\}\})/', // template
63                         2 => '/(\[\[)|(\]\])/', // image
64                         3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
65
66                 // @todo FIXME: This should prolly be a hook or something
67                 // instead of hardcoding a class name from the Cite extension
68                 if ( class_exists( 'Cite' ) ) {
69                         $spat .= '|(<ref>)'; // references via cite extension
70                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
71                 }
72                 $spat .= '/';
73                 $textExt = []; // text extracts
74                 $otherExt = []; // other extracts
75                 $start = 0;
76                 $textLen = strlen( $text );
77                 $count = 0; // sequence number to maintain ordering
78                 while ( $start < $textLen ) {
79                         // find start of template/image/table
80                         if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
81                                 $epat = '';
82                                 foreach ( $matches as $key => $val ) {
83                                         if ( $key > 0 && $val[1] != -1 ) {
84                                                 if ( $key == 2 ) {
85                                                         // see if this is an image link
86                                                         $ns = substr( $val[0], 2, -1 );
87                                                         if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
88                                                                 break;
89                                                         }
90
91                                                 }
92                                                 $epat = $endPatterns[$key];
93                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
94                                                 $start = $val[1];
95                                                 break;
96                                         }
97                                 }
98                                 if ( $epat ) {
99                                         // find end (and detect any nested elements)
100                                         $level = 0;
101                                         $offset = $start + 1;
102                                         $found = false;
103                                         while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
104                                                 if ( array_key_exists( 2, $endMatches ) ) {
105                                                         // found end
106                                                         if ( $level == 0 ) {
107                                                                 $len = strlen( $endMatches[2][0] );
108                                                                 $off = $endMatches[2][1];
109                                                                 $this->splitAndAdd( $otherExt, $count,
110                                                                         substr( $text, $start, $off + $len - $start ) );
111                                                                 $start = $off + $len;
112                                                                 $found = true;
113                                                                 break;
114                                                         } else {
115                                                                 // end of nested element
116                                                                 $level -= 1;
117                                                         }
118                                                 } else {
119                                                         // nested
120                                                         $level += 1;
121                                                 }
122                                                 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
123                                         }
124                                         if ( !$found ) {
125                                                 // couldn't find appropriate closing tag, skip
126                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
127                                                 $start += strlen( $matches[0][0] );
128                                         }
129                                         continue;
130                                 }
131                         }
132                         // else: add as text extract
133                         $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
134                         break;
135                 }
136
137                 $all = $textExt + $otherExt; // these have disjunct key sets
138
139                 // prepare regexps
140                 foreach ( $terms as $index => $term ) {
141                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
142                         if ( preg_match( '/[\x80-\xff]/', $term ) ) {
143                                 $terms[$index] = preg_replace_callback(
144                                         '/./us',
145                                         [ $this, 'caseCallback' ],
146                                         $terms[$index]
147                                 );
148                         } else {
149                                 $terms[$index] = $term;
150                         }
151                 }
152                 $anyterm = implode( '|', $terms );
153                 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
154                 // @todo FIXME: A hack to scale contextchars, a correct solution
155                 // would be to have contextchars actually be char and not byte
156                 // length, and do proper utf-8 substrings and lengths everywhere,
157                 // but PHP is making that very hard and unclean to implement :(
158                 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
159                 $contextchars = intval( $contextchars * $scale );
160
161                 $patPre = "(^|$wgSearchHighlightBoundaries)";
162                 $patPost = "($wgSearchHighlightBoundaries|$)";
163
164                 $pat1 = "/(" . $phrase . ")/ui";
165                 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
166
167                 $left = $contextlines;
168
169                 $snippets = [];
170                 $offsets = [];
171
172                 // show beginning only if it contains all words
173                 $first = 0;
174                 $firstText = '';
175                 foreach ( $textExt as $index => $line ) {
176                         if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
177                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
178                                 $first = $index;
179                                 break;
180                         }
181                 }
182                 if ( $firstText ) {
183                         $succ = true;
184                         // check if first text contains all terms
185                         foreach ( $terms as $term ) {
186                                 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
187                                         $succ = false;
188                                         break;
189                                 }
190                         }
191                         if ( $succ ) {
192                                 $snippets[$first] = $firstText;
193                                 $offsets[$first] = 0;
194                         }
195                 }
196                 if ( !$snippets ) {
197                         // match whole query on text
198                         $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
199                         // match whole query on templates/tables/images
200                         $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
201                         // match any words on text
202                         $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
203                         // match any words on templates/tables/images
204                         $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
205
206                         ksort( $snippets );
207                 }
208
209                 // add extra chars to each snippet to make snippets constant size
210                 $extended = [];
211                 if ( count( $snippets ) == 0 ) {
212                         // couldn't find the target words, just show beginning of article
213                         if ( array_key_exists( $first, $all ) ) {
214                                 $targetchars = $contextchars * $contextlines;
215                                 $snippets[$first] = '';
216                                 $offsets[$first] = 0;
217                         }
218                 } else {
219                         // if begin of the article contains the whole phrase, show only that !!
220                         if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
221                                 && $offsets[$first] < $contextchars * 2 ) {
222                                 $snippets = [ $first => $snippets[$first] ];
223                         }
224
225                         // calc by how much to extend existing snippets
226                         $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
227                 }
228
229                 foreach ( $snippets as $index => $line ) {
230                         $extended[$index] = $line;
231                         $len = strlen( $line );
232                         if ( $len < $targetchars - 20 ) {
233                                 // complete this line
234                                 if ( $len < strlen( $all[$index] ) ) {
235                                         $extended[$index] = $this->extract(
236                                                 $all[$index],
237                                                 $offsets[$index],
238                                                 $offsets[$index] + $targetchars,
239                                                 $offsets[$index]
240                                         );
241                                         $len = strlen( $extended[$index] );
242                                 }
243
244                                 // add more lines
245                                 $add = $index + 1;
246                                 while ( $len < $targetchars - 20
247                                                 && array_key_exists( $add, $all )
248                                                 && !array_key_exists( $add, $snippets ) ) {
249                                         $offsets[$add] = 0;
250                                         $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
251                                         $extended[$add] = $tt;
252                                         $len += strlen( $tt );
253                                         $add++;
254                                 }
255                         }
256                 }
257
258                 // $snippets = array_map( 'htmlspecialchars', $extended );
259                 $snippets = $extended;
260                 $last = -1;
261                 $extract = '';
262                 foreach ( $snippets as $index => $line ) {
263                         if ( $last == -1 ) {
264                                 $extract .= $line; // first line
265                         } elseif ( $last + 1 == $index
266                                 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
267                         ) {
268                                 $extract .= " " . $line; // continous lines
269                         } else {
270                                 $extract .= '<b> ... </b>' . $line;
271                         }
272
273                         $last = $index;
274                 }
275                 if ( $extract ) {
276                         $extract .= '<b> ... </b>';
277                 }
278
279                 $processed = [];
280                 foreach ( $terms as $term ) {
281                         if ( !isset( $processed[$term] ) ) {
282                                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
283                                 $extract = preg_replace( $pat3,
284                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
285                                 $processed[$term] = true;
286                         }
287                 }
288
289                 return $extract;
290         }
291
292         /**
293          * Split text into lines and add it to extracts array
294          *
295          * @param array &$extracts Index -> $line
296          * @param int &$count
297          * @param string $text
298          */
299         function splitAndAdd( &$extracts, &$count, $text ) {
300                 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
301                 foreach ( $split as $line ) {
302                         $tt = trim( $line );
303                         if ( $tt ) {
304                                 $extracts[$count++] = $tt;
305                         }
306                 }
307         }
308
309         /**
310          * Do manual case conversion for non-ascii chars
311          *
312          * @param array $matches
313          * @return string
314          */
315         function caseCallback( $matches ) {
316                 global $wgContLang;
317                 if ( strlen( $matches[0] ) > 1 ) {
318                         return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
319                 } else {
320                         return $matches[0];
321                 }
322         }
323
324         /**
325          * Extract part of the text from start to end, but by
326          * not chopping up words
327          * @param string $text
328          * @param int $start
329          * @param int $end
330          * @param int &$posStart (out) actual start position
331          * @param int &$posEnd (out) actual end position
332          * @return string
333          */
334         function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
335                 if ( $start != 0 ) {
336                         $start = $this->position( $text, $start, 1 );
337                 }
338                 if ( $end >= strlen( $text ) ) {
339                         $end = strlen( $text );
340                 } else {
341                         $end = $this->position( $text, $end );
342                 }
343
344                 if ( !is_null( $posStart ) ) {
345                         $posStart = $start;
346                 }
347                 if ( !is_null( $posEnd ) ) {
348                         $posEnd = $end;
349                 }
350
351                 if ( $end > $start ) {
352                         return substr( $text, $start, $end - $start );
353                 } else {
354                         return '';
355                 }
356         }
357
358         /**
359          * Find a nonletter near a point (index) in the text
360          *
361          * @param string $text
362          * @param int $point
363          * @param int $offset Offset to found index
364          * @return int Nearest nonletter index, or beginning of utf8 char if none
365          */
366         function position( $text, $point, $offset = 0 ) {
367                 $tolerance = 10;
368                 $s = max( 0, $point - $tolerance );
369                 $l = min( strlen( $text ), $point + $tolerance ) - $s;
370                 $m = [];
371
372                 if ( preg_match(
373                         '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
374                         substr( $text, $s, $l ),
375                         $m,
376                         PREG_OFFSET_CAPTURE
377                 ) ) {
378                         return $m[0][1] + $s + $offset;
379                 } else {
380                         // check if point is on a valid first UTF8 char
381                         $char = ord( $text[$point] );
382                         while ( $char >= 0x80 && $char < 0xc0 ) {
383                                 // skip trailing bytes
384                                 $point++;
385                                 if ( $point >= strlen( $text ) ) {
386                                         return strlen( $text );
387                                 }
388                                 $char = ord( $text[$point] );
389                         }
390
391                         return $point;
392
393                 }
394         }
395
396         /**
397          * Search extracts for a pattern, and return snippets
398          *
399          * @param string $pattern Regexp for matching lines
400          * @param array $extracts Extracts to search
401          * @param int &$linesleft Number of extracts to make
402          * @param int &$contextchars Length of snippet
403          * @param array &$out Map for highlighted snippets
404          * @param array &$offsets Map of starting points of snippets
405          * @protected
406          */
407         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
408                 if ( $linesleft == 0 ) {
409                         return; // nothing to do
410                 }
411                 foreach ( $extracts as $index => $line ) {
412                         if ( array_key_exists( $index, $out ) ) {
413                                 continue; // this line already highlighted
414                         }
415
416                         $m = [];
417                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
418                                 continue;
419                         }
420
421                         $offset = $m[0][1];
422                         $len = strlen( $m[0][0] );
423                         if ( $offset + $len < $contextchars ) {
424                                 $begin = 0;
425                         } elseif ( $len > $contextchars ) {
426                                 $begin = $offset;
427                         } else {
428                                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
429                         }
430
431                         $end = $begin + $contextchars;
432
433                         $posBegin = $begin;
434                         // basic snippet from this line
435                         $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
436                         $offsets[$index] = $posBegin;
437                         $linesleft--;
438                         if ( $linesleft == 0 ) {
439                                 return;
440                         }
441                 }
442         }
443
444         /**
445          * Basic wikitext removal
446          * @protected
447          * @param string $text
448          * @return mixed
449          */
450         function removeWiki( $text ) {
451                 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
452                 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
453                 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
454                 $text = preg_replace_callback(
455                         "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
456                         [ $this, 'linkReplace' ],
457                         $text
458                 );
459                 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
460                 $text = preg_replace( "/'''''/", "", $text );
461                 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
462                 $text = preg_replace( "/''/", "", $text );
463
464                 // Note, the previous /<\/?[^>]+>/ is insufficient
465                 // for XSS safety as the HTML tag can span multiple
466                 // search results (T144845).
467                 $text = Sanitizer::escapeHtmlAllowEntities( $text );
468                 return $text;
469         }
470
471         /**
472          * callback to replace [[target|caption]] kind of links, if
473          * the target is category or image, leave it
474          *
475          * @param array $matches
476          * @return string
477          */
478         function linkReplace( $matches ) {
479                 $colon = strpos( $matches[1], ':' );
480                 if ( $colon === false ) {
481                         return $matches[2]; // replace with caption
482                 }
483                 global $wgContLang;
484                 $ns = substr( $matches[1], 0, $colon );
485                 $index = $wgContLang->getNsIndex( $ns );
486                 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
487                         return $matches[0]; // return the whole thing
488                 } else {
489                         return $matches[2];
490                 }
491         }
492
493         /**
494          * Simple & fast snippet extraction, but gives completely unrelevant
495          * snippets
496          *
497          * Used when $wgAdvancedSearchHighlighting is false.
498          *
499          * @param string $text
500          * @param array $terms Escaped for regex by SearchDatabase::regexTerm()
501          * @param int $contextlines
502          * @param int $contextchars
503          * @return string
504          */
505         public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
506                 global $wgContLang;
507
508                 $lines = explode( "\n", $text );
509
510                 $terms = implode( '|', $terms );
511                 $max = intval( $contextchars ) + 1;
512                 $pat1 = "/(.*)($terms)(.{0,$max})/i";
513
514                 $lineno = 0;
515
516                 $extract = "";
517                 foreach ( $lines as $line ) {
518                         if ( 0 == $contextlines ) {
519                                 break;
520                         }
521                         ++$lineno;
522                         $m = [];
523                         if ( !preg_match( $pat1, $line, $m ) ) {
524                                 continue;
525                         }
526                         --$contextlines;
527                         // truncate function changes ... to relevant i18n message.
528                         $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
529
530                         if ( count( $m ) < 3 ) {
531                                 $post = '';
532                         } else {
533                                 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
534                         }
535
536                         $found = $m[2];
537
538                         $line = htmlspecialchars( $pre . $found . $post );
539                         $pat2 = '/(' . $terms . ")/i";
540                         $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
541
542                         $extract .= "${line}\n";
543                 }
544
545                 return $extract;
546         }
547
548         /**
549          * Returns the first few lines of the text
550          *
551          * @param string $text
552          * @param int $contextlines Max number of returned lines
553          * @param int $contextchars Average number of characters per line
554          * @return string
555          */
556         public function highlightNone( $text, $contextlines, $contextchars ) {
557                 $match = [];
558                 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
559                 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
560                 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
561
562                 // Trim and limit to max number of chars
563                 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
564                 return str_replace( "\n", '<br>', $text );
565         }
566 }