]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - includes/diff/WikiDiff3.php
MediaWiki 1.17.0
[autoinstalls/mediawiki.git] / includes / diff / WikiDiff3.php
1 <?php
2 /**
3  * New version of the difference engine
4  *
5  * Copyright © 2008 Guy Van den Broeck <guy@guyvdb.eu>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20  * http://www.gnu.org/copyleft/gpl.html
21  *
22  * @file
23  * @ingroup DifferenceEngine
24  */
25
26 /**
27  * This diff implementation is mainly lifted from the LCS algorithm of the Eclipse project which
28  * in turn is based on Myers' "An O(ND) difference algorithm and its variations"
29  * (http://citeseer.ist.psu.edu/myers86ond.html) with range compression (see Wu et al.'s
30  * "An O(NP) Sequence Comparison Algorithm").
31  *
32  * This implementation supports an upper bound on the excution time.
33  *
34  * Complexity: O((M + N)D) worst case time, O(M + N + D^2) expected time, O(M + N) space
35  *
36  * @author Guy Van den Broeck
37  * @ingroup DifferenceEngine
38  */
39 class WikiDiff3 {
40
41         // Input variables
42         private $from;
43         private $to;
44         private $m;
45         private $n;
46
47         private $tooLong;
48         private $powLimit;
49
50         // State variables
51         private $maxDifferences;
52         private $lcsLengthCorrectedForHeuristic = false;
53
54         // Output variables
55         public $length;
56         public $removed;
57         public $added;
58         public $heuristicUsed;
59
60         function __construct( $tooLong = 2000000, $powLimit = 1.45 ) {
61                 $this->tooLong = $tooLong;
62                 $this->powLimit = $powLimit;
63         }
64
65         public function diff( /*array*/ $from, /*array*/ $to ) {
66                 // remember initial lengths
67                 $m = sizeof( $from );
68                 $n = count( $to );
69
70                 $this->heuristicUsed = false;
71
72                 // output
73                 $removed = $m > 0 ? array_fill( 0, $m, true ) : array();
74                 $added = $n > 0 ? array_fill( 0, $n, true ) : array();
75
76                 // reduce the complexity for the next step (intentionally done twice)
77                 // remove common tokens at the start
78                 $i = 0;
79                 while ( $i < $m && $i < $n && $from[$i] === $to[$i] ) {
80                         $removed[$i] = $added[$i] = false;
81                         unset( $from[$i], $to[$i] );
82                         ++$i;
83                 }
84
85                 // remove common tokens at the end
86                 $j = 1;
87                 while ( $i + $j <= $m && $i + $j <= $n && $from[$m - $j] === $to[$n - $j] ) {
88                         $removed[$m - $j] = $added[$n - $j] = false;
89                         unset( $from[$m - $j], $to[$n - $j] );
90                         ++$j;
91                 }
92
93                 $this->from = $newFromIndex = $this->to = $newToIndex = array();
94
95                 // remove tokens not in both sequences
96                 $shared = array();
97                 foreach ( $from as $key ) {
98                         $shared[$key] = false;
99                 }
100
101                 foreach ( $to as $index => &$el ) {
102                         if ( array_key_exists( $el, $shared ) ) {
103                                 // keep it
104                                 $this->to[] = $el;
105                                 $shared[$el] = true;
106                                 $newToIndex[] = $index;
107                         }
108                 }
109                 foreach ( $from as $index => &$el ) {
110                         if ( $shared[$el] ) {
111                                 // keep it
112                                 $this->from[] = $el;
113                                 $newFromIndex[] = $index;
114                         }
115                 }
116
117                 unset( $shared, $from, $to );
118
119                 $this->m = count( $this->from );
120                 $this->n = count( $this->to );
121
122                 $this->removed = $this->m > 0 ? array_fill( 0, $this->m, true ) : array();
123                 $this->added = $this->n > 0 ? array_fill( 0, $this->n, true ) : array();
124
125                 if ( $this->m == 0 || $this->n == 0 ) {
126                         $this->length = 0;
127                 } else {
128                         $this->maxDifferences = ceil( ( $this->m + $this->n ) / 2.0 );
129                         if ( $this->m * $this->n > $this->tooLong ) {
130                                 // limit complexity to D^POW_LIMIT for long sequences
131                                 $this->maxDifferences = floor( pow( $this->maxDifferences, $this->powLimit - 1.0 ) );
132                                 wfDebug( "Limiting max number of differences to $this->maxDifferences\n" );
133                         }
134
135                         /*
136                          * The common prefixes and suffixes are always part of some LCS, include
137                          * them now to reduce our search space
138                          */
139                         $max = min( $this->m, $this->n );
140                         for ( $forwardBound = 0; $forwardBound < $max
141                                         && $this->from[$forwardBound] === $this->to[$forwardBound];
142                                         ++$forwardBound ) {
143                                 $this->removed[$forwardBound] = $this->added[$forwardBound] = false;
144                         }
145
146                         $backBoundL1 = $this->m - 1;
147                         $backBoundL2 = $this->n - 1;
148
149                         while ( $backBoundL1 >= $forwardBound && $backBoundL2 >= $forwardBound
150                                         && $this->from[$backBoundL1] === $this->to[$backBoundL2] ) {
151                                 $this->removed[$backBoundL1--] = $this->added[$backBoundL2--] = false;
152                         }
153
154                         $temp = array_fill( 0, $this->m + $this->n + 1, 0 );
155                         $V = array( $temp, $temp );
156                         $snake = array( 0, 0, 0 );
157
158                         $this->length = $forwardBound + $this->m - $backBoundL1 - 1
159                                 + $this->lcs_rec( $forwardBound, $backBoundL1,
160                                 $forwardBound, $backBoundL2, $V, $snake );
161                 }
162
163                 $this->m = $m;
164                 $this->n = $n;
165
166                 $this->length += $i + $j - 1;
167
168                 foreach ( $this->removed as $key => &$removed_elem ) {
169                         if ( !$removed_elem ) {
170                                 $removed[$newFromIndex[$key]] = false;
171                         }
172                 }
173                 foreach ( $this->added as $key => &$added_elem ) {
174                         if ( !$added_elem ) {
175                                 $added[$newToIndex[$key]] = false;
176                         }
177                 }
178                 $this->removed = $removed;
179                 $this->added = $added;
180         }
181
182         function diff_range( $from_lines, $to_lines ) {
183                 // Diff and store locally
184                 $this->diff( $from_lines, $to_lines );
185                 unset( $from_lines, $to_lines );
186
187                 $ranges = array();
188                 $xi = $yi = 0;
189                 while ( $xi < $this->m || $yi < $this->n ) {
190                         // Matching "snake".
191                         while ( $xi < $this->m && $yi < $this->n
192                                         && !$this->removed[$xi]
193                                         && !$this->added[$yi] ) {
194                                 ++$xi;
195                                 ++$yi;
196                         }
197                         // Find deletes & adds.
198                         $xstart = $xi;
199                         while ( $xi < $this->m && $this->removed[$xi] ) {
200                                 ++$xi;
201                         }
202
203                         $ystart = $yi;
204                         while ( $yi < $this->n && $this->added[$yi] ) {
205                                 ++$yi;
206                         }
207
208                         if ( $xi > $xstart || $yi > $ystart ) {
209                                 $ranges[] = new RangeDifference( $xstart, $xi,
210                                                                 $ystart, $yi );
211                         }
212                 }
213                 return $ranges;
214         }
215
216         private function lcs_rec( $bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake ) {
217                 // check that both sequences are non-empty
218                 if ( $bottoml1 > $topl1 || $bottoml2 > $topl2 ) {
219                         return 0;
220                 }
221
222                 $d = $this->find_middle_snake( $bottoml1, $topl1, $bottoml2,
223                                                         $topl2, $V, $snake );
224
225                 // need to store these so we don't lose them when they're
226                 // overwritten by the recursion
227                 $len = $snake[2];
228                 $startx = $snake[0];
229                 $starty = $snake[1];
230
231                 // the middle snake is part of the LCS, store it
232                 for ( $i = 0; $i < $len; ++$i ) {
233                         $this->removed[$startx + $i] = $this->added[$starty + $i] = false;
234                 }
235
236                 if ( $d > 1 ) {
237                         return $len
238                         + $this->lcs_rec( $bottoml1, $startx - 1, $bottoml2,
239                                                         $starty - 1, $V, $snake )
240                         + $this->lcs_rec( $startx + $len, $topl1, $starty + $len,
241                                                         $topl2, $V, $snake );
242                 } else if ( $d == 1 ) {
243                         /*
244                          * In this case the sequences differ by exactly 1 line. We have
245                          * already saved all the lines after the difference in the for loop
246                          * above, now we need to save all the lines before the difference.
247                          */
248                         $max = min( $startx - $bottoml1, $starty - $bottoml2 );
249                         for ( $i = 0; $i < $max; ++$i ) {
250                                 $this->removed[$bottoml1 + $i] =
251                                         $this->added[$bottoml2 + $i] = false;
252                         }
253                         return $max + $len;
254                 }
255                 return $len;
256         }
257
258         private function find_middle_snake( $bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake ) {
259                 $from = &$this->from;
260                 $to = &$this->to;
261                 $V0 = &$V[0];
262                 $V1 = &$V[1];
263                 $snake0 = &$snake[0];
264                 $snake1 = &$snake[1];
265                 $snake2 = &$snake[2];
266                 $bottoml1_min_1 = $bottoml1 -1;
267                 $bottoml2_min_1 = $bottoml2 -1;
268                 $N = $topl1 - $bottoml1_min_1;
269                 $M = $topl2 - $bottoml2_min_1;
270                 $delta = $N - $M;
271                 $maxabsx = $N + $bottoml1;
272                 $maxabsy = $M + $bottoml2;
273                 $limit = min( $this->maxDifferences, ceil( ( $N + $M ) / 2 ) );
274
275                 // value_to_add_forward: a 0 or 1 that we add to the start
276                 // offset to make it odd/even
277                 if ( ( $M & 1 ) == 1 ) {
278                         $value_to_add_forward = 1;
279                 } else {
280                         $value_to_add_forward = 0;
281                 }
282
283                 if ( ( $N & 1 ) == 1 ) {
284                         $value_to_add_backward = 1;
285                 } else {
286                         $value_to_add_backward = 0;
287                 }
288
289                 $start_forward = -$M;
290                 $end_forward = $N;
291                 $start_backward = -$N;
292                 $end_backward = $M;
293
294                 $limit_min_1 = $limit - 1;
295                 $limit_plus_1 = $limit + 1;
296
297                 $V0[$limit_plus_1] = 0;
298                 $V1[$limit_min_1] = $N;
299                 $limit = min( $this->maxDifferences, ceil( ( $N + $M ) / 2 ) );
300
301                 if ( ( $delta & 1 ) == 1 ) {
302                         for ( $d = 0; $d <= $limit; ++$d ) {
303                                 $start_diag = max( $value_to_add_forward + $start_forward, -$d );
304                                 $end_diag = min( $end_forward, $d );
305                                 $value_to_add_forward = 1 - $value_to_add_forward;
306
307                                 // compute forward furthest reaching paths
308                                 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
309                                         if ( $k == -$d || ( $k < $d
310                                                         && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k] ) ) {
311                                                 $x = $V0[$limit_plus_1 + $k];
312                                         } else {
313                                                 $x = $V0[$limit_min_1 + $k] + 1;
314                                         }
315
316                                         $absx = $snake0 = $x + $bottoml1;
317                                         $absy = $snake1 = $x - $k + $bottoml2;
318
319                                         while ( $absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy] ) {
320                                                 ++$absx;
321                                                 ++$absy;
322                                         }
323                                         $x = $absx -$bottoml1;
324
325                                         $snake2 = $absx -$snake0;
326                                         $V0[$limit + $k] = $x;
327                                         if ( $k >= $delta - $d + 1 && $k <= $delta + $d - 1
328                                                         && $x >= $V1[$limit + $k - $delta] ) {
329                                                 return 2 * $d - 1;
330                                         }
331
332                                         // check to see if we can cut down the diagonal range
333                                         if ( $x >= $N && $end_forward > $k - 1 ) {
334                                                 $end_forward = $k - 1;
335                                         } else if ( $absy - $bottoml2 >= $M ) {
336                                                 $start_forward = $k + 1;
337                                                 $value_to_add_forward = 0;
338                                         }
339                                 }
340
341                                 $start_diag = max( $value_to_add_backward + $start_backward, -$d );
342                                 $end_diag = min( $end_backward, $d );
343                                 $value_to_add_backward = 1 - $value_to_add_backward;
344
345                                 // compute backward furthest reaching paths
346                                 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
347                                         if ( $k == $d
348                                         || ( $k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k] ) ) {
349                                                 $x = $V1[$limit_min_1 + $k];
350                                         } else {
351                                                 $x = $V1[$limit_plus_1 + $k] - 1;
352                                         }
353
354                                         $y = $x - $k - $delta;
355
356                                         $snake2 = 0;
357                                         while ( $x > 0 && $y > 0
358                                         && $from[$x + $bottoml1_min_1] === $to[$y + $bottoml2_min_1] ) {
359                                                 --$x;
360                                                 --$y;
361                                                 ++$snake2;
362                                         }
363                                         $V1[$limit + $k] = $x;
364
365                                         // check to see if we can cut down our diagonal range
366                                         if ( $x <= 0 ) {
367                                                 $start_backward = $k + 1;
368                                                 $value_to_add_backward = 0;
369                                         } else if ( $y <= 0 && $end_backward > $k - 1 ) {
370                                                 $end_backward = $k - 1;
371                                         }
372                                 }
373                         }
374                 } else {
375                         for ( $d = 0; $d <= $limit; ++$d ) {
376                                 $start_diag = max( $value_to_add_forward + $start_forward, -$d );
377                                 $end_diag = min( $end_forward, $d );
378                                 $value_to_add_forward = 1 - $value_to_add_forward;
379
380                                 // compute forward furthest reaching paths
381                                 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
382                                         if ( $k == -$d
383                                         || ( $k < $d && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k] ) ) {
384                                                 $x = $V0[$limit_plus_1 + $k];
385                                         } else {
386                                                 $x = $V0[$limit_min_1 + $k] + 1;
387                                         }
388
389                                         $absx = $snake0 = $x + $bottoml1;
390                                         $absy = $snake1 = $x - $k + $bottoml2;
391
392                                         while ( $absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy] ) {
393                                                 ++$absx;
394                                                 ++$absy;
395                                         }
396                                         $x = $absx -$bottoml1;
397                                         $snake2 = $absx -$snake0;
398                                         $V0[$limit + $k] = $x;
399
400                                         // check to see if we can cut down the diagonal range
401                                         if ( $x >= $N && $end_forward > $k - 1 ) {
402                                                 $end_forward = $k - 1;
403                                         } else if ( $absy -$bottoml2 >= $M ) {
404                                                 $start_forward = $k + 1;
405                                                 $value_to_add_forward = 0;
406                                         }
407                                 }
408
409                                 $start_diag = max( $value_to_add_backward + $start_backward, -$d );
410                                 $end_diag = min( $end_backward, $d );
411                                 $value_to_add_backward = 1 - $value_to_add_backward;
412
413                                 // compute backward furthest reaching paths
414                                 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
415                                         if ( $k == $d
416                                         || ( $k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k] ) ) {
417                                                 $x = $V1[$limit_min_1 + $k];
418                                         } else {
419                                                 $x = $V1[$limit_plus_1 + $k] - 1;
420                                         }
421
422                                         $y = $x - $k - $delta;
423
424                                         $snake2 = 0;
425                                         while ( $x > 0 && $y > 0
426                                                         && $from[$x + $bottoml1_min_1] === $to[$y + $bottoml2_min_1] ) {
427                                                 --$x;
428                                                 --$y;
429                                                 ++$snake2;
430                                         }
431                                         $V1[$limit + $k] = $x;
432
433                                         if ( $k >= -$delta - $d && $k <= $d - $delta
434                                                         && $x <= $V0[$limit + $k + $delta] ) {
435                                                 $snake0 = $bottoml1 + $x;
436                                                 $snake1 = $bottoml2 + $y;
437                                                 return 2 * $d;
438                                         }
439
440                                         // check to see if we can cut down our diagonal range
441                                         if ( $x <= 0 ) {
442                                                 $start_backward = $k + 1;
443                                                 $value_to_add_backward = 0;
444                                         } else if ( $y <= 0 && $end_backward > $k - 1 ) {
445                                                 $end_backward = $k - 1;
446                                         }
447                                 }
448                         }
449                 }
450                 /*
451                  * computing the true LCS is too expensive, instead find the diagonal
452                  * with the most progress and pretend a midle snake of length 0 occurs
453                  * there.
454                  */
455
456                 $most_progress = self::findMostProgress( $M, $N, $limit, $V );
457
458                 $snake0 = $bottoml1 + $most_progress[0];
459                 $snake1 = $bottoml2 + $most_progress[1];
460                 $snake2 = 0;
461                 wfDebug( "Computing the LCS is too expensive. Using a heuristic.\n" );
462                 $this->heuristicUsed = true;
463                 return 5; /*
464                 * HACK: since we didn't really finish the LCS computation
465                 * we don't really know the length of the SES. We don't do
466                 * anything with the result anyway, unless it's <=1. We know
467                 * for a fact SES > 1 so 5 is as good a number as any to
468                 * return here
469                 */
470         }
471
472         private static function findMostProgress( $M, $N, $limit, $V ) {
473                 $delta = $N - $M;
474
475                 if ( ( $M & 1 ) == ( $limit & 1 ) ) {
476                         $forward_start_diag = max( -$M, -$limit );
477                 } else {
478                         $forward_start_diag = max( 1 - $M, -$limit );
479                 }
480
481                 $forward_end_diag = min( $N, $limit );
482
483                 if ( ( $N & 1 ) == ( $limit & 1 ) ) {
484                         $backward_start_diag = max( -$N, -$limit );
485                 } else {
486                         $backward_start_diag = max( 1 - $N, -$limit );
487                 }
488
489                 $backward_end_diag = -min( $M, $limit );
490
491                 $temp = array( 0, 0, 0 );
492
493
494                 $max_progress = array_fill( 0, ceil( max( $forward_end_diag - $forward_start_diag,
495                                 $backward_end_diag - $backward_start_diag ) / 2 ), $temp );
496                 $num_progress = 0; // the 1st entry is current, it is initialized
497                 // with 0s
498
499                 // first search the forward diagonals
500                 for ( $k = $forward_start_diag; $k <= $forward_end_diag; $k += 2 ) {
501                         $x = $V[0][$limit + $k];
502                         $y = $x - $k;
503                         if ( $x > $N || $y > $M ) {
504                                 continue;
505                         }
506
507                         $progress = $x + $y;
508                         if ( $progress > $max_progress[0][2] ) {
509                                 $num_progress = 0;
510                                 $max_progress[0][0] = $x;
511                                 $max_progress[0][1] = $y;
512                                 $max_progress[0][2] = $progress;
513                         } else if ( $progress == $max_progress[0][2] ) {
514                                 ++$num_progress;
515                                 $max_progress[$num_progress][0] = $x;
516                                 $max_progress[$num_progress][1] = $y;
517                                 $max_progress[$num_progress][2] = $progress;
518                         }
519                 }
520
521                 $max_progress_forward = true; // initially the maximum
522                 // progress is in the forward
523                 // direction
524
525                 // now search the backward diagonals
526                 for ( $k = $backward_start_diag; $k <= $backward_end_diag; $k += 2 ) {
527                         $x = $V[1][$limit + $k];
528                         $y = $x - $k - $delta;
529                         if ( $x < 0 || $y < 0 ) {
530                                 continue;
531                         }
532
533                         $progress = $N - $x + $M - $y;
534                         if ( $progress > $max_progress[0][2] ) {
535                                 $num_progress = 0;
536                                 $max_progress_forward = false;
537                                 $max_progress[0][0] = $x;
538                                 $max_progress[0][1] = $y;
539                                 $max_progress[0][2] = $progress;
540                         } else if ( $progress == $max_progress[0][2] && !$max_progress_forward ) {
541                                 ++$num_progress;
542                                 $max_progress[$num_progress][0] = $x;
543                                 $max_progress[$num_progress][1] = $y;
544                                 $max_progress[$num_progress][2] = $progress;
545                         }
546                 }
547
548                 // return the middle diagonal with maximal progress.
549                 return $max_progress[floor( $num_progress / 2 )];
550         }
551
552         public function getLcsLength() {
553                 if ( $this->heuristicUsed && !$this->lcsLengthCorrectedForHeuristic ) {
554                         $this->lcsLengthCorrectedForHeuristic = true;
555                         $this->length = $this->m -array_sum( $this->added );
556                 }
557                 return $this->length;
558         }
559
560 }
561
562 /**
563  * Alternative representation of a set of changes, by the index
564  * ranges that are changed.
565  *
566  * @ingroup DifferenceEngine
567  */
568 class RangeDifference {
569
570         public $leftstart;
571         public $leftend;
572         public $leftlength;
573
574         public $rightstart;
575         public $rightend;
576         public $rightlength;
577
578         function __construct( $leftstart, $leftend, $rightstart, $rightend ) {
579                 $this->leftstart = $leftstart;
580                 $this->leftend = $leftend;
581                 $this->leftlength = $leftend - $leftstart;
582                 $this->rightstart = $rightstart;
583                 $this->rightend = $rightend;
584                 $this->rightlength = $rightend - $rightstart;
585         }
586 }