]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - includes/diff/WikiDiff.php
MediaWiki 1.17.0
[autoinstalls/mediawiki.git] / includes / diff / WikiDiff.php
1 <?php
2 /**
3  * A PHP diff engine for phpwiki. (Taken from phpwiki-1.3.3)
4  *
5  * Copyright © 2000, 2001 Geoffrey T. Dairiki <dairiki@dairiki.org>
6  * You may copy this code freely under the conditions of the GPL.
7  *
8  * @file
9  * @ingroup DifferenceEngine
10  * @defgroup DifferenceEngine DifferenceEngine
11  */
12
13 /**
14  * @todo document
15  * @private
16  * @ingroup DifferenceEngine
17  */
18 class _DiffOp {
19         var $type;
20         var $orig;
21         var $closing;
22
23         function reverse() {
24                 trigger_error( 'pure virtual', E_USER_ERROR );
25         }
26
27         function norig() {
28                 return $this->orig ? sizeof( $this->orig ) : 0;
29         }
30
31         function nclosing() {
32                 return $this->closing ? sizeof( $this->closing ) : 0;
33         }
34 }
35
36 /**
37  * @todo document
38  * @private
39  * @ingroup DifferenceEngine
40  */
41 class _DiffOp_Copy extends _DiffOp {
42         var $type = 'copy';
43
44         function __construct ( $orig, $closing = false ) {
45                 if ( !is_array( $closing ) )
46                 $closing = $orig;
47                 $this->orig = $orig;
48                 $this->closing = $closing;
49         }
50
51         function reverse() {
52                 return new _DiffOp_Copy( $this->closing, $this->orig );
53         }
54 }
55
56 /**
57  * @todo document
58  * @private
59  * @ingroup DifferenceEngine
60  */
61 class _DiffOp_Delete extends _DiffOp {
62         var $type = 'delete';
63
64         function __construct ( $lines ) {
65                 $this->orig = $lines;
66                 $this->closing = false;
67         }
68
69         function reverse() {
70                 return new _DiffOp_Add( $this->orig );
71         }
72 }
73
74 /**
75  * @todo document
76  * @private
77  * @ingroup DifferenceEngine
78  */
79 class _DiffOp_Add extends _DiffOp {
80         var $type = 'add';
81
82         function __construct ( $lines ) {
83                 $this->closing = $lines;
84                 $this->orig = false;
85         }
86
87         function reverse() {
88                 return new _DiffOp_Delete( $this->closing );
89         }
90 }
91
92 /**
93  * @todo document
94  * @private
95  * @ingroup DifferenceEngine
96  */
97 class _DiffOp_Change extends _DiffOp {
98         var $type = 'change';
99
100         function __construct ( $orig, $closing ) {
101                 $this->orig = $orig;
102                 $this->closing = $closing;
103         }
104
105         function reverse() {
106                 return new _DiffOp_Change( $this->closing, $this->orig );
107         }
108 }
109
110 /**
111  * Class used internally by Diff to actually compute the diffs.
112  *
113  * The algorithm used here is mostly lifted from the perl module
114  * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
115  *       http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
116  *
117  * More ideas are taken from:
118  *       http://www.ics.uci.edu/~eppstein/161/960229.html
119  *
120  * Some ideas are (and a bit of code) are from from analyze.c, from GNU
121  * diffutils-2.7, which can be found at:
122  *       ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
123  *
124  * closingly, some ideas (subdivision by NCHUNKS > 2, and some optimizations)
125  * are my own.
126  *
127  * Line length limits for robustness added by Tim Starling, 2005-08-31
128  * Alternative implementation added by Guy Van den Broeck, 2008-07-30
129  *
130  * @author Geoffrey T. Dairiki, Tim Starling, Guy Van den Broeck
131  * @private
132  * @ingroup DifferenceEngine
133  */
134 class _DiffEngine {
135
136         const MAX_XREF_LENGTH =  10000;
137
138         function diff ( $from_lines, $to_lines ) {
139                 wfProfileIn( __METHOD__ );
140
141                 // Diff and store locally
142                 $this->diff_local( $from_lines, $to_lines );
143
144                 // Merge edits when possible
145                 $this->_shift_boundaries( $from_lines, $this->xchanged, $this->ychanged );
146                 $this->_shift_boundaries( $to_lines, $this->ychanged, $this->xchanged );
147
148                 // Compute the edit operations.
149                 $n_from = sizeof( $from_lines );
150                 $n_to = sizeof( $to_lines );
151
152                 $edits = array();
153                 $xi = $yi = 0;
154                 while ( $xi < $n_from || $yi < $n_to ) {
155                         assert( $yi < $n_to || $this->xchanged[$xi] );
156                         assert( $xi < $n_from || $this->ychanged[$yi] );
157
158                         // Skip matching "snake".
159                         $copy = array();
160                         while ( $xi < $n_from && $yi < $n_to
161                         && !$this->xchanged[$xi] && !$this->ychanged[$yi] ) {
162                                 $copy[] = $from_lines[$xi++];
163                                 ++$yi;
164                         }
165                         if ( $copy )
166                         $edits[] = new _DiffOp_Copy( $copy );
167
168                         // Find deletes & adds.
169                         $delete = array();
170                         while ( $xi < $n_from && $this->xchanged[$xi] )
171                         $delete[] = $from_lines[$xi++];
172
173                         $add = array();
174                         while ( $yi < $n_to && $this->ychanged[$yi] )
175                         $add[] = $to_lines[$yi++];
176
177                         if ( $delete && $add )
178                         $edits[] = new _DiffOp_Change( $delete, $add );
179                         elseif ( $delete )
180                         $edits[] = new _DiffOp_Delete( $delete );
181                         elseif ( $add )
182                         $edits[] = new _DiffOp_Add( $add );
183                 }
184                 wfProfileOut( __METHOD__ );
185                 return $edits;
186         }
187
188         function diff_local ( $from_lines, $to_lines ) {
189                 global $wgExternalDiffEngine;
190                 wfProfileIn( __METHOD__ );
191
192                 if ( $wgExternalDiffEngine == 'wikidiff3' ) {
193                         // wikidiff3
194                         $wikidiff3 = new WikiDiff3();
195                         $wikidiff3->diff( $from_lines, $to_lines );
196                         $this->xchanged = $wikidiff3->removed;
197                         $this->ychanged = $wikidiff3->added;
198                         unset( $wikidiff3 );
199                 } else {
200                         // old diff
201                         $n_from = sizeof( $from_lines );
202                         $n_to = sizeof( $to_lines );
203                         $this->xchanged = $this->ychanged = array();
204                         $this->xv = $this->yv = array();
205                         $this->xind = $this->yind = array();
206                         unset( $this->seq );
207                         unset( $this->in_seq );
208                         unset( $this->lcs );
209
210                         // Skip leading common lines.
211                         for ( $skip = 0; $skip < $n_from && $skip < $n_to; $skip++ ) {
212                                 if ( $from_lines[$skip] !== $to_lines[$skip] )
213                                 break;
214                                 $this->xchanged[$skip] = $this->ychanged[$skip] = false;
215                         }
216                         // Skip trailing common lines.
217                         $xi = $n_from; $yi = $n_to;
218                         for ( $endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++ ) {
219                                 if ( $from_lines[$xi] !== $to_lines[$yi] )
220                                 break;
221                                 $this->xchanged[$xi] = $this->ychanged[$yi] = false;
222                         }
223
224                         // Ignore lines which do not exist in both files.
225                         for ( $xi = $skip; $xi < $n_from - $endskip; $xi++ ) {
226                                 $xhash[$this->_line_hash( $from_lines[$xi] )] = 1;
227                         }
228
229                         for ( $yi = $skip; $yi < $n_to - $endskip; $yi++ ) {
230                                 $line = $to_lines[$yi];
231                                 if ( ( $this->ychanged[$yi] = empty( $xhash[$this->_line_hash( $line )] ) ) )
232                                 continue;
233                                 $yhash[$this->_line_hash( $line )] = 1;
234                                 $this->yv[] = $line;
235                                 $this->yind[] = $yi;
236                         }
237                         for ( $xi = $skip; $xi < $n_from - $endskip; $xi++ ) {
238                                 $line = $from_lines[$xi];
239                                 if ( ( $this->xchanged[$xi] = empty( $yhash[$this->_line_hash( $line )] ) ) )
240                                 continue;
241                                 $this->xv[] = $line;
242                                 $this->xind[] = $xi;
243                         }
244
245                         // Find the LCS.
246                         $this->_compareseq( 0, sizeof( $this->xv ), 0, sizeof( $this->yv ) );
247                 }
248                 wfProfileOut( __METHOD__ );
249         }
250
251         /**
252          * Returns the whole line if it's small enough, or the MD5 hash otherwise
253          */
254         function _line_hash( $line ) {
255                 if ( strlen( $line ) > self::MAX_XREF_LENGTH ) {
256                         return md5( $line );
257                 } else {
258                         return $line;
259                 }
260         }
261
262         /* Divide the Largest Common Subsequence (LCS) of the sequences
263          * [XOFF, XLIM) and [YOFF, YLIM) into NCHUNKS approximately equally
264          * sized segments.
265          *
266          * Returns (LCS, PTS).  LCS is the length of the LCS. PTS is an
267          * array of NCHUNKS+1 (X, Y) indexes giving the diving points between
268          * sub sequences.  The first sub-sequence is contained in [X0, X1),
269          * [Y0, Y1), the second in [X1, X2), [Y1, Y2) and so on.  Note
270          * that (X0, Y0) == (XOFF, YOFF) and
271          * (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
272          *
273          * This function assumes that the first lines of the specified portions
274          * of the two files do not match, and likewise that the last lines do not
275          * match.  The caller must trim matching lines from the beginning and end
276          * of the portions it is going to specify.
277          */
278         function _diag ( $xoff, $xlim, $yoff, $ylim, $nchunks ) {
279                 $flip = false;
280
281                 if ( $xlim - $xoff > $ylim - $yoff ) {
282                         // Things seems faster (I'm not sure I understand why)
283                         // when the shortest sequence in X.
284                         $flip = true;
285                         list ( $xoff, $xlim, $yoff, $ylim )
286                         = array( $yoff, $ylim, $xoff, $xlim );
287                 }
288
289                 if ( $flip )
290                 for ( $i = $ylim - 1; $i >= $yoff; $i-- )
291                 $ymatches[$this->xv[$i]][] = $i;
292                 else
293                 for ( $i = $ylim - 1; $i >= $yoff; $i-- )
294                 $ymatches[$this->yv[$i]][] = $i;
295
296                 $this->lcs = 0;
297                 $this->seq[0] = $yoff - 1;
298                 $this->in_seq = array();
299                 $ymids[0] = array();
300
301                 $numer = $xlim - $xoff + $nchunks - 1;
302                 $x = $xoff;
303                 for ( $chunk = 0; $chunk < $nchunks; $chunk++ ) {
304                         if ( $chunk > 0 )
305                         for ( $i = 0; $i <= $this->lcs; $i++ )
306                         $ymids[$i][$chunk -1] = $this->seq[$i];
307
308                         $x1 = $xoff + (int)( ( $numer + ( $xlim -$xoff ) * $chunk ) / $nchunks );
309                         for ( ; $x < $x1; $x++ ) {
310                                 $line = $flip ? $this->yv[$x] : $this->xv[$x];
311                                 if ( empty( $ymatches[$line] ) ) {
312                                         continue;
313                                 }
314                                 $matches = $ymatches[$line];
315                                 reset( $matches );
316                                 while ( list( , $y ) = each( $matches ) ) {
317                                         if ( empty( $this->in_seq[$y] ) ) {
318                                                 $k = $this->_lcs_pos( $y );
319                                                 assert( $k > 0 );
320                                                 $ymids[$k] = $ymids[$k -1];
321                                                 break;
322                                         }
323                                 }
324                                 while ( list ( , $y ) = each( $matches ) ) {
325                                         if ( $y > $this->seq[$k -1] ) {
326                                                 assert( $y < $this->seq[$k] );
327                                                 // Optimization: this is a common case:
328                                                 //      next match is just replacing previous match.
329                                                 $this->in_seq[$this->seq[$k]] = false;
330                                                 $this->seq[$k] = $y;
331                                                 $this->in_seq[$y] = 1;
332                                         } else if ( empty( $this->in_seq[$y] ) ) {
333                                                 $k = $this->_lcs_pos( $y );
334                                                 assert( $k > 0 );
335                                                 $ymids[$k] = $ymids[$k -1];
336                                         }
337                                 }
338                         }
339                 }
340
341                 $seps[] = $flip ? array( $yoff, $xoff ) : array( $xoff, $yoff );
342                 $ymid = $ymids[$this->lcs];
343                 for ( $n = 0; $n < $nchunks - 1; $n++ ) {
344                         $x1 = $xoff + (int)( ( $numer + ( $xlim - $xoff ) * $n ) / $nchunks );
345                         $y1 = $ymid[$n] + 1;
346                         $seps[] = $flip ? array( $y1, $x1 ) : array( $x1, $y1 );
347                 }
348                 $seps[] = $flip ? array( $ylim, $xlim ) : array( $xlim, $ylim );
349
350                 return array( $this->lcs, $seps );
351         }
352
353         function _lcs_pos ( $ypos ) {
354                 $end = $this->lcs;
355                 if ( $end == 0 || $ypos > $this->seq[$end] ) {
356                         $this->seq[++$this->lcs] = $ypos;
357                         $this->in_seq[$ypos] = 1;
358                         return $this->lcs;
359                 }
360
361                 $beg = 1;
362                 while ( $beg < $end ) {
363                         $mid = (int)( ( $beg + $end ) / 2 );
364                         if ( $ypos > $this->seq[$mid] )
365                         $beg = $mid + 1;
366                         else
367                         $end = $mid;
368                 }
369
370                 assert( $ypos != $this->seq[$end] );
371
372                 $this->in_seq[$this->seq[$end]] = false;
373                 $this->seq[$end] = $ypos;
374                 $this->in_seq[$ypos] = 1;
375                 return $end;
376         }
377
378         /* Find LCS of two sequences.
379          *
380          * The results are recorded in the vectors $this->{x,y}changed[], by
381          * storing a 1 in the element for each line that is an insertion
382          * or deletion (ie. is not in the LCS).
383          *
384          * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
385          *
386          * Note that XLIM, YLIM are exclusive bounds.
387          * All line numbers are origin-0 and discarded lines are not counted.
388          */
389         function _compareseq ( $xoff, $xlim, $yoff, $ylim ) {
390                 // Slide down the bottom initial diagonal.
391                 while ( $xoff < $xlim && $yoff < $ylim
392                 && $this->xv[$xoff] == $this->yv[$yoff] ) {
393                         ++$xoff;
394                         ++$yoff;
395                 }
396
397                 // Slide up the top initial diagonal.
398                 while ( $xlim > $xoff && $ylim > $yoff
399                 && $this->xv[$xlim - 1] == $this->yv[$ylim - 1] ) {
400                         --$xlim;
401                         --$ylim;
402                 }
403
404                 if ( $xoff == $xlim || $yoff == $ylim )
405                 $lcs = 0;
406                 else {
407                         // This is ad hoc but seems to work well.
408                         // $nchunks = sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5);
409                         // $nchunks = max(2,min(8,(int)$nchunks));
410                         $nchunks = min( 7, $xlim - $xoff, $ylim - $yoff ) + 1;
411                         list ( $lcs, $seps )
412                         = $this->_diag( $xoff, $xlim, $yoff, $ylim, $nchunks );
413                 }
414
415                 if ( $lcs == 0 ) {
416                         // X and Y sequences have no common subsequence:
417                         // mark all changed.
418                         while ( $yoff < $ylim )
419                         $this->ychanged[$this->yind[$yoff++]] = 1;
420                         while ( $xoff < $xlim )
421                         $this->xchanged[$this->xind[$xoff++]] = 1;
422                 } else {
423                         // Use the partitions to split this problem into subproblems.
424                         reset( $seps );
425                         $pt1 = $seps[0];
426                         while ( $pt2 = next( $seps ) ) {
427                                 $this->_compareseq ( $pt1[0], $pt2[0], $pt1[1], $pt2[1] );
428                                 $pt1 = $pt2;
429                         }
430                 }
431         }
432
433         /* Adjust inserts/deletes of identical lines to join changes
434          * as much as possible.
435          *
436          * We do something when a run of changed lines include a
437          * line at one end and has an excluded, identical line at the other.
438          * We are free to choose which identical line is included.
439          * `compareseq' usually chooses the one at the beginning,
440          * but usually it is cleaner to consider the following identical line
441          * to be the "change".
442          *
443          * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
444          */
445         function _shift_boundaries ( $lines, &$changed, $other_changed ) {
446                 wfProfileIn( __METHOD__ );
447                 $i = 0;
448                 $j = 0;
449
450                 assert( 'sizeof($lines) == sizeof($changed)' );
451                 $len = sizeof( $lines );
452                 $other_len = sizeof( $other_changed );
453
454                 while ( 1 ) {
455                         /*
456                          * Scan forwards to find beginning of another run of changes.
457                          * Also keep track of the corresponding point in the other file.
458                          *
459                          * Throughout this code, $i and $j are adjusted together so that
460                          * the first $i elements of $changed and the first $j elements
461                          * of $other_changed both contain the same number of zeros
462                          * (unchanged lines).
463                          * Furthermore, $j is always kept so that $j == $other_len or
464                          * $other_changed[$j] == false.
465                          */
466                         while ( $j < $other_len && $other_changed[$j] )
467                         $j++;
468
469                         while ( $i < $len && ! $changed[$i] ) {
470                                 assert( '$j < $other_len && ! $other_changed[$j]' );
471                                 $i++; $j++;
472                                 while ( $j < $other_len && $other_changed[$j] )
473                                 $j++;
474                         }
475
476                         if ( $i == $len )
477                         break;
478
479                         $start = $i;
480
481                         // Find the end of this run of changes.
482                         while ( ++$i < $len && $changed[$i] )
483                         continue;
484
485                         do {
486                                 /*
487                                  * Record the length of this run of changes, so that
488                                  * we can later determine whether the run has grown.
489                                  */
490                                 $runlength = $i - $start;
491
492                                 /*
493                                  * Move the changed region back, so long as the
494                                  * previous unchanged line matches the last changed one.
495                                  * This merges with previous changed regions.
496                                  */
497                                 while ( $start > 0 && $lines[$start - 1] == $lines[$i - 1] ) {
498                                         $changed[--$start] = 1;
499                                         $changed[--$i] = false;
500                                         while ( $start > 0 && $changed[$start - 1] )
501                                         $start--;
502                                         assert( '$j > 0' );
503                                         while ( $other_changed[--$j] )
504                                         continue;
505                                         assert( '$j >= 0 && !$other_changed[$j]' );
506                                 }
507
508                                 /*
509                                  * Set CORRESPONDING to the end of the changed run, at the last
510                                  * point where it corresponds to a changed run in the other file.
511                                  * CORRESPONDING == LEN means no such point has been found.
512                                  */
513                                 $corresponding = $j < $other_len ? $i : $len;
514
515                                 /*
516                                  * Move the changed region forward, so long as the
517                                  * first changed line matches the following unchanged one.
518                                  * This merges with following changed regions.
519                                  * Do this second, so that if there are no merges,
520                                  * the changed region is moved forward as far as possible.
521                                  */
522                                 while ( $i < $len && $lines[$start] == $lines[$i] ) {
523                                         $changed[$start++] = false;
524                                         $changed[$i++] = 1;
525                                         while ( $i < $len && $changed[$i] )
526                                         $i++;
527
528                                         assert( '$j < $other_len && ! $other_changed[$j]' );
529                                         $j++;
530                                         if ( $j < $other_len && $other_changed[$j] ) {
531                                                 $corresponding = $i;
532                                                 while ( $j < $other_len && $other_changed[$j] )
533                                                 $j++;
534                                         }
535                                 }
536                         } while ( $runlength != $i - $start );
537
538                         /*
539                          * If possible, move the fully-merged run of changes
540                          * back to a corresponding run in the other file.
541                          */
542                         while ( $corresponding < $i ) {
543                                 $changed[--$start] = 1;
544                                 $changed[--$i] = 0;
545                                 assert( '$j > 0' );
546                                 while ( $other_changed[--$j] )
547                                 continue;
548                                 assert( '$j >= 0 && !$other_changed[$j]' );
549                         }
550                 }
551                 wfProfileOut( __METHOD__ );
552         }
553 }
554
555 /**
556  * Class representing a 'diff' between two sequences of strings.
557  * @todo document
558  * @private
559  * @ingroup DifferenceEngine
560  */
561 class Diff
562 {
563         var $edits;
564
565         /**
566          * Constructor.
567          * Computes diff between sequences of strings.
568          *
569          * @param $from_lines array An array of strings.
570          *                (Typically these are lines from a file.)
571          * @param $to_lines array An array of strings.
572          */
573         function __construct( $from_lines, $to_lines ) {
574                 $eng = new _DiffEngine;
575                 $this->edits = $eng->diff( $from_lines, $to_lines );
576                 // $this->_check($from_lines, $to_lines);
577         }
578
579         /**
580          * Compute reversed Diff.
581          *
582          * SYNOPSIS:
583          *
584          *      $diff = new Diff($lines1, $lines2);
585          *      $rev = $diff->reverse();
586          * @return object A Diff object representing the inverse of the
587          *                                original diff.
588          */
589         function reverse () {
590                 $rev = $this;
591                 $rev->edits = array();
592                 foreach ( $this->edits as $edit ) {
593                         $rev->edits[] = $edit->reverse();
594                 }
595                 return $rev;
596         }
597
598         /**
599          * Check for empty diff.
600          *
601          * @return bool True iff two sequences were identical.
602          */
603         function isEmpty () {
604                 foreach ( $this->edits as $edit ) {
605                         if ( $edit->type != 'copy' )
606                         return false;
607                 }
608                 return true;
609         }
610
611         /**
612          * Compute the length of the Longest Common Subsequence (LCS).
613          *
614          * This is mostly for diagnostic purposed.
615          *
616          * @return int The length of the LCS.
617          */
618         function lcs () {
619                 $lcs = 0;
620                 foreach ( $this->edits as $edit ) {
621                         if ( $edit->type == 'copy' )
622                         $lcs += sizeof( $edit->orig );
623                 }
624                 return $lcs;
625         }
626
627         /**
628          * Get the original set of lines.
629          *
630          * This reconstructs the $from_lines parameter passed to the
631          * constructor.
632          *
633          * @return array The original sequence of strings.
634          */
635         function orig() {
636                 $lines = array();
637
638                 foreach ( $this->edits as $edit ) {
639                         if ( $edit->orig )
640                         array_splice( $lines, sizeof( $lines ), 0, $edit->orig );
641                 }
642                 return $lines;
643         }
644
645         /**
646          * Get the closing set of lines.
647          *
648          * This reconstructs the $to_lines parameter passed to the
649          * constructor.
650          *
651          * @return array The sequence of strings.
652          */
653         function closing() {
654                 $lines = array();
655
656                 foreach ( $this->edits as $edit ) {
657                         if ( $edit->closing )
658                         array_splice( $lines, sizeof( $lines ), 0, $edit->closing );
659                 }
660                 return $lines;
661         }
662
663         /**
664          * Check a Diff for validity.
665          *
666          * This is here only for debugging purposes.
667          */
668         function _check ( $from_lines, $to_lines ) {
669                 wfProfileIn( __METHOD__ );
670                 if ( serialize( $from_lines ) != serialize( $this->orig() ) )
671                 trigger_error( "Reconstructed original doesn't match", E_USER_ERROR );
672                 if ( serialize( $to_lines ) != serialize( $this->closing() ) )
673                 trigger_error( "Reconstructed closing doesn't match", E_USER_ERROR );
674
675                 $rev = $this->reverse();
676                 if ( serialize( $to_lines ) != serialize( $rev->orig() ) )
677                 trigger_error( "Reversed original doesn't match", E_USER_ERROR );
678                 if ( serialize( $from_lines ) != serialize( $rev->closing() ) )
679                 trigger_error( "Reversed closing doesn't match", E_USER_ERROR );
680
681
682                 $prevtype = 'none';
683                 foreach ( $this->edits as $edit ) {
684                         if ( $prevtype == $edit->type )
685                         trigger_error( "Edit sequence is non-optimal", E_USER_ERROR );
686                         $prevtype = $edit->type;
687                 }
688
689                 $lcs = $this->lcs();
690                 trigger_error( 'Diff okay: LCS = ' . $lcs, E_USER_NOTICE );
691                 wfProfileOut( __METHOD__ );
692         }
693 }
694
695 /**
696  * @todo document, bad name.
697  * @private
698  * @ingroup DifferenceEngine
699  */
700 class MappedDiff extends Diff
701 {
702         /**
703          * Constructor.
704          *
705          * Computes diff between sequences of strings.
706          *
707          * This can be used to compute things like
708          * case-insensitve diffs, or diffs which ignore
709          * changes in white-space.
710          *
711          * @param $from_lines array An array of strings.
712          *      (Typically these are lines from a file.)
713          *
714          * @param $to_lines array An array of strings.
715          *
716          * @param $mapped_from_lines array This array should
717          *      have the same size number of elements as $from_lines.
718          *      The elements in $mapped_from_lines and
719          *      $mapped_to_lines are what is actually compared
720          *      when computing the diff.
721          *
722          * @param $mapped_to_lines array This array should
723          *      have the same number of elements as $to_lines.
724          */
725         function __construct( $from_lines, $to_lines,
726         $mapped_from_lines, $mapped_to_lines ) {
727                 wfProfileIn( __METHOD__ );
728
729                 assert( sizeof( $from_lines ) == sizeof( $mapped_from_lines ) );
730                 assert( sizeof( $to_lines ) == sizeof( $mapped_to_lines ) );
731
732                 parent::__construct( $mapped_from_lines, $mapped_to_lines );
733
734                 $xi = $yi = 0;
735                 for ( $i = 0; $i < sizeof( $this->edits ); $i++ ) {
736                         $orig = &$this->edits[$i]->orig;
737                         if ( is_array( $orig ) ) {
738                                 $orig = array_slice( $from_lines, $xi, sizeof( $orig ) );
739                                 $xi += sizeof( $orig );
740                         }
741
742                         $closing = &$this->edits[$i]->closing;
743                         if ( is_array( $closing ) ) {
744                                 $closing = array_slice( $to_lines, $yi, sizeof( $closing ) );
745                                 $yi += sizeof( $closing );
746                         }
747                 }
748                 wfProfileOut( __METHOD__ );
749         }
750 }
751
752 /**
753  * A class to format Diffs
754  *
755  * This class formats the diff in classic diff format.
756  * It is intended that this class be customized via inheritance,
757  * to obtain fancier outputs.
758  * @todo document
759  * @private
760  * @ingroup DifferenceEngine
761  */
762 class DiffFormatter {
763         /**
764          * Number of leading context "lines" to preserve.
765          *
766          * This should be left at zero for this class, but subclasses
767          * may want to set this to other values.
768          */
769         var $leading_context_lines = 0;
770
771         /**
772          * Number of trailing context "lines" to preserve.
773          *
774          * This should be left at zero for this class, but subclasses
775          * may want to set this to other values.
776          */
777         var $trailing_context_lines = 0;
778
779         /**
780          * Format a diff.
781          *
782          * @param $diff object A Diff object.
783          * @return string The formatted output.
784          */
785         function format( $diff ) {
786                 wfProfileIn( __METHOD__ );
787
788                 $xi = $yi = 1;
789                 $block = false;
790                 $context = array();
791
792                 $nlead = $this->leading_context_lines;
793                 $ntrail = $this->trailing_context_lines;
794
795                 $this->_start_diff();
796
797                 foreach ( $diff->edits as $edit ) {
798                         if ( $edit->type == 'copy' ) {
799                                 if ( is_array( $block ) ) {
800                                         if ( sizeof( $edit->orig ) <= $nlead + $ntrail ) {
801                                                 $block[] = $edit;
802                                         }
803                                         else {
804                                                 if ( $ntrail ) {
805                                                         $context = array_slice( $edit->orig, 0, $ntrail );
806                                                         $block[] = new _DiffOp_Copy( $context );
807                                                 }
808                                                 $this->_block( $x0, $ntrail + $xi - $x0,
809                                                 $y0, $ntrail + $yi - $y0,
810                                                 $block );
811                                                 $block = false;
812                                         }
813                                 }
814                                 $context = $edit->orig;
815                         }
816                         else {
817                                 if ( ! is_array( $block ) ) {
818                                         $context = array_slice( $context, sizeof( $context ) - $nlead );
819                                         $x0 = $xi - sizeof( $context );
820                                         $y0 = $yi - sizeof( $context );
821                                         $block = array();
822                                         if ( $context )
823                                         $block[] = new _DiffOp_Copy( $context );
824                                 }
825                                 $block[] = $edit;
826                         }
827
828                         if ( $edit->orig )
829                         $xi += sizeof( $edit->orig );
830                         if ( $edit->closing )
831                         $yi += sizeof( $edit->closing );
832                 }
833
834                 if ( is_array( $block ) )
835                 $this->_block( $x0, $xi - $x0,
836                 $y0, $yi - $y0,
837                 $block );
838
839                 $end = $this->_end_diff();
840                 wfProfileOut( __METHOD__ );
841                 return $end;
842         }
843
844         function _block( $xbeg, $xlen, $ybeg, $ylen, &$edits ) {
845                 wfProfileIn( __METHOD__ );
846                 $this->_start_block( $this->_block_header( $xbeg, $xlen, $ybeg, $ylen ) );
847                 foreach ( $edits as $edit ) {
848                         if ( $edit->type == 'copy' )
849                         $this->_context( $edit->orig );
850                         elseif ( $edit->type == 'add' )
851                         $this->_added( $edit->closing );
852                         elseif ( $edit->type == 'delete' )
853                         $this->_deleted( $edit->orig );
854                         elseif ( $edit->type == 'change' )
855                         $this->_changed( $edit->orig, $edit->closing );
856                         else
857                         trigger_error( 'Unknown edit type', E_USER_ERROR );
858                 }
859                 $this->_end_block();
860                 wfProfileOut( __METHOD__ );
861         }
862
863         function _start_diff() {
864                 ob_start();
865         }
866
867         function _end_diff() {
868                 $val = ob_get_contents();
869                 ob_end_clean();
870                 return $val;
871         }
872
873         function _block_header( $xbeg, $xlen, $ybeg, $ylen ) {
874                 if ( $xlen > 1 )
875                 $xbeg .= "," . ( $xbeg + $xlen - 1 );
876                 if ( $ylen > 1 )
877                 $ybeg .= "," . ( $ybeg + $ylen - 1 );
878
879                 return $xbeg . ( $xlen ? ( $ylen ? 'c' : 'd' ) : 'a' ) . $ybeg;
880         }
881
882         function _start_block( $header ) {
883                 echo $header . "\n";
884         }
885
886         function _end_block() {
887         }
888
889         function _lines( $lines, $prefix = ' ' ) {
890                 foreach ( $lines as $line )
891                 echo "$prefix $line\n";
892         }
893
894         function _context( $lines ) {
895                 $this->_lines( $lines );
896         }
897
898         function _added( $lines ) {
899                 $this->_lines( $lines, '>' );
900         }
901         function _deleted( $lines ) {
902                 $this->_lines( $lines, '<' );
903         }
904
905         function _changed( $orig, $closing ) {
906                 $this->_deleted( $orig );
907                 echo "---\n";
908                 $this->_added( $closing );
909         }
910 }
911
912 /**
913  * A formatter that outputs unified diffs
914  * @ingroup DifferenceEngine
915  */
916
917 class UnifiedDiffFormatter extends DiffFormatter {
918         var $leading_context_lines = 2;
919         var $trailing_context_lines = 2;
920
921         function _added( $lines ) {
922                 $this->_lines( $lines, '+' );
923         }
924         function _deleted( $lines ) {
925                 $this->_lines( $lines, '-' );
926         }
927         function _changed( $orig, $closing ) {
928                 $this->_deleted( $orig );
929                 $this->_added( $closing );
930         }
931         function _block_header( $xbeg, $xlen, $ybeg, $ylen ) {
932                 return "@@ -$xbeg,$xlen +$ybeg,$ylen @@";
933         }
934 }
935
936 /**
937  * A pseudo-formatter that just passes along the Diff::$edits array
938  * @ingroup DifferenceEngine
939  */
940 class ArrayDiffFormatter extends DiffFormatter {
941         function format( $diff ) {
942                 $oldline = 1;
943                 $newline = 1;
944                 $retval = array();
945                 foreach ( $diff->edits as $edit )
946                 switch( $edit->type ) {
947                         case 'add':
948                                 foreach ( $edit->closing as $l ) {
949                                         $retval[] = array(
950                                                         'action' => 'add',
951                                                         'new' => $l,
952                                                         'newline' => $newline++
953                                         );
954                                 }
955                                 break;
956                         case 'delete':
957                                 foreach ( $edit->orig as $l ) {
958                                         $retval[] = array(
959                                                         'action' => 'delete',
960                                                         'old' => $l,
961                                                         'oldline' => $oldline++,
962                                         );
963                                 }
964                                 break;
965                         case 'change':
966                                 foreach ( $edit->orig as $i => $l ) {
967                                         $retval[] = array(
968                                                         'action' => 'change',
969                                                         'old' => $l,
970                                                         'new' => @$edit->closing[$i],
971                                                         'oldline' => $oldline++,
972                                                         'newline' => $newline++,
973                                         );
974                                 }
975                                 break;
976                         case 'copy':
977                                 $oldline += count( $edit->orig );
978                                 $newline += count( $edit->orig );
979                 }
980                 return $retval;
981         }
982 }
983
984 /**
985  *      Additions by Axel Boldt follow, partly taken from diff.php, phpwiki-1.3.3
986  *
987  */
988
989 define( 'NBSP', '&#160;' ); // iso-8859-x non-breaking space.
990
991 /**
992  * @todo document
993  * @private
994  * @ingroup DifferenceEngine
995  */
996 class _HWLDF_WordAccumulator {
997         function __construct () {
998                 $this->_lines = array();
999                 $this->_line = '';
1000                 $this->_group = '';
1001                 $this->_tag = '';
1002         }
1003
1004         function _flushGroup ( $new_tag ) {
1005                 if ( $this->_group !== '' ) {
1006                         if ( $this->_tag == 'ins' )
1007                         $this->_line .= '<ins class="diffchange diffchange-inline">' .
1008                         htmlspecialchars ( $this->_group ) . '</ins>';
1009                         elseif ( $this->_tag == 'del' )
1010                         $this->_line .= '<del class="diffchange diffchange-inline">' .
1011                         htmlspecialchars ( $this->_group ) . '</del>';
1012                         else
1013                         $this->_line .= htmlspecialchars ( $this->_group );
1014                 }
1015                 $this->_group = '';
1016                 $this->_tag = $new_tag;
1017         }
1018
1019         function _flushLine ( $new_tag ) {
1020                 $this->_flushGroup( $new_tag );
1021                 if ( $this->_line != '' )
1022                 array_push ( $this->_lines, $this->_line );
1023                 else
1024                 # make empty lines visible by inserting an NBSP
1025                 array_push ( $this->_lines, NBSP );
1026                 $this->_line = '';
1027         }
1028
1029         function addWords ( $words, $tag = '' ) {
1030                 if ( $tag != $this->_tag )
1031                 $this->_flushGroup( $tag );
1032
1033                 foreach ( $words as $word ) {
1034                         // new-line should only come as first char of word.
1035                         if ( $word == '' )
1036                         continue;
1037                         if ( $word[0] == "\n" ) {
1038                                 $this->_flushLine( $tag );
1039                                 $word = substr( $word, 1 );
1040                         }
1041                         assert( !strstr( $word, "\n" ) );
1042                         $this->_group .= $word;
1043                 }
1044         }
1045
1046         function getLines() {
1047                 $this->_flushLine( '~done' );
1048                 return $this->_lines;
1049         }
1050 }
1051
1052 /**
1053  * @todo document
1054  * @private
1055  * @ingroup DifferenceEngine
1056  */
1057 class WordLevelDiff extends MappedDiff {
1058         const MAX_LINE_LENGTH = 10000;
1059
1060         function __construct ( $orig_lines, $closing_lines ) {
1061                 wfProfileIn( __METHOD__ );
1062
1063                 list ( $orig_words, $orig_stripped ) = $this->_split( $orig_lines );
1064                 list ( $closing_words, $closing_stripped ) = $this->_split( $closing_lines );
1065
1066                 parent::__construct( $orig_words, $closing_words,
1067                 $orig_stripped, $closing_stripped );
1068                 wfProfileOut( __METHOD__ );
1069         }
1070
1071         function _split( $lines ) {
1072                 wfProfileIn( __METHOD__ );
1073
1074                 $words = array();
1075                 $stripped = array();
1076                 $first = true;
1077                 foreach ( $lines as $line ) {
1078                         # If the line is too long, just pretend the entire line is one big word
1079                         # This prevents resource exhaustion problems
1080                         if ( $first ) {
1081                                 $first = false;
1082                         } else {
1083                                 $words[] = "\n";
1084                                 $stripped[] = "\n";
1085                         }
1086                         if ( strlen( $line ) > self::MAX_LINE_LENGTH ) {
1087                                 $words[] = $line;
1088                                 $stripped[] = $line;
1089                         } else {
1090                                 $m = array();
1091                                 if ( preg_match_all( '/ ( [^\S\n]+ | [0-9_A-Za-z\x80-\xff]+ | . ) (?: (?!< \n) [^\S\n])? /xs',
1092                                 $line, $m ) )
1093                                 {
1094                                         $words = array_merge( $words, $m[0] );
1095                                         $stripped = array_merge( $stripped, $m[1] );
1096                                 }
1097                         }
1098                 }
1099                 wfProfileOut( __METHOD__ );
1100                 return array( $words, $stripped );
1101         }
1102
1103         function orig () {
1104                 wfProfileIn( __METHOD__ );
1105                 $orig = new _HWLDF_WordAccumulator;
1106
1107                 foreach ( $this->edits as $edit ) {
1108                         if ( $edit->type == 'copy' )
1109                         $orig->addWords( $edit->orig );
1110                         elseif ( $edit->orig )
1111                         $orig->addWords( $edit->orig, 'del' );
1112                 }
1113                 $lines = $orig->getLines();
1114                 wfProfileOut( __METHOD__ );
1115                 return $lines;
1116         }
1117
1118         function closing () {
1119                 wfProfileIn( __METHOD__ );
1120                 $closing = new _HWLDF_WordAccumulator;
1121
1122                 foreach ( $this->edits as $edit ) {
1123                         if ( $edit->type == 'copy' )
1124                         $closing->addWords( $edit->closing );
1125                         elseif ( $edit->closing )
1126                         $closing->addWords( $edit->closing, 'ins' );
1127                 }
1128                 $lines = $closing->getLines();
1129                 wfProfileOut( __METHOD__ );
1130                 return $lines;
1131         }
1132 }
1133
1134 /**
1135  * Wikipedia Table style diff formatter.
1136  * @todo document
1137  * @private
1138  * @ingroup DifferenceEngine
1139  */
1140 class TableDiffFormatter extends DiffFormatter {
1141         function __construct() {
1142                 $this->leading_context_lines = 2;
1143                 $this->trailing_context_lines = 2;
1144         }
1145
1146         public static function escapeWhiteSpace( $msg ) {
1147                 $msg = preg_replace( '/^ /m', '&#160; ', $msg );
1148                 $msg = preg_replace( '/ $/m', ' &#160;', $msg );
1149                 $msg = preg_replace( '/  /', '&#160; ', $msg );
1150                 return $msg;
1151         }
1152
1153         function _block_header( $xbeg, $xlen, $ybeg, $ylen ) {
1154                 $r = '<tr><td colspan="2" class="diff-lineno"><!--LINE ' . $xbeg . "--></td>\n" .
1155                   '<td colspan="2" class="diff-lineno"><!--LINE ' . $ybeg . "--></td></tr>\n";
1156                 return $r;
1157         }
1158
1159         function _start_block( $header ) {
1160                 echo $header;
1161         }
1162
1163         function _end_block() {
1164         }
1165
1166         function _lines( $lines, $prefix = ' ', $color = 'white' ) {
1167         }
1168
1169         # HTML-escape parameter before calling this
1170         function addedLine( $line ) {
1171                 return $this->wrapLine( '+', 'diff-addedline', $line );
1172         }
1173
1174         # HTML-escape parameter before calling this
1175         function deletedLine( $line ) {
1176                 return $this->wrapLine( '&minus;', 'diff-deletedline', $line );
1177         }
1178
1179         # HTML-escape parameter before calling this
1180         function contextLine( $line ) {
1181                 return $this->wrapLine( '&#160;', 'diff-context', $line );
1182         }
1183
1184         private function wrapLine( $marker, $class, $line ) {
1185                 if ( $line !== '' ) {
1186                         // The <div> wrapper is needed for 'overflow: auto' style to scroll properly
1187                         $line = Xml::tags( 'div', null, $this->escapeWhiteSpace( $line ) );
1188                 }
1189                 return "<td class='diff-marker'>$marker</td><td class='$class'>$line</td>";
1190         }
1191
1192         function emptyLine() {
1193                 return '<td colspan="2">&#160;</td>';
1194         }
1195
1196         function _added( $lines ) {
1197                 foreach ( $lines as $line ) {
1198                         echo '<tr>' . $this->emptyLine() .
1199                         $this->addedLine( '<ins class="diffchange">' .
1200                         htmlspecialchars ( $line ) . '</ins>' ) . "</tr>\n";
1201                 }
1202         }
1203
1204         function _deleted( $lines ) {
1205                 foreach ( $lines as $line ) {
1206                         echo '<tr>' . $this->deletedLine( '<del class="diffchange">' .
1207                         htmlspecialchars ( $line ) . '</del>' ) .
1208                         $this->emptyLine() . "</tr>\n";
1209                 }
1210         }
1211
1212         function _context( $lines ) {
1213                 foreach ( $lines as $line ) {
1214                         echo '<tr>' .
1215                         $this->contextLine( htmlspecialchars ( $line ) ) .
1216                         $this->contextLine( htmlspecialchars ( $line ) ) . "</tr>\n";
1217                 }
1218         }
1219
1220         function _changed( $orig, $closing ) {
1221                 wfProfileIn( __METHOD__ );
1222
1223                 $diff = new WordLevelDiff( $orig, $closing );
1224                 $del = $diff->orig();
1225                 $add = $diff->closing();
1226
1227                 # Notice that WordLevelDiff returns HTML-escaped output.
1228                 # Hence, we will be calling addedLine/deletedLine without HTML-escaping.
1229
1230                 while ( $line = array_shift( $del ) ) {
1231                         $aline = array_shift( $add );
1232                         echo '<tr>' . $this->deletedLine( $line ) .
1233                         $this->addedLine( $aline ) . "</tr>\n";
1234                 }
1235                 foreach ( $add as $line ) {     # If any leftovers
1236                         echo '<tr>' . $this->emptyLine() .
1237                         $this->addedLine( $line ) . "</tr>\n";
1238                 }
1239                 wfProfileOut( __METHOD__ );
1240         }
1241 }