File Coverage

lib/Algorithm/Diff.pm
Criterion Covered Total %
statement 314 359 87.5
branch 107 136 78.7
condition 73 94 77.7
subroutine 42 52 80.8
pod 9 9 100.0
total 545 650 83.8


line stmt bran cond sub pod time code
1             package Algorithm::Diff;
2             # Skip to first "=head" line for documentation.
3 2     2   26 use strict;
  2         17  
  2         29  
4              
5 2     2   1486 use integer; # see below in _replaceNextLargerWith() for mod to make
  2         19  
  2         33  
6             # if you don't use this
7 2     2   33 use vars qw( $VERSION @EXPORT_OK );
  2         17  
  2         28  
8             $VERSION = 1.19_02;
9             # ^ ^^ ^^-- Incremented at will
10             # | \+----- Incremented for non-trivial changes to features
11             # \-------- Incremented for fundamental changes
12             require Exporter;
13             *import    = \&Exporter::import;
14             @EXPORT_OK = qw(
15             prepare LCS LCSidx LCS_length
16             diff sdiff compact_diff
17             traverse_sequences traverse_balanced
18             );
19              
20             # McIlroy-Hunt diff algorithm
21             # Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com>
22             # by Ned Konz, perl@bike-nomad.com
23             # Updates by Tye McQueen, http://perlmonks.org/?node=tye
24              
25             # Create a hash that maps each element of $aCollection to the set of
26             # positions it occupies in $aCollection, restricted to the elements
27             # within the range of indexes specified by $start and $end.
28             # The fourth parameter is a subroutine reference that will be called to
29             # generate a string to use as a key.
30             # Additional parameters, if any, will be passed to this subroutine.
31             #
32             # my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
33              
34             sub _withPositionsOfInInterval
35             {
36 36     36   298     my $aCollection = shift; # array ref
37 36         297     my $start = shift;
38 36         322     my $end = shift;
39 36         300     my $keyGen = shift;
40 36         375     my %d;
41 36         281     my $index;
42                 for ( $index = $start ; $index <= $end ; $index++ )
43                 {
44 154         1455         my $element = $aCollection->[$index];
45 154         1740         my $key = &$keyGen( $element, @_ );
46 154 100       3295         if ( exists( $d{$key} ) )
47                     {
48 1         9             unshift ( @{ $d{$key} }, $index );
  1         15  
49                     }
50                     else
51                     {
52 153         2355             $d{$key} = [$index];
53                     }
54 36         285     }
55 36 50       454     return wantarray ? %d : \%d;
56             }
57              
58             # Find the place at which aValue would normally be inserted into the
59             # array. If that place is already occupied by aValue, do nothing, and
60             # return undef. If the place does not exist (i.e., it is off the end of
61             # the array), add it to the end, otherwise replace the element at that
62             # point with aValue. It is assumed that the array's values are numeric.
63             # This is where the bulk (75%) of the time is spent in this module, so
64             # try to make it fast!
65              
66             sub _replaceNextLargerWith
67             {
68 55     55   470     my ( $array, $aValue, $high ) = @_;
69 55   50     639     $high ||= $#$array;
70              
71             # off the end?
72 55 50 66     722     if ( $high == -1 || $aValue > $array->[-1] )
73                 {
74 55         475         push ( @$array, $aValue );
75 55         654         return $high + 1;
76                 }
77              
78             # binary search for insertion point...
79 0         0     my $low = 0;
80 0         0     my $index;
81 0         0     my $found;
82 0         0     while ( $low <= $high )
83                 {
84 0         0         $index = ( $high + $low ) / 2;
85              
86             # $index = int(( $high + $low ) / 2); # without 'use integer'
87 0         0         $found = $array->[$index];
88              
89 0 0       0         if ( $aValue == $found )
    0          
90                     {
91 0         0             return undef;
92                     }
93                     elsif ( $aValue > $found )
94                     {
95 0         0             $low = $index + 1;
96                     }
97                     else
98                     {
99 0         0             $high = $index - 1;
100                     }
101                 }
102              
103             # now insertion point is in $low.
104 0         0     $array->[$low] = $aValue; # overwrite next larger
105 0         0     return $low;
106             }
107              
108             # This method computes the longest common subsequence in $a and $b.
109              
110             # Result is array or ref, whose contents is such that
111             # $a->[ $i ] == $b->[ $result[ $i ] ]
112             # foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined.
113              
114             # An additional argument may be passed; this is a hash or key generating
115             # function that should return a string that uniquely identifies the given
116             # element. It should be the case that if the key is the same, the elements
117             # will compare the same. If this parameter is undef or missing, the key
118             # will be the element as a string.
119              
120             # By default, comparisons will use "eq" and elements will be turned into keys
121             # using the default stringizing operator '""'.
122              
123             # Additional parameters, if any, will be passed to the key generation
124             # routine.
125              
126             sub _longestCommonSubsequence
127             {
128 36     36   419     my $a = shift; # array ref or hash ref
129 36         324     my $b = shift; # array ref or hash ref
130 36         302     my $counting = shift; # scalar
131 36         298     my $keyGen = shift; # code ref
132 36         293     my $compare; # code ref
133              
134 36 50       455     if ( ref($a) eq 'HASH' )
135                 { # prepared hash must be in $b
136 0         0         my $tmp = $b;
137 0         0         $b = $a;
138 0         0         $a = $tmp;
139                 }
140              
141             # Check for bogus (non-ref) argument values
142 36 50 33     545     if ( !ref($a) || !ref($b) )
143                 {
144 0         0         my @callerInfo = caller(1);
145 0         0         die 'error: must pass array or hash references to ' . $callerInfo[3];
146                 }
147              
148             # set up code refs
149             # Note that these are optimized.
150 36 50       344     if ( !defined($keyGen) ) # optimize for strings
151                 {
152 36     283   306         $keyGen = sub { $_[0] };
  283         2896  
153 36     81   357         $compare = sub { my ( $a, $b ) = @_; $a eq $b };
  81         818  
  81         1064  
154                 }
155                 else
156                 {
157                     $compare = sub {
158 0     0   0             my $a = shift;
159 0         0             my $b = shift;
160 0         0             &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
161 0         0         };
162                 }
163              
164 36         411     my ( $aStart, $aFinish, $matchVector ) = ( 0, $#$a, [] );
165 36         381     my ( $prunedCount, $bMatches ) = ( 0, {} );
166              
167 36 50       377     if ( ref($b) eq 'HASH' ) # was $bMatches prepared for us?
168                 {
169 0         0         $bMatches = $b;
170                 }
171                 else
172                 {
173 36         476         my ( $bStart, $bFinish ) = ( 0, $#$b );
174              
175             # First we prune off any common elements at the beginning
176 36   100     694         while ( $aStart <= $aFinish
      100        
177                         and $bStart <= $bFinish
178                         and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
179                     {
180 13         129             $matchVector->[ $aStart++ ] = $bStart++;
181 13         459             $prunedCount++;
182                     }
183              
184             # now the end
185 36   100     589         while ( $aStart <= $aFinish
      100        
186                         and $bStart <= $bFinish
187                         and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
188                     {
189 16         161             $matchVector->[ $aFinish-- ] = $bFinish--;
190 16         213             $prunedCount++;
191                     }
192              
193             # Now compute the equivalence classes of positions of elements
194                     $bMatches =
195 36         360           _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
196                 }
197 36         334     my $thresh = [];
198 36         313     my $links = [];
199              
200 36         318     my ( $i, $ai, $j, $k );
201                 for ( $i = $aStart ; $i <= $aFinish ; $i++ )
202                 {
203 129         1807         $ai = &$keyGen( $a->[$i], @_ );
204 129 100       1371         if ( exists( $bMatches->{$ai} ) )
205                     {
206 55         429             $k = 0;
207 55         425             for $j ( @{ $bMatches->{$ai} } )
  55         548  
208                         {
209              
210             # optimization: most of the time this will be true
211 55 50 33     640                 if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
      33        
212                             {
213 0         0                     $thresh->[$k] = $j;
214                             }
215                             else
216                             {
217 55         514                     $k = _replaceNextLargerWith( $thresh, $j, $k );
218                             }
219              
220             # oddly, it's faster to always test this (CPU cache?).
221 55 50       1019                 if ( defined($k) )
222                             {
223 55 100       1014                     $links->[$k] =
224                                   [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
225                             }
226                         }
227                     }
228 36         291     }
229              
230 36 100       473     if (@$thresh)
    50          
231                 {
232 14 50       151         return $prunedCount + @$thresh if $counting;
233                     for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
234                     {
235 55         627             $matchVector->[ $link->[1] ] = $link->[2];
236 14         152         }
237                 }
238                 elsif ($counting)
239                 {
240 0         0         return $prunedCount;
241                 }
242              
243 36 100       641     return wantarray ? @$matchVector : $matchVector;
244             }
245              
246             sub traverse_sequences
247             {
248 3     3 1 1078     my $a = shift; # array ref
249 3         26     my $b = shift; # array ref
250 3   50     31     my $callbacks = shift || {};
251 3         27     my $keyGen = shift;
252 3   50 0   33     my $matchCallback = $callbacks->{'MATCH'} || sub { };
  0         0  
253 3   50 0   32     my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
  0         0  
254 3         28     my $finishedACallback = $callbacks->{'A_FINISHED'};
255 3   50 0   33     my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
  0         0  
256 3         27     my $finishedBCallback = $callbacks->{'B_FINISHED'};
257 3         31     my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
258              
259             # Process all the lines in @$matchVector
260 3         29     my $lastA = $#$a;
261 3         28     my $lastB = $#$b;
262 3         23     my $bi = 0;
263 3         24     my $ai;
264              
265                 for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
266                 {
267 24         383         my $bLine = $matchVector->[$ai];
268 24 100       230         if ( defined($bLine) ) # matched
269                     {
270 18         261             &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
271 18         413             &$matchCallback( $ai, $bi++, @_ );
272                     }
273                     else
274                     {
275 6         85             &$discardACallback( $ai, $bi, @_ );
276                     }
277 3         26     }
278              
279             # The last entry (if any) processed was a match.
280             # $ai and $bi point just past the last matching lines in their sequences.
281              
282 3   100     63     while ( $ai <= $lastA or $bi <= $lastB )
283                 {
284              
285             # last A?
286 9 100 66     145         if ( $ai == $lastA + 1 and $bi <= $lastB )
287                     {
288 3 100       29             if ( defined($finishedACallback) )
289                         {
290 1         12                 &$finishedACallback( $lastA, @_ );
291 1         16                 $finishedACallback = undef;
292                         }
293                         else
294                         {
295 2         85                 &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
296                         }
297                     }
298              
299             # last B?
300 9 50 66     105         if ( $bi == $lastB + 1 and $ai <= $lastA )
301                     {
302 0 0       0             if ( defined($finishedBCallback) )
303                         {
304 0         0                 &$finishedBCallback( $lastB, @_ );
305 0         0                 $finishedBCallback = undef;
306                         }
307                         else
308                         {
309 0         0                 &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
310                         }
311                     }
312              
313 9 100       94         &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
314 9 100       132         &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
315                 }
316              
317 3         50     return 1;
318             }
319              
320             sub traverse_balanced
321             {
322 20     20 1 1535     my $a = shift; # array ref
323 20         213     my $b = shift; # array ref
324 20   50     194     my $callbacks = shift || {};
325 20         162     my $keyGen = shift;
326 20   50 0   201     my $matchCallback = $callbacks->{'MATCH'} || sub { };
  0         0  
327 20   50 0   190     my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
  0         0  
328 20   50 0   192     my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
  0         0  
329 20         165     my $changeCallback = $callbacks->{'CHANGE'};
330 20         195     my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
331              
332             # Process all the lines in match vector
333 20         187     my $lastA = $#$a;
334 20         171     my $lastB = $#$b;
335 20         163     my $bi = 0;
336 20         334     my $ai = 0;
337 20         160     my $ma = -1;
338 20         156     my $mb;
339              
340 20         155     while (1)
341                 {
342              
343             # Find next match indices $ma and $mb
344 52   100     489         do {
345 71         971             $ma++;
346                     } while(
347                             $ma <= $#$matchVector
348                         && !defined $matchVector->[$ma]
349                     );
350              
351 52 100       529         last if $ma > $#$matchVector; # end of matchVector?
352 32         273         $mb = $matchVector->[$ma];
353              
354             # Proceed with discard a/b or change events until
355             # next match
356 32   100     359         while ( $ai < $ma || $bi < $mb )
357                     {
358              
359 38 100 100     567             if ( $ai < $ma && $bi < $mb )
    100          
360                         {
361              
362             # Change
363 10 100       86                 if ( defined $changeCallback )
364                             {
365 9         91                     &$changeCallback( $ai++, $bi++, @_ );
366                             }
367                             else
368                             {
369 1         13                     &$discardACallback( $ai++, $bi, @_ );
370 1         20                     &$discardBCallback( $ai, $bi++, @_ );
371                             }
372                         }
373                         elsif ( $ai < $ma )
374                         {
375 9         88                 &$discardACallback( $ai++, $bi, @_ );
376                         }
377                         else
378                         {
379              
380             # $bi < $mb
381 19         186                 &$discardBCallback( $ai, $bi++, @_ );
382                         }
383                     }
384              
385             # Match
386 32         373         &$matchCallback( $ai++, $bi++, @_ );
387                 }
388              
389 20   100     233     while ( $ai <= $lastA || $bi <= $lastB )
390                 {
391 22 100 100     393         if ( $ai <= $lastA && $bi <= $lastB )
    100          
392                     {
393              
394             # Change
395 10 50       126             if ( defined $changeCallback )
396                         {
397 10         100                 &$changeCallback( $ai++, $bi++, @_ );
398                         }
399                         else
400                         {
401 0         0                 &$discardACallback( $ai++, $bi, @_ );
402 0         0                 &$discardBCallback( $ai, $bi++, @_ );
403                         }
404                     }
405                     elsif ( $ai <= $lastA )
406                     {
407 7         68             &$discardACallback( $ai++, $bi, @_ );
408                     }
409                     else
410                     {
411              
412             # $bi <= $lastB
413 5         48             &$discardBCallback( $ai, $bi++, @_ );
414                     }
415                 }
416              
417 20         258     return 1;
418             }
419              
420             sub prepare
421             {
422 0     0 1 0     my $a = shift; # array ref
423 0         0     my $keyGen = shift; # code ref
424              
425             # set up code ref
426 0 0   0   0     $keyGen = sub { $_[0] } unless defined($keyGen);
  0         0  
427              
428 0         0     return scalar _withPositionsOfInInterval( $a, 0, $#$a, $keyGen, @_ );
429             }
430              
431             sub LCS
432             {
433 1     1 1 617     my $a = shift; # array ref
434 1         11     my $b = shift; # array ref or hash ref
435 1         10     my $matchVector = _longestCommonSubsequence( $a, $b, 0, @_ );
436 1         10     my @retval;
437 1         9     my $i;
438                 for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
439                 {
440 8 100       78         if ( defined( $matchVector->[$i] ) )
441                     {
442 6         76             push ( @retval, $a->[$i] );
443                     }
444 1         9     }
445 1 50       22     return wantarray ? @retval : \@retval;
446             }
447              
448             sub LCS_length
449             {
450 0     0 1 0     my $a = shift; # array ref
451 0         0     my $b = shift; # array ref or hash ref
452 0         0     return _longestCommonSubsequence( $a, $b, 1, @_ );
453             }
454              
455             sub LCSidx
456             {
457 11     11 1 100     my $a= shift @_;
458 11         97     my $b= shift @_;
459 11         117     my $match= _longestCommonSubsequence( $a, $b, 0, @_ );
460 11         169     my @am= grep defined $match->[$_], 0..$#$match;
461 11         94     my @bm= @{$match}[@am];
  11         109  
462 11         139     return \@am, \@bm;
463             }
464              
465             sub compact_diff
466             {
467 11     11 1 104     my $a= shift @_;
468 11         101     my $b= shift @_;
469 11         119     my( $am, $bm )= LCSidx( $a, $b, @_ );
470 11         96     my @cdiff;
471 11         97     my( $ai, $bi )= ( 0, 0 );
472 11         124     push @cdiff, $ai, $bi;
473 11         88     while( 1 ) {
474 22   100     360         while( @$am && $ai == $am->[0] && $bi == $bm->[0] ) {
      100        
475 22         179             shift @$am;
476 22         181             shift @$bm;
477 22         1130             ++$ai, ++$bi;
478                     }
479 22         208         push @cdiff, $ai, $bi;
480 22 100       223         last if ! @$am;
481 11         92         $ai = $am->[0];
482 11         142         $bi = $bm->[0];
483 11         105         push @cdiff, $ai, $bi;
484                 }
485 11 100 100     153     push @cdiff, 0+@$a, 0+@$b
486                     if $ai < @$a || $bi < @$b;
487 11 50       137     return wantarray ? @cdiff : \@cdiff;
488             }
489              
490             sub diff
491             {
492 1     1 1 121     my $a = shift; # array ref
493 1         10     my $b = shift; # array ref
494 1         9     my $retval = [];
495 1         10     my $hunk = [];
496                 my $discard = sub {
497 4     4   50         push @$hunk, [ '-', $_[0], $a->[ $_[0] ] ];
498 1         18     };
499                 my $add = sub {
500 6     6   85         push @$hunk, [ '+', $_[1], $b->[ $_[1] ] ];
501 1         14     };
502                 my $match = sub {
503 7 100   7   71         push @$retval, $hunk
504                         if 0 < @$hunk;
505 7         81         $hunk = []
506 1         13     };
507 1         53     traverse_sequences( $a, $b,
508                     { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
509 1         40     &$match();
510 1 50       21     return wantarray ? @$retval : $retval;
511             }
512              
513             sub sdiff
514             {
515 12     12 1 1135     my $a = shift; # array ref
516 12         267     my $b = shift; # array ref
517 12         104     my $retval = [];
518 12     13   181     my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) };
  13         195  
519 12     24   323     my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) };
  24         396  
520                 my $change = sub {
521 8     8   126         push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] );
522 12         153     };
523                 my $match = sub {
524 23     23   282         push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] );
525 12         208     };
526 12         176     traverse_balanced(
527                     $a,
528                     $b,
529                     {
530                         MATCH => $match,
531                         DISCARD_A => $discard,
532                         DISCARD_B => $add,
533                         CHANGE => $change,
534                     },
535                     @_
536                 );
537 12 50       360     return wantarray ? @$retval : $retval;
538             }
539              
540             ########################################
541             my $Root= __PACKAGE__;
542             package Algorithm::Diff::_impl;
543 2     2   46 use strict;
  2         18  
  2         30  
544              
545             sub _Idx() { 0 } # $me->[_Idx]: Ref to array of hunk indices
546             # 1 # $me->[1]: Ref to first sequence
547             # 2 # $me->[2]: Ref to second sequence
548             sub _End() { 3 } # $me->[_End]: Diff between forward and reverse pos
549             sub _Same() { 4 } # $me->[_Same]: 1 if pos 1 contains unchanged items
550             sub _Base() { 5 } # $me->[_Base]: Added to range's min and max
551             sub _Pos() { 6 } # $me->[_Pos]: Which hunk is currently selected
552             sub _Off() { 7 } # $me->[_Off]: Offset into _Idx for current position
553             sub _Min() { -2 } # Added to _Off to get min instead of max+1
554              
555             sub Die
556             {
557 230     230   3165     require Carp;
558 230         3570     Carp::confess( @_ );
559             }
560              
561             sub _ChkPos
562             {
563 1052     1052   61922     my( $me )= @_;
564 1052 100       13172     return if $me->[_Pos];
565 110         1614     my $meth= ( caller(1) )[3];
566 110         1412     Die( "Called $meth on 'reset' object" );
567             }
568              
569             sub _ChkSeq
570             {
571 650     650   7031     my( $me, $seq )= @_;
572 650 100 100     12247     return $seq + $me->[_Off]
573                     if 1 == $seq || 2 == $seq;
574 110         6131     my $meth= ( caller(1) )[3];
575 110         3019     Die( "$meth: Invalid sequence number ($seq); must be 1 or 2" );
576             }
577              
578             sub getObjPkg
579             {
580 13     13   123     my( $us )= @_;
581 13 50       122     return ref $us if ref $us;
582 13         176     return $us . "::_obj";
583             }
584              
585             sub new
586             {
587 11     11   2106     my( $us, $seq1, $seq2, $opts ) = @_;
588 11         97     my @args;
589 11         152     for( $opts->{keyGen} ) {
590 11 50       165         push @args, $_ if $_;
591                 }
592 11         212     for( $opts->{keyGenArgs} ) {
593 11 50       178         push @args, @$_ if $_;
594                 }
595 11         130     my $cdif= Algorithm::Diff::compact_diff( $seq1, $seq2, @args );
596 11         97     my $same= 1;
597 11 100 66     136     if( 0 == $cdif->[2] && 0 == $cdif->[3] ) {
598 8         64         $same= 0;
599 8         77         splice @$cdif, 0, 2;
600                 }
601 11         107     my @obj= ( $cdif, $seq1, $seq2 );
602 11         98     $obj[_End] = (1+@$cdif)/2;
603 11         97     $obj[_Same] = $same;
604 11         91     $obj[_Base] = 0;
605 11         128     my $me = bless \@obj, $us->getObjPkg();
606 11         122     $me->Reset( 0 );
607 11         139     return $me;
608             }
609              
610             sub Reset
611             {
612 330     330   5224     my( $me, $pos )= @_;
613 330   100     8222     $pos= int( $pos || 0 );
614 330 100       3780     $pos += $me->[_End]
615                     if $pos < 0;
616 330 100 100     5104     $pos= 0
617                     if $pos < 0 || $me->[_End] <= $pos;
618 330   100     4031     $me->[_Pos]= $pos || !1;
619 330         5127     $me->[_Off]= 2*$pos - 1;
620 330         4026     return $me;
621             }
622              
623             sub Base
624             {
625 162     162   1817     my( $me, $base )= @_;
626 162         1387     my $oldBase= $me->[_Base];
627 162 100       1791     $me->[_Base]= 0+$base if defined $base;
628 162         3064     return $oldBase;
629             }
630              
631             sub Copy
632             {
633 54     54   497     my( $me, $pos, $base )= @_;
634 54         1468     my @obj= @$me;
635 54         616     my $you= bless \@obj, ref($me);
636 54 100       513     $you->Reset( $pos ) if defined $pos;
637 54         776     $you->Base( $base );
638 54         613     return $you;
639             }
640              
641             sub Next {
642 347     347   6245     my( $me, $steps )= @_;
643 347 100       15577     $steps= 1 if ! defined $steps;
644 347 100       3388     if( $steps ) {
645 242         2244         my $pos= $me->[_Pos];
646 242         3892         my $new= $pos + $steps;
647 242 100 100     2880         $new= 0 if $pos && $new < 0;
648 242         2373         $me->Reset( $new )
649                 }
650 347         14517     return $me->[_Pos];
651             }
652              
653             sub Prev {
654 109     109   1270     my( $me, $steps )= @_;
655 109 100       1130     $steps= 1 if ! defined $steps;
656 109         1127     my $pos= $me->Next(-$steps);
657 109 100       1049     $pos -= $me->[_End] if $pos;
658 109         1308     return $pos;
659             }
660              
661             sub Diff {
662 81     81   808     my( $me )= @_;
663 81         2191     $me->_ChkPos();
664 70 100       811     return 0 if $me->[_Same] == ( 1 & $me->[_Pos] );
665 50         491     my $ret= 0;
666 50         446     my $off= $me->[_Off];
667 50         439     for my $seq ( 1, 2 ) {
668 100 100       1215         $ret |= $seq
669                         if $me->[_Idx][ $off + $seq + _Min ]
670                         < $me->[_Idx][ $off + $seq ];
671                 }
672 50         580     return $ret;
673             }
674              
675             sub Min {
676 91     91   2195     my( $me, $seq, $base )= @_;
677 91         1587     $me->_ChkPos();
678 80         2821     my $off= $me->_ChkSeq($seq);
679 50 100       503     $base= $me->[_Base] if !defined $base;
680 50         4727     return $base + $me->[_Idx][ $off + _Min ];
681             }
682              
683             sub Max {
684 91     91   2810     my( $me, $seq, $base )= @_;
685 91         894     $me->_ChkPos();
686 80         1010     my $off= $me->_ChkSeq($seq);
687 50 100       464     $base= $me->[_Base] if !defined $base;
688 50         779     return $base + $me->[_Idx][ $off ] -1;
689             }
690              
691             sub Range {
692 295     295   4943     my( $me, $seq, $base )= @_;
693 295         3054     $me->_ChkPos();
694 284         4254     my $off = $me->_ChkSeq($seq);
695 254 100       2738     if( !wantarray ) {
696 104         1519         return $me->[_Idx][ $off ]
697                         - $me->[_Idx][ $off + _Min ];
698                 }
699 150 100       1417     $base= $me->[_Base] if !defined $base;
700 150         2780     return ( $base + $me->[_Idx][ $off + _Min ] )
701                     .. ( $base + $me->[_Idx][ $off ] - 1 );
702             }
703              
704             sub Items {
705 217     217   4364     my( $me, $seq )= @_;
706 217         2417     $me->_ChkPos();
707 206         2074     my $off = $me->_ChkSeq($seq);
708 186 100       1860     if( !wantarray ) {
709 48         593         return $me->[_Idx][ $off ]
710                         - $me->[_Idx][ $off + _Min ];
711                 }
712                 return
713 138         1738         @{$me->[$seq]}[
  138         2290  
714                             $me->[_Idx][ $off + _Min ]
715                         .. ( $me->[_Idx][ $off ] - 1 )
716                     ];
717             }
718              
719             sub Same {
720 156     156   2751     my( $me )= @_;
721 156         3861     $me->_ChkPos();
722 112 50       1615     return wantarray ? () : 0
    100          
723                     if $me->[_Same] != ( 1 & $me->[_Pos] );
724 62         766     return $me->Items(1);
725             }
726              
727             my %getName;
728             BEGIN {
729 2     2   95     %getName= (
730                     same => \&Same,
731                     diff => \&Diff,
732                     base => \&Base,
733                     min => \&Min,
734                     max => \&Max,
735                     range=> \&Range,
736                     items=> \&Items, # same thing
737                 );
738             }
739              
740             sub Get
741             {
742 121     121   3619     my $me= shift @_;
743 121         1196     $me->_ChkPos();
744 110         897     my @value;
745 110         1123     for my $arg ( @_ ) {
746 130         2164         for my $word ( split ' ', $arg ) {
747 160         1296             my $meth;
748 160 50 33     3799             if( $word !~ /^(-?\d+)?([a-zA-Z]+)([12])?$/
749                             || not $meth= $getName{ lc $2 }
750                         ) {
751 0         0                 Die( $Root, ", Get: Invalid request ($word)" );
752                         }
753 160         10398             my( $base, $name, $seq )= ( $1, $2, $3 );
754 160 100       1964             push @value, scalar(
755                             4 == length($name)
756                                 ? $meth->( $me )
757                                 : $meth->( $me, $seq, $base )
758                         );
759                     }
760                 }
761 80 100       889     if( wantarray ) {
    100          
762 30         520         return @value;
763                 } elsif( 1 == @value ) {
764 40         568         return $value[0];
765                 }
766 10         108     Die( 0+@value, " values requested from ",
767                     $Root, "'s Get in scalar context" );
768             }
769              
770              
771             my $Obj= getObjPkg($Root);
772 2     2   34 no strict 'refs';
  2         18  
  2         30  
773              
774             for my $meth ( qw( new getObjPkg ) ) {
775                 *{$Root."::".$meth} = \&{$meth};
776                 *{$Obj ."::".$meth} = \&{$meth};
777             }
778             for my $meth ( qw(
779             Next Prev Reset Copy Base Diff
780             Same Items Range Min Max Get
781             _ChkPos _ChkSeq
782             )  ) {
783                 *{$Obj."::".$meth} = \&{$meth};
784             }
785              
786             1;
787             __END__
788            
789             =head1 NAME
790            
791             Algorithm::Diff - Compute `intelligent' differences between two files / lists
792            
793             =head1 SYNOPSIS
794            
795             require Algorithm::Diff;
796            
797             # This example produces traditional 'diff' output:
798            
799             my $diff = Algorithm::Diff->new( \@seq1, \@seq2 );
800            
801             $diff->Base( 1 ); # Return line numbers, not indices
802             while( $diff->Next() ) {
803             next if $diff->Same();
804             my $sep = '';
805             if( ! $diff->Items(2) ) {
806             printf "%d,%dd%d\n",
807             $diff->Get(qw( Min1 Max1 Max2 ));
808             } elsif( ! $diff->Items(1) ) {
809             printf "%da%d,%d\n",
810             $diff->Get(qw( Max1 Min2 Max2 ));
811             } else {
812             $sep = "---\n";
813             printf "%d,%dc%d,%d\n",
814             $diff->Get(qw( Min1 Max1 Min2 Max2 ));
815             }
816             print "< $_" for $diff->Items(1);
817             print $sep;
818             print "> $_" for $diff->Items(2);
819             }
820            
821            
822             # Alternate interfaces:
823            
824             use Algorithm::Diff qw(
825             LCS LCS_length LCSidx
826             diff sdiff compact_diff
827             traverse_sequences traverse_balanced );
828            
829             @lcs = LCS( \@seq1, \@seq2 );
830             $lcsref = LCS( \@seq1, \@seq2 );
831             $count = LCS_length( \@seq1, \@seq2 );
832            
833             ( $seq1idxref, $seq2idxref ) = LCSidx( \@seq1, \@seq2 );
834            
835            
836             # Complicated interfaces:
837            
838             @diffs = diff( \@seq1, \@seq2 );
839            
840             @sdiffs = sdiff( \@seq1, \@seq2 );
841            
842             @cdiffs = compact_diff( \@seq1, \@seq2 );
843            
844             traverse_sequences(
845             \@seq1,
846             \@seq2,
847             { MATCH => \&callback1,
848             DISCARD_A => \&callback2,
849             DISCARD_B => \&callback3,
850             },
851             \&key_generator,
852             @extra_args,
853             );
854            
855             traverse_balanced(
856             \@seq1,
857             \@seq2,
858             { MATCH => \&callback1,
859             DISCARD_A => \&callback2,
860             DISCARD_B => \&callback3,
861             CHANGE => \&callback4,
862             },
863             \&key_generator,
864             @extra_args,
865             );
866            
867            
868             =head1 INTRODUCTION
869            
870             (by Mark-Jason Dominus)
871            
872             I once read an article written by the authors of C<diff>; they said
873             that they worked very hard on the algorithm until they found the
874             right one.
875            
876             I think what they ended up using (and I hope someone will correct me,
877             because I am not very confident about this) was the `longest common
878             subsequence' method. In the LCS problem, you have two sequences of
879             items:
880            
881             a b c d f g h j q z
882            
883             a b c d e f g i j k r x y z
884            
885             and you want to find the longest sequence of items that is present in
886             both original sequences in the same order. That is, you want to find
887             a new sequence I<S> which can be obtained from the first sequence by
888             deleting some items, and from the secend sequence by deleting other
889             items. You also want I<S> to be as long as possible. In this case I<S>
890             is
891            
892             a b c d f g j z
893            
894             From there it's only a small step to get diff-like output:
895            
896             e h i k q r x y
897             + - + + - + + +
898            
899             This module solves the LCS problem. It also includes a canned function
900             to generate C<diff>-like output.
901            
902             It might seem from the example above that the LCS of two sequences is
903             always pretty obvious, but that's not always the case, especially when
904             the two sequences have many repeated elements. For example, consider
905            
906             a x b y c z p d q
907             a b c a x b y c z
908            
909             A naive approach might start by matching up the C<a> and C<b> that
910             appear at the beginning of each sequence, like this:
911            
912             a x b y c z p d q
913             a b c a b y c z
914            
915             This finds the common subsequence C<a b c z>. But actually, the LCS
916             is C<a x b y c z>:
917            
918             a x b y c z p d q
919             a b c a x b y c z
920            
921             or
922            
923             a x b y c z p d q
924             a b c a x b y c z
925            
926             =head1 USAGE
927            
928             (See also the README file and several example
929             scripts include with this module.)
930            
931             This module now provides an object-oriented interface that uses less
932             memory and is easier to use than most of the previous procedural
933             interfaces. It also still provides several exportable functions. We'll
934             deal with these in ascending order of difficulty: C<LCS>,
935             C<LCS_length>, C<LCSidx>, OO interface, C<prepare>, C<diff>, C<sdiff>,
936             C<traverse_sequences>, and C<traverse_balanced>.
937            
938             =head2 C<LCS>
939            
940             Given references to two lists of items, LCS returns an array containing
941             their longest common subsequence. In scalar context, it returns a
942             reference to such a list.
943            
944             @lcs = LCS( \@seq1, \@seq2 );
945             $lcsref = LCS( \@seq1, \@seq2 );
946            
947             C<LCS> may be passed an optional third parameter; this is a CODE
948             reference to a key generation function. See L</KEY GENERATION
949             FUNCTIONS>.
950            
951             @lcs = LCS( \@seq1, \@seq2, \&keyGen, @args );
952             $lcsref = LCS( \@seq1, \@seq2, \&keyGen, @args );
953            
954             Additional parameters, if any, will be passed to the key generation
955             routine.
956            
957             =head2 C<LCS_length>
958            
959             This is just like C<LCS> except it only returns the length of the
960             longest common subsequence. This provides a performance gain of about
961             9% compared to C<LCS>.
962            
963             =head2 C<LCSidx>
964            
965             Like C<LCS> except it returns references to two arrays. The first array
966             contains the indices into @seq1 where the LCS items are located. The
967             second array contains the indices into @seq2 where the LCS items are located.
968            
969             Therefore, the following three lists will contain the same values:
970            
971             my( $idx1, $idx2 ) = LCSidx( \@seq1, \@seq2 );
972             my @list1 = @seq1[ @$idx1 ];
973             my @list2 = @seq2[ @$idx2 ];
974             my @list3 = LCS( \@seq1, \@seq2 );
975            
976             =head2 C<new>
977            
978             $diff = Algorithm::Diffs->new( \@seq1, \@seq2 );
979             $diff = Algorithm::Diffs->new( \@seq1, \@seq2, \%opts );
980            
981             C<new> computes the smallest set of additions and deletions necessary
982             to turn the first sequence into the second and compactly records them
983             in the object.
984            
985             You use the object to iterate over I<hunks>, where each hunk represents
986             a contiguous section of items which should be added, deleted, replaced,
987             or left unchanged.
988            
989             =over 4
990            
991             The following summary of all of the methods looks a lot like Perl code
992             but some of the symbols have different meanings:
993            
994             [ ] Encloses optional arguments
995             : Is followed by the default value for an optional argument
996             | Separates alternate return results
997            
998             Method summary:
999            
1000             $obj = Algorithm::Diff->new( \@seq1, \@seq2, [ \%opts ] );
1001             $pos = $obj->Next( [ $count : 1 ] );
1002             $revPos = $obj->Prev( [ $count : 1 ] );
1003             $obj = $obj->Reset( [ $pos : 0 ] );
1004             $copy = $obj->Copy( [ $pos, [ $newBase ] ] );
1005             $oldBase = $obj->Base( [ $newBase ] );
1006            
1007             Note that all of the following methods C<die> if used on an object that
1008             is "reset" (not currently pointing at any hunk).
1009            
1010             $bits = $obj->Diff( );
1011             @items|$cnt = $obj->Same( );
1012             @items|$cnt = $obj->Items( $seqNum );
1013             @idxs |$cnt = $obj->Range( $seqNum, [ $base ] );
1014             $minIdx = $obj->Min( $seqNum, [ $base ] );
1015             $maxIdx = $obj->Max( $seqNum, [ $base ] );
1016             @values = $obj->Get( @names );
1017            
1018             Passing in C<undef> for an optional argument is always treated the same
1019             as if no argument were passed in.
1020            
1021             =item C<Next>
1022            
1023             $pos = $diff->Next(); # Move forward 1 hunk
1024             $pos = $diff->Next( 2 ); # Move forward 2 hunks
1025             $pos = $diff->Next(-5); # Move backward 5 hunks
1026            
1027             C<Next> moves the object to point at the next hunk. The object starts
1028             out "reset", which means it isn't pointing at any hunk. If the object
1029             is reset, then C<Next()> moves to the first hunk.
1030            
1031             C<Next> returns a true value iff the move didn't go past the last hunk.
1032             So C<Next(0)> will return true iff the object is not reset.
1033            
1034             Actually, C<Next> returns the object's new position, which is a number
1035             between 1 and the number of hunks (inclusive), or returns a false value.
1036            
1037             =item C<Prev>
1038            
1039             C<Prev($N)> is almost identical to C<Next(-$N)>; it moves to the $Nth
1040             previous hunk. On a 'reset' object, C<Prev()> [and C<Next(-1)>] move
1041             to the last hunk.
1042            
1043             The position returned by C<Prev> is relative to the I<end> of the
1044             hunks; -1 for the last hunk, -2 for the second-to-last, etc.
1045            
1046             =item C<Reset>
1047            
1048             $diff->Reset(); # Reset the object's position
1049             $diff->Reset($pos); # Move to the specified hunk
1050             $diff->Reset(1); # Move to the first hunk
1051             $diff->Reset(-1); # Move to the last hunk
1052            
1053             C<Reset> returns the object, so, for example, you could use
1054             C<< $diff->Reset()->Next(-1) >> to get the number of hunks.
1055            
1056             =item C<Copy>
1057            
1058             $copy = $diff->Copy( $newPos, $newBase );
1059            
1060             C<Copy> returns a copy of the object. The copy and the orignal object
1061             share most of their data, so making copies takes very little memory.
1062             The copy maintains its own position (separate from the original), which
1063             is the main purpose of copies. It also maintains its own base.
1064            
1065             By default, the copy's position starts out the same as the original
1066             object's position. But C<Copy> takes an optional first argument to set the
1067             new position, so the following three snippets are equivalent:
1068            
1069             $copy = $diff->Copy($pos);
1070            
1071             $copy = $diff->Copy();
1072             $copy->Reset($pos);
1073            
1074             $copy = $diff->Copy()->Reset($pos);
1075            
1076             C<Copy> takes an optional second argument to set the base for
1077             the copy. If you wish to change the base of the copy but leave
1078             the position the same as in the original, here are two
1079             equivalent ways:
1080            
1081             $copy = $diff->Copy();
1082             $copy->Base( 0 );
1083            
1084             $copy = $diff->Copy(undef,0);
1085            
1086             Here are two equivalent way to get a "reset" copy:
1087            
1088             $copy = $diff->Copy(0);
1089            
1090             $copy = $diff->Copy()->Reset();
1091            
1092             =item C<Diff>
1093            
1094             $bits = $obj->Diff();
1095            
1096             C<Diff> returns a true value iff the current hunk contains items that are
1097             different between the two sequences. It actually returns one of the
1098             follow 4 values:
1099            
1100             =over 4
1101            
1102             =item 3
1103            
1104             C<3==(1|2)>. This hunk contains items from @seq1 and the items
1105             from @seq2 that should replace them. Both sequence 1 and 2
1106             contain changed items so both the 1 and 2 bits are set.
1107            
1108             =item 2
1109            
1110             This hunk only contains items from @seq2 that should be inserted (not
1111             items from @seq1). Only sequence 2 contains changed items so only the 2
1112             bit is set.
1113            
1114             =item 1
1115            
1116             This hunk only contains items from @seq1 that should be deleted (not
1117             items from @seq2). Only sequence 1 contains changed items so only the 1
1118             bit is set.
1119            
1120             =item 0
1121            
1122             This means that the items in this hunk are the same in both sequences.
1123             Neither sequence 1 nor 2 contain changed items so neither the 1 nor the
1124             2 bits are set.
1125            
1126             =back
1127            
1128             =item C<Same>
1129            
1130             C<Same> returns a true value iff the current hunk contains items that
1131             are the same in both sequences. It actually returns the list of items
1132             if they are the same or an emty list if they aren't. In a scalar
1133             context, it returns the size of the list.
1134            
1135             =item C<Items>
1136            
1137             $count = $diff->Items(2);
1138             @items = $diff->Items($seqNum);
1139            
1140             C<Items> returns the (number of) items from the specified sequence that
1141             are part of the current hunk.
1142            
1143             If the current hunk contains only insertions, then
1144             C<< $diff->Items(1) >> will return an empty list (0 in a scalar conext).
1145             If the current hunk contains only deletions, then C<< $diff->Items(2) >>
1146             will return an empty list (0 in a scalar conext).
1147            
1148             If the hunk contains replacements, then both C<< $diff->Items(1) >> and
1149             C<< $diff->Items(2) >> will return different, non-empty lists.
1150            
1151             Otherwise, the hunk contains identical items and all of the following
1152             will return the same lists:
1153            
1154             @items = $diff->Items(1);
1155             @items = $diff->Items(2);
1156             @items = $diff->Same();
1157            
1158             =item C<Range>
1159            
1160             $count = $diff->Range( $seqNum );
1161             @indices = $diff->Range( $seqNum );
1162             @indices = $diff->Range( $seqNum, $base );
1163            
1164             C<Range> is like C<Items> except that it returns a list of I<indices> to
1165             the items rather than the items themselves. By default, the index of
1166             the first item (in each sequence) is 0 but this can be changed by
1167             calling the C<Base> method. So, by default, the following two snippets
1168             return the same lists:
1169            
1170             @list = $diff->Items(2);
1171             @list = @seq2[ $diff->Range(2) ];
1172            
1173             You can also specify the base to use as the second argument. So the
1174             following two snippets I<always> return the same lists:
1175            
1176             @list = $diff->Items(1);
1177             @list = @seq1[ $diff->Range(1,0) ];
1178            
1179             =item C<Base>
1180            
1181             $curBase = $diff->Base();
1182             $oldBase = $diff->Base($newBase);
1183            
1184             C<Base> sets and/or returns the current base (usually 0 or 1) that is
1185             used when you request range information. The base defaults to 0 so
1186             that range information is returned as array indices. You can set the
1187             base to 1 if you want to report traditional line numbers instead.
1188            
1189             =item C<Min>
1190            
1191             $min1 = $diff->Min(1);
1192             $min = $diff->Min( $seqNum, $base );
1193            
1194             C<Min> returns the first value that C<Range> would return (given the
1195             same arguments) or returns C<undef> if C<Range> would return an empty
1196             list.
1197            
1198             =item C<Max>
1199            
1200             C<Max> returns the last value that C<Range> would return or C<undef>.
1201            
1202             =item C<Get>
1203            
1204             ( $n, $x, $r ) = $diff->Get(qw( min1 max1 range1 ));
1205             @values = $diff->Get(qw( 0min2 1max2 range2 same base ));
1206            
1207             C<Get> returns one or more scalar values. You pass in a list of the
1208             names of the values you want returned. Each name must match one of the
1209             following regexes:
1210            
1211             /^(-?\d+)?(min|max)[12]$/i
1212             /^(range[12]|same|diff|base)$/i
1213            
1214             The 1 or 2 after a name says which sequence you want the information
1215             for (and where allowed, it is required). The optional number before
1216             "min" or "max" is the base to use. So the following equalities hold:
1217            
1218             $diff->Get('min1') == $diff->Min(1)
1219             $diff->Get('0min2') == $diff->Min(2,0)
1220            
1221             Using C<Get> in a scalar context when you've passed in more than one
1222             name is a fatal error (C<die> is called).
1223            
1224             =back
1225            
1226             =head2 C<prepare>
1227            
1228             Given a reference to a list of items, C<prepare> returns a reference
1229             to a hash which can be used when comparing this sequence to other
1230             sequences with C<LCS> or C<LCS_length>.
1231            
1232             $prep = prepare( \@seq1 );
1233             for $i ( 0 .. 10_000 )
1234             {
1235             @lcs = LCS( $prep, $seq[$i] );
1236             # do something useful with @lcs
1237             }
1238            
1239             C<prepare> may be passed an optional third parameter; this is a CODE
1240             reference to a key generation function. See L</KEY GENERATION
1241             FUNCTIONS>.
1242            
1243             $prep = prepare( \@seq1, \&keyGen );
1244             for $i ( 0 .. 10_000 )
1245             {
1246             @lcs = LCS( $seq[$i], $prep, \&keyGen );
1247             # do something useful with @lcs
1248             }
1249            
1250             Using C<prepare> provides a performance gain of about 50% when calling LCS
1251             many times compared with not preparing.
1252            
1253             =head2 C<diff>
1254            
1255             @diffs = diff( \@seq1, \@seq2 );
1256             $diffs_ref = diff( \@seq1, \@seq2 );
1257            
1258             C<diff> computes the smallest set of additions and deletions necessary
1259             to turn the first sequence into the second, and returns a description
1260             of these changes. The description is a list of I<hunks>; each hunk
1261             represents a contiguous section of items which should be added,
1262             deleted, or replaced. (Hunks containing unchanged items are not
1263             included.)
1264            
1265             The return value of C<diff> is a list of hunks, or, in scalar context, a
1266             reference to such a list. If there are no differences, the list will be
1267             empty.
1268            
1269             Here is an example. Calling C<diff> for the following two sequences:
1270            
1271             a b c e h j l m n p
1272             b c d e f j k l m r s t
1273            
1274             would produce the following list:
1275            
1276             (
1277             [ [ '-', 0, 'a' ] ],
1278            
1279             [ [ '+', 2, 'd' ] ],
1280            
1281             [ [ '-', 4, 'h' ],
1282             [ '+', 4, 'f' ] ],
1283            
1284             [ [ '+', 6, 'k' ] ],
1285            
1286             [ [ '-', 8, 'n' ],
1287             [ '-', 9, 'p' ],
1288             [ '+', 9, 'r' ],
1289             [ '+', 10, 's' ],
1290             [ '+', 11, 't' ] ],
1291             )
1292            
1293             There are five hunks here. The first hunk says that the C<a> at
1294             position 0 of the first sequence should be deleted (C<->). The second
1295             hunk says that the C<d> at position 2 of the second sequence should
1296             be inserted (C<+>). The third hunk says that the C<h> at position 4
1297             of the first sequence should be removed and replaced with the C<f>
1298             from position 4 of the second sequence. And so on.
1299            
1300             C<diff> may be passed an optional third parameter; this is a CODE
1301             reference to a key generation function. See L</KEY GENERATION
1302             FUNCTIONS>.
1303            
1304             Additional parameters, if any, will be passed to the key generation
1305             routine.
1306            
1307             =head2 C<sdiff>
1308            
1309             @sdiffs = sdiff( \@seq1, \@seq2 );
1310             $sdiffs_ref = sdiff( \@seq1, \@seq2 );
1311            
1312             C<sdiff> computes all necessary components to show two sequences
1313             and their minimized differences side by side, just like the
1314             Unix-utility I<sdiff> does:
1315            
1316             same same
1317             before | after
1318             old < -
1319             - > new
1320            
1321             It returns a list of array refs, each pointing to an array of
1322             display instructions. In scalar context it returns a reference
1323             to such a list. If there are no differences, the list will have one
1324             entry per item, each indicating that the item was unchanged.
1325            
1326             Display instructions consist of three elements: A modifier indicator
1327             (C<+>: Element added, C<->: Element removed, C<u>: Element unmodified,
1328             C<c>: Element changed) and the value of the old and new elements, to
1329             be displayed side-by-side.
1330            
1331             An C<sdiff> of the following two sequences:
1332            
1333             a b c e h j l m n p
1334             b c d e f j k l m r s t
1335            
1336             results in
1337            
1338             ( [ '-', 'a', '' ],
1339             [ 'u', 'b', 'b' ],
1340             [ 'u', 'c', 'c' ],
1341             [ '+', '', 'd' ],
1342             [ 'u', 'e', 'e' ],
1343             [ 'c', 'h', 'f' ],
1344             [ 'u', 'j', 'j' ],
1345             [ '+', '', 'k' ],
1346             [ 'u', 'l', 'l' ],
1347             [ 'u', 'm', 'm' ],
1348             [ 'c', 'n', 'r' ],
1349             [ 'c', 'p', 's' ],
1350             [ '+', '', 't' ],
1351             )
1352            
1353             C<sdiff> may be passed an optional third parameter; this is a CODE
1354             reference to a key generation function. See L</KEY GENERATION
1355             FUNCTIONS>.
1356            
1357             Additional parameters, if any, will be passed to the key generation
1358             routine.
1359            
1360             =head2 C<compact_diff>
1361            
1362             C<compact_diff> is much like C<sdiff> except it returns a much more
1363             compact description consisting of just one flat list of indices. An
1364             example helps explain the format:
1365            
1366             my @a = qw( a b c e h j l m n p );
1367             my @b = qw( b c d e f j k l m r s t );
1368             @cdiff = compact_diff( \@a, \@b );
1369             # Returns:
1370             # @a @b @a @b
1371             # start start values values
1372             ( 0, 0, # =
1373             0, 0, # a !
1374             1, 0, # b c = b c
1375             3, 2, # ! d
1376             3, 3, # e = e
1377             4, 4, # f ! h
1378             5, 5, # j = j
1379             6, 6, # ! k
1380             6, 7, # l m = l m
1381             8, 9, # n p ! r s t
1382             10, 12, #
1383             );
1384            
1385             The 0th, 2nd, 4th, etc. entries are all indices into @seq1 (@a in the
1386             above example) indicating where a hunk begins. The 1st, 3rd, 5th, etc.
1387             entries are all indices into @seq2 (@b in the above example) indicating
1388             where the same hunk begins.
1389            
1390             So each pair of indices (except the last pair) describes where a hunk
1391             begins (in each sequence). Since each hunk must end at the item just
1392             before the item that starts the next hunk, the next pair of indices can
1393             be used to determine where the hunk ends.
1394            
1395             So, the first 4 entries (0..3) describe the first hunk. Entries 0 and 1
1396             describe where the first hunk begins (and so are always both 0).
1397             Entries 2 and 3 describe where the next hunk begins, so subtracting 1
1398             from each tells us where the first hunk ends. That is, the first hunk
1399             contains items C<$diff[0]> through C<$diff[2] - 1> of the first sequence
1400             and contains items C<$diff[1]> through C<$diff[3] - 1> of the second
1401             sequence.
1402            
1403             In other words, the first hunk consists of the following two lists of items:
1404            
1405             # 1st pair 2nd pair
1406             # of indices of indices
1407             @list1 = @a[ $cdiff[0] .. $cdiff[2]-1 ];
1408             @list2 = @b[ $cdiff[1] .. $cdiff[3]-1 ];
1409             # Hunk start Hunk end
1410            
1411             Note that the hunks will always alternate between those that are part of
1412             the LCS (those that contain unchanged items) and those that contain
1413             changes. This means that all we need to be told is whether the first
1414             hunk is a 'same' or 'diff' hunk and we can determine which of the other
1415             hunks contain 'same' items or 'diff' items.
1416            
1417             By convention, we always make the first hunk contain unchanged items.
1418             So the 1st, 3rd, 5th, etc. hunks (all odd-numbered hunks if you start
1419             counting from 1) all contain unchanged items. And the 2nd, 4th, 6th,
1420             etc. hunks (all even-numbered hunks if you start counting from 1) all
1421             contain changed items.
1422            
1423             Since @a and @b don't begin with the same value, the first hunk in our
1424             example is empty (otherwise we'd violate the above convention). Note
1425             that the first 4 index values in our example are all zero. Plug these
1426             values into our previous code block and we get:
1427            
1428             @hunk1a = @a[ 0 .. 0-1 ];
1429             @hunk1b = @b[ 0 .. 0-1 ];
1430            
1431             And C<0..-1> returns the empty list.
1432            
1433             Move down one pair of indices (2..5) and we get the offset ranges for
1434             the second hunk, which contains changed items.
1435            
1436             Since C<@diff[2..5]> contains (0,0,1,0) in our example, the second hunk
1437             consists of these two lists of items:
1438            
1439             @hunk2a = @a[ $cdiff[2] .. $cdiff[4]-1 ];
1440             @hunk2b = @b[ $cdiff[3] .. $cdiff[5]-1 ];
1441             # or
1442             @hunk2a = @a[ 0 .. 1-1 ];
1443             @hunk2b = @b[ 0 .. 0-1 ];
1444             # or
1445             @hunk2a = @a[ 0 .. 0 ];
1446             @hunk2b = @b[ 0 .. -1 ];
1447             # or
1448             @hunk2a = ( 'a' );
1449             @hunk2b = ( );
1450            
1451             That is, we would delete item 0 ('a') from @a.
1452            
1453             Since C<@diff[4..7]> contains (1,0,3,2) in our example, the third hunk
1454             consists of these two lists of items:
1455            
1456             @hunk3a = @a[ $cdiff[4] .. $cdiff[6]-1 ];
1457             @hunk3a = @b[ $cdiff[5] .. $cdiff[7]-1 ];
1458             # or
1459             @hunk3a = @a[ 1 .. 3-1 ];
1460             @hunk3a = @b[ 0 .. 2-1 ];
1461             # or
1462             @hunk3a = @a[ 1 .. 2 ];
1463             @hunk3a = @b[ 0 .. 1 ];
1464             # or
1465             @hunk3a = qw( b c );
1466             @hunk3a = qw( b c );
1467            
1468             Note that this third hunk contains unchanged items as our convention demands.
1469            
1470             You can continue this process until you reach the last two indices,
1471             which will always be the number of items in each sequence. This is
1472             required so that subtracting one from each will give you the indices to
1473             the last items in each sequence.
1474            
1475             =head2 C<traverse_sequences>
1476            
1477             C<traverse_sequences> used to be the most general facility provided by
1478             this module (the new OO interface is more powerful and much easier to
1479             use).
1480            
1481             Imagine that there are two arrows. Arrow A points to an element of
1482             sequence A, and arrow B points to an element of the sequence B.
1483             Initially, the arrows point to the first elements of the respective
1484             sequences. C<traverse_sequences> will advance the arrows through the
1485             sequences one element at a time, calling an appropriate user-specified
1486             callback function before each advance. It willadvance the arrows in
1487             such a way that if there are equal elements C<$A[$i]> and C<$B[$j]>
1488             which are equal and which are part of the LCS, there will be some moment
1489             during the execution of C<traverse_sequences> when arrow A is pointing
1490             to C<$A[$i]> and arrow B is pointing to C<$B[$j]>. When this happens,
1491             C<traverse_sequences> will call the C<MATCH> callback function and then
1492             it will advance both arrows.
1493            
1494             Otherwise, one of the arrows is pointing to an element of its sequence
1495             that is not part of the LCS. C<traverse_sequences> will advance that