File Coverage

lib/Algorithm/Diff.pm
Criterion Covered Total %
statement 314 359 87.5
branch 107 136 78.7
condition 73 94 77.7
subroutine 42 52 80.8
pod 9 9 100.0
total 545 650 83.8


line stmt bran cond sub pod time code
1             package Algorithm::Diff;
2             # Skip to first "=head" line for documentation.
3 2     2   26 use strict;
  2         17  
  2         29  
4              
5 2     2   1486 use integer; # see below in _replaceNextLargerWith() for mod to make
  2         19  
  2         33  
6             # if you don't use this
7 2     2   33 use vars qw( $VERSION @EXPORT_OK );
  2         17  
  2         28  
8             $VERSION = 1.19_02;
9             # ^ ^^ ^^-- Incremented at will
10             # | \+----- Incremented for non-trivial changes to features
11             # \-------- Incremented for fundamental changes
12             require Exporter;
13             *import    = \&Exporter::import;
14             @EXPORT_OK = qw(
15             prepare LCS LCSidx LCS_length
16             diff sdiff compact_diff
17             traverse_sequences traverse_balanced
18             );
19              
20             # McIlroy-Hunt diff algorithm
21             # Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com>
22             # by Ned Konz, perl@bike-nomad.com
23             # Updates by Tye McQueen, http://perlmonks.org/?node=tye
24              
25             # Create a hash that maps each element of $aCollection to the set of
26             # positions it occupies in $aCollection, restricted to the elements
27             # within the range of indexes specified by $start and $end.
28             # The fourth parameter is a subroutine reference that will be called to
29             # generate a string to use as a key.
30             # Additional parameters, if any, will be passed to this subroutine.
31             #
32             # my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
33              
34             sub _withPositionsOfInInterval
35             {
36 36     36   298     my $aCollection = shift; # array ref
37 36         297     my $start = shift;
38 36         322     my $end = shift;
39 36         300     my $keyGen = shift;
40 36         375     my %d;
41 36         281     my $index;
42                 for ( $index = $start ; $index <= $end ; $index++ )
43                 {
44 154         1455         my $element = $aCollection->[$index];
45 154         1740         my $key = &$keyGen( $element, @_ );
46 154 100       3295         if ( exists( $d{$key} ) )
47                     {
48 1         9             unshift ( @{ $d{$key} }, $index );
  1         15  
49                     }
50                     else
51                     {
52 153         2355             $d{$key} = [$index];
53                     }
54 36         285     }
55 36 50       454     return wantarray ? %d : \%d;
56             }
57              
58             # Find the place at which aValue would normally be inserted into the
59             # array. If that place is already occupied by aValue, do nothing, and
60             # return undef. If the place does not exist (i.e., it is off the end of
61             # the array), add it to the end, otherwise replace the element at that
62             # point with aValue. It is assumed that the array's values are numeric.
63             # This is where the bulk (75%) of the time is spent in this module, so
64             # try to make it fast!
65              
66             sub _replaceNextLargerWith
67             {
68 55     55   470     my ( $array, $aValue, $high ) = @_;
69 55   50     639     $high ||= $#$array;
70              
71             # off the end?
72 55 50 66     722     if ( $high == -1 || $aValue > $array->[-1] )
73                 {
74 55         475         push ( @$array, $aValue );
75 55         654         return $high + 1;
76                 }
77              
78             # binary search for insertion point...
79 0         0     my $low = 0;
80 0         0     my $index;
81 0         0     my $found;
82 0         0     while ( $low <= $high )
83                 {
84 0         0         $index = ( $high + $low ) / 2;
85              
86             # $index = int(( $high + $low ) / 2); # without 'use integer'
87 0         0         $found = $array->[$index];
88              
89 0 0       0         if ( $aValue == $found )
    0          
90                     {
91 0         0             return undef;
92                     }
93                     elsif ( $aValue > $found )
94                     {
95 0         0             $low = $index + 1;
96                     }
97                     else
98                     {
99 0         0             $high = $index - 1;
100                     }
101                 }
102              
103             # now insertion point is in $low.
104 0         0     $array->[$low] = $aValue; # overwrite next larger
105 0         0     return $low;
106             }
107              
108             # This method computes the longest common subsequence in $a and $b.
109              
110             # Result is array or ref, whose contents is such that
111             # $a->[ $i ] == $b->[ $result[ $i ] ]
112             # foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined.
113              
114             # An additional argument may be passed; this is a hash or key generating
115             # function that should return a string that uniquely identifies the given
116             # element. It should be the case that if the key is the same, the elements
117             # will compare the same. If this parameter is undef or missing, the key
118             # will be the element as a string.
119              
120             # By default, comparisons will use "eq" and elements will be turned into keys
121             # using the default stringizing operator '""'.
122              
123             # Additional parameters, if any, will be passed to the key generation
124             # routine.
125              
126             sub _longestCommonSubsequence
127             {
128 36     36   419     my $a = shift; # array ref or hash ref
129 36         324     my $b = shift; # array ref or hash ref
130 36         302     my $counting = shift; # scalar
131 36         298     my $keyGen = shift; # code ref
132 36         293     my $compare; # code ref
133              
134 36 50       455     if ( ref($a) eq 'HASH' )
135                 { # prepared hash must be in $b
136 0         0         my $tmp = $b;
137 0         0         $b = $a;
138 0         0         $a = $tmp;
139                 }
140              
141             # Check for bogus (non-ref) argument values
142 36 50 33     545     if ( !ref($a) || !ref($b) )
143                 {
144 0         0         my @callerInfo = caller(1);
145 0         0         die 'error: must pass array or hash references to ' . $callerInfo[3];
146                 }
147              
148             # set up code refs
149             # Note that these are optimized.
150 36 50       344     if ( !defined($keyGen) ) # optimize for strings
151                 {
152 36     283   306         $keyGen = sub { $_[0] };
  283         2896  
153 36     81   357         $compare = sub { my ( $a, $b ) = @_; $a eq $b };
  81         818  
  81         1064  
154                 }
155                 else
156                 {
157                     $compare = sub {
158 0     0   0             my $a = shift;
159 0         0             my $b = shift;
160 0         0             &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
161 0         0         };
162                 }
163              
164 36         411     my ( $aStart, $aFinish, $matchVector ) = ( 0, $#$a, [] );
165 36         381     my ( $prunedCount, $bMatches ) = ( 0, {} );
166              
167 36 50       377     if ( ref($b) eq 'HASH' ) # was $bMatches prepared for us?
168                 {
169 0         0         $bMatches = $b;
170                 }
171                 else
172                 {
173 36         476         my ( $bStart, $bFinish ) = ( 0, $#$b );
174              
175             # First we prune off any common elements at the beginning
176 36   100     694         while ( $aStart <= $aFinish
      100        
177                         and $bStart <= $bFinish
178                         and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
179                     {
180 13         129             $matchVector->[ $aStart++ ] = $bStart++;
181 13         459             $prunedCount++;
182                     }
183              
184             # now the end
185 36   100     589         while ( $aStart <= $aFinish
      100        
186                         and $bStart <= $bFinish
187                         and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
188                     {
189 16         161             $matchVector->[ $aFinish-- ] = $bFinish--;
190 16         213             $prunedCount++;
191                     }
192              
193             # Now compute the equivalence classes of positions of elements
194                     $bMatches =
195 36         360           _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
196                 }
197 36         334     my $thresh = [];
198 36         313     my $links = [];
199              
200 36         318     my ( $i, $ai, $j, $k );
201                 for ( $i = $aStart ; $i <= $aFinish ; $i++ )
202                 {
203 129         1807         $ai = &$keyGen( $a->[$i], @_ );
204 129 100       1371         if ( exists( $bMatches->{$ai} ) )
205                     {
206 55         429             $k = 0;
207 55         425             for $j ( @{ $bMatches->{$ai} } )
  55         548  
208                         {
209              
210             # optimization: most of the time this will be true
211 55 50 33     640                 if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
      33        
212                             {
213 0         0                     $thresh->[$k] = $j;
214                             }
215                             else
216                             {
217 55         514                     $k = _replaceNextLargerWith( $thresh, $j, $k );
218                             }
219              
220             # oddly, it's faster to always test this (CPU cache?).
221 55 50       1019                 if ( defined($k) )
222                             {
223 55 100       1014                     $links->[$k] =
224                                   [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
225                             }
226                         }
227                     }
228 36         291     }
229              
230 36 100       473     if (@$thresh)
    50          
231                 {
232 14 50       151         return $prunedCount + @$thresh if $counting;
233                     for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
234                     {
235 55         627             $matchVector->[ $link->[1] ] = $link->[2];
236 14         152         }
237                 }
238                 elsif ($counting)
239                 {
240 0         0         return $prunedCount;
241                 }
242              
243 36 100       641     return wantarray ? @$matchVector : $matchVector;
244             }
245              
246             sub traverse_sequences
247             {
248 3     3 1 1078     my $a = shift; # array ref
249 3         26     my $b = shift; # array ref
250 3   50     31     my $callbacks = shift || {};
251 3         27     my $keyGen = shift;
252 3   50 0   33     my $matchCallback = $callbacks->{'MATCH'} || sub { };
  0         0  
253 3   50 0   32     my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
  0         0  
254 3         28     my $finishedACallback = $callbacks->{'A_FINISHED'};
255 3   50 0   33     my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
  0         0  
256 3         27     my $finishedBCallback = $callbacks->{'B_FINISHED'};
257 3         31     my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
258              
259             # Process all the lines in @$matchVector
260 3         29     my $lastA = $#$a;
261 3         28     my $lastB = $#$b;
262 3         23     my $bi = 0;
263 3         24     my $ai;
264              
265                 for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
266                 {
267 24         383         my $bLine = $matchVector->[$ai];
268 24 100       230         if ( defined($bLine) ) # matched
269                     {
270 18         261             &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
271 18         413             &$matchCallback( $ai, $bi++, @_ );
272                     }
273                     else
274                     {
275 6         85             &$discardACallback( $ai, $bi, @_ );
276                     }
277 3         26     }
278              
279             # The last entry (if any) processed was a match.
280             # $ai and $bi point just past the last matching lines in their sequences.
281              
282 3   100     63     while ( $ai <= $lastA or $bi <= $lastB )
283                 {
284              
285             # last A?
286 9 100 66     145         if ( $ai == $lastA + 1 and $bi <= $lastB )
287                     {
288 3 100       29             if ( defined($finishedACallback) )
289                         {
290 1         12                 &$finishedACallback( $lastA, @_ );
291 1         16                 $finishedACallback = undef;
292                         }
293                         else
294                         {
295 2         85                 &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
296                         }
297                     }
298              
299             # last B?
300 9 50 66     105         if ( $bi == $lastB + 1 and $ai <= $lastA )
301                     {
302 0 0       0             if ( defined($finishedBCallback) )
303                         {
304 0         0                 &$finishedBCallback( $lastB, @_ );
305 0         0                 $finishedBCallback = undef;
306                         }
307                         else
308                         {
309 0         0                 &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
310                         }
311                     }
312              
313 9 100       94         &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
314 9 100       132         &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
315                 }
316              
317 3         50     return 1;
318             }
319              
320             sub traverse_balanced