bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
|
1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
1 |
#!/usr/bin/env python
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
2 |
# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
|
3 |
#
|
|
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
4 |
# This program is free software; you can redistribute it and/or modify
|
5 |
# it under the terms of the GNU General Public License as published by
|
|
6 |
# the Free Software Foundation; either version 2 of the License, or
|
|
7 |
# (at your option) any later version.
|
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
8 |
#
|
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
9 |
# This program is distributed in the hope that it will be useful,
|
10 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12 |
# GNU General Public License for more details.
|
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
13 |
#
|
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
14 |
# You should have received a copy of the GNU General Public License
|
15 |
# along with this program; if not, write to the Free Software
|
|
16 |
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
17 |
||
18 |
||
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
19 |
from bisect import bisect |
20 |
from copy import copy |
|
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
21 |
import difflib |
|
1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
22 |
import os |
23 |
import sys |
|
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
24 |
import time |
25 |
||
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
26 |
|
|
1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
27 |
__all__ = ['SequenceMatcher', 'unified_diff', 'unified_diff_files'] |
|
1185.81.9
by John Arbash Meinel
Added (failing) tests for cdv.recurse_matches with common sections, |
28 |
|
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
29 |
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
30 |
def unique_lcs(a, b): |
31 |
"""Find the longest common subset for unique lines. |
|
32 |
||
33 |
:param a: An indexable object (such as string or list of strings)
|
|
34 |
:param b: Another indexable object (such as string or list of strings)
|
|
35 |
:return: A list of tuples, one for each line which is matched.
|
|
36 |
[(line_in_a, line_in_b), ...]
|
|
37 |
||
38 |
This only matches lines which are unique on both sides.
|
|
39 |
This helps prevent common lines from over influencing match
|
|
40 |
results.
|
|
41 |
The longest common subset uses the Patience Sorting algorithm:
|
|
42 |
http://en.wikipedia.org/wiki/Patience_sorting
|
|
43 |
"""
|
|
44 |
# set index[line in a] = position of line in a unless
|
|
45 |
# unless a is a duplicate, in which case it's set to None
|
|
46 |
index = {} |
|
47 |
for i in xrange(len(a)): |
|
48 |
line = a[i] |
|
49 |
if line in index: |
|
50 |
index[line] = None |
|
51 |
else: |
|
52 |
index[line]= i |
|
53 |
# make btoa[i] = position of line i in a, unless
|
|
54 |
# that line doesn't occur exactly once in both,
|
|
55 |
# in which case it's set to None
|
|
56 |
btoa = [None] * len(b) |
|
57 |
index2 = {} |
|
58 |
for pos, line in enumerate(b): |
|
59 |
next = index.get(line) |
|
60 |
if next is not None: |
|
61 |
if line in index2: |
|
62 |
# unset the previous mapping, which we now know to
|
|
63 |
# be invalid because the line isn't unique
|
|
64 |
btoa[index2[line]] = None |
|
65 |
del index[line] |
|
66 |
else: |
|
67 |
index2[line] = pos |
|
68 |
btoa[pos] = next |
|
69 |
# this is the Patience sorting algorithm
|
|
70 |
# see http://en.wikipedia.org/wiki/Patience_sorting
|
|
71 |
backpointers = [None] * len(b) |
|
72 |
stacks = [] |
|
73 |
lasts = [] |
|
74 |
k = 0 |
|
75 |
for bpos, apos in enumerate(btoa): |
|
76 |
if apos is None: |
|
77 |
continue
|
|
78 |
# as an optimization, check if the next line comes at the end,
|
|
79 |
# because it usually does
|
|
80 |
if stacks and stacks[-1] < apos: |
|
81 |
k = len(stacks) |
|
82 |
# as an optimization, check if the next line comes right after
|
|
83 |
# the previous line, because usually it does
|
|
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
84 |
elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or |
85 |
stacks[k+1] > apos): |
|
|
1185.81.24
by Aaron Bentley
Reoganize patience-related code |
86 |
k += 1 |
87 |
else: |
|
88 |
k = bisect(stacks, apos) |
|
89 |
if k > 0: |
|
90 |
backpointers[bpos] = lasts[k-1] |
|
91 |
if k < len(stacks): |
|
92 |
stacks[k] = apos |
|
93 |
lasts[k] = bpos |
|
94 |
else: |
|
95 |
stacks.append(apos) |
|
96 |
lasts.append(bpos) |
|
97 |
if len(lasts) == 0: |
|
98 |
return [] |
|
99 |
result = [] |
|
100 |
k = lasts[-1] |
|
101 |
while k is not None: |
|
102 |
result.append((btoa[k], k)) |
|
103 |
k = backpointers[k] |
|
104 |
result.reverse() |
|
105 |
return result |
|
106 |
||
107 |
||
108 |
def recurse_matches(a, b, ahi, bhi, answer, maxrecursion): |
|
109 |
"""Find all of the matching text in the lines of a and b. |
|
110 |
||
111 |
:param a: A sequence
|
|
112 |
:param b: Another sequence
|
|
113 |
:param ahi: The maximum length of a to check, typically len(a)
|
|
114 |
:param bhi: The maximum length of b to check, typically len(b)
|
|
115 |
:param answer: The return array. Will be filled with tuples
|
|
116 |
indicating [(line_in_a), (line_in_b)]
|
|
117 |
:param maxrecursion: The maximum depth to recurse.
|
|
118 |
Must be a positive integer.
|
|
119 |
:return: None, the return value is in the parameter answer, which
|
|
120 |
should be a list
|
|
121 |
||
122 |
"""
|
|
123 |
oldlen = len(answer) |
|
124 |
if maxrecursion < 0: |
|
125 |
# this will never happen normally, this check is to prevent DOS attacks
|
|
126 |
return
|
|
127 |
oldlength = len(answer) |
|
128 |
if len(answer) == 0: |
|
129 |
alo, blo = 0, 0 |
|
130 |
else: |
|
131 |
alo, blo = answer[-1] |
|
132 |
alo += 1 |
|
133 |
blo += 1 |
|
134 |
if alo == ahi or blo == bhi: |
|
135 |
return
|
|
136 |
for apos, bpos in unique_lcs(a[alo:ahi], b[blo:bhi]): |
|
137 |
# recurse between lines which are unique in each file and match
|
|
138 |
apos += alo |
|
139 |
bpos += blo |
|
140 |
recurse_matches(a, b, apos, bpos, answer, maxrecursion - 1) |
|
141 |
answer.append((apos, bpos)) |
|
142 |
if len(answer) > oldlength: |
|
143 |
# find matches between the last match and the end
|
|
144 |
recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1) |
|
145 |
elif a[alo] == b[blo]: |
|
146 |
# find matching lines at the very beginning
|
|
147 |
while alo < ahi and blo < bhi and a[alo] == b[blo]: |
|
148 |
answer.append((alo, blo)) |
|
149 |
alo += 1 |
|
150 |
blo += 1 |
|
151 |
recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1) |
|
152 |
elif a[ahi - 1] == b[bhi - 1]: |
|
153 |
# find matching lines at the very end
|
|
154 |
nahi = ahi - 1 |
|
155 |
nbhi = bhi - 1 |
|
156 |
while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]: |
|
157 |
nahi -= 1 |
|
158 |
nbhi -= 1 |
|
159 |
recurse_matches(a, b, nahi, nbhi, answer, maxrecursion - 1) |
|
160 |
for i in xrange(ahi - nahi): |
|
161 |
answer.append((nahi + i, nbhi + i)) |
|
162 |
||
163 |
||
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
164 |
class SequenceMatcher(difflib.SequenceMatcher): |
|
1185.81.5
by John Arbash Meinel
Fix up SequenceMatcher, add comments to nofrillsprecisemerge |
165 |
"""Compare a pair of sequences using longest common subset.""" |
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
166 |
|
|
1185.81.5
by John Arbash Meinel
Fix up SequenceMatcher, add comments to nofrillsprecisemerge |
167 |
def __init__(self, isjunk=None, a='', b=''): |
168 |
if isjunk is not None: |
|
169 |
raise NotImplementedError('Currently we do not support' |
|
170 |
' isjunk for sequence matching') |
|
171 |
difflib.SequenceMatcher.__init__(self, isjunk, a, b) |
|
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
172 |
|
|
1185.81.11
by John Arbash Meinel
Found some edge cases that weren't being matched. |
173 |
def _check_with_diff(self, alo, ahi, blo, bhi, answer): |
174 |
"""Use the original diff algorithm on an unmatched section. |
|
175 |
||
176 |
This will check to make sure the range is worth checking,
|
|
177 |
before doing any work.
|
|
178 |
||
179 |
:param alo: The last line that actually matched
|
|
180 |
:param ahi: The next line that actually matches
|
|
181 |
:param blo: Same as alo, only for the 'b' set
|
|
182 |
:param bhi: Same as ahi
|
|
183 |
:param answer: An array which will have the new ranges appended to it
|
|
184 |
:return: None
|
|
185 |
"""
|
|
186 |
# WORKAROUND
|
|
187 |
# recurse_matches has an implementation design
|
|
188 |
# which does not match non-unique lines in the
|
|
189 |
# if they do not touch matching unique lines
|
|
190 |
# so we rerun the regular diff algorithm
|
|
191 |
# if find a large enough chunk.
|
|
192 |
||
193 |
# recurse_matches already looked at the direct
|
|
194 |
# neighbors, so we only need to run if there is
|
|
195 |
# enough space to do so
|
|
196 |
if ahi - alo > 2 and bhi - blo > 2: |
|
|
1185.81.16
by John Arbash Meinel
Added tests, and an assert check to make sure ranges are always increasing. |
197 |
a = self.a[alo+1:ahi-1] |
|
1185.81.11
by John Arbash Meinel
Found some edge cases that weren't being matched. |
198 |
b = self.b[blo+1:bhi-1] |
199 |
m = difflib.SequenceMatcher(None, a, b) |
|
200 |
new_blocks = m.get_matching_blocks() |
|
201 |
# difflib always adds a final match
|
|
202 |
new_blocks.pop() |
|
203 |
for blk in new_blocks: |
|
204 |
answer.append((blk[0]+alo+1, |
|
205 |
blk[1]+blo+1, |
|
206 |
blk[2])) |
|
207 |
||
|
1185.81.4
by John Arbash Meinel
moved the logic deeper into difflib. |
208 |
def __helper(self, alo, ahi, blo, bhi, answer): |
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
209 |
matches = [] |
|
1185.81.4
by John Arbash Meinel
moved the logic deeper into difflib. |
210 |
a = self.a[alo:ahi] |
211 |
b = self.b[blo:bhi] |
|
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
212 |
recurse_matches(a, b, len(a), len(b), matches, 10) |
213 |
# Matches now has individual line pairs of
|
|
214 |
# line A matches line B, at the given offsets
|
|
215 |
||
216 |
start_a = start_b = None |
|
217 |
length = 0 |
|
218 |
for i_a, i_b in matches: |
|
219 |
if (start_a is not None |
|
220 |
and (i_a == start_a + length) |
|
221 |
and (i_b == start_b + length)): |
|
222 |
length += 1 |
|
223 |
else: |
|
224 |
# New block
|
|
|
1185.81.11
by John Arbash Meinel
Found some edge cases that weren't being matched. |
225 |
if start_a is None: |
226 |
# We need to check from 0,0 until the current match
|
|
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
227 |
self._check_with_diff(alo-1, i_a+alo, blo-1, i_b+blo, |
228 |
answer) |
|
|
1185.81.11
by John Arbash Meinel
Found some edge cases that weren't being matched. |
229 |
else: |
|
1185.81.4
by John Arbash Meinel
moved the logic deeper into difflib. |
230 |
answer.append((start_a+alo, start_b+blo, length)) |
|
1185.81.11
by John Arbash Meinel
Found some edge cases that weren't being matched. |
231 |
self._check_with_diff(start_a+alo+length, i_a+alo, |
232 |
start_b+blo+length, i_b+blo, |
|
233 |
answer) |
|
|
1185.81.10
by John Arbash Meinel
Added some more test cases. |
234 |
|
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
235 |
start_a = i_a |
236 |
start_b = i_b |
|
237 |
length = 1 |
|
238 |
||
239 |
if length != 0: |
|
|
1185.81.11
by John Arbash Meinel
Found some edge cases that weren't being matched. |
240 |
answer.append((start_a+alo, start_b+blo, length)) |
241 |
self._check_with_diff(start_a+alo+length, ahi+1, |
|
242 |
start_b+blo+length, bhi+1, |
|
243 |
answer) |
|
244 |
if not matches: |
|
245 |
# Nothing matched, so we need to send the complete text
|
|
246 |
self._check_with_diff(alo-1, ahi+1, blo-1, bhi+1, answer) |
|
|
1185.81.1
by John Arbash Meinel
Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib. |
247 |
|
|
1185.81.16
by John Arbash Meinel
Added tests, and an assert check to make sure ranges are always increasing. |
248 |
# For consistency sake, make sure all matches are only increasing
|
249 |
if __debug__: |
|
250 |
next_a = -1 |
|
251 |
next_b = -1 |
|
252 |
for a,b,match_len in answer: |
|
253 |
assert a >= next_a, 'Non increasing matches for a' |
|
254 |
assert b >= next_b, 'Not increasing matches for b' |
|
255 |
next_a = a + match_len |
|
256 |
next_b = b + match_len |
|
257 |
||
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
258 |
|
|
1185.81.8
by John Arbash Meinel
Updating unified_diff to take a factory, using the new diff algorithm in the code. |
259 |
# This is a version of unified_diff which only adds a factory parameter
|
260 |
# so that you can override the default SequenceMatcher
|
|
261 |
# this has been submitted as a patch to python
|
|
262 |
def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', |
|
263 |
tofiledate='', n=3, lineterm='\n', |
|
264 |
sequencematcher=None): |
|
265 |
r""" |
|
266 |
Compare two sequences of lines; generate the delta as a unified diff.
|
|
267 |
||
268 |
Unified diffs are a compact way of showing line changes and a few
|
|
269 |
lines of context. The number of context lines is set by 'n' which
|
|
270 |
defaults to three.
|
|
271 |
||
272 |
By default, the diff control lines (those with ---, +++, or @@) are
|
|
273 |
created with a trailing newline. This is helpful so that inputs
|
|
274 |
created from file.readlines() result in diffs that are suitable for
|
|
275 |
file.writelines() since both the inputs and outputs have trailing
|
|
276 |
newlines.
|
|
277 |
||
278 |
For inputs that do not have trailing newlines, set the lineterm
|
|
279 |
argument to "" so that the output will be uniformly newline free.
|
|
280 |
||
281 |
The unidiff format normally has a header for filenames and modification
|
|
282 |
times. Any or all of these may be specified using strings for
|
|
283 |
'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification
|
|
284 |
times are normally expressed in the format returned by time.ctime().
|
|
285 |
||
286 |
Example:
|
|
287 |
||
288 |
>>> for line in unified_diff('one two three four'.split(),
|
|
289 |
... 'zero one tree four'.split(), 'Original', 'Current',
|
|
290 |
... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
|
|
291 |
... lineterm=''):
|
|
292 |
... print line
|
|
293 |
--- Original Sat Jan 26 23:30:50 1991
|
|
294 |
+++ Current Fri Jun 06 10:20:52 2003
|
|
295 |
@@ -1,4 +1,4 @@
|
|
296 |
+zero
|
|
297 |
one
|
|
298 |
-two
|
|
299 |
-three
|
|
300 |
+tree
|
|
301 |
four
|
|
302 |
"""
|
|
303 |
if sequencematcher is None: |
|
304 |
sequencematcher = difflib.SequenceMatcher |
|
305 |
||
306 |
started = False |
|
307 |
for group in sequencematcher(None,a,b).get_grouped_opcodes(n): |
|
308 |
if not started: |
|
309 |
yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm) |
|
310 |
yield '+++ %s %s%s' % (tofile, tofiledate, lineterm) |
|
311 |
started = True |
|
312 |
i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4] |
|
313 |
yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm) |
|
314 |
for tag, i1, i2, j1, j2 in group: |
|
315 |
if tag == 'equal': |
|
316 |
for line in a[i1:i2]: |
|
317 |
yield ' ' + line |
|
318 |
continue
|
|
319 |
if tag == 'replace' or tag == 'delete': |
|
320 |
for line in a[i1:i2]: |
|
321 |
yield '-' + line |
|
322 |
if tag == 'replace' or tag == 'insert': |
|
323 |
for line in b[j1:j2]: |
|
324 |
yield '+' + line |
|
325 |
||
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
326 |
|
|
1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
327 |
def unified_diff_files(a, b, sequencematcher=None): |
328 |
"""Generate the diff for two files. |
|
329 |
"""
|
|
330 |
# Should this actually be an error?
|
|
331 |
if a == b: |
|
332 |
return [] |
|
333 |
if a == '-': |
|
334 |
file_a = sys.stdin |
|
335 |
time_a = time.time() |
|
336 |
else: |
|
337 |
file_a = open(a, 'rb') |
|
338 |
time_a = os.stat(a).st_mtime |
|
339 |
||
340 |
if b == '-': |
|
341 |
file_b = sys.stdin |
|
342 |
time_b = time.time() |
|
343 |
else: |
|
344 |
file_b = open(b, 'rb') |
|
345 |
time_b = os.stat(b).st_mtime |
|
346 |
||
347 |
# TODO: Include fromfiledate and tofiledate
|
|
348 |
return unified_diff(file_a.readlines(), file_b.readlines(), |
|
349 |
fromfile=a, tofile=b, |
|
350 |
sequencematcher=sequencematcher) |
|
351 |
||
|
1185.81.29
by Aaron Bentley
Fix style issues and duplicated tests |
352 |
|
|
1185.81.14
by John Arbash Meinel
Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces |
353 |
def main(args): |
354 |
import optparse |
|
355 |
p = optparse.OptionParser(usage='%prog [options] file_a file_b' |
|
356 |
'\nFiles can be "-" to read from stdin') |
|
357 |
p.add_option('--cdv', dest='matcher', action='store_const', const='cdv', |
|
358 |
default='cdv', help='Use the cdv difference algorithm') |
|
359 |
p.add_option('--difflib', dest='matcher', action='store_const', const='difflib', |
|
360 |
default='cdv', help='Use python\'s difflib algorithm') |
|
361 |
||
362 |
algorithms = {'cdv':SequenceMatcher, 'difflib':difflib.SequenceMatcher} |
|
363 |
||
364 |
(opts, args) = p.parse_args(args) |
|
365 |
matcher = algorithms[opts.matcher] |
|
366 |
||
367 |
if len(args) != 2: |
|
368 |
print 'You must supply 2 filenames to diff' |
|
369 |
return -1 |
|
370 |
||
371 |
for line in unified_diff_files(args[0], args[1], sequencematcher=matcher): |
|
372 |
sys.stdout.write(line) |
|
373 |
||
374 |
if __name__ == '__main__': |
|
375 |
sys.exit(main(sys.argv[1:])) |