bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
| 
1185.81.14
by John Arbash Meinel
 Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces  | 
1  | 
#!/usr/bin/env python
 | 
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
2  | 
# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
 | 
3  | 
#
 | 
|
| 
1185.81.1
by John Arbash Meinel
 Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.  | 
4  | 
# This program is free software; you can redistribute it and/or modify
 | 
5  | 
# it under the terms of the GNU General Public License as published by
 | 
|
6  | 
# the Free Software Foundation; either version 2 of the License, or
 | 
|
7  | 
# (at your option) any later version.
 | 
|
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
8  | 
#
 | 
| 
1185.81.1
by John Arbash Meinel
 Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.  | 
9  | 
# This program is distributed in the hope that it will be useful,
 | 
10  | 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
|
11  | 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
|
12  | 
# GNU General Public License for more details.
 | 
|
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
13  | 
#
 | 
| 
1185.81.1
by John Arbash Meinel
 Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.  | 
14  | 
# You should have received a copy of the GNU General Public License
 | 
15  | 
# along with this program; if not, write to the Free Software
 | 
|
| 
4183.7.1
by Sabin Iacob
 update FSF mailing address  | 
16  | 
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
| 
1185.81.1
by John Arbash Meinel
 Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.  | 
17  | 
|
18  | 
||
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
19  | 
from bisect import bisect  | 
| 
1185.81.1
by John Arbash Meinel
 Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.  | 
20  | 
import difflib  | 
| 
1185.81.29
by Aaron Bentley
 Fix style issues and duplicated tests  | 
21  | 
|
| 
1711.2.12
by John Arbash Meinel
 Make a mention when the maximum recursion length is reached.  | 
22  | 
from bzrlib.trace import mutter  | 
23  | 
||
| 
1185.81.1
by John Arbash Meinel
 Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.  | 
24  | 
|
| 
1711.2.11
by John Arbash Meinel
 Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher  | 
25  | 
__all__ = ['PatienceSequenceMatcher', 'unified_diff', 'unified_diff_files']  | 
| 
1185.81.9
by John Arbash Meinel
 Added (failing) tests for cdv.recurse_matches with common sections,  | 
26  | 
|
| 
1185.81.29
by Aaron Bentley
 Fix style issues and duplicated tests  | 
27  | 
|
| 
2781.1.1
by Martin Pool
 merge cpatiencediff from Lukas  | 
28  | 
def unique_lcs_py(a, b):  | 
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
29  | 
"""Find the longest common subset for unique lines.  | 
30  | 
||
31  | 
    :param a: An indexable object (such as string or list of strings)
 | 
|
32  | 
    :param b: Another indexable object (such as string or list of strings)
 | 
|
33  | 
    :return: A list of tuples, one for each line which is matched.
 | 
|
34  | 
            [(line_in_a, line_in_b), ...]
 | 
|
35  | 
||
36  | 
    This only matches lines which are unique on both sides.
 | 
|
37  | 
    This helps prevent common lines from over influencing match
 | 
|
38  | 
    results.
 | 
|
39  | 
    The longest common subset uses the Patience Sorting algorithm:
 | 
|
40  | 
    http://en.wikipedia.org/wiki/Patience_sorting
 | 
|
41  | 
    """
 | 
|
42  | 
    # set index[line in a] = position of line in a unless
 | 
|
| 
2100.2.1
by wang
 Replace python's difflib by patiencediff because the worst case  | 
43  | 
    # a is a duplicate, in which case it's set to None
 | 
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
44  | 
index = {}  | 
45  | 
for i in xrange(len(a)):  | 
|
46  | 
line = a[i]  | 
|
47  | 
if line in index:  | 
|
48  | 
index[line] = None  | 
|
49  | 
else:  | 
|
50  | 
index[line]= i  | 
|
51  | 
    # make btoa[i] = position of line i in a, unless
 | 
|
| 
3943.8.1
by Marius Kruger
 remove all trailing whitespace from bzr source  | 
52  | 
    # that line doesn't occur exactly once in both,
 | 
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
53  | 
    # in which case it's set to None
 | 
54  | 
btoa = [None] * len(b)  | 
|
55  | 
index2 = {}  | 
|
56  | 
for pos, line in enumerate(b):  | 
|
57  | 
next = index.get(line)  | 
|
58  | 
if next is not None:  | 
|
59  | 
if line in index2:  | 
|
60  | 
                # unset the previous mapping, which we now know to
 | 
|
61  | 
                # be invalid because the line isn't unique
 | 
|
62  | 
btoa[index2[line]] = None  | 
|
63  | 
del index[line]  | 
|
64  | 
else:  | 
|
65  | 
index2[line] = pos  | 
|
66  | 
btoa[pos] = next  | 
|
67  | 
    # this is the Patience sorting algorithm
 | 
|
68  | 
    # see http://en.wikipedia.org/wiki/Patience_sorting
 | 
|
69  | 
backpointers = [None] * len(b)  | 
|
70  | 
stacks = []  | 
|
71  | 
lasts = []  | 
|
72  | 
k = 0  | 
|
73  | 
for bpos, apos in enumerate(btoa):  | 
|
74  | 
if apos is None:  | 
|
75  | 
            continue
 | 
|
76  | 
        # as an optimization, check if the next line comes at the end,
 | 
|
77  | 
        # because it usually does
 | 
|
78  | 
if stacks and stacks[-1] < apos:  | 
|
79  | 
k = len(stacks)  | 
|
80  | 
        # as an optimization, check if the next line comes right after
 | 
|
81  | 
        # the previous line, because usually it does
 | 
|
| 
3943.8.1
by Marius Kruger
 remove all trailing whitespace from bzr source  | 
82  | 
elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or  | 
| 
1185.81.29
by Aaron Bentley
 Fix style issues and duplicated tests  | 
83  | 
stacks[k+1] > apos):  | 
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
84  | 
k += 1  | 
85  | 
else:  | 
|
86  | 
k = bisect(stacks, apos)  | 
|
87  | 
if k > 0:  | 
|
88  | 
backpointers[bpos] = lasts[k-1]  | 
|
89  | 
if k < len(stacks):  | 
|
90  | 
stacks[k] = apos  | 
|
91  | 
lasts[k] = bpos  | 
|
92  | 
else:  | 
|
93  | 
stacks.append(apos)  | 
|
94  | 
lasts.append(bpos)  | 
|
95  | 
if len(lasts) == 0:  | 
|
96  | 
return []  | 
|
97  | 
result = []  | 
|
98  | 
k = lasts[-1]  | 
|
99  | 
while k is not None:  | 
|
100  | 
result.append((btoa[k], k))  | 
|
101  | 
k = backpointers[k]  | 
|
102  | 
result.reverse()  | 
|
103  | 
return result  | 
|
104  | 
||
105  | 
||
| 
2781.1.1
by Martin Pool
 merge cpatiencediff from Lukas  | 
106  | 
def recurse_matches_py(a, b, alo, blo, ahi, bhi, answer, maxrecursion):  | 
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
107  | 
"""Find all of the matching text in the lines of a and b.  | 
108  | 
||
109  | 
    :param a: A sequence
 | 
|
110  | 
    :param b: Another sequence
 | 
|
| 
1711.2.22
by John Arbash Meinel
 Passing the alo parameter to recurse_matches shaves of 5% of the diff time.  | 
111  | 
    :param alo: The start location of a to check, typically 0
 | 
112  | 
    :param ahi: The start location of b to check, typically 0
 | 
|
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
113  | 
    :param ahi: The maximum length of a to check, typically len(a)
 | 
114  | 
    :param bhi: The maximum length of b to check, typically len(b)
 | 
|
115  | 
    :param answer: The return array. Will be filled with tuples
 | 
|
| 
1711.2.17
by John Arbash Meinel
 Small cleanups to patience_diff code.  | 
116  | 
                   indicating [(line_in_a, line_in_b)]
 | 
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
117  | 
    :param maxrecursion: The maximum depth to recurse.
 | 
118  | 
                         Must be a positive integer.
 | 
|
119  | 
    :return: None, the return value is in the parameter answer, which
 | 
|
120  | 
             should be a list
 | 
|
121  | 
||
122  | 
    """
 | 
|
123  | 
if maxrecursion < 0:  | 
|
| 
1711.2.12
by John Arbash Meinel
 Make a mention when the maximum recursion length is reached.  | 
124  | 
mutter('max recursion depth reached')  | 
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
125  | 
        # this will never happen normally, this check is to prevent DOS attacks
 | 
126  | 
        return
 | 
|
127  | 
oldlength = len(answer)  | 
|
128  | 
if alo == ahi or blo == bhi:  | 
|
129  | 
        return
 | 
|
| 
1711.2.22
by John Arbash Meinel
 Passing the alo parameter to recurse_matches shaves of 5% of the diff time.  | 
130  | 
last_a_pos = alo-1  | 
131  | 
last_b_pos = blo-1  | 
|
| 
2781.1.1
by Martin Pool
 merge cpatiencediff from Lukas  | 
132  | 
for apos, bpos in unique_lcs_py(a[alo:ahi], b[blo:bhi]):  | 
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
133  | 
        # recurse between lines which are unique in each file and match
 | 
134  | 
apos += alo  | 
|
135  | 
bpos += blo  | 
|
| 
1711.2.18
by John Arbash Meinel
 Optimize common case where unique_lcs returns a set of lines all in a row  | 
136  | 
        # Most of the time, you will have a sequence of similar entries
 | 
137  | 
if last_a_pos+1 != apos or last_b_pos+1 != bpos:  | 
|
| 
2781.1.1
by Martin Pool
 merge cpatiencediff from Lukas  | 
138  | 
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1,  | 
| 
1711.2.22
by John Arbash Meinel
 Passing the alo parameter to recurse_matches shaves of 5% of the diff time.  | 
139  | 
apos, bpos, answer, maxrecursion - 1)  | 
| 
1711.2.18
by John Arbash Meinel
 Optimize common case where unique_lcs returns a set of lines all in a row  | 
140  | 
last_a_pos = apos  | 
141  | 
last_b_pos = bpos  | 
|
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
142  | 
answer.append((apos, bpos))  | 
143  | 
if len(answer) > oldlength:  | 
|
144  | 
        # find matches between the last match and the end
 | 
|
| 
2781.1.1
by Martin Pool
 merge cpatiencediff from Lukas  | 
145  | 
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1,  | 
146  | 
ahi, bhi, answer, maxrecursion - 1)  | 
|
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
147  | 
elif a[alo] == b[blo]:  | 
148  | 
        # find matching lines at the very beginning
 | 
|
149  | 
while alo < ahi and blo < bhi and a[alo] == b[blo]:  | 
|
150  | 
answer.append((alo, blo))  | 
|
151  | 
alo += 1  | 
|
152  | 
blo += 1  | 
|
| 
2781.1.1
by Martin Pool
 merge cpatiencediff from Lukas  | 
153  | 
recurse_matches_py(a, b, alo, blo,  | 
154  | 
ahi, bhi, answer, maxrecursion - 1)  | 
|
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
155  | 
elif a[ahi - 1] == b[bhi - 1]:  | 
156  | 
        # find matching lines at the very end
 | 
|
157  | 
nahi = ahi - 1  | 
|
158  | 
nbhi = bhi - 1  | 
|
159  | 
while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:  | 
|
160  | 
nahi -= 1  | 
|
161  | 
nbhi -= 1  | 
|
| 
2781.1.1
by Martin Pool
 merge cpatiencediff from Lukas  | 
162  | 
recurse_matches_py(a, b, last_a_pos+1, last_b_pos+1,  | 
163  | 
nahi, nbhi, answer, maxrecursion - 1)  | 
|
| 
1185.81.24
by Aaron Bentley
 Reoganize patience-related code  | 
164  | 
for i in xrange(ahi - nahi):  | 
165  | 
answer.append((nahi + i, nbhi + i))  | 
|
166  | 
||
167  | 
||
| 
1711.2.21
by John Arbash Meinel
 Cleanup patiencediff, remove the use of difflib.SequenceMatcher.  | 
168  | 
def _collapse_sequences(matches):  | 
169  | 
"""Find sequences of lines.  | 
|
170  | 
||
171  | 
    Given a sequence of [(line_in_a, line_in_b),]
 | 
|
172  | 
    find regions where they both increment at the same time
 | 
|
173  | 
    """
 | 
|
174  | 
answer = []  | 
|
175  | 
start_a = start_b = None  | 
|
176  | 
length = 0  | 
|
177  | 
for i_a, i_b in matches:  | 
|
178  | 
if (start_a is not None  | 
|
| 
3565.3.1
by Robert Collins
 * The generic fetch code now uses two attributes on Repository objects  | 
179  | 
and (i_a == start_a + length)  | 
| 
1711.2.21
by John Arbash Meinel
 Cleanup patiencediff, remove the use of difflib.SequenceMatcher.  | 
180  | 
and (i_b == start_b + length)):  | 
181  | 
length += 1  | 
|
182  | 
else:  | 
|
183  | 
if start_a is not None:  | 
|
184  | 
answer.append((start_a, start_b, length))  | 
|
185  | 
start_a = i_a  | 
|
186  | 
start_b = i_b  | 
|
187  | 
length = 1  | 
|
188  | 
||
189  | 
if length != 0:  | 
|
190  | 
answer.append((start_a, start_b, length))  | 
|
191  | 
||
192  | 
return answer  | 
|
193  | 
||
194  | 
||
195  | 
def _check_consistency(answer):  | 
|
196  | 
    # For consistency sake, make sure all matches are only increasing
 | 
|
197  | 
next_a = -1  | 
|
198  | 
next_b = -1  | 
|
| 
3376.2.8
by Martin Pool
 Some review cleanups for assertion removal  | 
199  | 
for (a, b, match_len) in answer:  | 
| 
3376.2.4
by Martin Pool
 Remove every assert statement from bzrlib!  | 
200  | 
if a < next_a:  | 
201  | 
raise ValueError('Non increasing matches for a')  | 
|
202  | 
if b < next_b:  | 
|
| 
3376.2.8
by Martin Pool
 Some review cleanups for assertion removal  | 
203  | 
raise ValueError('Non increasing matches for b')  | 
| 
1711.2.21
by John Arbash Meinel
 Cleanup patiencediff, remove the use of difflib.SequenceMatcher.  | 
204  | 
next_a = a + match_len  | 
205  | 
next_b = b + match_len  | 
|
206  | 
||
207  | 
||
| 
2781.1.1
by Martin Pool
 merge cpatiencediff from Lukas  | 
208  | 
class PatienceSequenceMatcher_py(difflib.SequenceMatcher):  | 
| 
1185.81.5
by John Arbash Meinel
 Fix up SequenceMatcher, add comments to nofrillsprecisemerge  | 
209  | 
"""Compare a pair of sequences using longest common subset."""  | 
| 
1185.81.1
by John Arbash Meinel
 Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.  | 
210  | 
|
| 
1711.2.21
by John Arbash Meinel
 Cleanup patiencediff, remove the use of difflib.SequenceMatcher.  | 
211  | 
_do_check_consistency = True  | 
212  | 
||
| 
1185.81.5
by John Arbash Meinel
 Fix up SequenceMatcher, add comments to nofrillsprecisemerge  | 
213  | 
def __init__(self, isjunk=None, a='', b=''):  | 
214  | 
if isjunk is not None:  | 
|
215  | 
raise NotImplementedError('Currently we do not support'  | 
|
216  | 
' isjunk for sequence matching')  | 
|
217  | 
difflib.SequenceMatcher.__init__(self, isjunk, a, b)  | 
|
| 
1185.81.1
by John Arbash Meinel
 Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.  | 
218  | 
|
| 
1711.2.7
by John Arbash Meinel
 Override get_matching_blocks  | 
219  | 
def get_matching_blocks(self):  | 
220  | 
"""Return list of triples describing matching subsequences.  | 
|
221  | 
||
222  | 
        Each triple is of the form (i, j, n), and means that
 | 
|
223  | 
        a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
 | 
|
224  | 
        i and in j.
 | 
|
225  | 
||
226  | 
        The last triple is a dummy, (len(a), len(b), 0), and is the only
 | 
|
227  | 
        triple with n==0.
 | 
|
228  | 
||
| 
1711.2.11
by John Arbash Meinel
 Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher  | 
229  | 
        >>> s = PatienceSequenceMatcher(None, "abxcd", "abcd")
 | 
| 
1711.2.7
by John Arbash Meinel
 Override get_matching_blocks  | 
230  | 
        >>> s.get_matching_blocks()
 | 
231  | 
        [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
 | 
|
232  | 
        """
 | 
|
| 
3943.8.1
by Marius Kruger
 remove all trailing whitespace from bzr source  | 
233  | 
        # jam 20060525 This is the python 2.4.1 difflib get_matching_blocks
 | 
| 
1711.2.7
by John Arbash Meinel
 Override get_matching_blocks  | 
234  | 
        # implementation which uses __helper. 2.4.3 got rid of helper for
 | 
235  | 
        # doing it inline with a queue.
 | 
|
236  | 
        # We should consider doing the same for recurse_matches
 | 
|
237  | 
||
238  | 
if self.matching_blocks is not None:  | 
|
239  | 
return self.matching_blocks  | 
|
240  | 
||
| 
1185.81.1
by John Arbash Meinel
 Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.  | 
241  | 
matches = []  | 
| 
2781.1.1
by Martin Pool
 merge cpatiencediff from Lukas  | 
242  | 
recurse_matches_py(self.a, self.b, 0, 0,  | 
243  | 
len(self.a), len(self.b), matches, 10)  | 
|
| 
1185.81.1
by John Arbash Meinel
 Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.  | 
244  | 
        # Matches now has individual line pairs of
 | 
245  | 
        # line A matches line B, at the given offsets
 | 
|
| 
1711.2.21
by John Arbash Meinel
 Cleanup patiencediff, remove the use of difflib.SequenceMatcher.  | 
246  | 
self.matching_blocks = _collapse_sequences(matches)  | 
247  | 
self.matching_blocks.append( (len(self.a), len(self.b), 0) )  | 
|
| 
2781.1.1
by Martin Pool
 merge cpatiencediff from Lukas  | 
248  | 
if PatienceSequenceMatcher_py._do_check_consistency:  | 
| 
1711.2.21
by John Arbash Meinel
 Cleanup patiencediff, remove the use of difflib.SequenceMatcher.  | 
249  | 
if __debug__:  | 
250  | 
_check_consistency(self.matching_blocks)  | 
|
251  | 
||
252  | 
return self.matching_blocks  |