2
# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
from __future__ import absolute_import
20
from bisect import bisect
23
from .trace import mutter
26
def unique_lcs_py(a, b):
27
"""Find the longest common subset for unique lines.
29
:param a: An indexable object (such as string or list of strings)
30
:param b: Another indexable object (such as string or list of strings)
31
:return: A list of tuples, one for each line which is matched.
32
[(line_in_a, line_in_b), ...]
34
This only matches lines which are unique on both sides.
35
This helps prevent common lines from over influencing match
37
The longest common subset uses the Patience Sorting algorithm:
38
http://en.wikipedia.org/wiki/Patience_sorting
40
# set index[line in a] = position of line in a unless
41
# a is a duplicate, in which case it's set to None
43
for i, line in enumerate(a):
48
# make btoa[i] = position of line i in a, unless
49
# that line doesn't occur exactly once in both,
50
# in which case it's set to None
51
btoa = [None] * len(b)
53
for pos, line in enumerate(b):
54
next = index.get(line)
57
# unset the previous mapping, which we now know to
58
# be invalid because the line isn't unique
59
btoa[index2[line]] = None
64
# this is the Patience sorting algorithm
65
# see http://en.wikipedia.org/wiki/Patience_sorting
66
backpointers = [None] * len(b)
70
for bpos, apos in enumerate(btoa):
73
# as an optimization, check if the next line comes at the end,
74
# because it usually does
75
if stacks and stacks[-1] < apos:
77
# as an optimization, check if the next line comes right after
78
# the previous line, because usually it does
79
elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or
80
stacks[k + 1] > apos):
83
k = bisect(stacks, apos)
85
backpointers[bpos] = lasts[k - 1]
97
result.append((btoa[k], k))
103
def recurse_matches_py(a, b, alo, blo, ahi, bhi, answer, maxrecursion):
104
"""Find all of the matching text in the lines of a and b.
107
:param b: Another sequence
108
:param alo: The start location of a to check, typically 0
109
:param ahi: The start location of b to check, typically 0
110
:param ahi: The maximum length of a to check, typically len(a)
111
:param bhi: The maximum length of b to check, typically len(b)
112
:param answer: The return array. Will be filled with tuples
113
indicating [(line_in_a, line_in_b)]
114
:param maxrecursion: The maximum depth to recurse.
115
Must be a positive integer.
116
:return: None, the return value is in the parameter answer, which
121
mutter('max recursion depth reached')
122
# this will never happen normally, this check is to prevent DOS attacks
124
oldlength = len(answer)
125
if alo == ahi or blo == bhi:
129
for apos, bpos in unique_lcs_py(a[alo:ahi], b[blo:bhi]):
130
# recurse between lines which are unique in each file and match
133
# Most of the time, you will have a sequence of similar entries
134
if last_a_pos + 1 != apos or last_b_pos + 1 != bpos:
136
a, b, last_a_pos + 1, last_b_pos + 1, apos, bpos, answer,
140
answer.append((apos, bpos))
141
if len(answer) > oldlength:
142
# find matches between the last match and the end
143
recurse_matches_py(a, b, last_a_pos + 1, last_b_pos + 1,
144
ahi, bhi, answer, maxrecursion - 1)
145
elif a[alo] == b[blo]:
146
# find matching lines at the very beginning
147
while alo < ahi and blo < bhi and a[alo] == b[blo]:
148
answer.append((alo, blo))
151
recurse_matches_py(a, b, alo, blo,
152
ahi, bhi, answer, maxrecursion - 1)
153
elif a[ahi - 1] == b[bhi - 1]:
154
# find matching lines at the very end
157
while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:
160
recurse_matches_py(a, b, last_a_pos + 1, last_b_pos + 1,
161
nahi, nbhi, answer, maxrecursion - 1)
162
for i in range(ahi - nahi):
163
answer.append((nahi + i, nbhi + i))
166
def _collapse_sequences(matches):
167
"""Find sequences of lines.
169
Given a sequence of [(line_in_a, line_in_b),]
170
find regions where they both increment at the same time
173
start_a = start_b = None
175
for i_a, i_b in matches:
176
if (start_a is not None
177
and (i_a == start_a + length)
178
and (i_b == start_b + length)):
181
if start_a is not None:
182
answer.append((start_a, start_b, length))
188
answer.append((start_a, start_b, length))
193
def _check_consistency(answer):
194
# For consistency sake, make sure all matches are only increasing
197
for (a, b, match_len) in answer:
199
raise ValueError('Non increasing matches for a')
201
raise ValueError('Non increasing matches for b')
202
next_a = a + match_len
203
next_b = b + match_len
206
class PatienceSequenceMatcher_py(difflib.SequenceMatcher):
207
"""Compare a pair of sequences using longest common subset."""
209
_do_check_consistency = True
211
def __init__(self, isjunk=None, a='', b=''):
212
if isjunk is not None:
213
raise NotImplementedError('Currently we do not support'
214
' isjunk for sequence matching')
215
difflib.SequenceMatcher.__init__(self, isjunk, a, b)
217
def get_matching_blocks(self):
218
"""Return list of triples describing matching subsequences.
220
Each triple is of the form (i, j, n), and means that
221
a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
224
The last triple is a dummy, (len(a), len(b), 0), and is the only
227
>>> s = PatienceSequenceMatcher(None, "abxcd", "abcd")
228
>>> s.get_matching_blocks()
229
[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
231
# jam 20060525 This is the python 2.4.1 difflib get_matching_blocks
232
# implementation which uses __helper. 2.4.3 got rid of helper for
233
# doing it inline with a queue.
234
# We should consider doing the same for recurse_matches
236
if self.matching_blocks is not None:
237
return self.matching_blocks
240
recurse_matches_py(self.a, self.b, 0, 0,
241
len(self.a), len(self.b), matches, 10)
242
# Matches now has individual line pairs of
243
# line A matches line B, at the given offsets
244
self.matching_blocks = _collapse_sequences(matches)
245
self.matching_blocks.append((len(self.a), len(self.b), 0))
246
if PatienceSequenceMatcher_py._do_check_consistency:
248
_check_consistency(self.matching_blocks)
250
return self.matching_blocks