1
from sets import Set as set
3
from bisect import bisect
6
"""Find the longest common subset for unique lines.
8
:param a: An indexable object (such as string or list of strings)
9
:param b: Another indexable object (such as string or list of strings)
10
:return: A list of tuples, one for each line which is matched.
11
[(line_in_a, line_in_b), ...]
13
This only matches lines which are unique on both sides.
14
This helps prevent common lines from over influencing match
16
The longest common subset uses the Patience Sorting algorithm:
17
http://en.wikipedia.org/wiki/Patience_sorting
19
# set index[line in a] = position of line in a unless
20
# unless a is a duplicate, in which case it's set to None
22
for i in xrange(len(a)):
28
# make btoa[i] = position of line i in a, unless
29
# that line doesn't occur exactly once in both,
30
# in which case it's set to None
31
btoa = [None] * len(b)
33
for pos, line in enumerate(b):
34
next = index.get(line)
37
# unset the previous mapping, which we now know to
38
# be invalid because the line isn't unique
39
btoa[index2[line]] = None
44
# this is the Patience sorting algorithm
45
# see http://en.wikipedia.org/wiki/Patience_sorting
46
backpointers = [None] * len(b)
50
for bpos, apos in enumerate(btoa):
53
# as an optimization, check if the next line comes at the end,
54
# because it usually does
55
if stacks and stacks[-1] < apos:
57
# as an optimization, check if the next line comes right after
58
# the previous line, because usually it does
59
elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or stacks[k+1] > apos):
62
k = bisect(stacks, apos)
64
backpointers[bpos] = lasts[k-1]
76
result.append((btoa[k], k))
81
assert unique_lcs('', '') == []
82
assert unique_lcs('a', 'a') == [(0, 0)]
83
assert unique_lcs('a', 'b') == []
84
assert unique_lcs('ab', 'ab') == [(0, 0), (1, 1)]
85
assert unique_lcs('abcde', 'cdeab') == [(2, 0), (3, 1), (4, 2)]
86
assert unique_lcs('cdeab', 'abcde') == [(0, 2), (1, 3), (2, 4)]
87
assert unique_lcs('abXde', 'abYde') == [(0, 0), (1, 1), (3, 3), (4, 4)]
88
assert unique_lcs('acbac', 'abc') == [(2, 1)]
90
def recurse_matches(a, b, ahi, bhi, answer, maxrecursion):
91
"""Find all of the matching text in the lines of a and b.
94
:param b: Another sequence
95
:param ahi: The maximum length of a to check, typically len(a)
96
:param bhi: The maximum length of b to check, typically len(b)
97
:param answer: The return array. Will be filled with tuples
98
indicating [(line_in_a), (line_in_b)]
99
:param maxrecursion: The maximum depth to recurse.
100
Must be a positive integer.
101
:return: None, the return value is in the parameter answer, which
107
# this will never happen normally, this check is to prevent DOS attacks
109
oldlength = len(answer)
113
alo, blo = answer[-1]
116
if alo == ahi or blo == bhi:
118
for apos, bpos in unique_lcs(a[alo:ahi], b[blo:bhi]):
119
# recurse between lines which are unique in each file and match
122
recurse_matches(a, b, apos, bpos, answer, maxrecursion - 1)
123
answer.append((apos, bpos))
124
if len(answer) > oldlength:
125
# find matches between the last match and the end
126
recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1)
127
elif a[alo] == b[blo]:
128
# find matching lines at the very beginning
129
while alo < ahi and blo < bhi and a[alo] == b[blo]:
130
answer.append((alo, blo))
133
recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1)
134
elif a[ahi - 1] == b[bhi - 1]:
135
# find matching lines at the very end
138
while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:
141
recurse_matches(a, b, nahi, nbhi, answer, maxrecursion - 1)
142
for i in xrange(ahi - nahi):
143
answer.append((nahi + i, nbhi + i))
146
recurse_matches(['a', None, 'b', None, 'c'], ['a', 'a', 'b', 'c', 'c'], 5, 5, a1, 10)
147
assert a1 == [(0, 0), (2, 2), (4, 4)]
149
recurse_matches(['a', 'c', 'b', 'a', 'c'], ['a', 'b', 'c'], 5, 3, a2, 10)
150
assert a2 == [(0, 0), (2, 1), (4, 2)]
153
recurse_matches(['a', 'B', 'c', 'c', 'D', 'e'], ['a', 'b', 'c', 'c', 'd', 'e'], 6, 6, a3, 10)
154
# FIXME: recurse_matches won't match non-unique lines, surrounded by bogus text
155
# This is what it should be
156
#assert a2 == [(0,0), (2,2), (3,3), (5,5)]
157
# This is what it currently gives:
158
assert a3 == [(0,0), (5,5)]