/brz/remove-bazaar : contents of bzrlib/patiencediff.py at revision 2279.6.1

: (revision 2279.6.1)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	1	#!/usr/bin/env python
1185.81.24 by Aaron Bentley Reoganize patience-related code	2	# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
1185.81.24 by Aaron Bentley Reoganize patience-related code	3	#
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	4	# This program is free software; you can redistribute it and/or modify
	5	# it under the terms of the GNU General Public License as published by
	6	# the Free Software Foundation; either version 2 of the License, or
	7	# (at your option) any later version.
1185.81.24 by Aaron Bentley Reoganize patience-related code	8	#
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU General Public License for more details.
1185.81.24 by Aaron Bentley Reoganize patience-related code	13	#
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	14	# You should have received a copy of the GNU General Public License
	15	# along with this program; if not, write to the Free Software
	16	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	17
	18
1185.81.24 by Aaron Bentley Reoganize patience-related code	19	from bisect import bisect
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	20	import difflib
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	21	import os
	22	import sys
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	23	import time
	24
1711.2.12 by John Arbash Meinel Make a mention when the maximum recursion length is reached.	25	from bzrlib.trace import mutter
	26
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	27
1711.2.11 by John Arbash Meinel Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher	28	__all__ = ['PatienceSequenceMatcher', 'unified_diff', 'unified_diff_files']
1185.81.9 by John Arbash Meinel Added (failing) tests for cdv.recurse_matches with common sections,	29
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	30
1185.81.24 by Aaron Bentley Reoganize patience-related code	31	def unique_lcs(a, b):
	32	"""Find the longest common subset for unique lines.
	33
	34	:param a: An indexable object (such as string or list of strings)
	35	:param b: Another indexable object (such as string or list of strings)
	36	:return: A list of tuples, one for each line which is matched.
	37	[(line_in_a, line_in_b), ...]
	38
	39	This only matches lines which are unique on both sides.
	40	This helps prevent common lines from over influencing match
	41	results.
	42	The longest common subset uses the Patience Sorting algorithm:
	43	http://en.wikipedia.org/wiki/Patience_sorting
	44	"""
	45	# set index[line in a] = position of line in a unless
2100.2.1 by wang Replace python's difflib by patiencediff because the worst case	46	# a is a duplicate, in which case it's set to None
1185.81.24 by Aaron Bentley Reoganize patience-related code	47	index = {}
	48	for i in xrange(len(a)):
	49	line = a[i]
	50	if line in index:
	51	index[line] = None
	52	else:
	53	index[line]= i
	54	# make btoa[i] = position of line i in a, unless
	55	# that line doesn't occur exactly once in both,
	56	# in which case it's set to None
	57	btoa = [None] * len(b)
	58	index2 = {}
	59	for pos, line in enumerate(b):
	60	next = index.get(line)
	61	if next is not None:
	62	if line in index2:
	63	# unset the previous mapping, which we now know to
	64	# be invalid because the line isn't unique
	65	btoa[index2[line]] = None
	66	del index[line]
	67	else:
	68	index2[line] = pos
	69	btoa[pos] = next
	70	# this is the Patience sorting algorithm
	71	# see http://en.wikipedia.org/wiki/Patience_sorting
	72	backpointers = [None] * len(b)
	73	stacks = []
	74	lasts = []
	75	k = 0
	76	for bpos, apos in enumerate(btoa):
	77	if apos is None:
	78	continue
	79	# as an optimization, check if the next line comes at the end,
	80	# because it usually does
	81	if stacks and stacks[-1] < apos:
	82	k = len(stacks)
	83	# as an optimization, check if the next line comes right after
	84	# the previous line, because usually it does
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	85	elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or
	86	stacks[k+1] > apos):
1185.81.24 by Aaron Bentley Reoganize patience-related code	87	k += 1
	88	else:
	89	k = bisect(stacks, apos)
	90	if k > 0:
	91	backpointers[bpos] = lasts[k-1]
	92	if k < len(stacks):
	93	stacks[k] = apos
	94	lasts[k] = bpos
	95	else:
	96	stacks.append(apos)
	97	lasts.append(bpos)
	98	if len(lasts) == 0:
	99	return []
	100	result = []
	101	k = lasts[-1]
	102	while k is not None:
	103	result.append((btoa[k], k))
	104	k = backpointers[k]
	105	result.reverse()
	106	return result
	107
	108
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	109	def recurse_matches(a, b, alo, blo, ahi, bhi, answer, maxrecursion):
1185.81.24 by Aaron Bentley Reoganize patience-related code	110	"""Find all of the matching text in the lines of a and b.
	111
	112	:param a: A sequence
	113	:param b: Another sequence
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	114	:param alo: The start location of a to check, typically 0
	115	:param ahi: The start location of b to check, typically 0
1185.81.24 by Aaron Bentley Reoganize patience-related code	116	:param ahi: The maximum length of a to check, typically len(a)
	117	:param bhi: The maximum length of b to check, typically len(b)
	118	:param answer: The return array. Will be filled with tuples
1711.2.17 by John Arbash Meinel Small cleanups to patience_diff code.	119	indicating [(line_in_a, line_in_b)]
1185.81.24 by Aaron Bentley Reoganize patience-related code	120	:param maxrecursion: The maximum depth to recurse.
	121	Must be a positive integer.
	122	:return: None, the return value is in the parameter answer, which
	123	should be a list
	124
	125	"""
	126	if maxrecursion < 0:
1711.2.12 by John Arbash Meinel Make a mention when the maximum recursion length is reached.	127	mutter('max recursion depth reached')
1185.81.24 by Aaron Bentley Reoganize patience-related code	128	# this will never happen normally, this check is to prevent DOS attacks
	129	return
	130	oldlength = len(answer)
	131	if alo == ahi or blo == bhi:
	132	return
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	133	last_a_pos = alo-1
	134	last_b_pos = blo-1
1185.81.24 by Aaron Bentley Reoganize patience-related code	135	for apos, bpos in unique_lcs(a[alo:ahi], b[blo:bhi]):
	136	# recurse between lines which are unique in each file and match
	137	apos += alo
	138	bpos += blo
1711.2.18 by John Arbash Meinel Optimize common case where unique_lcs returns a set of lines all in a row	139	# Most of the time, you will have a sequence of similar entries
	140	if last_a_pos+1 != apos or last_b_pos+1 != bpos:
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	141	recurse_matches(a, b, last_a_pos+1, last_b_pos+1,
	142	apos, bpos, answer, maxrecursion - 1)
1711.2.18 by John Arbash Meinel Optimize common case where unique_lcs returns a set of lines all in a row	143	last_a_pos = apos
	144	last_b_pos = bpos
1185.81.24 by Aaron Bentley Reoganize patience-related code	145	answer.append((apos, bpos))
	146	if len(answer) > oldlength:
	147	# find matches between the last match and the end
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	148	recurse_matches(a, b, last_a_pos+1, last_b_pos+1,
	149	ahi, bhi, answer, maxrecursion - 1)
1185.81.24 by Aaron Bentley Reoganize patience-related code	150	elif a[alo] == b[blo]:
	151	# find matching lines at the very beginning
	152	while alo < ahi and blo < bhi and a[alo] == b[blo]:
	153	answer.append((alo, blo))
	154	alo += 1
	155	blo += 1
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	156	recurse_matches(a, b, alo, blo,
	157	ahi, bhi, answer, maxrecursion - 1)
1185.81.24 by Aaron Bentley Reoganize patience-related code	158	elif a[ahi - 1] == b[bhi - 1]:
	159	# find matching lines at the very end
	160	nahi = ahi - 1
	161	nbhi = bhi - 1
	162	while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:
	163	nahi -= 1
	164	nbhi -= 1
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	165	recurse_matches(a, b, last_a_pos+1, last_b_pos+1,
	166	nahi, nbhi, answer, maxrecursion - 1)
1185.81.24 by Aaron Bentley Reoganize patience-related code	167	for i in xrange(ahi - nahi):
	168	answer.append((nahi + i, nbhi + i))
	169
	170
1711.2.21 by John Arbash Meinel Cleanup patiencediff, remove the use of difflib.SequenceMatcher.	171	def _collapse_sequences(matches):
	172	"""Find sequences of lines.
	173
	174	Given a sequence of [(line_in_a, line_in_b),]
	175	find regions where they both increment at the same time
	176	"""
	177	answer = []
	178	start_a = start_b = None
	179	length = 0
	180	for i_a, i_b in matches:
	181	if (start_a is not None
	182	and (i_a == start_a + length)
	183	and (i_b == start_b + length)):
	184	length += 1
	185	else:
	186	if start_a is not None:
	187	answer.append((start_a, start_b, length))
	188	start_a = i_a
	189	start_b = i_b
	190	length = 1
	191
	192	if length != 0:
	193	answer.append((start_a, start_b, length))
	194
	195	return answer
	196
	197
	198	def _check_consistency(answer):
	199	# For consistency sake, make sure all matches are only increasing
	200	next_a = -1
	201	next_b = -1
	202	for a,b,match_len in answer:
	203	assert a >= next_a, 'Non increasing matches for a'
	204	assert b >= next_b, 'Not increasing matches for b'
	205	next_a = a + match_len
	206	next_b = b + match_len
	207
	208
1711.2.11 by John Arbash Meinel Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher	209	class PatienceSequenceMatcher(difflib.SequenceMatcher):
1185.81.5 by John Arbash Meinel Fix up SequenceMatcher, add comments to nofrillsprecisemerge	210	"""Compare a pair of sequences using longest common subset."""
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	211
1711.2.21 by John Arbash Meinel Cleanup patiencediff, remove the use of difflib.SequenceMatcher.	212	_do_check_consistency = True
	213
1185.81.5 by John Arbash Meinel Fix up SequenceMatcher, add comments to nofrillsprecisemerge	214	def __init__(self, isjunk=None, a='', b=''):
	215	if isjunk is not None:
	216	raise NotImplementedError('Currently we do not support'
	217	' isjunk for sequence matching')
	218	difflib.SequenceMatcher.__init__(self, isjunk, a, b)
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	219
1711.2.7 by John Arbash Meinel Override get_matching_blocks	220	def get_matching_blocks(self):
	221	"""Return list of triples describing matching subsequences.
	222
	223	Each triple is of the form (i, j, n), and means that
	224	a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
	225	i and in j.
	226
	227	The last triple is a dummy, (len(a), len(b), 0), and is the only
	228	triple with n==0.
	229
1711.2.11 by John Arbash Meinel Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher	230	>>> s = PatienceSequenceMatcher(None, "abxcd", "abcd")
1711.2.7 by John Arbash Meinel Override get_matching_blocks	231	>>> s.get_matching_blocks()
	232	[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
	233	"""
	234	# jam 20060525 This is the python 2.4.1 difflib get_matching_blocks
	235	# implementation which uses __helper. 2.4.3 got rid of helper for
	236	# doing it inline with a queue.
	237	# We should consider doing the same for recurse_matches
	238
	239	if self.matching_blocks is not None:
	240	return self.matching_blocks
	241
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	242	matches = []
1711.2.22 by John Arbash Meinel Passing the alo parameter to recurse_matches shaves of 5% of the diff time.	243	recurse_matches(self.a, self.b, 0, 0,
	244	len(self.a), len(self.b), matches, 10)
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	245	# Matches now has individual line pairs of
	246	# line A matches line B, at the given offsets
1711.2.21 by John Arbash Meinel Cleanup patiencediff, remove the use of difflib.SequenceMatcher.	247	self.matching_blocks = _collapse_sequences(matches)
	248	self.matching_blocks.append( (len(self.a), len(self.b), 0) )
	249	if PatienceSequenceMatcher._do_check_consistency:
	250	if __debug__:
	251	_check_consistency(self.matching_blocks)
	252
	253	return self.matching_blocks
1185.81.16 by John Arbash Meinel Added tests, and an assert check to make sure ranges are always increasing.	254
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	255
1185.81.8 by John Arbash Meinel Updating unified_diff to take a factory, using the new diff algorithm in the code.	256	# This is a version of unified_diff which only adds a factory parameter
	257	# so that you can override the default SequenceMatcher
	258	# this has been submitted as a patch to python
	259	def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
	260	tofiledate='', n=3, lineterm='\n',
	261	sequencematcher=None):
	262	r"""
	263	Compare two sequences of lines; generate the delta as a unified diff.
	264
	265	Unified diffs are a compact way of showing line changes and a few
	266	lines of context. The number of context lines is set by 'n' which
	267	defaults to three.
	268
	269	By default, the diff control lines (those with ---, +++, or @@) are
	270	created with a trailing newline. This is helpful so that inputs
	271	created from file.readlines() result in diffs that are suitable for
	272	file.writelines() since both the inputs and outputs have trailing
	273	newlines.
	274
	275	For inputs that do not have trailing newlines, set the lineterm
	276	argument to "" so that the output will be uniformly newline free.
	277
	278	The unidiff format normally has a header for filenames and modification
	279	times. Any or all of these may be specified using strings for
	280	'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification
	281	times are normally expressed in the format returned by time.ctime().
	282
	283	Example:
	284
	285	>>> for line in unified_diff('one two three four'.split(),
	286	... 'zero one tree four'.split(), 'Original', 'Current',
	287	... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
	288	... lineterm=''):
	289	... print line
	290	--- Original Sat Jan 26 23:30:50 1991
	291	+++ Current Fri Jun 06 10:20:52 2003
	292	@@ -1,4 +1,4 @@
	293	+zero
	294	one
	295	-two
	296	-three
	297	+tree
	298	four
	299	"""
	300	if sequencematcher is None:
	301	sequencematcher = difflib.SequenceMatcher
	302
	303	started = False
	304	for group in sequencematcher(None,a,b).get_grouped_opcodes(n):
	305	if not started:
	306	yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)
	307	yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)
	308	started = True
	309	i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
	310	yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)
	311	for tag, i1, i2, j1, j2 in group:
	312	if tag == 'equal':
	313	for line in a[i1:i2]:
	314	yield ' ' + line
	315	continue
	316	if tag == 'replace' or tag == 'delete':
	317	for line in a[i1:i2]:
	318	yield '-' + line
	319	if tag == 'replace' or tag == 'insert':
320	for line in b[j1:j2]:
321	yield '+' + line
322
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	323
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	324	def unified_diff_files(a, b, sequencematcher=None):
	325	"""Generate the diff for two files.
	326	"""
	327	# Should this actually be an error?
	328	if a == b:
	329	return []
	330	if a == '-':
	331	file_a = sys.stdin
	332	time_a = time.time()
	333	else:
	334	file_a = open(a, 'rb')
	335	time_a = os.stat(a).st_mtime
	336
	337	if b == '-':
	338	file_b = sys.stdin
	339	time_b = time.time()
	340	else:
	341	file_b = open(b, 'rb')
	342	time_b = os.stat(b).st_mtime
	343
	344	# TODO: Include fromfiledate and tofiledate
	345	return unified_diff(file_a.readlines(), file_b.readlines(),
	346	fromfile=a, tofile=b,
	347	sequencematcher=sequencematcher)
	348
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	349
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	350	def main(args):
	351	import optparse
	352	p = optparse.OptionParser(usage='%prog [options] file_a file_b'
	353	'\nFiles can be "-" to read from stdin')
1711.2.9 by John Arbash Meinel Rename cdv => patience	354	p.add_option('--patience', dest='matcher', action='store_const', const='patience',
1711.2.9 by John Arbash Meinel Rename cdv => patience	355	default='patience', help='Use the patience difference algorithm')
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	356	p.add_option('--difflib', dest='matcher', action='store_const', const='difflib',
1711.2.9 by John Arbash Meinel Rename cdv => patience	357	default='patience', help='Use python\'s difflib algorithm')
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	358
1711.2.11 by John Arbash Meinel Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher	359	algorithms = {'patience':PatienceSequenceMatcher, 'difflib':difflib.SequenceMatcher}
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	360
	361	(opts, args) = p.parse_args(args)
	362	matcher = algorithms[opts.matcher]
	363
	364	if len(args) != 2:
	365	print 'You must supply 2 filenames to diff'
	366	return -1
	367
	368	for line in unified_diff_files(args[0], args[1], sequencematcher=matcher):
	369	sys.stdout.write(line)
	370
	371	if __name__ == '__main__':
	372	sys.exit(main(sys.argv[1:]))