/brz/remove-bazaar : contents of bzrlib/patiencediff.py at revision 1711.2.14

: (revision 1711.2.14)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	1	#!/usr/bin/env python
1185.81.24 by Aaron Bentley Reoganize patience-related code	2	# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
1185.81.24 by Aaron Bentley Reoganize patience-related code	3	#
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	4	# This program is free software; you can redistribute it and/or modify
	5	# it under the terms of the GNU General Public License as published by
	6	# the Free Software Foundation; either version 2 of the License, or
	7	# (at your option) any later version.
1185.81.24 by Aaron Bentley Reoganize patience-related code	8	#
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU General Public License for more details.
1185.81.24 by Aaron Bentley Reoganize patience-related code	13	#
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	14	# You should have received a copy of the GNU General Public License
	15	# along with this program; if not, write to the Free Software
	16	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	17
	18
1185.81.24 by Aaron Bentley Reoganize patience-related code	19	from bisect import bisect
1185.81.24 by Aaron Bentley Reoganize patience-related code	20	from copy import copy
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	21	import difflib
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	22	import os
	23	import sys
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	24	import time
	25
1711.2.12 by John Arbash Meinel Make a mention when the maximum recursion length is reached.	26	from bzrlib.trace import mutter
	27
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	28
1711.2.11 by John Arbash Meinel Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher	29	__all__ = ['PatienceSequenceMatcher', 'unified_diff', 'unified_diff_files']
1185.81.9 by John Arbash Meinel Added (failing) tests for cdv.recurse_matches with common sections,	30
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	31
1185.81.24 by Aaron Bentley Reoganize patience-related code	32	def unique_lcs(a, b):
	33	"""Find the longest common subset for unique lines.
	34
	35	:param a: An indexable object (such as string or list of strings)
	36	:param b: Another indexable object (such as string or list of strings)
	37	:return: A list of tuples, one for each line which is matched.
	38	[(line_in_a, line_in_b), ...]
	39
	40	This only matches lines which are unique on both sides.
	41	This helps prevent common lines from over influencing match
	42	results.
	43	The longest common subset uses the Patience Sorting algorithm:
	44	http://en.wikipedia.org/wiki/Patience_sorting
	45	"""
	46	# set index[line in a] = position of line in a unless
	47	# unless a is a duplicate, in which case it's set to None
	48	index = {}
	49	for i in xrange(len(a)):
	50	line = a[i]
	51	if line in index:
	52	index[line] = None
	53	else:
	54	index[line]= i
	55	# make btoa[i] = position of line i in a, unless
	56	# that line doesn't occur exactly once in both,
	57	# in which case it's set to None
	58	btoa = [None] * len(b)
	59	index2 = {}
	60	for pos, line in enumerate(b):
	61	next = index.get(line)
	62	if next is not None:
	63	if line in index2:
	64	# unset the previous mapping, which we now know to
	65	# be invalid because the line isn't unique
	66	btoa[index2[line]] = None
	67	del index[line]
	68	else:
	69	index2[line] = pos
	70	btoa[pos] = next
	71	# this is the Patience sorting algorithm
	72	# see http://en.wikipedia.org/wiki/Patience_sorting
	73	backpointers = [None] * len(b)
	74	stacks = []
	75	lasts = []
	76	k = 0
	77	for bpos, apos in enumerate(btoa):
	78	if apos is None:
	79	continue
	80	# as an optimization, check if the next line comes at the end,
	81	# because it usually does
	82	if stacks and stacks[-1] < apos:
	83	k = len(stacks)
	84	# as an optimization, check if the next line comes right after
	85	# the previous line, because usually it does
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	86	elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or
	87	stacks[k+1] > apos):
1185.81.24 by Aaron Bentley Reoganize patience-related code	88	k += 1
	89	else:
	90	k = bisect(stacks, apos)
	91	if k > 0:
	92	backpointers[bpos] = lasts[k-1]
	93	if k < len(stacks):
	94	stacks[k] = apos
	95	lasts[k] = bpos
	96	else:
	97	stacks.append(apos)
	98	lasts.append(bpos)
	99	if len(lasts) == 0:
	100	return []
	101	result = []
	102	k = lasts[-1]
	103	while k is not None:
	104	result.append((btoa[k], k))
	105	k = backpointers[k]
	106	result.reverse()
	107	return result
	108
	109
	110	def recurse_matches(a, b, ahi, bhi, answer, maxrecursion):
	111	"""Find all of the matching text in the lines of a and b.
	112
	113	:param a: A sequence
	114	:param b: Another sequence
	115	:param ahi: The maximum length of a to check, typically len(a)
	116	:param bhi: The maximum length of b to check, typically len(b)
	117	:param answer: The return array. Will be filled with tuples
	118	indicating [(line_in_a), (line_in_b)]
	119	:param maxrecursion: The maximum depth to recurse.
	120	Must be a positive integer.
	121	:return: None, the return value is in the parameter answer, which
	122	should be a list
	123
	124	"""
	125	oldlen = len(answer)
	126	if maxrecursion < 0:
1711.2.12 by John Arbash Meinel Make a mention when the maximum recursion length is reached.	127	mutter('max recursion depth reached')
1185.81.24 by Aaron Bentley Reoganize patience-related code	128	# this will never happen normally, this check is to prevent DOS attacks
	129	return
	130	oldlength = len(answer)
	131	if len(answer) == 0:
	132	alo, blo = 0, 0
	133	else:
	134	alo, blo = answer[-1]
	135	alo += 1
	136	blo += 1
	137	if alo == ahi or blo == bhi:
	138	return
	139	for apos, bpos in unique_lcs(a[alo:ahi], b[blo:bhi]):
	140	# recurse between lines which are unique in each file and match
	141	apos += alo
	142	bpos += blo
	143	recurse_matches(a, b, apos, bpos, answer, maxrecursion - 1)
	144	answer.append((apos, bpos))
	145	if len(answer) > oldlength:
	146	# find matches between the last match and the end
	147	recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1)
	148	elif a[alo] == b[blo]:
	149	# find matching lines at the very beginning
	150	while alo < ahi and blo < bhi and a[alo] == b[blo]:
	151	answer.append((alo, blo))
	152	alo += 1
	153	blo += 1
	154	recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1)
	155	elif a[ahi - 1] == b[bhi - 1]:
	156	# find matching lines at the very end
	157	nahi = ahi - 1
	158	nbhi = bhi - 1
	159	while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:
	160	nahi -= 1
	161	nbhi -= 1
	162	recurse_matches(a, b, nahi, nbhi, answer, maxrecursion - 1)
	163	for i in xrange(ahi - nahi):
	164	answer.append((nahi + i, nbhi + i))
	165
	166
1711.2.11 by John Arbash Meinel Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher	167	class PatienceSequenceMatcher(difflib.SequenceMatcher):
1185.81.5 by John Arbash Meinel Fix up SequenceMatcher, add comments to nofrillsprecisemerge	168	"""Compare a pair of sequences using longest common subset."""
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	169
1185.81.5 by John Arbash Meinel Fix up SequenceMatcher, add comments to nofrillsprecisemerge	170	def __init__(self, isjunk=None, a='', b=''):
	171	if isjunk is not None:
	172	raise NotImplementedError('Currently we do not support'
	173	' isjunk for sequence matching')
	174	difflib.SequenceMatcher.__init__(self, isjunk, a, b)
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	175
1185.81.11 by John Arbash Meinel Found some edge cases that weren't being matched.	176	def _check_with_diff(self, alo, ahi, blo, bhi, answer):
	177	"""Use the original diff algorithm on an unmatched section.
	178
	179	This will check to make sure the range is worth checking,
	180	before doing any work.
	181
	182	:param alo: The last line that actually matched
	183	:param ahi: The next line that actually matches
	184	:param blo: Same as alo, only for the 'b' set
	185	:param bhi: Same as ahi
	186	:param answer: An array which will have the new ranges appended to it
	187	:return: None
	188	"""
	189	# WORKAROUND
	190	# recurse_matches has an implementation design
	191	# which does not match non-unique lines in the
	192	# if they do not touch matching unique lines
	193	# so we rerun the regular diff algorithm
	194	# if find a large enough chunk.
	195
	196	# recurse_matches already looked at the direct
	197	# neighbors, so we only need to run if there is
	198	# enough space to do so
	199	if ahi - alo > 2 and bhi - blo > 2:
1185.81.16 by John Arbash Meinel Added tests, and an assert check to make sure ranges are always increasing.	200	a = self.a[alo+1:ahi-1]
1185.81.11 by John Arbash Meinel Found some edge cases that weren't being matched.	201	b = self.b[blo+1:bhi-1]
	202	m = difflib.SequenceMatcher(None, a, b)
	203	new_blocks = m.get_matching_blocks()
	204	# difflib always adds a final match
	205	new_blocks.pop()
	206	for blk in new_blocks:
	207	answer.append((blk[0]+alo+1,
	208	blk[1]+blo+1,
	209	blk[2]))
	210
1711.2.7 by John Arbash Meinel Override get_matching_blocks	211	def get_matching_blocks(self):
	212	"""Return list of triples describing matching subsequences.
	213
	214	Each triple is of the form (i, j, n), and means that
	215	a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
	216	i and in j.
	217
	218	The last triple is a dummy, (len(a), len(b), 0), and is the only
	219	triple with n==0.
	220
1711.2.11 by John Arbash Meinel Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher	221	>>> s = PatienceSequenceMatcher(None, "abxcd", "abcd")
1711.2.7 by John Arbash Meinel Override get_matching_blocks	222	>>> s.get_matching_blocks()
	223	[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
	224	"""
	225	# jam 20060525 This is the python 2.4.1 difflib get_matching_blocks
	226	# implementation which uses __helper. 2.4.3 got rid of helper for
	227	# doing it inline with a queue.
	228	# We should consider doing the same for recurse_matches
	229
	230	if self.matching_blocks is not None:
	231	return self.matching_blocks
	232	self.matching_blocks = []
	233	la, lb = len(self.a), len(self.b)
	234	self.__helper(0, la, 0, lb, self.matching_blocks)
	235	self.matching_blocks.append( (la, lb, 0) )
	236	return self.matching_blocks
	237
1185.81.4 by John Arbash Meinel moved the logic deeper into difflib.	238	def __helper(self, alo, ahi, blo, bhi, answer):
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	239	matches = []
1185.81.4 by John Arbash Meinel moved the logic deeper into difflib.	240	a = self.a[alo:ahi]
	241	b = self.b[blo:bhi]
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	242	recurse_matches(a, b, len(a), len(b), matches, 10)
	243	# Matches now has individual line pairs of
	244	# line A matches line B, at the given offsets
	245
	246	start_a = start_b = None
	247	length = 0
	248	for i_a, i_b in matches:
	249	if (start_a is not None
	250	and (i_a == start_a + length)
	251	and (i_b == start_b + length)):
	252	length += 1
	253	else:
	254	# New block
1185.81.11 by John Arbash Meinel Found some edge cases that weren't being matched.	255	if start_a is None:
	256	# We need to check from 0,0 until the current match
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	257	self._check_with_diff(alo-1, i_a+alo, blo-1, i_b+blo,
	258	answer)
1185.81.11 by John Arbash Meinel Found some edge cases that weren't being matched.	259	else:
1185.81.4 by John Arbash Meinel moved the logic deeper into difflib.	260	answer.append((start_a+alo, start_b+blo, length))
1185.81.11 by John Arbash Meinel Found some edge cases that weren't being matched.	261	self._check_with_diff(start_a+alo+length, i_a+alo,
	262	start_b+blo+length, i_b+blo,
	263	answer)
1185.81.10 by John Arbash Meinel Added some more test cases.	264
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	265	start_a = i_a
	266	start_b = i_b
	267	length = 1
	268
	269	if length != 0:
1185.81.11 by John Arbash Meinel Found some edge cases that weren't being matched.	270	answer.append((start_a+alo, start_b+blo, length))
	271	self._check_with_diff(start_a+alo+length, ahi+1,
	272	start_b+blo+length, bhi+1,
	273	answer)
	274	if not matches:
	275	# Nothing matched, so we need to send the complete text
	276	self._check_with_diff(alo-1, ahi+1, blo-1, bhi+1, answer)
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	277
1185.81.16 by John Arbash Meinel Added tests, and an assert check to make sure ranges are always increasing.	278	# For consistency sake, make sure all matches are only increasing
	279	if __debug__:
	280	next_a = -1
	281	next_b = -1
	282	for a,b,match_len in answer:
	283	assert a >= next_a, 'Non increasing matches for a'
	284	assert b >= next_b, 'Not increasing matches for b'
	285	next_a = a + match_len
	286	next_b = b + match_len
	287
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	288
1185.81.8 by John Arbash Meinel Updating unified_diff to take a factory, using the new diff algorithm in the code.	289	# This is a version of unified_diff which only adds a factory parameter
	290	# so that you can override the default SequenceMatcher
	291	# this has been submitted as a patch to python
	292	def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
	293	tofiledate='', n=3, lineterm='\n',
	294	sequencematcher=None):
	295	r"""
	296	Compare two sequences of lines; generate the delta as a unified diff.
	297
	298	Unified diffs are a compact way of showing line changes and a few
	299	lines of context. The number of context lines is set by 'n' which
	300	defaults to three.
	301
	302	By default, the diff control lines (those with ---, +++, or @@) are
	303	created with a trailing newline. This is helpful so that inputs
	304	created from file.readlines() result in diffs that are suitable for
	305	file.writelines() since both the inputs and outputs have trailing
	306	newlines.
	307
	308	For inputs that do not have trailing newlines, set the lineterm
	309	argument to "" so that the output will be uniformly newline free.
	310
	311	The unidiff format normally has a header for filenames and modification
	312	times. Any or all of these may be specified using strings for
	313	'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification
	314	times are normally expressed in the format returned by time.ctime().
	315
	316	Example:
	317
	318	>>> for line in unified_diff('one two three four'.split(),
	319	... 'zero one tree four'.split(), 'Original', 'Current',
	320	... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
	321	... lineterm=''):
	322	... print line
	323	--- Original Sat Jan 26 23:30:50 1991
	324	+++ Current Fri Jun 06 10:20:52 2003
	325	@@ -1,4 +1,4 @@
	326	+zero
	327	one
	328	-two
	329	-three
	330	+tree
	331	four
	332	"""
	333	if sequencematcher is None:
	334	sequencematcher = difflib.SequenceMatcher
	335
	336	started = False
	337	for group in sequencematcher(None,a,b).get_grouped_opcodes(n):
	338	if not started:
	339	yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)
	340	yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)
	341	started = True
	342	i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
	343	yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)
	344	for tag, i1, i2, j1, j2 in group:
	345	if tag == 'equal':
	346	for line in a[i1:i2]:
	347	yield ' ' + line
	348	continue
	349	if tag == 'replace' or tag == 'delete':
	350	for line in a[i1:i2]:
	351	yield '-' + line
	352	if tag == 'replace' or tag == 'insert':
353	for line in b[j1:j2]:
354	yield '+' + line
355
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	356
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	357	def unified_diff_files(a, b, sequencematcher=None):
	358	"""Generate the diff for two files.
	359	"""
	360	# Should this actually be an error?
	361	if a == b:
	362	return []
	363	if a == '-':
	364	file_a = sys.stdin
	365	time_a = time.time()
	366	else:
	367	file_a = open(a, 'rb')
	368	time_a = os.stat(a).st_mtime
	369
	370	if b == '-':
	371	file_b = sys.stdin
	372	time_b = time.time()
	373	else:
	374	file_b = open(b, 'rb')
	375	time_b = os.stat(b).st_mtime
	376
	377	# TODO: Include fromfiledate and tofiledate
	378	return unified_diff(file_a.readlines(), file_b.readlines(),
	379	fromfile=a, tofile=b,
	380	sequencematcher=sequencematcher)
	381
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	382
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	383	def main(args):
	384	import optparse
	385	p = optparse.OptionParser(usage='%prog [options] file_a file_b'
	386	'\nFiles can be "-" to read from stdin')
1711.2.9 by John Arbash Meinel Rename cdv => patience	387	p.add_option('--patience', dest='matcher', action='store_const', const='patience',
1711.2.9 by John Arbash Meinel Rename cdv => patience	388	default='patience', help='Use the patience difference algorithm')
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	389	p.add_option('--difflib', dest='matcher', action='store_const', const='difflib',
1711.2.9 by John Arbash Meinel Rename cdv => patience	390	default='patience', help='Use python\'s difflib algorithm')
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	391
1711.2.11 by John Arbash Meinel Rename patiencediff.SequenceMatcher => PatienceSequenceMatcher and knit.SequenceMatcher => KnitSequenceMatcher	392	algorithms = {'patience':PatienceSequenceMatcher, 'difflib':difflib.SequenceMatcher}
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	393
	394	(opts, args) = p.parse_args(args)
	395	matcher = algorithms[opts.matcher]
	396
	397	if len(args) != 2:
	398	print 'You must supply 2 filenames to diff'
	399	return -1
	400
	401	for line in unified_diff_files(args[0], args[1], sequencematcher=matcher):
	402	sys.stdout.write(line)
	403
	404	if __name__ == '__main__':
	405	sys.exit(main(sys.argv[1:]))