/brz/remove-bazaar : contents of bzrlib/groupcompress.py at revision 4194.2.2

: (revision 4194.2.2)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	1	# Copyright (C) 2008, 2009 Canonical Ltd
	2	#
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	3	# This program is free software; you can redistribute it and/or modify
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7	#
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	12	#
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
3735.36.3 by John Arbash Meinel Add the new address for FSF to the new files.	15	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	16
	17	"""Core compression logic for compressing streams of related files."""
	18
0.17.13 by Robert Collins Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).	19	from itertools import izip
0.17.5 by Robert Collins nograph tests completely passing.	20	from cStringIO import StringIO
0.25.3 by John Arbash Meinel Add a encode/decode base128 functions.	21	import struct
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	22	import time
0.17.5 by Robert Collins nograph tests completely passing.	23	import zlib
0.17.44 by John Arbash Meinel Use the bit field to allow both lzma groups and zlib groups.	24	try:
	25	import pylzma
	26	except ImportError:
	27	pylzma = None
0.17.5 by Robert Collins nograph tests completely passing.	28
0.17.4 by Robert Collins Annotate.	29	from bzrlib import (
0.17.4 by Robert Collins Annotate.	30	annotate,
0.17.5 by Robert Collins nograph tests completely passing.	31	debug,
0.17.4 by Robert Collins Annotate.	32	diff,
0.17.5 by Robert Collins nograph tests completely passing.	33	errors,
0.17.4 by Robert Collins Annotate.	34	graph as _mod_graph,
0.20.2 by John Arbash Meinel Teach groupcompress about 'chunked' encoding	35	osutils,
0.17.4 by Robert Collins Annotate.	36	pack,
0.17.4 by Robert Collins Annotate.	37	patiencediff,
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	38	trace,
0.17.4 by Robert Collins Annotate.	39	)
0.17.4 by Robert Collins Annotate.	40	from bzrlib.graph import Graph
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	41	from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins Core proof of concept working.	42	from bzrlib.osutils import (
	43	contains_whitespace,
	44	sha_string,
	45	split_lines,
	46	)
0.17.21 by Robert Collins Update groupcompress to bzrlib 1.10.	47	from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.	48	from bzrlib.lru_cache import LRUSizeCache
0.17.9 by Robert Collins Initial stab at repository format support.	49	from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins Core proof of concept working.	50	from bzrlib.versionedfile import (
0.17.5 by Robert Collins nograph tests completely passing.	51	adapter_registry,
0.17.5 by Robert Collins nograph tests completely passing.	52	AbsentContentFactory,
0.20.5 by John Arbash Meinel Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.	53	ChunkedContentFactory,
0.17.2 by Robert Collins Core proof of concept working.	54	FulltextContentFactory,
	55	VersionedFiles,
	56	)
	57
0.17.44 by John Arbash Meinel Use the bit field to allow both lzma groups and zlib groups.	58	_USE_LZMA = False and (pylzma is not None)
3735.32.17 by John Arbash Meinel We now round-trip the wire_bytes.	59	_NO_LABELS = True
0.25.16 by John Arbash Meinel Make sure we don't inter-pack to GCCHKBig repos.	60	_FAST = False
0.17.2 by Robert Collins Core proof of concept working.	61
3735.2.162 by John Arbash Meinel Change GroupCompressor.compress() to return the start_point.	62	# osutils.sha_string('')
	63	_null_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
	64
	65
0.25.3 by John Arbash Meinel Add a encode/decode base128 functions.	66	def encode_base128_int(val):
	67	"""Convert an integer into a 7-bit lsb encoding."""
	68	bytes = []
	69	count = 0
	70	while val >= 0x80:
	71	bytes.append(chr((val \| 0x80) & 0xFF))
	72	val >>= 7
	73	bytes.append(chr(val))
	74	return ''.join(bytes)
	75
	76
	77	def decode_base128_int(bytes):
	78	"""Decode an integer from a 7-bit lsb encoding."""
	79	offset = 0
	80	val = 0
	81	shift = 0
	82	bval = ord(bytes[offset])
	83	while bval >= 0x80:
	84	val \|= (bval & 0x7F) << shift
	85	shift += 7
	86	offset += 1
	87	bval = ord(bytes[offset])
	88	val \|= bval << shift
	89	offset += 1
	90	return val, offset
	91
	92
0.20.11 by John Arbash Meinel start experimenting with gc-optimal ordering.	93	def sort_gc_optimal(parent_map):
3735.31.14 by John Arbash Meinel Change the gc-optimal to 'groupcompress'	94	"""Sort and group the keys in parent_map into groupcompress order.
0.20.11 by John Arbash Meinel start experimenting with gc-optimal ordering.	95
3735.31.14 by John Arbash Meinel Change the gc-optimal to 'groupcompress'	96	groupcompress is defined (currently) as reverse-topological order, grouped by
0.20.11 by John Arbash Meinel start experimenting with gc-optimal ordering.	97	the key prefix.
	98
	99	:return: A sorted-list of keys
	100	"""
3735.31.14 by John Arbash Meinel Change the gc-optimal to 'groupcompress'	101	# groupcompress ordering is approximately reverse topological,
0.20.11 by John Arbash Meinel start experimenting with gc-optimal ordering.	102	# properly grouped by file-id.
0.20.23 by John Arbash Meinel Add a progress indicator for chk pages.	103	per_prefix_map = {}
0.20.11 by John Arbash Meinel start experimenting with gc-optimal ordering.	104	for item in parent_map.iteritems():
	105	key = item[0]
	106	if isinstance(key, str) or len(key) == 1:
0.20.23 by John Arbash Meinel Add a progress indicator for chk pages.	107	prefix = ''
0.20.11 by John Arbash Meinel start experimenting with gc-optimal ordering.	108	else:
0.20.23 by John Arbash Meinel Add a progress indicator for chk pages.	109	prefix = key[0]
	110	try:
	111	per_prefix_map[prefix].append(item)
	112	except KeyError:
	113	per_prefix_map[prefix] = [item]
0.20.11 by John Arbash Meinel start experimenting with gc-optimal ordering.	114
0.20.29 by Ian Clatworthy groupcompress.py code cleanups	115	present_keys = []
0.20.11 by John Arbash Meinel start experimenting with gc-optimal ordering.	116	for prefix in sorted(per_prefix_map):
	117	present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
	118	return present_keys
	119
	120
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	121	class GroupCompressBlockEntry(object):
	122	"""Track the information about a single object inside a GC group.
	123
	124	This is generally just the dumb data structure.
	125	"""
	126
	127	def __init__(self, key, type, sha1, start, length):
	128	self.key = key
	129	self.type = type # delta, fulltext, external?
	130	self.sha1 = sha1 # Sha1 of content
	131	self.start = start # Byte offset to start of data
	132	self.length = length # Length of content
	133
0.25.6 by John Arbash Meinel (tests broken) implement the basic ability to have a separate header	134	def __repr__(self):
	135	return '%s(%s, %s, %s, %s, %s)' % (
	136	self.__class__.__name__,
	137	self.key, self.type, self.sha1, self.start, self.length
	138	)
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	139
	140	@property
	141	def end(self):
	142	return self.start + self.length
	143
3735.32.9 by John Arbash Meinel Use a 32kB extension, since that is the max window size for zlib.	144	# The max zlib window size is 32kB, so if we set 'max_size' output of the
	145	# decompressor to the requested bytes + 32kB, then we should guarantee
	146	# num_bytes coming out.
	147	_ZLIB_DECOMP_WINDOW = 32*1024
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	148
	149	class GroupCompressBlock(object):
	150	"""An object which maintains the internal structure of the compressed data.
	151
	152	This tracks the meta info (start of text, length, type, etc.)
	153	"""
	154
0.25.5 by John Arbash Meinel Now using a zlib compressed format.	155	# Group Compress Block v1 Zlib
	156	GCB_HEADER = 'gcb1z\n'
0.17.44 by John Arbash Meinel Use the bit field to allow both lzma groups and zlib groups.	157	GCB_LZ_HEADER = 'gcb1l\n'
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	158
	159	def __init__(self):
	160	# map by key? or just order in file?
	161	self._entries = {}
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	162	self._compressor_name = None
3735.32.5 by John Arbash Meinel Change the parsing code to start out just holding the compressed bytes.	163	self._z_header_length = None
	164	self._header_length = None
	165	self._z_header = None
	166	self._z_content = None
3735.32.7 by John Arbash Meinel Implement partial decompression support.	167	self._z_content_decompressor = None
3735.32.5 by John Arbash Meinel Change the parsing code to start out just holding the compressed bytes.	168	self._z_content_length = None
	169	self._content_length = None
0.25.6 by John Arbash Meinel (tests broken) implement the basic ability to have a separate header	170	self._content = None
3735.32.5 by John Arbash Meinel Change the parsing code to start out just holding the compressed bytes.	171
	172	def __len__(self):
	173	return self._content_length + self._header_length
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	174
	175	def _parse_header(self):
0.17.48 by John Arbash Meinel if _NO_LABELS is set, don't bother parsing the mini header.	176	"""Parse the header part of the block."""
3735.32.5 by John Arbash Meinel Change the parsing code to start out just holding the compressed bytes.	177	assert self._z_header is not None
	178	if self._z_header == '':
	179	# Nothing to process
	180	self._z_header = None
0.17.48 by John Arbash Meinel if _NO_LABELS is set, don't bother parsing the mini header.	181	return
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	182	if self._compressor_name == 'lzma':
3735.32.5 by John Arbash Meinel Change the parsing code to start out just holding the compressed bytes.	183	header = pylzma.decompress(self._z_header)
	184	else:
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	185	assert self._compressor_name == 'zlib'
3735.32.5 by John Arbash Meinel Change the parsing code to start out just holding the compressed bytes.	186	header = zlib.decompress(self._z_header)
	187	self._z_header = None # We have consumed the header
	188	lines = header.split('\n')
	189	del header
0.17.48 by John Arbash Meinel if _NO_LABELS is set, don't bother parsing the mini header.	190	info_dict = {}
	191	for line in lines:
	192	if not line: #End of record
	193	if not info_dict:
	194	break
	195	self.add_entry(**info_dict)
	196	info_dict = {}
	197	continue
	198	key, value = line.split(':', 1)
	199	if key == 'key':
	200	value = tuple(map(intern, value.split('\x00')))
	201	elif key in ('start', 'length'):
	202	value = int(value)
	203	elif key == 'type':
	204	value = intern(value)
	205	info_dict[key] = value
	206
3735.32.7 by John Arbash Meinel Implement partial decompression support.	207	def _ensure_content(self, num_bytes=None):
	208	"""Make sure that content has been expanded enough.
	209
	210	:param num_bytes: Ensure that we have extracted at least num_bytes of
	211	content. If None, consume everything
	212	"""
3735.32.15 by John Arbash Meinel Change the GroupCompressBlock code to allow not recording 'end'.	213	# TODO: If we re-use the same content block at different times during
	214	# get_record_stream(), it is possible that the first pass will
	215	# get inserted, triggering an extract/_ensure_content() which
	216	# will get rid of _z_content. And then the next use of the block
	217	# will try to access _z_content (to send it over the wire), and
	218	# fail because it is already extracted. Consider never releasing
	219	# _z_content because of this.
3735.32.7 by John Arbash Meinel Implement partial decompression support.	220	if num_bytes is None:
	221	num_bytes = self._content_length
3735.32.11 by John Arbash Meinel Add tests for the ability to do partial decompression without knowing the final length.	222	if self._content_length is not None:
	223	assert num_bytes <= self._content_length
3735.32.6 by John Arbash Meinel A bit of reworking changes things so content is expanded at extract() time.	224	if self._content is None:
3735.32.7 by John Arbash Meinel Implement partial decompression support.	225	assert self._z_content is not None
	226	if self._z_content == '':
	227	self._content = ''
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	228	elif self._compressor_name == 'lzma':
3735.32.7 by John Arbash Meinel Implement partial decompression support.	229	# We don't do partial lzma decomp yet
3735.2.160 by John Arbash Meinel Fix a trivial typo	230	self._content = pylzma.decompress(self._z_content)
3735.32.7 by John Arbash Meinel Implement partial decompression support.	231	else:
	232	# Start a zlib decompressor
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	233	assert self._compressor_name == 'zlib'
3735.32.27 by John Arbash Meinel Have _LazyGroupContentManager pre-extract everything it holds.	234	if num_bytes is None:
	235	self._content = zlib.decompress(self._z_content)
	236	else:
	237	self._z_content_decompressor = zlib.decompressobj()
	238	# Seed the decompressor with the uncompressed bytes, so
	239	# that the rest of the code is simplified
	240	self._content = self._z_content_decompressor.decompress(
	241	self._z_content, num_bytes + _ZLIB_DECOMP_WINDOW)
3735.32.7 by John Arbash Meinel Implement partial decompression support.	242	# Any bytes remaining to be decompressed will be in the
	243	# decompressors 'unconsumed_tail'
	244	# Do we have enough bytes already?
3735.32.11 by John Arbash Meinel Add tests for the ability to do partial decompression without knowing the final length.	245	if num_bytes is not None and len(self._content) >= num_bytes:
3735.32.7 by John Arbash Meinel Implement partial decompression support.	246	return
3735.32.27 by John Arbash Meinel Have _LazyGroupContentManager pre-extract everything it holds.	247	if num_bytes is None and self._z_content_decompressor is None:
	248	# We must have already decompressed everything
	249	return
3735.32.7 by John Arbash Meinel Implement partial decompression support.	250	# If we got this far, and don't have a decompressor, something is wrong
	251	assert self._z_content_decompressor is not None
	252	remaining_decomp = self._z_content_decompressor.unconsumed_tail
3735.32.11 by John Arbash Meinel Add tests for the ability to do partial decompression without knowing the final length.	253	if num_bytes is None:
	254	if remaining_decomp:
	255	# We don't know how much is left, but we'll decompress it all
	256	self._content += self._z_content_decompressor.decompress(
	257	remaining_decomp)
	258	# Note: There what I consider a bug in zlib.decompressobj
	259	# If you pass back in the entire unconsumed_tail, only
	260	# this time you don't pass a max-size, it doesn't
	261	# change the unconsumed_tail back to None/''.
	262	# However, we know we are done with the whole stream
	263	self._z_content_decompressor = None
	264	self._content_length = len(self._content)
	265	else:
	266	# If we have nothing left to decomp, we ran out of decomp bytes
	267	assert remaining_decomp
	268	needed_bytes = num_bytes - len(self._content)
3735.32.12 by John Arbash Meinel Add groupcompress-block[-ref] as valid stream types.	269	# We always set max_size to 32kB over the minimum needed, so that
	270	# zlib will give us as much as we really want.
	271	# TODO: If this isn't good enough, we could make a loop here,
	272	# that keeps expanding the request until we get enough
3735.32.11 by John Arbash Meinel Add tests for the ability to do partial decompression without knowing the final length.	273	self._content += self._z_content_decompressor.decompress(
	274	remaining_decomp, needed_bytes + _ZLIB_DECOMP_WINDOW)
	275	assert len(self._content) >= num_bytes
	276	if not self._z_content_decompressor.unconsumed_tail:
	277	# The stream is finished
	278	self._z_content_decompressor = None
3735.32.6 by John Arbash Meinel A bit of reworking changes things so content is expanded at extract() time.	279
3735.32.5 by John Arbash Meinel Change the parsing code to start out just holding the compressed bytes.	280	def _parse_bytes(self, bytes):
	281	"""Read the various lengths from the header.
	282
	283	This also populates the various 'compressed' buffers.
	284
	285	:return: The position in bytes just after the last newline
	286	"""
	287	# At present, there are 4 lengths to be read, we have 2 integers for
	288	# the length of the compressed and uncompressed header, and 2 integers
	289	# for the compressed and uncompressed content
3735.32.10 by John Arbash Meinel test that we support reading from the gc blocks that didn't have their lengths.	290	# 14 bytes can represent > 1TB, so to avoid checking too far, cap the
	291	# search to 14 bytes.
	292	pos = bytes.index('\n', 6, 20)
3735.32.5 by John Arbash Meinel Change the parsing code to start out just holding the compressed bytes.	293	self._z_header_length = int(bytes[6:pos])
	294	pos += 1
3735.32.10 by John Arbash Meinel test that we support reading from the gc blocks that didn't have their lengths.	295	pos2 = bytes.index('\n', pos, pos + 14)
3735.32.5 by John Arbash Meinel Change the parsing code to start out just holding the compressed bytes.	296	self._header_length = int(bytes[pos:pos2])
3735.32.10 by John Arbash Meinel test that we support reading from the gc blocks that didn't have their lengths.	297	end_of_z_lengths = pos2
3735.32.5 by John Arbash Meinel Change the parsing code to start out just holding the compressed bytes.	298	pos2 += 1
	299	# Older versions don't have the content lengths, if we want to preserve
	300	# backwards compatibility, we could try/except over these, and allow
	301	# them to be skipped
3735.32.10 by John Arbash Meinel test that we support reading from the gc blocks that didn't have their lengths.	302	try:
	303	pos = bytes.index('\n', pos2, pos2 + 14)
	304	self._z_content_length = int(bytes[pos2:pos])
	305	pos += 1
	306	pos2 = bytes.index('\n', pos, pos + 14)
	307	self._content_length = int(bytes[pos:pos2])
	308	pos = pos2 + 1
	309	assert len(bytes) == (pos + self._z_header_length +
	310	self._z_content_length)
	311	pos2 = pos + self._z_header_length
	312	self._z_header = bytes[pos:pos2]
	313	self._z_content = bytes[pos2:]
	314	assert len(self._z_content) == self._z_content_length
	315	except ValueError:
	316	# This is the older form, which did not encode its content length
	317	pos = end_of_z_lengths + 1
	318	pos2 = pos + self._z_header_length
	319	self._z_header = bytes[pos:pos2]
	320	self._z_content = bytes[pos2:]
	321	self._z_content_length = len(self._z_content)
3735.32.5 by John Arbash Meinel Change the parsing code to start out just holding the compressed bytes.	322
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	323	@classmethod
	324	def from_bytes(cls, bytes):
	325	out = cls()
0.17.44 by John Arbash Meinel Use the bit field to allow both lzma groups and zlib groups.	326	if bytes[:6] not in (cls.GCB_HEADER, cls.GCB_LZ_HEADER):
3735.31.1 by John Arbash Meinel Bring the groupcompress plugin into the brisbane-core branch.	327	raise ValueError('bytes did not start with %r' % (cls.GCB_HEADER,))
0.17.44 by John Arbash Meinel Use the bit field to allow both lzma groups and zlib groups.	328	if bytes[4] == 'z':
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	329	out._compressor_name = 'zlib'
0.17.45 by John Arbash Meinel Just make sure we have the right decompressor	330	elif bytes[4] == 'l':
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	331	out._compressor_name = 'lzma'
0.17.45 by John Arbash Meinel Just make sure we have the right decompressor	332	else:
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	333	raise ValueError('unknown compressor: %r' % (bytes,))
3735.32.5 by John Arbash Meinel Change the parsing code to start out just holding the compressed bytes.	334	out._parse_bytes(bytes)
	335	if not _NO_LABELS:
	336	out._parse_header()
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	337	return out
	338
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	339	def extract(self, key, start, end, sha1=None):
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	340	"""Extract the text for a specific key.
	341
	342	:param key: The label used for this content
	343	:param sha1: TODO (should we validate only when sha1 is supplied?)
	344	:return: The bytes for the content
	345	"""
3735.34.1 by John Arbash Meinel Some testing to see if we can decrease the peak memory consumption a bit.	346	if start == end == 0:
3735.2.158 by John Arbash Meinel Remove support for passing None for end in GroupCompressBlock.extract.	347	return ''
	348	self._ensure_content(end)
3735.32.7 by John Arbash Meinel Implement partial decompression support.	349	# The bytes are 'f' or 'd' for the type, then a variable-length
	350	# base128 integer for the content size, then the actual content
3735.32.15 by John Arbash Meinel Change the GroupCompressBlock code to allow not recording 'end'.	351	# We know that the variable-length integer won't be longer than 5
	352	# bytes (it takes 5 bytes to encode 2^32)
3735.32.7 by John Arbash Meinel Implement partial decompression support.	353	c = self._content[start]
	354	if c == 'f':
	355	type = 'fulltext'
0.17.36 by John Arbash Meinel Adding a mini-len to the delta/fulltext bytes	356	else:
3735.32.7 by John Arbash Meinel Implement partial decompression support.	357	if c != 'd':
	358	raise ValueError('Unknown content control code: %s'
	359	% (c,))
	360	type = 'delta'
3735.32.15 by John Arbash Meinel Change the GroupCompressBlock code to allow not recording 'end'.	361	content_len, len_len = decode_base128_int(
	362	self._content[start + 1:start + 6])
	363	content_start = start + 1 + len_len
3735.2.158 by John Arbash Meinel Remove support for passing None for end in GroupCompressBlock.extract.	364	if end != content_start + content_len:
	365	raise ValueError('end != len according to field header'
	366	' %s != %s' % (end, content_start + content_len))
0.17.36 by John Arbash Meinel Adding a mini-len to the delta/fulltext bytes	367	content = self._content[content_start:end]
	368	if c == 'f':
	369	bytes = content
	370	elif c == 'd':
	371	bytes = _groupcompress_pyx.apply_delta(self._content, content)
3735.2.158 by John Arbash Meinel Remove support for passing None for end in GroupCompressBlock.extract.	372	return bytes
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	373
	374	def add_entry(self, key, type, sha1, start, length):
	375	"""Add new meta info about an entry.
	376
	377	:param key: The key for the new content
	378	:param type: Whether this is a delta or fulltext entry (external?)
	379	:param sha1: sha1sum of the fulltext of this entry
	380	:param start: where the encoded bytes start
	381	:param length: total number of bytes in the encoded form
	382	:return: The entry?
	383	"""
	384	entry = GroupCompressBlockEntry(key, type, sha1, start, length)
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	385	if key in self._entries:
	386	raise ValueError('Duplicate key found: %s' % (key,))
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	387	self._entries[key] = entry
	388	return entry
	389
3735.32.17 by John Arbash Meinel We now round-trip the wire_bytes.	390	def set_content(self, content):
	391	"""Set the content of this block."""
	392	self._content_length = len(content)
	393	self._content = content
	394	self._z_content = None
	395	self._z_header_length = None
	396
	397	def to_bytes(self):
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	398	"""Encode the information into a byte stream."""
0.17.44 by John Arbash Meinel Use the bit field to allow both lzma groups and zlib groups.	399	compress = zlib.compress
	400	if _USE_LZMA:
	401	compress = pylzma.compress
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	402	chunks = []
	403	for key in sorted(self._entries):
	404	entry = self._entries[key]
	405	chunk = ('key:%s\n'
0.25.4 by John Arbash Meinel We at least have the rudimentary ability to encode and decode values.	406	'sha1:%s\n'
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	407	'type:%s\n'
	408	'start:%s\n'
	409	'length:%s\n'
	410	'\n'
	411	) % ('\x00'.join(entry.key),
0.25.4 by John Arbash Meinel We at least have the rudimentary ability to encode and decode values.	412	entry.sha1,
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	413	entry.type,
	414	entry.start,
	415	entry.length,
	416	)
	417	chunks.append(chunk)
0.25.5 by John Arbash Meinel Now using a zlib compressed format.	418	bytes = ''.join(chunks)
	419	info_len = len(bytes)
3735.32.4 by John Arbash Meinel Change the byte representation of a groupcompress block.	420	z_header_bytes = compress(bytes)
	421	del bytes, chunks
	422	z_header_len = len(z_header_bytes)
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	423	# TODO: we may want to have the header compressed in the same chain
	424	# as the data, or we may not, evaulate it
	425	# having them compressed together is probably a win for
	426	# revisions and the 'inv' portion of chk inventories. As the
	427	# label in the header is duplicated in the text.
	428	# For chk pages and real bytes, I would guess this is not
	429	# true.
0.25.13 by John Arbash Meinel bring back the code that handles _NO_LABELS.	430	if _NO_LABELS:
3735.32.4 by John Arbash Meinel Change the byte representation of a groupcompress block.	431	z_header_bytes = ''
	432	z_header_len = 0
0.25.13 by John Arbash Meinel bring back the code that handles _NO_LABELS.	433	info_len = 0
3735.32.17 by John Arbash Meinel We now round-trip the wire_bytes.	434	if self._z_content is not None:
	435	content_len = self._content_length
	436	z_content_len = self._z_content_length
	437	z_content_bytes = self._z_content
	438	else:
	439	assert self._content is not None
	440	content_len = self._content_length
	441	z_content_bytes = compress(self._content)
	442	self._z_content = z_content_bytes
	443	z_content_len = len(z_content_bytes)
	444	self._z_content_length = z_content_len
0.17.46 by John Arbash Meinel Set the proper header when using/not using lzma	445	if _USE_LZMA:
	446	header = self.GCB_LZ_HEADER
	447	else:
	448	header = self.GCB_HEADER
	449	chunks = [header,
3735.32.4 by John Arbash Meinel Change the byte representation of a groupcompress block.	450	'%d\n%d\n%d\n%d\n' % (z_header_len, info_len,
	451	z_content_len, content_len)
0.25.7 by John Arbash Meinel Have the GroupCompressBlock decide how to compress the header and content.	452	]
3735.32.4 by John Arbash Meinel Change the byte representation of a groupcompress block.	453	chunks.append(z_header_bytes)
	454	chunks.append(z_content_bytes)
0.25.2 by John Arbash Meinel First cut at meta-info as text form.	455	return ''.join(chunks)
	456
	457
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	458	class _LazyGroupCompressFactory(object):
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	459	"""Yield content from a GroupCompressBlock on demand."""
	460
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	461	def __init__(self, key, parents, manager, start, end, first):
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	462	"""Create a _LazyGroupCompressFactory
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	463
	464	:param key: The key of just this record
	465	:param parents: The parents of this key (possibly None)
	466	:param gc_block: A GroupCompressBlock object
	467	:param start: Offset of the first byte for this record in the
	468	uncompressd content
	469	:param end: Offset of the byte just after the end of this record
	470	(ie, bytes = content[start:end])
	471	:param first: Is this the first Factory for the given block?
	472	"""
	473	self.key = key
	474	self.parents = parents
	475	self.sha1 = None
3735.32.15 by John Arbash Meinel Change the GroupCompressBlock code to allow not recording 'end'.	476	# Note: This attribute coupled with Manager._factories creates a
	477	# reference cycle. Perhaps we would rather use a weakref(), or
	478	# find an appropriate time to release the ref. After the first
	479	# get_bytes_as call? After Manager.get_record_stream() returns
	480	# the object?
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	481	self._manager = manager
3735.34.1 by John Arbash Meinel Some testing to see if we can decrease the peak memory consumption a bit.	482	self._bytes = None
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	483	self.storage_kind = 'groupcompress-block'
	484	if not first:
	485	self.storage_kind = 'groupcompress-block-ref'
	486	self._first = first
	487	self._start = start
	488	self._end = end
	489
3735.32.12 by John Arbash Meinel Add groupcompress-block[-ref] as valid stream types.	490	def __repr__(self):
	491	return '%s(%s, first=%s)' % (self.__class__.__name__,
	492	self.key, self._first)
	493
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	494	def get_bytes_as(self, storage_kind):
	495	if storage_kind == self.storage_kind:
	496	if self._first:
	497	# wire bytes, something...
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	498	return self._manager._wire_bytes()
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	499	else:
	500	return ''
	501	if storage_kind in ('fulltext', 'chunked'):
3735.34.1 by John Arbash Meinel Some testing to see if we can decrease the peak memory consumption a bit.	502	if self._bytes is None:
3735.34.3 by John Arbash Meinel Cleanup, in preparation for merging to brisbane-core.	503	# Grab and cache the raw bytes for this entry
	504	# and break the ref-cycle with _manager since we don't need it
	505	# anymore
3735.34.1 by John Arbash Meinel Some testing to see if we can decrease the peak memory consumption a bit.	506	self._manager._prepare_for_extract()
	507	block = self._manager._block
3735.34.2 by John Arbash Meinel Merge brisbane-core tip, resolve differences.	508	self._bytes = block.extract(self.key, self._start, self._end)
3735.2.163 by John Arbash Meinel Merge bzr.dev 4187, and revert the change to fix refcycle issues.	509	# XXX: It seems the smart fetch extracts inventories and chk
	510	# pages as fulltexts to find the next chk pages, but then
	511	# passes them down to be inserted as a
	512	# groupcompress-block, so this is not safe to do. Perhaps
	513	# we could just change the storage kind to "fulltext" at
	514	# that point?
	515	# self._manager = None
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	516	if storage_kind == 'fulltext':
3735.34.1 by John Arbash Meinel Some testing to see if we can decrease the peak memory consumption a bit.	517	return self._bytes
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	518	else:
3735.34.1 by John Arbash Meinel Some testing to see if we can decrease the peak memory consumption a bit.	519	return [self._bytes]
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	520	raise errors.UnavailableRepresentation(self.key, storage_kind,
3735.34.3 by John Arbash Meinel Cleanup, in preparation for merging to brisbane-core.	521	self.storage_kind)
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	522
	523
3735.32.17 by John Arbash Meinel We now round-trip the wire_bytes.	524	class _LazyGroupContentManager(object):
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	525	"""This manages a group of _LazyGroupCompressFactory objects."""
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	526
	527	def __init__(self, block):
	528	self._block = block
	529	# We need to preserve the ordering
	530	self._factories = []
3735.32.27 by John Arbash Meinel Have _LazyGroupContentManager pre-extract everything it holds.	531	self._last_byte = 0
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	532
	533	def add_factory(self, key, parents, start, end):
	534	if not self._factories:
	535	first = True
	536	else:
	537	first = False
	538	# Note that this creates a reference cycle....
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	539	factory = _LazyGroupCompressFactory(key, parents, self,
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	540	start, end, first=first)
3735.32.27 by John Arbash Meinel Have _LazyGroupContentManager pre-extract everything it holds.	541	self._last_byte = max(end, self._last_byte)
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	542	self._factories.append(factory)
	543
	544	def get_record_stream(self):
	545	"""Get a record for all keys added so far."""
	546	for factory in self._factories:
	547	yield factory
3735.34.3 by John Arbash Meinel Cleanup, in preparation for merging to brisbane-core.	548	# Break the ref-cycle
3735.34.2 by John Arbash Meinel Merge brisbane-core tip, resolve differences.	549	factory._bytes = None
3735.2.163 by John Arbash Meinel Merge bzr.dev 4187, and revert the change to fix refcycle issues.	550	# XXX: this is not safe, the smart fetch code requests the content
	551	# as both a 'fulltext', and then later on as a
3735.36.5 by John Arbash Meinel iter_interesting_nodes really is a culprit causing us to hold on to a record	552	# groupcompress-block. The iter_interesting_nodes code also is
	553	# still buffering multiple records and returning them later.
	554	# So that code would need to be updated to either re-fetch the
	555	# original object, or buffer it somehow.
3735.2.163 by John Arbash Meinel Merge bzr.dev 4187, and revert the change to fix refcycle issues.	556	# factory._manager = None
3735.32.15 by John Arbash Meinel Change the GroupCompressBlock code to allow not recording 'end'.	557	# TODO: Consider setting self._factories = None after the above loop,
	558	# as it will break the reference cycle
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	559
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	560	def _trim_block(self, last_byte):
	561	"""Create a new GroupCompressBlock, with just some of the content."""
	562	# None of the factories need to be adjusted, because the content is
	563	# located in an identical place. Just that some of the unreferenced
	564	# trailing bytes are stripped
	565	trace.mutter('stripping trailing bytes from groupcompress block'
	566	' %d => %d', self._block._content_length, last_byte)
	567	new_block = GroupCompressBlock()
	568	self._block._ensure_content(last_byte)
	569	new_block.set_content(self._block._content[:last_byte])
	570	self._block = new_block
	571
	572	def _rebuild_block(self):
	573	"""Create a new GroupCompressBlock with only the referenced texts."""
	574	compressor = GroupCompressor()
	575	tstart = time.time()
	576	old_length = self._block._content_length
3735.2.162 by John Arbash Meinel Change GroupCompressor.compress() to return the start_point.	577	end_point = 0
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	578	for factory in self._factories:
	579	bytes = factory.get_bytes_as('fulltext')
3735.2.162 by John Arbash Meinel Change GroupCompressor.compress() to return the start_point.	580	(found_sha1, start_point, end_point, type,
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	581	length) = compressor.compress(factory.key, bytes, factory.sha1)
	582	# Now update this factory with the new offsets, etc
	583	factory.sha1 = found_sha1
3735.2.162 by John Arbash Meinel Change GroupCompressor.compress() to return the start_point.	584	factory._start = start_point
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	585	factory._end = end_point
3735.2.162 by John Arbash Meinel Change GroupCompressor.compress() to return the start_point.	586	self._last_byte = end_point
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	587	new_block = compressor.flush()
	588	# TODO: Should we check that new_block really is smaller than the old
	589	# block? It seems hard to come up with a method that it would
	590	# expand, since we do full compression again. Perhaps based on a
	591	# request that ends up poorly ordered?
	592	delta = time.time() - tstart
	593	self._block = new_block
	594	trace.mutter('creating new compressed block on-the-fly in %.3fs'
	595	' %d bytes => %d bytes', delta, old_length,
	596	self._block._content_length)
	597
3735.32.27 by John Arbash Meinel Have _LazyGroupContentManager pre-extract everything it holds.	598	def _prepare_for_extract(self):
	599	"""A _LazyGroupCompressFactory is about to extract to fulltext."""
	600	# We expect that if one child is going to fulltext, all will be. This
	601	# helps prevent all of them from extracting a small amount at a time.
	602	# Which in itself isn't terribly expensive, but resizing 2MB 32kB at a
	603	# time (self._block._content) is a little expensive.
	604	self._block._ensure_content(self._last_byte)
	605
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	606	def _check_rebuild_block(self):
	607	"""Check to see if our block should be repacked."""
	608	total_bytes_used = 0
	609	last_byte_used = 0
	610	for factory in self._factories:
	611	total_bytes_used += factory._end - factory._start
	612	last_byte_used = max(last_byte_used, factory._end)
	613	# If we are using most of the bytes from the block, we have nothing
	614	# else to check (currently more that 1/2)
	615	if total_bytes_used * 2 >= self._block._content_length:
	616	return
	617	# Can we just strip off the trailing bytes? If we are going to be
	618	# transmitting more than 50% of the front of the content, go ahead
	619	if total_bytes_used * 2 > last_byte_used:
	620	self._trim_block(last_byte_used)
	621	return
	622
	623	# We are using a small amount of the data, and it isn't just packed
	624	# nicely at the front, so rebuild the content.
	625	# Note: This would be nicer as a strip-data-from-group, rather than
	626	# building it up again from scratch
	627	# It might be reasonable to consider the fulltext sizes for
	628	# different bits when deciding this, too. As you may have a small
	629	# fulltext, and a trivial delta, and you are just trading around
	630	# for another fulltext. If we do a simple 'prune' you may end up
	631	# expanding many deltas into fulltexts, as well.
	632	# If we build a cheap enough 'strip', then we could try a strip,
	633	# if that expands the content, we then rebuild.
	634	self._rebuild_block()
	635
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	636	def _wire_bytes(self):
	637	"""Return a byte stream suitable for transmitting over the wire."""
3735.32.24 by John Arbash Meinel _wire_bytes() now strips groups as necessary, as does _insert_record_stream	638	self._check_rebuild_block()
3735.32.16 by John Arbash Meinel We now have a general header for the GC block.	639	# The outer block starts with:
	640	# 'groupcompress-block\n'
	641	# <length of compressed key info>\n
	642	# <length of uncompressed info>\n
	643	# <length of gc block>\n
	644	# <header bytes>
	645	# <gc-block>
	646	lines = ['groupcompress-block\n']
	647	# The minimal info we need is the key, the start offset, and the
	648	# parents. The length and type are encoded in the record itself.
	649	# However, passing in the other bits makes it easier. The list of
	650	# keys, and the start offset, the length
	651	# 1 line key
	652	# 1 line with parents, '' for ()
	653	# 1 line for start offset
	654	# 1 line for end byte
	655	header_lines = []
3735.32.15 by John Arbash Meinel Change the GroupCompressBlock code to allow not recording 'end'.	656	for factory in self._factories:
3735.32.16 by John Arbash Meinel We now have a general header for the GC block.	657	key_bytes = '\x00'.join(factory.key)
	658	parents = factory.parents
	659	if parents is None:
	660	parent_bytes = 'None:'
	661	else:
	662	parent_bytes = '\t'.join('\x00'.join(key) for key in parents)
	663	record_header = '%s\n%s\n%d\n%d\n' % (
	664	key_bytes, parent_bytes, factory._start, factory._end)
	665	header_lines.append(record_header)
	666	header_bytes = ''.join(header_lines)
	667	del header_lines
	668	header_bytes_len = len(header_bytes)
	669	z_header_bytes = zlib.compress(header_bytes)
	670	del header_bytes
	671	z_header_bytes_len = len(z_header_bytes)
3735.32.17 by John Arbash Meinel We now round-trip the wire_bytes.	672	block_bytes = self._block.to_bytes()
3735.32.16 by John Arbash Meinel We now have a general header for the GC block.	673	lines.append('%d\n%d\n%d\n' % (z_header_bytes_len, header_bytes_len,
3735.32.17 by John Arbash Meinel We now round-trip the wire_bytes.	674	len(block_bytes)))
3735.32.16 by John Arbash Meinel We now have a general header for the GC block.	675	lines.append(z_header_bytes)
3735.32.17 by John Arbash Meinel We now round-trip the wire_bytes.	676	lines.append(block_bytes)
	677	del z_header_bytes, block_bytes
3735.32.16 by John Arbash Meinel We now have a general header for the GC block.	678	return ''.join(lines)
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	679
3735.32.17 by John Arbash Meinel We now round-trip the wire_bytes.	680	@classmethod
3735.32.18 by John Arbash Meinel We now support generating a network stream.	681	def from_bytes(cls, bytes):
3735.32.17 by John Arbash Meinel We now round-trip the wire_bytes.	682	# TODO: This does extra string copying, probably better to do it a
	683	# different way
	684	(storage_kind, z_header_len, header_len,
	685	block_len, rest) = bytes.split('\n', 4)
	686	del bytes
	687	if storage_kind != 'groupcompress-block':
	688	raise ValueError('Unknown storage kind: %s' % (storage_kind,))
	689	z_header_len = int(z_header_len)
	690	if len(rest) < z_header_len:
	691	raise ValueError('Compressed header len shorter than all bytes')
	692	z_header = rest[:z_header_len]
	693	header_len = int(header_len)
	694	header = zlib.decompress(z_header)
	695	if len(header) != header_len:
	696	raise ValueError('invalid length for decompressed bytes')
	697	del z_header
	698	block_len = int(block_len)
	699	if len(rest) != z_header_len + block_len:
	700	raise ValueError('Invalid length for block')
	701	block_bytes = rest[z_header_len:]
	702	del rest
	703	# So now we have a valid GCB, we just need to parse the factories that
	704	# were sent to us
	705	header_lines = header.split('\n')
	706	del header
	707	last = header_lines.pop()
	708	if last != '':
	709	raise ValueError('header lines did not end with a trailing'
	710	' newline')
	711	if len(header_lines) % 4 != 0:
	712	raise ValueError('The header was not an even multiple of 4 lines')
	713	block = GroupCompressBlock.from_bytes(block_bytes)
	714	del block_bytes
	715	result = cls(block)
	716	for start in xrange(0, len(header_lines), 4):
	717	# intern()?
	718	key = tuple(header_lines[start].split('\x00'))
	719	parents_line = header_lines[start+1]
	720	if parents_line == 'None:':
	721	parents = None
	722	else:
	723	parents = tuple([tuple(segment.split('\x00'))
	724	for segment in parents_line.split('\t')
	725	if segment])
	726	start_offset = int(header_lines[start+2])
	727	end_offset = int(header_lines[start+3])
	728	result.add_factory(key, parents, start_offset, end_offset)
	729	return result
	730
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	731
3735.32.18 by John Arbash Meinel We now support generating a network stream.	732	def network_block_to_records(storage_kind, bytes, line_end):
	733	if storage_kind != 'groupcompress-block':
	734	raise ValueError('Unknown storage kind: %s' % (storage_kind,))
	735	manager = _LazyGroupContentManager.from_bytes(bytes)
	736	return manager.get_record_stream()
	737
	738
0.17.2 by Robert Collins Core proof of concept working.	739	class GroupCompressor(object):
0.17.3 by Robert Collins new encoder, allows non monotonically increasing sequence matches for moar compression.	740	"""Produce a serialised group of compressed texts.
0.23.6 by John Arbash Meinel Start stripping out the actual GroupCompressor	741
0.17.3 by Robert Collins new encoder, allows non monotonically increasing sequence matches for moar compression.	742	It contains code very similar to SequenceMatcher because of having a similar
	743	task. However some key differences apply:
	744	- there is no junk, we want a minimal edit not a human readable diff.
	745	- we don't filter very common lines (because we don't know where a good
	746	range will start, and after the first text we want to be emitting minmal
	747	edits only.
	748	- we chain the left side, not the right side
	749	- we incrementally update the adjacency matrix as new lines are provided.
	750	- we look for matches in all of the left side, so the routine which does
	751	the analagous task of find_longest_match does not need to filter on the
	752	left side.
	753	"""
0.17.2 by Robert Collins Core proof of concept working.	754
3735.32.19 by John Arbash Meinel Get rid of the 'delta' flag to GroupCompressor. It didn't do anything anyway.	755	def __init__(self):
	756	"""Create a GroupCompressor."""
0.23.13 by John Arbash Meinel Factor out the ability to have/not have labels.	757	# Consider seeding the lines with some sort of GC Start flag, or
	758	# putting it as part of the output stream, rather than in the
	759	# compressed bytes.
0.23.9 by John Arbash Meinel We now basically have full support for using diff-delta as the compressor.	760	self.lines = []
0.17.2 by Robert Collins Core proof of concept working.	761	self.endpoint = 0
0.17.2 by Robert Collins Core proof of concept working.	762	self.input_bytes = 0
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	763	self.num_keys = 0
0.17.11 by Robert Collins Add extraction of just-compressed texts to support converting from knits.	764	self.labels_deltas = {}
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	765	self._last = None
0.23.26 by John Arbash Meinel We now start to make use of the ability to extend the delta index	766	self._delta_index = _groupcompress_pyx.DeltaIndex()
0.25.6 by John Arbash Meinel (tests broken) implement the basic ability to have a separate header	767	self._block = GroupCompressBlock()
0.23.6 by John Arbash Meinel Start stripping out the actual GroupCompressor	768
3735.31.12 by John Arbash Meinel Push nostore_sha down through the stack.	769	def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):
0.17.2 by Robert Collins Core proof of concept working.	770	"""Compress lines with label key.
	771
	772	:param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins Working better --gc-plain-chk.	773	for identification of the text during decompression. If the last
	774	element is 'None' it is replaced with the sha1 of the text -
	775	e.g. sha1:xxxxxxx.
0.23.58 by John Arbash Meinel fix up the failing tests.	776	:param bytes: The bytes to be compressed
0.20.29 by Ian Clatworthy groupcompress.py code cleanups	777	:param expected_sha: If non-None, the sha the lines are believed to
0.17.2 by Robert Collins Core proof of concept working.	778	have. During compression the sha is calculated; a mismatch will
0.17.2 by Robert Collins Core proof of concept working.	779	cause an error.
3735.31.12 by John Arbash Meinel Push nostore_sha down through the stack.	780	:param nostore_sha: If the computed sha1 sum matches, we will raise
	781	ExistingContent rather than adding the text.
0.20.15 by John Arbash Meinel Change so that regions that have lots of copies get converted back	782	:param soft: Do a 'soft' compression. This means that we require larger
	783	ranges to match to be considered for a copy command.
0.17.2 by Robert Collins Core proof of concept working.	784	:return: The sha1 of lines, and the number of bytes accumulated in
0.17.2 by Robert Collins Core proof of concept working.	785	the group output so far.
3735.31.12 by John Arbash Meinel Push nostore_sha down through the stack.	786	:seealso VersionedFiles.add_lines:
0.17.2 by Robert Collins Core proof of concept working.	787	"""
3735.2.162 by John Arbash Meinel Change GroupCompressor.compress() to return the start_point.	788	if not bytes: # empty, like a dir entry, etc
	789	if nostore_sha == _null_sha1:
	790	raise errors.ExistingContent()
	791	self._block.add_entry(key, type='empty',
	792	sha1=None, start=0,
	793	length=0)
	794	return _null_sha1, 0, 0, 'fulltext', 0
	795	# we assume someone knew what they were doing when they passed it in
	796	if expected_sha is not None:
0.23.54 by John Arbash Meinel 'bzr pack' _FAST during compress() now is 32s versus 25s.	797	sha1 = expected_sha
3735.2.162 by John Arbash Meinel Change GroupCompressor.compress() to return the start_point.	798	else:
	799	sha1 = osutils.sha_string(bytes)
	800	if nostore_sha is not None:
	801	if sha1 == nostore_sha:
	802	raise errors.ExistingContent()
0.17.26 by Robert Collins Working better --gc-plain-chk.	803	if key[-1] is None:
0.17.26 by Robert Collins Working better --gc-plain-chk.	804	key = key[:-1] + ('sha1:' + sha1,)
0.23.52 by John Arbash Meinel Use the max_delta flag.	805	input_len = len(bytes)
0.23.12 by John Arbash Meinel Add a 'len:' field to the data.	806	# By having action/label/sha1/len, we can parse the group if the index
	807	# was ever destroyed, we have the key in 'label', we know the final
	808	# bytes are valid from sha1, and we know where to find the end of this
	809	# record because of 'len'. (the delta record itself will store the
	810	# total length for the expanded record)
0.23.13 by John Arbash Meinel Factor out the ability to have/not have labels.	811	# 'len: %d\n' costs approximately 1% increase in total data
	812	# Having the labels at all costs us 9-10% increase, 38% increase for
	813	# inventory pages, and 5.8% increase for text pages
0.25.6 by John Arbash Meinel (tests broken) implement the basic ability to have a separate header	814	# new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]
0.23.33 by John Arbash Meinel Fix a bug when handling multiple large-range copies.	815	if self._delta_index._source_offset != self.endpoint:
	816	raise AssertionError('_source_offset != endpoint'
	817	' somehow the DeltaIndex got out of sync with'
	818	' the output lines')
0.23.52 by John Arbash Meinel Use the max_delta flag.	819	max_delta_size = len(bytes) / 2
	820	delta = self._delta_index.make_delta(bytes, max_delta_size)
	821	if (delta is None):
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	822	type = 'fulltext'
0.17.36 by John Arbash Meinel Adding a mini-len to the delta/fulltext bytes	823	enc_length = encode_base128_int(len(bytes))
	824	len_mini_header = 1 + len(enc_length)
	825	length = len(bytes) + len_mini_header
	826	self._delta_index.add_source(bytes, len_mini_header)
	827	new_chunks = ['f', enc_length, bytes]
0.23.9 by John Arbash Meinel We now basically have full support for using diff-delta as the compressor.	828	else:
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	829	type = 'delta'
0.17.36 by John Arbash Meinel Adding a mini-len to the delta/fulltext bytes	830	enc_length = encode_base128_int(len(delta))
	831	len_mini_header = 1 + len(enc_length)
	832	length = len(delta) + len_mini_header
	833	new_chunks = ['d', enc_length, delta]
0.23.43 by John Arbash Meinel Change the internals to allow delta indexes to be expanded with new source data.	834	if _FAST:
0.25.14 by John Arbash Meinel Fix a bug in 'FAST' handling.	835	self._delta_index._source_offset += length
0.23.43 by John Arbash Meinel Change the internals to allow delta indexes to be expanded with new source data.	836	else:
0.17.36 by John Arbash Meinel Adding a mini-len to the delta/fulltext bytes	837	self._delta_index.add_delta_source(delta, len_mini_header)
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	838	self._block.add_entry(key, type=type, sha1=sha1,
	839	start=self.endpoint, length=length)
3735.2.162 by John Arbash Meinel Change GroupCompressor.compress() to return the start_point.	840	start = self.endpoint
0.18.9 by John Arbash Meinel If we are going to do it this way, we don't need to explicitly distinguish left and right	841	delta_start = (self.endpoint, len(self.lines))
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	842	self.num_keys += 1
0.23.9 by John Arbash Meinel We now basically have full support for using diff-delta as the compressor.	843	self.output_chunks(new_chunks)
0.23.6 by John Arbash Meinel Start stripping out the actual GroupCompressor	844	self.input_bytes += input_len
0.18.9 by John Arbash Meinel If we are going to do it this way, we don't need to explicitly distinguish left and right	845	delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins Add extraction of just-compressed texts to support converting from knits.	846	self.labels_deltas[key] = (delta_start, delta_end)
0.23.29 by John Arbash Meinel Forgot to add the delta bytes to the index objects.	847	if not self._delta_index._source_offset == self.endpoint:
	848	raise AssertionError('the delta index is out of sync'
	849	'with the output lines %s != %s'
	850	% (self._delta_index._source_offset, self.endpoint))
3735.2.162 by John Arbash Meinel Change GroupCompressor.compress() to return the start_point.	851	return sha1, start, self.endpoint, type, length
0.17.2 by Robert Collins Core proof of concept working.	852
0.17.11 by Robert Collins Add extraction of just-compressed texts to support converting from knits.	853	def extract(self, key):
0.17.12 by Robert Collins Encode copy ranges as bytes not lines, halves decode overhead.	854	"""Extract a key previously added to the compressor.
0.23.6 by John Arbash Meinel Start stripping out the actual GroupCompressor	855
0.17.12 by Robert Collins Encode copy ranges as bytes not lines, halves decode overhead.	856	:param key: The key to extract.
	857	:return: An iterable over bytes and the sha1.
	858	"""
0.17.11 by Robert Collins Add extraction of just-compressed texts to support converting from knits.	859	delta_details = self.labels_deltas[key]
0.23.9 by John Arbash Meinel We now basically have full support for using diff-delta as the compressor.	860	delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
0.25.6 by John Arbash Meinel (tests broken) implement the basic ability to have a separate header	861	stored_bytes = ''.join(delta_chunks)
	862	# TODO: Fix this, we shouldn't really be peeking here
	863	entry = self._block._entries[key]
	864	if entry.type == 'fulltext':
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	865	if stored_bytes[0] != 'f':
	866	raise ValueError('Index claimed fulltext, but stored bytes'
	867	' indicate %s' % (stored_bytes[0],))
0.17.36 by John Arbash Meinel Adding a mini-len to the delta/fulltext bytes	868	fulltext_len, offset = decode_base128_int(stored_bytes[1:10])
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	869	if fulltext_len + 1 + offset != len(stored_bytes):
	870	raise ValueError('Index claimed fulltext len, but stored bytes'
	871	' claim %s != %s'
	872	% (len(stored_bytes),
	873	fulltext_len + 1 + offset))
0.17.36 by John Arbash Meinel Adding a mini-len to the delta/fulltext bytes	874	bytes = stored_bytes[offset + 1:]
0.23.9 by John Arbash Meinel We now basically have full support for using diff-delta as the compressor.	875	else:
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	876	if entry.type != 'delta':
	877	raise ValueError('Unknown entry type: %s' % (entry.type,))
0.25.6 by John Arbash Meinel (tests broken) implement the basic ability to have a separate header	878	# XXX: This is inefficient at best
	879	source = ''.join(self.lines)
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	880	if stored_bytes[0] != 'd':
	881	raise ValueError('Entry type claims delta, bytes claim %s'
	882	% (stored_bytes[0],))
0.17.36 by John Arbash Meinel Adding a mini-len to the delta/fulltext bytes	883	delta_len, offset = decode_base128_int(stored_bytes[1:10])
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	884	if delta_len + 1 + offset != len(stored_bytes):
	885	raise ValueError('Index claimed delta len, but stored bytes'
	886	' claim %s != %s'
	887	% (len(stored_bytes),
	888	delta_len + 1 + offset))
0.17.36 by John Arbash Meinel Adding a mini-len to the delta/fulltext bytes	889	bytes = _groupcompress_pyx.apply_delta(source,
	890	stored_bytes[offset + 1:])
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	891	bytes_sha1 = sha_string(bytes)
	892	if entry.sha1 != bytes_sha1:
	893	raise ValueError('Recorded sha1 != measured %s != %s'
	894	% (entry.sha1, bytes_sha1))
0.25.6 by John Arbash Meinel (tests broken) implement the basic ability to have a separate header	895	return bytes, entry.sha1
0.23.9 by John Arbash Meinel We now basically have full support for using diff-delta as the compressor.	896
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	897	def flush(self):
	898	"""Finish this group, creating a formatted stream."""
	899	content = ''.join(self.lines)
	900	self.lines = None
3735.32.17 by John Arbash Meinel We now round-trip the wire_bytes.	901	self._block.set_content(content)
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	902	return self._block
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	903
0.23.9 by John Arbash Meinel We now basically have full support for using diff-delta as the compressor.	904	def output_chunks(self, new_chunks):
	905	"""Output some chunks.
	906
	907	:param new_chunks: The chunks to output.
	908	"""
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	909	self._last = (len(self.lines), self.endpoint)
0.17.12 by Robert Collins Encode copy ranges as bytes not lines, halves decode overhead.	910	endpoint = self.endpoint
0.23.9 by John Arbash Meinel We now basically have full support for using diff-delta as the compressor.	911	self.lines.extend(new_chunks)
	912	endpoint += sum(map(len, new_chunks))
0.17.12 by Robert Collins Encode copy ranges as bytes not lines, halves decode overhead.	913	self.endpoint = endpoint
0.17.3 by Robert Collins new encoder, allows non monotonically increasing sequence matches for moar compression.	914
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	915	def pop_last(self):
	916	"""Call this if you want to 'revoke' the last compression.
	917
	918	After this, the data structures will be rolled back, but you cannot do
	919	more compression.
	920	"""
	921	self._delta_index = None
	922	del self.lines[self._last[0]:]
	923	self.endpoint = self._last[1]
	924	self._last = None
	925
0.17.2 by Robert Collins Core proof of concept working.	926	def ratio(self):
	927	"""Return the overall compression ratio."""
	928	return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	929
0.17.11 by Robert Collins Add extraction of just-compressed texts to support converting from knits.	930
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	931	def make_pack_factory(graph, delta, keylength):
	932	"""Create a factory for creating a pack based groupcompress.
	933
	934	This is only functional enough to run interface tests, it doesn't try to
	935	provide a full pack environment.
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	936
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	937	:param graph: Store a graph.
	938	:param delta: Delta compress contents.
	939	:param keylength: How long should keys be.
	940	"""
	941	def factory(transport):
3735.32.2 by John Arbash Meinel The 'delta' flag has no effect on the content (all GC is delta'd),	942	parents = graph
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	943	ref_length = 0
	944	if graph:
0.20.29 by Ian Clatworthy groupcompress.py code cleanups	945	ref_length = 1
0.17.7 by Robert Collins Update for current index2 changes.	946	graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	947	key_elements=keylength)
	948	stream = transport.open_write_stream('newpack')
	949	writer = pack.ContainerWriter(stream.write)
	950	writer.begin()
	951	index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins Initial stab at repository format support.	952	add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	953	access = _DirectPackAccess({})
	954	access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins Core proof of concept working.	955	result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	956	result.stream = stream
	957	result.writer = writer
	958	return result
	959	return factory
	960
	961
	962	def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins Only decompress as much of the zlib data as is needed to read the text recipe.	963	versioned_files.writer.end()
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	964	versioned_files.stream.close()
	965
	966
	967	class GroupCompressVersionedFiles(VersionedFiles):
	968	"""A group-compress based VersionedFiles implementation."""
	969
0.17.2 by Robert Collins Core proof of concept working.	970	def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	971	"""Create a GroupCompressVersionedFiles object.
	972
	973	:param index: The index object storing access and graph data.
	974	:param access: The access object storing raw data.
0.17.2 by Robert Collins Core proof of concept working.	975	:param delta: Whether to delta compress or just entropy compress.
	976	"""
	977	self._index = index
	978	self._access = access
	979	self._delta = delta
0.17.11 by Robert Collins Add extraction of just-compressed texts to support converting from knits.	980	self._unadded_refs = {}
0.17.24 by Robert Collins Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.	981	self._group_cache = LRUSizeCache(max_size=5010241024)
3735.31.7 by John Arbash Meinel Start bringing in stacking support for Groupcompress repos.	982	self._fallback_vfs = []
0.17.2 by Robert Collins Core proof of concept working.	983
	984	def add_lines(self, key, parents, lines, parent_texts=None,
	985	left_matching_blocks=None, nostore_sha=None, random_id=False,
	986	check_content=True):
	987	"""Add a text to the store.
	988
	989	:param key: The key tuple of the text to add.
	990	:param parents: The parents key tuples of the text to add.
	991	:param lines: A list of lines. Each line must be a bytestring. And all
	992	of them except the last must be terminated with \n and contain no
	993	other \n's. The last line may either contain no \n's or a single
	994	terminating \n. If the lines list does meet this constraint the add
	995	routine may error or may succeed - but you will be unable to read
	996	the data back accurately. (Checking the lines have been split
	997	correctly is expensive and extremely unlikely to catch bugs so it
	998	is not done at runtime unless check_content is True.)
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	999	:param parent_texts: An optional dictionary containing the opaque
0.17.2 by Robert Collins Core proof of concept working.	1000	representations of some or all of the parents of version_id to
	1001	allow delta optimisations. VERY IMPORTANT: the texts must be those
	1002	returned by add_lines or data corruption can be caused.
	1003	:param left_matching_blocks: a hint about which areas are common
	1004	between the text and its left-hand-parent. The format is
	1005	the SequenceMatcher.get_matching_blocks format.
	1006	:param nostore_sha: Raise ExistingContent and do not add the lines to
	1007	the versioned file if the digest of the lines matches this.
	1008	:param random_id: If True a random id has been selected rather than
	1009	an id determined by some deterministic process such as a converter
	1010	from a foreign VCS. When True the backend may choose not to check
	1011	for uniqueness of the resulting key within the versioned file, so
	1012	this should only be done when the result is expected to be unique
	1013	anyway.
	1014	:param check_content: If True, the lines supplied are verified to be
	1015	bytestrings that are correctly formed lines.
	1016	:return: The text sha1, the number of bytes in the text, and an opaque
	1017	representation of the inserted version which can be provided
	1018	back to future add_lines calls in the parent_texts dictionary.
	1019	"""
	1020	self._index._check_write_ok()
	1021	self._check_add(key, lines, random_id, check_content)
	1022	if parents is None:
	1023	# The caller might pass None if there is no graph data, but kndx
	1024	# indexes can't directly store that, so we give them
	1025	# an empty tuple instead.
	1026	parents = ()
	1027	# double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.	1028	length = sum(map(len, lines))
	1029	record = ChunkedContentFactory(key, parents, None, lines)
3735.31.12 by John Arbash Meinel Push nostore_sha down through the stack.	1030	sha1 = list(self._insert_record_stream([record], random_id=random_id,
	1031	nostore_sha=nostore_sha))[0]
0.20.5 by John Arbash Meinel Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.	1032	return sha1, length, None
0.17.2 by Robert Collins Core proof of concept working.	1033
3735.31.7 by John Arbash Meinel Start bringing in stacking support for Groupcompress repos.	1034	def add_fallback_versioned_files(self, a_versioned_files):
	1035	"""Add a source of texts for texts not present in this knit.
	1036
	1037	:param a_versioned_files: A VersionedFiles object.
	1038	"""
	1039	self._fallback_vfs.append(a_versioned_files)
	1040
0.17.4 by Robert Collins Annotate.	1041	def annotate(self, key):
	1042	"""See VersionedFiles.annotate."""
	1043	graph = Graph(self)
0.17.5 by Robert Collins nograph tests completely passing.	1044	parent_map = self.get_parent_map([key])
	1045	if not parent_map:
	1046	raise errors.RevisionNotPresent(key, self)
	1047	if parent_map[key] is not None:
	1048	search = graph._make_breadth_first_searcher([key])
	1049	keys = set()
	1050	while True:
	1051	try:
	1052	present, ghosts = search.next_with_ghosts()
	1053	except StopIteration:
	1054	break
	1055	keys.update(present)
	1056	parent_map = self.get_parent_map(keys)
	1057	else:
	1058	keys = [key]
	1059	parent_map = {key:()}
0.17.4 by Robert Collins Annotate.	1060	head_cache = _mod_graph.FrozenHeadsCache(graph)
	1061	parent_cache = {}
	1062	reannotate = annotate.reannotate
	1063	for record in self.get_record_stream(keys, 'topological', True):
	1064	key = record.key
0.20.2 by John Arbash Meinel Teach groupcompress about 'chunked' encoding	1065	chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins Annotate.	1066	parent_lines = [parent_cache[parent] for parent in parent_map[key]]
0.17.4 by Robert Collins Annotate.	1067	parent_cache[key] = list(
0.20.21 by John Arbash Meinel Merge the chk sorting code.	1068	reannotate(parent_lines, chunks, key, None, head_cache))
0.17.4 by Robert Collins Annotate.	1069	return parent_cache[key]
0.17.4 by Robert Collins Annotate.	1070
0.17.5 by Robert Collins nograph tests completely passing.	1071	def check(self, progress_bar=None):
	1072	"""See VersionedFiles.check()."""
	1073	keys = self.keys()
	1074	for record in self.get_record_stream(keys, 'unordered', True):
	1075	record.get_bytes_as('fulltext')
	1076
0.17.2 by Robert Collins Core proof of concept working.	1077	def _check_add(self, key, lines, random_id, check_content):
	1078	"""check that version_id and lines are safe to add."""
	1079	version_id = key[-1]
0.17.26 by Robert Collins Working better --gc-plain-chk.	1080	if version_id is not None:
0.17.26 by Robert Collins Working better --gc-plain-chk.	1081	if contains_whitespace(version_id):
3735.31.1 by John Arbash Meinel Bring the groupcompress plugin into the brisbane-core branch.	1082	raise errors.InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins Core proof of concept working.	1083	self.check_not_reserved_id(version_id)
	1084	# TODO: If random_id==False and the key is already present, we should
	1085	# probably check that the existing content is identical to what is
	1086	# being inserted, and otherwise raise an exception. This would make
	1087	# the bundle code simpler.
	1088	if check_content:
	1089	self._check_lines_not_unicode(lines)
	1090	self._check_lines_are_lines(lines)
	1091
0.17.5 by Robert Collins nograph tests completely passing.	1092	def get_parent_map(self, keys):
3735.31.7 by John Arbash Meinel Start bringing in stacking support for Groupcompress repos.	1093	"""Get a map of the graph parents of keys.
0.17.5 by Robert Collins nograph tests completely passing.	1094
	1095	:param keys: The keys to look up parents for.
	1096	:return: A mapping from keys to parents. Absent keys are absent from
	1097	the mapping.
	1098	"""
3735.31.7 by John Arbash Meinel Start bringing in stacking support for Groupcompress repos.	1099	return self._get_parent_map_with_sources(keys)[0]
	1100
	1101	def _get_parent_map_with_sources(self, keys):
	1102	"""Get a map of the parents of keys.
	1103
	1104	:param keys: The keys to look up parents for.
	1105	:return: A tuple. The first element is a mapping from keys to parents.
	1106	Absent keys are absent from the mapping. The second element is a
	1107	list with the locations each key was found in. The first element
	1108	is the in-this-knit parents, the second the first fallback source,
	1109	and so on.
	1110	"""
0.17.5 by Robert Collins nograph tests completely passing.	1111	result = {}
3735.31.7 by John Arbash Meinel Start bringing in stacking support for Groupcompress repos.	1112	sources = [self._index] + self._fallback_vfs
0.17.5 by Robert Collins nograph tests completely passing.	1113	source_results = []
	1114	missing = set(keys)
	1115	for source in sources:
	1116	if not missing:
	1117	break
	1118	new_result = source.get_parent_map(missing)
	1119	source_results.append(new_result)
	1120	result.update(new_result)
	1121	missing.difference_update(set(new_result))
3735.31.7 by John Arbash Meinel Start bringing in stacking support for Groupcompress repos.	1122	return result, source_results
0.17.5 by Robert Collins nograph tests completely passing.	1123
0.25.6 by John Arbash Meinel (tests broken) implement the basic ability to have a separate header	1124	def _get_block(self, index_memo):
0.20.14 by John Arbash Meinel Factor out _get_group_and_delta_lines.	1125	read_memo = index_memo[0:3]
	1126	# get the group:
	1127	try:
0.25.6 by John Arbash Meinel (tests broken) implement the basic ability to have a separate header	1128	block = self._group_cache[read_memo]
0.20.14 by John Arbash Meinel Factor out _get_group_and_delta_lines.	1129	except KeyError:
	1130	# read the group
	1131	zdata = self._access.get_raw_records([read_memo]).next()
	1132	# decompress - whole thing - this is not a bug, as it
	1133	# permits caching. We might want to store the partially
	1134	# decompresed group and decompress object, so that recent
	1135	# texts are not penalised by big groups.
0.25.6 by John Arbash Meinel (tests broken) implement the basic ability to have a separate header	1136	block = GroupCompressBlock.from_bytes(zdata)
	1137	self._group_cache[read_memo] = block
0.20.14 by John Arbash Meinel Factor out _get_group_and_delta_lines.	1138	# cheapo debugging:
	1139	# print len(zdata), len(plain)
	1140	# parse - requires split_lines, better to have byte offsets
	1141	# here (but not by much - we only split the region for the
	1142	# recipe, and we often want to end up with lines anyway.
0.25.6 by John Arbash Meinel (tests broken) implement the basic ability to have a separate header	1143	return block
0.20.14 by John Arbash Meinel Factor out _get_group_and_delta_lines.	1144
0.20.18 by John Arbash Meinel Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()	1145	def get_missing_compression_parent_keys(self):
	1146	"""Return the keys of missing compression parents.
	1147
	1148	Missing compression parents occur when a record stream was missing
	1149	basis texts, or a index was scanned that had missing basis texts.
	1150	"""
	1151	# GroupCompress cannot currently reference texts that are not in the
	1152	# group, so this is valid for now
	1153	return frozenset()
	1154
0.17.5 by Robert Collins nograph tests completely passing.	1155	def get_record_stream(self, keys, ordering, include_delta_closure):
	1156	"""Get a stream of records for keys.
	1157
	1158	:param keys: The keys to include.
	1159	:param ordering: Either 'unordered' or 'topological'. A topologically
	1160	sorted stream has compression parents strictly before their
	1161	children.
	1162	:param include_delta_closure: If True then the closure across any
	1163	compression parents will be included (in the opaque data).
	1164	:return: An iterator of ContentFactory objects, each of which is only
	1165	valid until the iterator is advanced.
	1166	"""
	1167	# keys might be a generator
0.22.6 by John Arbash Meinel Clustering chk pages properly makes a big difference.	1168	orig_keys = list(keys)
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1169	keys = set(keys)
0.17.5 by Robert Collins nograph tests completely passing.	1170	if not keys:
0.17.5 by Robert Collins nograph tests completely passing.	1171	return
0.20.23 by John Arbash Meinel Add a progress indicator for chk pages.	1172	if (not self._index.has_graph
3735.31.14 by John Arbash Meinel Change the gc-optimal to 'groupcompress'	1173	and ordering in ('topological', 'groupcompress')):
0.17.5 by Robert Collins nograph tests completely passing.	1174	# Cannot topological order when no graph has been stored.
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1175	# but we allow 'as-requested' or 'unordered'
0.17.5 by Robert Collins nograph tests completely passing.	1176	ordering = 'unordered'
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1177
	1178	remaining_keys = keys
	1179	while True:
	1180	try:
	1181	keys = set(remaining_keys)
	1182	for content_factory in self._get_remaining_record_stream(keys,
	1183	orig_keys, ordering, include_delta_closure):
	1184	remaining_keys.discard(content_factory.key)
	1185	yield content_factory
	1186	return
	1187	except errors.RetryWithNewPacks, e:
	1188	self._access.reload_or_raise(e)
	1189
	1190	def _find_from_fallback(self, missing):
	1191	"""Find whatever keys you can from the fallbacks.
	1192
	1193	:param missing: A set of missing keys. This set will be mutated as keys
	1194	are found from a fallback_vfs
	1195	:return: (parent_map, key_to_source_map, source_results)
	1196	parent_map the overall key => parent_keys
	1197	key_to_source_map a dict from {key: source}
	1198	source_results a list of (source: keys)
	1199	"""
	1200	parent_map = {}
	1201	key_to_source_map = {}
	1202	source_results = []
	1203	for source in self._fallback_vfs:
	1204	if not missing:
	1205	break
	1206	source_parents = source.get_parent_map(missing)
	1207	parent_map.update(source_parents)
	1208	source_parents = list(source_parents)
	1209	source_results.append((source, source_parents))
	1210	key_to_source_map.update((key, source) for key in source_parents)
	1211	missing.difference_update(source_parents)
	1212	return parent_map, key_to_source_map, source_results
	1213
	1214	def _get_ordered_source_keys(self, ordering, parent_map, key_to_source_map):
	1215	"""Get the (source, [keys]) list.
	1216
	1217	The returned objects should be in the order defined by 'ordering',
	1218	which can weave between different sources.
	1219	:param ordering: Must be one of 'topological' or 'groupcompress'
	1220	:return: List of [(source, [keys])] tuples, such that all keys are in
	1221	the defined order, regardless of source.
	1222	"""
	1223	if ordering == 'topological':
	1224	present_keys = topo_sort(parent_map)
	1225	else:
	1226	# ordering == 'groupcompress'
	1227	# XXX: This only optimizes for the target ordering. We may need
	1228	# to balance that with the time it takes to extract
	1229	# ordering, by somehow grouping based on
	1230	# locations[key][0:3]
	1231	present_keys = sort_gc_optimal(parent_map)
	1232	# Now group by source:
	1233	source_keys = []
	1234	current_source = None
	1235	for key in present_keys:
	1236	source = key_to_source_map.get(key, self)
	1237	if source is not current_source:
	1238	source_keys.append((source, []))
3735.32.12 by John Arbash Meinel Add groupcompress-block[-ref] as valid stream types.	1239	current_source = source
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1240	source_keys[-1][1].append(key)
	1241	return source_keys
	1242
	1243	def _get_as_requested_source_keys(self, orig_keys, locations, unadded_keys,
	1244	key_to_source_map):
	1245	source_keys = []
	1246	current_source = None
	1247	for key in orig_keys:
	1248	if key in locations or key in unadded_keys:
	1249	source = self
	1250	elif key in key_to_source_map:
	1251	source = key_to_source_map[key]
	1252	else: # absent
	1253	continue
	1254	if source is not current_source:
	1255	source_keys.append((source, []))
3735.32.12 by John Arbash Meinel Add groupcompress-block[-ref] as valid stream types.	1256	current_source = source
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1257	source_keys[-1][1].append(key)
	1258	return source_keys
	1259
	1260	def _get_io_ordered_source_keys(self, locations, unadded_keys,
	1261	source_result):
	1262	def get_group(key):
	1263	# This is the group the bytes are stored in, followed by the
	1264	# location in the group
	1265	return locations[key][0]
	1266	present_keys = sorted(locations.iterkeys(), key=get_group)
	1267	# We don't have an ordering for keys in the in-memory object, but
	1268	# lets process the in-memory ones first.
	1269	present_keys = list(unadded_keys) + present_keys
	1270	# Now grab all of the ones from other sources
	1271	source_keys = [(self, present_keys)]
	1272	source_keys.extend(source_result)
	1273	return source_keys
	1274
	1275	def _get_remaining_record_stream(self, keys, orig_keys, ordering,
	1276	include_delta_closure):
	1277	"""Get a stream of records for keys.
	1278
	1279	:param keys: The keys to include.
	1280	:param ordering: one of 'unordered', 'topological', 'groupcompress' or
	1281	'as-requested'
	1282	:param include_delta_closure: If True then the closure across any
	1283	compression parents will be included (in the opaque data).
	1284	:return: An iterator of ContentFactory objects, each of which is only
	1285	valid until the iterator is advanced.
	1286	"""
0.17.5 by Robert Collins nograph tests completely passing.	1287	# Cheap: iterate
0.17.5 by Robert Collins nograph tests completely passing.	1288	locations = self._index.get_build_details(keys)
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1289	unadded_keys = set(self._unadded_refs).intersection(keys)
	1290	missing = keys.difference(locations)
	1291	missing.difference_update(unadded_keys)
	1292	(fallback_parent_map, key_to_source_map,
	1293	source_result) = self._find_from_fallback(missing)
	1294	if ordering in ('topological', 'groupcompress'):
0.17.5 by Robert Collins nograph tests completely passing.	1295	# would be better to not globally sort initially but instead
	1296	# start with one key, recurse to its oldest parent, then grab
	1297	# everything in the same group, etc.
	1298	parent_map = dict((key, details[2]) for key, details in
	1299	locations.iteritems())
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1300	for key in unadded_keys:
	1301	parent_map[key] = self._unadded_refs[key]
	1302	parent_map.update(fallback_parent_map)
	1303	source_keys = self._get_ordered_source_keys(ordering, parent_map,
	1304	key_to_source_map)
0.22.6 by John Arbash Meinel Clustering chk pages properly makes a big difference.	1305	elif ordering == 'as-requested':
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1306	source_keys = self._get_as_requested_source_keys(orig_keys,
	1307	locations, unadded_keys, key_to_source_map)
0.17.5 by Robert Collins nograph tests completely passing.	1308	else:
0.20.10 by John Arbash Meinel Change the extraction ordering for 'unordered'.	1309	# We want to yield the keys in a semi-optimal (read-wise) ordering.
	1310	# Otherwise we thrash the _group_cache and destroy performance
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1311	source_keys = self._get_io_ordered_source_keys(locations,
	1312	unadded_keys, source_result)
	1313	for key in missing:
0.17.5 by Robert Collins nograph tests completely passing.	1314	yield AbsentContentFactory(key)
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	1315	manager = None
3735.34.3 by John Arbash Meinel Cleanup, in preparation for merging to brisbane-core.	1316	last_read_memo = None
3735.32.15 by John Arbash Meinel Change the GroupCompressBlock code to allow not recording 'end'.	1317	# TODO: This works fairly well at batching up existing groups into a
	1318	# streamable format, and possibly allowing for taking one big
	1319	# group and splitting it when it isn't fully utilized.
	1320	# However, it doesn't allow us to find under-utilized groups and
	1321	# combine them into a bigger group on the fly.
	1322	# (Consider the issue with how chk_map inserts texts
	1323	# one-at-a-time.) This could be done at insert_record_stream()
	1324	# time, but it probably would decrease the number of
	1325	# bytes-on-the-wire for fetch.
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1326	for source, keys in source_keys:
	1327	if source is self:
	1328	for key in keys:
	1329	if key in self._unadded_refs:
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	1330	if manager is not None:
	1331	for factory in manager.get_record_stream():
3735.32.12 by John Arbash Meinel Add groupcompress-block[-ref] as valid stream types.	1332	yield factory
3735.34.3 by John Arbash Meinel Cleanup, in preparation for merging to brisbane-core.	1333	last_read_memo = manager = None
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1334	bytes, sha1 = self._compressor.extract(key)
	1335	parents = self._unadded_refs[key]
3735.32.12 by John Arbash Meinel Add groupcompress-block[-ref] as valid stream types.	1336	yield FulltextContentFactory(key, parents, sha1, bytes)
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1337	else:
	1338	index_memo, _, parents, (method, _) = locations[key]
3735.34.1 by John Arbash Meinel Some testing to see if we can decrease the peak memory consumption a bit.	1339	read_memo = index_memo[0:3]
3735.34.3 by John Arbash Meinel Cleanup, in preparation for merging to brisbane-core.	1340	if last_read_memo != read_memo:
	1341	# We are starting a new block. If we have a
	1342	# manager, we have found everything that fits for
	1343	# now, so yield records
	1344	if manager is not None:
	1345	for factory in manager.get_record_stream():
	1346	yield factory
	1347	# Now start a new manager
3735.34.1 by John Arbash Meinel Some testing to see if we can decrease the peak memory consumption a bit.	1348	block = self._get_block(index_memo)
3735.34.3 by John Arbash Meinel Cleanup, in preparation for merging to brisbane-core.	1349	manager = _LazyGroupContentManager(block)
	1350	last_read_memo = read_memo
3735.32.8 by John Arbash Meinel Some tests for the LazyGroupCompressFactory	1351	start, end = index_memo[3:5]
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	1352	manager.add_factory(key, parents, start, end)
0.17.11 by Robert Collins Add extraction of just-compressed texts to support converting from knits.	1353	else:
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	1354	if manager is not None:
	1355	for factory in manager.get_record_stream():
3735.32.12 by John Arbash Meinel Add groupcompress-block[-ref] as valid stream types.	1356	yield factory
3735.34.3 by John Arbash Meinel Cleanup, in preparation for merging to brisbane-core.	1357	last_read_memo = manager = None
3735.31.18 by John Arbash Meinel Implement stacking support across all ordering implementations.	1358	for record in source.get_record_stream(keys, ordering,
	1359	include_delta_closure):
	1360	yield record
3735.32.14 by John Arbash Meinel Move the tests over to testing the LazyGroupContentManager object.	1361	if manager is not None:
	1362	for factory in manager.get_record_stream():
3735.32.12 by John Arbash Meinel Add groupcompress-block[-ref] as valid stream types.	1363	yield factory
0.20.5 by John Arbash Meinel Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.	1364
0.17.5 by Robert Collins nograph tests completely passing.	1365	def get_sha1s(self, keys):
	1366	"""See VersionedFiles.get_sha1s()."""
	1367	result = {}
	1368	for record in self.get_record_stream(keys, 'unordered', True):
	1369	if record.sha1 != None:
	1370	result[record.key] = record.sha1
	1371	else:
	1372	if record.storage_kind != 'absent':
3735.32.12 by John Arbash Meinel Add groupcompress-block[-ref] as valid stream types.	1373	result[record.key] = sha_string(record.get_bytes_as(
0.17.5 by Robert Collins nograph tests completely passing.	1374	'fulltext'))
	1375	return result
	1376
0.17.2 by Robert Collins Core proof of concept working.	1377	def insert_record_stream(self, stream):
	1378	"""Insert a record stream into this container.
	1379
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	1380	:param stream: A stream of records to insert.
0.17.2 by Robert Collins Core proof of concept working.	1381	:return: None
	1382	:seealso VersionedFiles.get_record_stream:
	1383	"""
0.17.5 by Robert Collins nograph tests completely passing.	1384	for _ in self._insert_record_stream(stream):
0.17.5 by Robert Collins nograph tests completely passing.	1385	pass
0.17.2 by Robert Collins Core proof of concept working.	1386
3735.32.21 by John Arbash Meinel We now have a 'reuse_blocks=False' flag for autopack et al.	1387	def _insert_record_stream(self, stream, random_id=False, nostore_sha=None,
	1388	reuse_blocks=True):
0.17.2 by Robert Collins Core proof of concept working.	1389	"""Internal core to insert a record stream into this container.
	1390
	1391	This helper function has a different interface than insert_record_stream
	1392	to allow add_lines to be minimal, but still return the needed data.
	1393
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	1394	:param stream: A stream of records to insert.
3735.31.12 by John Arbash Meinel Push nostore_sha down through the stack.	1395	:param nostore_sha: If the sha1 of a given text matches nostore_sha,
	1396	raise ExistingContent, rather than committing the new text.
3735.32.21 by John Arbash Meinel We now have a 'reuse_blocks=False' flag for autopack et al.	1397	:param reuse_blocks: If the source is streaming from
	1398	groupcompress-blocks, just insert the blocks as-is, rather than
	1399	expanding the texts and inserting again.
0.17.2 by Robert Collins Core proof of concept working.	1400	:return: An iterator over the sha1 of the inserted records.
	1401	:seealso insert_record_stream:
	1402	:seealso add_lines:
	1403	"""
0.20.29 by Ian Clatworthy groupcompress.py code cleanups	1404	adapters = {}
0.17.5 by Robert Collins nograph tests completely passing.	1405	def get_adapter(adapter_key):
	1406	try:
	1407	return adapters[adapter_key]
	1408	except KeyError:
	1409	adapter_factory = adapter_registry.get(adapter_key)
	1410	adapter = adapter_factory(self)
	1411	adapters[adapter_key] = adapter
	1412	return adapter
0.17.2 by Robert Collins Core proof of concept working.	1413	# This will go up to fulltexts for gc to gc fetching, which isn't
0.17.2 by Robert Collins Core proof of concept working.	1414	# ideal.
3735.32.19 by John Arbash Meinel Get rid of the 'delta' flag to GroupCompressor. It didn't do anything anyway.	1415	self._compressor = GroupCompressor()
0.17.11 by Robert Collins Add extraction of just-compressed texts to support converting from knits.	1416	self._unadded_refs = {}
0.17.5 by Robert Collins nograph tests completely passing.	1417	keys_to_add = []
0.17.6 by Robert Collins Cap group size at 20MB internal buffer. (Probably way too big).	1418	def flush():
3735.32.23 by John Arbash Meinel Add a _LazyGroupContentManager._check_rebuild_block	1419	bytes = self._compressor.flush().to_bytes()
0.17.6 by Robert Collins Cap group size at 20MB internal buffer. (Probably way too big).	1420	index, start, length = self._access.add_raw_records(
0.25.7 by John Arbash Meinel Have the GroupCompressBlock decide how to compress the header and content.	1421	[(None, len(bytes))], bytes)[0]
0.17.6 by Robert Collins Cap group size at 20MB internal buffer. (Probably way too big).	1422	nodes = []
	1423	for key, reads, refs in keys_to_add:
	1424	nodes.append((key, "%d %d %s" % (start, length, reads), refs))
	1425	self._index.add_records(nodes, random_id=random_id)
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	1426	self._unadded_refs = {}
	1427	del keys_to_add[:]
3735.32.19 by John Arbash Meinel Get rid of the 'delta' flag to GroupCompressor. It didn't do anything anyway.	1428	self._compressor = GroupCompressor()
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	1429
0.20.15 by John Arbash Meinel Change so that regions that have lots of copies get converted back	1430	last_prefix = None
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	1431	last_fulltext_len = None
	1432	max_fulltext_len = 0
0.25.11 by John Arbash Meinel Slightly different handling of large texts.	1433	max_fulltext_prefix = None
3735.32.20 by John Arbash Meinel groupcompress now copies the blocks exactly as they were given.	1434	insert_manager = None
	1435	block_start = None
	1436	block_length = None
0.17.2 by Robert Collins Core proof of concept working.	1437	for record in stream:
0.17.5 by Robert Collins nograph tests completely passing.	1438	# Raise an error when a record is missing.
0.17.5 by Robert Collins nograph tests completely passing.	1439	if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy groupcompress.py code cleanups	1440	raise errors.RevisionNotPresent(record.key, self)
3735.32.21 by John Arbash Meinel We now have a 'reuse_blocks=False' flag for autopack et al.	1441	if reuse_blocks:
	1442	# If the reuse_blocks flag is set, check to see if we can just
	1443	# copy a groupcompress block as-is.
	1444	if record.storage_kind == 'groupcompress-block':
	1445	# Insert the raw block into the target repo
	1446	insert_manager = record._manager
3735.2.163 by John Arbash Meinel Merge bzr.dev 4187, and revert the change to fix refcycle issues.	1447	insert_manager._check_rebuild_block()
3735.32.21 by John Arbash Meinel We now have a 'reuse_blocks=False' flag for autopack et al.	1448	bytes = record._manager._block.to_bytes()
	1449	_, start, length = self._access.add_raw_records(
	1450	[(None, len(bytes))], bytes)[0]
	1451	del bytes
	1452	block_start = start
	1453	block_length = length
	1454	if record.storage_kind in ('groupcompress-block',
	1455	'groupcompress-block-ref'):
	1456	assert insert_manager is not None
	1457	assert record._manager is insert_manager
	1458	value = "%d %d %d %d" % (block_start, block_length,
	1459	record._start, record._end)
	1460	nodes = [(record.key, value, (record.parents,))]
	1461	self._index.add_records(nodes, random_id=random_id)
	1462	continue
0.20.18 by John Arbash Meinel Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()	1463	try:
0.23.52 by John Arbash Meinel Use the max_delta flag.	1464	bytes = record.get_bytes_as('fulltext')
0.20.18 by John Arbash Meinel Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()	1465	except errors.UnavailableRepresentation:
0.17.5 by Robert Collins nograph tests completely passing.	1466	adapter_key = record.storage_kind, 'fulltext'
0.17.5 by Robert Collins nograph tests completely passing.	1467	adapter = get_adapter(adapter_key)
0.20.21 by John Arbash Meinel Merge the chk sorting code.	1468	bytes = adapter.get_bytes(record)
0.20.13 by John Arbash Meinel Play around a bit.	1469	if len(record.key) > 1:
0.20.13 by John Arbash Meinel Play around a bit.	1470	prefix = record.key[0]
0.25.11 by John Arbash Meinel Slightly different handling of large texts.	1471	soft = (prefix == last_prefix)
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	1472	else:
	1473	prefix = None
0.25.11 by John Arbash Meinel Slightly different handling of large texts.	1474	soft = False
	1475	if max_fulltext_len < len(bytes):
	1476	max_fulltext_len = len(bytes)
	1477	max_fulltext_prefix = prefix
3735.2.162 by John Arbash Meinel Change GroupCompressor.compress() to return the start_point.	1478	(found_sha1, start_point, end_point, type,
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	1479	length) = self._compressor.compress(record.key,
3735.31.12 by John Arbash Meinel Push nostore_sha down through the stack.	1480	bytes, record.sha1, soft=soft,
	1481	nostore_sha=nostore_sha)
0.25.11 by John Arbash Meinel Slightly different handling of large texts.	1482	# delta_ratio = float(len(bytes)) / length
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	1483	# Check if we want to continue to include that text
0.25.11 by John Arbash Meinel Slightly different handling of large texts.	1484	if (prefix == max_fulltext_prefix
	1485	and end_point < 2 * max_fulltext_len):
	1486	# As long as we are on the same file_id, we will fill at least
	1487	# 2 * max_fulltext_len
	1488	start_new_block = False
	1489	elif end_point > 410241024:
	1490	start_new_block = True
	1491	elif (prefix is not None and prefix != last_prefix
	1492	and end_point > 210241024):
	1493	start_new_block = True
	1494	else:
	1495	start_new_block = False
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	1496	# if type == 'fulltext':
	1497	# # If this is the first text, we don't do anything
	1498	# if self._compressor.num_keys > 1:
	1499	# if prefix is not None and prefix != last_prefix:
	1500	# # We just inserted a fulltext for a different prefix
	1501	# # (aka file-id).
	1502	# if end_point > 512 * 1024:
	1503	# start_new_block = True
	1504	# # TODO: Consider packing several small texts together
	1505	# # maybe only flush if end_point > some threshold
	1506	# # if end_point > 512 * 1024 or len(bytes) <
	1507	# # start_new_block = true
	1508	# else:
	1509	# # We just added a fulltext, part of the same file-id
	1510	# if (end_point > 210241024
	1511	# and end_point > 5*max_fulltext_len):
	1512	# start_new_block = True
	1513	# last_fulltext_len = len(bytes)
	1514	# else:
	1515	# delta_ratio = float(len(bytes)) / length
	1516	# if delta_ratio < 3: # Not much compression
	1517	# if end_point > 110241024:
	1518	# start_new_block = True
	1519	# elif delta_ratio < 10: # 10:1 compression
	1520	# if end_point > 410241024:
	1521	# start_new_block = True
	1522	last_prefix = prefix
	1523	if start_new_block:
	1524	self._compressor.pop_last()
	1525	flush()
	1526	max_fulltext_len = len(bytes)
3735.2.162 by John Arbash Meinel Change GroupCompressor.compress() to return the start_point.	1527	(found_sha1, start_point, end_point, type,
0.25.10 by John Arbash Meinel Play around with detecting compression breaks.	1528	length) = self._compressor.compress(record.key,
	1529	bytes, record.sha1)
	1530	last_fulltext_len = length
0.17.26 by Robert Collins Working better --gc-plain-chk.	1531	if record.key[-1] is None:
	1532	key = record.key[:-1] + ('sha1:' + found_sha1,)
	1533	else:
	1534	key = record.key
	1535	self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins new encoder, allows non monotonically increasing sequence matches for moar compression.	1536	yield found_sha1
3735.2.164 by John Arbash Meinel Fix a critical bug that caused problems with the index entries.	1537	keys_to_add.append((key, '%d %d' % (start_point, end_point),
0.17.5 by Robert Collins nograph tests completely passing.	1538	(record.parents,)))
0.17.8 by Robert Collins Flush pending updates at the end of _insert_record_stream	1539	if len(keys_to_add):
	1540	flush()
0.17.11 by Robert Collins Add extraction of just-compressed texts to support converting from knits.	1541	self._compressor = None
0.17.5 by Robert Collins nograph tests completely passing.	1542
	1543	def iter_lines_added_or_present_in_keys(self, keys, pb=None):
	1544	"""Iterate over the lines in the versioned files from keys.
	1545
	1546	This may return lines from other keys. Each item the returned
	1547	iterator yields is a tuple of a line and a text version that that line
	1548	is present in (not introduced in).
	1549
	1550	Ordering of results is in whatever order is most suitable for the
	1551	underlying storage format.
	1552
	1553	If a progress bar is supplied, it may be used to indicate progress.
	1554	The caller is responsible for cleaning up progress bars (because this
	1555	is an iterator).
	1556
	1557	NOTES:
	1558	* Lines are normalised by the underlying store: they will all have \n
	1559	terminators.
	1560	* Lines are returned in arbitrary order.
	1561
	1562	:return: An iterator over (line, key).
	1563	"""
	1564	if pb is None:
	1565	pb = progress.DummyProgress()
	1566	keys = set(keys)
	1567	total = len(keys)
	1568	# we don't care about inclusions, the caller cares.
	1569	# but we need to setup a list of records to visit.
	1570	# we need key, position, length
	1571	for key_idx, record in enumerate(self.get_record_stream(keys,
	1572	'unordered', True)):
	1573	# XXX: todo - optimise to use less than full texts.
	1574	key = record.key
3735.32.1 by John Arbash Meinel Fix the VF WalkingContent checks.	1575	pb.update('Walking content', key_idx, total)
0.17.5 by Robert Collins nograph tests completely passing.	1576	if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy groupcompress.py code cleanups	1577	raise errors.RevisionNotPresent(key, self)
0.17.5 by Robert Collins nograph tests completely passing.	1578	lines = split_lines(record.get_bytes_as('fulltext'))
	1579	for line in lines:
	1580	yield line, key
3735.32.1 by John Arbash Meinel Fix the VF WalkingContent checks.	1581	pb.update('Walking content', total, total)
0.17.5 by Robert Collins nograph tests completely passing.	1582
	1583	def keys(self):
	1584	"""See VersionedFiles.keys."""
	1585	if 'evil' in debug.debug_flags:
	1586	trace.mutter_callsite(2, "keys scales with size of history")
3735.31.7 by John Arbash Meinel Start bringing in stacking support for Groupcompress repos.	1587	sources = [self._index] + self._fallback_vfs
0.17.5 by Robert Collins nograph tests completely passing.	1588	result = set()
	1589	for source in sources:
	1590	result.update(source.keys())
	1591	return result
	1592
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	1593
	1594	class _GCGraphIndex(object):
	1595	"""Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
	1596
0.17.9 by Robert Collins Initial stab at repository format support.	1597	def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	1598	add_callback=None):
	1599	"""Construct a _GCGraphIndex on a graph_index.
	1600
	1601	:param graph_index: An implementation of bzrlib.index.GraphIndex.
0.20.29 by Ian Clatworthy groupcompress.py code cleanups	1602	:param is_locked: A callback, returns True if the index is locked and
0.20.29 by Ian Clatworthy groupcompress.py code cleanups	1603	thus usable.
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	1604	:param parents: If True, record knits parents, if not do not record
0.17.1 by Robert Collins Starting point. Interface tests hooked up and failing.	1605	parents.
	1606	:param add_callback: If not None, allow additions to the index and call
	1607	this callback with a list of added GraphIndex nodes:
	1608	[(node, value, node_refs), ...]
	1609	"""
	1610	self._add_callback = add_callback
	1611	self._graph_index = graph_index
	1612	self._parents = parents
	1613	self.has_graph = parents
	1614	self._is_locked = is_locked
	1615
0.17.5 by Robert Collins nograph tests completely passing.	1616	def add_records(self, records, random_id=False):
0.17.5 by Robert Collins nograph tests completely passing.	1617	"""Add multiple records to the index.
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	1618
0.17.5 by Robert Collins nograph tests completely passing.	1619	This function does not insert data into the Immutable GraphIndex
	1620	backing the KnitGraphIndex, instead it prepares data for insertion by
	1621	the caller and checks that it is safe to insert then calls
	1622	self._add_callback with the prepared GraphIndex nodes.
	1623
	1624	:param records: a list of tuples:
	1625	(key, options, access_memo, parents).
	1626	:param random_id: If True the ids being added were randomly generated
	1627	and no check for existence will be performed.
	1628	"""
	1629	if not self._add_callback:
	1630	raise errors.ReadOnlyError(self)
	1631	# we hope there are no repositories with inconsistent parentage
	1632	# anymore.
	1633
	1634	changed = False
	1635	keys = {}
	1636	for (key, value, refs) in records:
	1637	if not self._parents:
	1638	if refs:
	1639	for ref in refs:
	1640	if ref:
	1641	raise KnitCorrupt(self,
	1642	"attempt to add node with parents "
	1643	"in parentless index.")
	1644	refs = ()
	1645	changed = True
	1646	keys[key] = (value, refs)
	1647	# check for dups
	1648	if not random_id:
	1649	present_nodes = self._get_entries(keys)
	1650	for (index, key, value, node_refs) in present_nodes:
	1651	if node_refs != keys[key][1]:
	1652	raise errors.KnitCorrupt(self, "inconsistent details in add_records"
	1653	": %s %s" % ((value, node_refs), keys[key]))
	1654	del keys[key]
	1655	changed = True
	1656	if changed:
	1657	result = []
	1658	if self._parents:
	1659	for key, (value, node_refs) in keys.iteritems():
	1660	result.append((key, value, node_refs))
	1661	else:
	1662	for key, (value, node_refs) in keys.iteritems():
	1663	result.append((key, value))
	1664	records = result
	1665	self._add_callback(records)
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	1666
0.17.5 by Robert Collins nograph tests completely passing.	1667	def _check_read(self):
0.20.29 by Ian Clatworthy groupcompress.py code cleanups	1668	"""Raise an exception if reads are not permitted."""
0.17.5 by Robert Collins nograph tests completely passing.	1669	if not self._is_locked():
	1670	raise errors.ObjectNotLocked(self)
	1671
0.17.2 by Robert Collins Core proof of concept working.	1672	def _check_write_ok(self):
0.20.29 by Ian Clatworthy groupcompress.py code cleanups	1673	"""Raise an exception if writes are not permitted."""
0.17.2 by Robert Collins Core proof of concept working.	1674	if not self._is_locked():
	1675	raise errors.ObjectNotLocked(self)
	1676
0.17.5 by Robert Collins nograph tests completely passing.	1677	def _get_entries(self, keys, check_present=False):
0.17.5 by Robert Collins nograph tests completely passing.	1678	"""Get the entries for keys.
0.20.29 by Ian Clatworthy groupcompress.py code cleanups	1679
	1680	Note: Callers are responsible for checking that the index is locked
	1681	before calling this method.
	1682
0.17.5 by Robert Collins nograph tests completely passing.	1683	:param keys: An iterable of index key tuples.
	1684	"""
	1685	keys = set(keys)
	1686	found_keys = set()
	1687	if self._parents:
	1688	for node in self._graph_index.iter_entries(keys):
	1689	yield node
	1690	found_keys.add(node[1])
	1691	else:
	1692	# adapt parentless index to the rest of the code.
	1693	for node in self._graph_index.iter_entries(keys):
	1694	yield node[0], node[1], node[2], ()
	1695	found_keys.add(node[1])
	1696	if check_present:
	1697	missing_keys = keys.difference(found_keys)
	1698	if missing_keys:
	1699	raise RevisionNotPresent(missing_keys.pop(), self)
	1700
	1701	def get_parent_map(self, keys):
	1702	"""Get a map of the parents of keys.
	1703
	1704	:param keys: The keys to look up parents for.
	1705	:return: A mapping from keys to parents. Absent keys are absent from
	1706	the mapping.
	1707	"""
	1708	self._check_read()
	1709	nodes = self._get_entries(keys)
	1710	result = {}
	1711	if self._parents:
	1712	for node in nodes:
	1713	result[node[1]] = node[3][0]
	1714	else:
	1715	for node in nodes:
	1716	result[node[1]] = None
	1717	return result
	1718
	1719	def get_build_details(self, keys):
	1720	"""Get the various build details for keys.
	1721
	1722	Ghosts are omitted from the result.
	1723
	1724	:param keys: An iterable of keys.
	1725	:return: A dict of key:
	1726	(index_memo, compression_parent, parents, record_details).
	1727	index_memo
	1728	opaque structure to pass to read_records to extract the raw
	1729	data
	1730	compression_parent
	1731	Content that this record is built upon, may be None
	1732	parents
	1733	Logical parents of this node
	1734	record_details
	1735	extra information about the content which needs to be passed to
	1736	Factory.parse_record
	1737	"""
	1738	self._check_read()
	1739	result = {}
0.20.29 by Ian Clatworthy groupcompress.py code cleanups	1740	entries = self._get_entries(keys)
0.17.5 by Robert Collins nograph tests completely passing.	1741	for entry in entries:
	1742	key = entry[1]
	1743	if not self._parents:
	1744	parents = None
	1745	else:
	1746	parents = entry[3][0]
	1747	method = 'group'
	1748	result[key] = (self._node_to_position(entry),
	1749	None, parents, (method, None))
	1750	return result
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	1751
0.17.5 by Robert Collins nograph tests completely passing.	1752	def keys(self):
0.17.5 by Robert Collins nograph tests completely passing.	1753	"""Get all the keys in the collection.
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	1754
0.17.5 by Robert Collins nograph tests completely passing.	1755	The keys are not ordered.
	1756	"""
	1757	self._check_read()
	1758	return [node[1] for node in self._graph_index.iter_all_entries()]
3735.31.2 by John Arbash Meinel Cleanup trailing whitespace, get test_source to pass by removing asserts.	1759
0.17.5 by Robert Collins nograph tests completely passing.	1760	def _node_to_position(self, node):
	1761	"""Convert an index value to position details."""
	1762	bits = node[2].split(' ')
	1763	# It would be nice not to read the entire gzip.
	1764	start = int(bits[0])
	1765	stop = int(bits[1])
	1766	basis_end = int(bits[2])
	1767	delta_end = int(bits[3])
	1768	return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel A bit more work, not really usable yet.	1769
	1770
	1771	try:
3735.31.1 by John Arbash Meinel Bring the groupcompress plugin into the brisbane-core branch.	1772	from bzrlib import _groupcompress_pyx
0.18.14 by John Arbash Meinel A bit more work, not really usable yet.	1773	except ImportError:
	1774	pass