/brz/remove-bazaar : contents of breezy/btree

: (revision 6670.2.1)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

5752.3.8 by John Arbash Meinel Merge bzr.dev 5764 to resolve release-notes (aka NEWS) conflicts	1	# Copyright (C) 2008-2011 Canonical Ltd
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	2	#
	3	# This program is free software; you can redistribute it and/or modify
3641.3.29 by John Arbash Meinel Cleanup the copyright headers	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob update FSF mailing address	15	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	16	#
	17
6379.6.7 by Jelmer Vernooij Move importing from future until after doc string, otherwise the doc string will disappear.	18	"""B+Tree indices"""
	19
6379.6.1 by Jelmer Vernooij Import absolute_import in a few places.	20	from __future__ import absolute_import
	21
6624 by Jelmer Vernooĳ Merge Python3 porting work ('py3 pokes')	22	from .lazy_import import lazy_import
5753.2.2 by Jelmer Vernooij Remove some unnecessary imports, clean up lazy imports.	23	lazy_import(globals(), """
5753.2.4 by Jelmer Vernooij Review feedback from John.	24	import bisect
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	25	import math
	26	import tempfile
	27	import zlib
5753.2.2 by Jelmer Vernooij Remove some unnecessary imports, clean up lazy imports.	28	""")
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	29
6624 by Jelmer Vernooĳ Merge Python3 porting work ('py3 pokes')	30	from . import (
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	31	chunk_writer,
	32	debug,
	33	errors,
4208.1.2 by John Arbash Meinel Switch to using a FIFOCache.	34	fifo_cache,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	35	index,
	36	lru_cache,
	37	osutils,
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	38	static_tuple,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	39	trace,
5273.1.7 by Vincent Ladeuil No more use of the get_transport imported symbol, all uses are through	40	transport,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	41	)
6624 by Jelmer Vernooĳ Merge Python3 porting work ('py3 pokes')	42	from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
	43	from .sixish import (
6621.22.2 by Martin Use BytesIO or StringIO from bzrlib.sixish	44	BytesIO,
6634.1.1 by Martin Remove direct use of future_builtins module	45	map,
6651.2.2 by Martin Apply 2to3 xrange fix and fix up with sixish range	46	range,
6656.1.1 by Martin Apply 2to3 dict fixer and clean up resulting mess using view helpers	47	viewitems,
	48	viewkeys,
	49	viewvalues,
6621.22.2 by Martin Use BytesIO or StringIO from bzrlib.sixish	50	)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	51
	52
3641.3.3 by John Arbash Meinel Change the header to indicate these indexes are	53	_BTSIGNATURE = "B+Tree Graph Index 2\n"
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	54	_OPTION_ROW_LENGTHS = "row_lengths="
	55	_LEAF_FLAG = "type=leaf\n"
	56	_INTERNAL_FLAG = "type=internal\n"
	57	_INTERNAL_OFFSET = "offset="
	58
	59	_RESERVED_HEADER_BYTES = 120
	60	_PAGE_SIZE = 4096
	61
	62	# 4K per page: 4MB - 1000 entries
	63	_NODE_CACHE_SIZE = 1000
	64
	65
	66	class _BuilderRow(object):
	67	"""The stored state accumulated while writing out a row in the index.
	68
	69	:ivar spool: A temporary file used to accumulate nodes for this row
	70	in the tree.
	71	:ivar nodes: The count of nodes emitted so far.
	72	"""
	73
	74	def __init__(self):
	75	"""Create a _BuilderRow."""
	76	self.nodes = 0
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	77	self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	78	self.writer = None
	79
	80	def finish_node(self, pad=True):
	81	byte_lines, _, padding = self.writer.finish()
	82	if self.nodes == 0:
6621.22.2 by Martin Use BytesIO or StringIO from bzrlib.sixish	83	self.spool = BytesIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	84	# padded note:
	85	self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	86	elif self.nodes == 1:
	87	# We got bigger than 1 node, switch to a temp file
	88	spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
	89	spool.write(self.spool.getvalue())
	90	self.spool = spool
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	91	skipped_bytes = 0
	92	if not pad and padding:
	93	del byte_lines[-1]
	94	skipped_bytes = padding
	95	self.spool.writelines(byte_lines)
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	96	remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
	97	if remainder != 0:
	98	raise AssertionError("incorrect node length: %d, %d"
	99	% (self.spool.tell(), remainder))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	100	self.nodes += 1
	101	self.writer = None
	102
	103
	104	class _InternalBuilderRow(_BuilderRow):
	105	"""The stored state accumulated while writing out internal rows."""
	106
	107	def finish_node(self, pad=True):
	108	if not pad:
	109	raise AssertionError("Must pad internal nodes only.")
	110	_BuilderRow.finish_node(self)
	111
	112
	113	class _LeafBuilderRow(_BuilderRow):
	114	"""The stored state accumulated while writing out a leaf rows."""
	115
	116
	117	class BTreeBuilder(index.GraphIndexBuilder):
	118	"""A Builder for B+Tree based Graph indices.
	119
	120	The resulting graph has the structure:
	121
	122	_SIGNATURE OPTIONS NODES
	123	_SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
	124	OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
	125	REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
	126	KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
	127	LENGTH := 'len=' DIGITS NEWLINE
	128	ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
	129	NODES := NODE_COMPRESSED*
	130	NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
	131	NODE_RAW := INTERNAL \| LEAF
	132	INTERNAL := INTERNAL_FLAG POINTERS
	133	LEAF := LEAF_FLAG ROWS
	134	KEY_ELEMENT := Not-whitespace-utf8
	135	KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
	136	ROWS := ROW*
	137	ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
	138	ABSENT := 'a'
	139	REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
	140	REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
	141	REFERENCE := KEY
	142	VALUE := no-newline-no-null-bytes
	143	"""
	144
	145	def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):
	146	"""See GraphIndexBuilder.__init__.
	147
	148	:param spill_at: Optional parameter controlling the maximum number
	149	of nodes that BTreeBuilder will hold in memory.
	150	"""
	151	index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
	152	key_elements=key_elements)
	153	self._spill_at = spill_at
	154	self._backing_indices = []
3644.2.11 by John Arbash Meinel Document the new form of _nodes and remove an unnecessary cast.	155	# A map of {key: (node_refs, value)}
	156	self._nodes = {}
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	157	# Indicate it hasn't been built yet
	158	self._nodes_by_key = None
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	159	self._optimize_for_size = False
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	160
	161	def add_node(self, key, value, references=()):
	162	"""Add a node to the index.
	163
	164	If adding the node causes the builder to reach its spill_at threshold,
	165	disk spilling will be triggered.
	166
	167	:param key: The key. keys are non-empty tuples containing
	168	as many whitespace-free utf8 bytestrings as the key length
	169	defined for this index.
	170	:param references: An iterable of iterables of keys. Each is a
	171	reference to another key.
	172	:param value: The value to associate with the key. It may be any
5891.1.3 by Andrew Bennetts Move docstring formatting fixes.	173	bytes as long as it does not contain \\0 or \\n.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	174	"""
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	175	# Ensure that 'key' is a StaticTuple
	176	key = static_tuple.StaticTuple.from_sequence(key).intern()
3644.2.9 by John Arbash Meinel Refactor some code.	177	# we don't care about absent_references
3644.2.9 by John Arbash Meinel Refactor some code.	178	node_refs, _ = self._check_key_ref_value(key, references, value)
3644.2.2 by John Arbash Meinel the new btree index doesn't have 'absent' keys in its _nodes	179	if key in self._nodes:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	180	raise errors.BadIndexDuplicateKey(key, self)
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	181	self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
3644.2.9 by John Arbash Meinel Refactor some code.	182	if self._nodes_by_key is not None and self._key_length > 1:
3644.2.9 by John Arbash Meinel Refactor some code.	183	self._update_nodes_by_key(key, value, node_refs)
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	184	if len(self._nodes) < self._spill_at:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	185	return
3644.2.9 by John Arbash Meinel Refactor some code.	186	self._spill_mem_keys_to_disk()
	187
	188	def _spill_mem_keys_to_disk(self):
	189	"""Write the in memory keys down to disk to cap memory consumption.
	190
	191	If we already have some keys written to disk, we will combine them so
	192	as to preserve the sorted order. The algorithm for combining uses
	193	powers of two. So on the first spill, write all mem nodes into a
	194	single index. On the second spill, combine the mem nodes with the nodes
	195	on disk to create a 2x sized disk index and get rid of the first index.
	196	On the third spill, create a single new disk index, which will contain
	197	the mem nodes, and preserve the existing 2x sized index. On the fourth,
	198	combine mem with the first and second indexes, creating a new one of
	199	size 4x. On the fifth create a single new one, etc.
	200	"""
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	201	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	202	(new_backing_file, size,
	203	backing_pos) = self._spill_mem_keys_and_combine()
	204	else:
	205	new_backing_file, size = self._spill_mem_keys_without_combining()
	206	# Note: The transport here isn't strictly needed, because we will use
	207	# direct access to the new_backing._file object
6083.1.1 by Jelmer Vernooij Use get_transport_from_{url,path} in more places.	208	new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
5273.1.7 by Vincent Ladeuil No more use of the get_transport imported symbol, all uses are through	209	'<temp>', size)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	210	# GC will clean up the file
	211	new_backing._file = new_backing_file
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	212	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	213	if len(self._backing_indices) == backing_pos:
	214	self._backing_indices.append(None)
	215	self._backing_indices[backing_pos] = new_backing
	216	for backing_pos in range(backing_pos):
	217	self._backing_indices[backing_pos] = None
	218	else:
	219	self._backing_indices.append(new_backing)
	220	self._nodes = {}
	221	self._nodes_by_key = None
	222
	223	def _spill_mem_keys_without_combining(self):
	224	return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
	225
	226	def _spill_mem_keys_and_combine(self):
4168.3.4 by John Arbash Meinel Restore the ability to spill, but prepare a flag to disable it.	227	iterators_to_combine = [self._iter_mem_nodes()]
	228	pos = -1
	229	for pos, backing in enumerate(self._backing_indices):
	230	if backing is None:
	231	pos -= 1
	232	break
	233	iterators_to_combine.append(backing.iter_all_entries())
	234	backing_pos = pos + 1
	235	new_backing_file, size = \
	236	self._write_nodes(self._iter_smallest(iterators_to_combine),
	237	allow_optimize=False)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	238	return new_backing_file, size, backing_pos
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	239
	240	def add_nodes(self, nodes):
	241	"""Add nodes to the index.
	242
	243	:param nodes: An iterable of (key, node_refs, value) entries to add.
	244	"""
	245	if self.reference_lists:
	246	for (key, value, node_refs) in nodes:
	247	self.add_node(key, value, node_refs)
	248	else:
	249	for (key, value) in nodes:
	250	self.add_node(key, value)
	251
	252	def _iter_mem_nodes(self):
	253	"""Iterate over the nodes held in memory."""
3644.2.8 by John Arbash Meinel Two quick tweaks.	254	nodes = self._nodes
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	255	if self.reference_lists:
3644.2.8 by John Arbash Meinel Two quick tweaks.	256	for key in sorted(nodes):
	257	references, value = nodes[key]
	258	yield self, key, value, references
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	259	else:
3644.2.8 by John Arbash Meinel Two quick tweaks.	260	for key in sorted(nodes):
	261	references, value = nodes[key]
	262	yield self, key, value
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	263
	264	def _iter_smallest(self, iterators_to_combine):
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	265	if len(iterators_to_combine) == 1:
	266	for value in iterators_to_combine[0]:
	267	yield value
	268	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	269	current_values = []
	270	for iterator in iterators_to_combine:
	271	try:
6634.2.1 by Martin Apply 2to3 next fixer and make compatible	272	current_values.append(next(iterator))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	273	except StopIteration:
	274	current_values.append(None)
	275	last = None
	276	while True:
	277	# Decorate candidates with the value to allow 2.4's min to be used.
	278	candidates = [(item[1][1], item) for item
	279	in enumerate(current_values) if item[1] is not None]
	280	if not len(candidates):
	281	return
	282	selected = min(candidates)
	283	# undecorate back to (pos, node)
	284	selected = selected[1]
	285	if last == selected[1][1]:
	286	raise errors.BadIndexDuplicateKey(last, self)
	287	last = selected[1][1]
	288	# Yield, with self as the index
	289	yield (self,) + selected[1][1:]
	290	pos = selected[0]
	291	try:
6634.2.1 by Martin Apply 2to3 next fixer and make compatible	292	current_values[pos] = next(iterators_to_combine[pos])
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	293	except StopIteration:
	294	current_values[pos] = None
	295
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	296	def _add_key(self, string_key, line, rows, allow_optimize=True):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	297	"""Add a key to the current chunk.
	298
	299	:param string_key: The key to add.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	300	:param line: The fully serialised key and value.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	301	:param allow_optimize: If set to False, prevent setting the optimize
	302	flag when writing out. This is used by the _spill_mem_keys_to_disk
	303	functionality.
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	304	"""
6015.58.1 by John Arbash Meinel Fix bug #1010339.	305	new_leaf = False
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	306	if rows[-1].writer is None:
	307	# opening a new leaf chunk;
6015.58.1 by John Arbash Meinel Fix bug #1010339.	308	new_leaf = True
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	309	for pos, internal_row in enumerate(rows[:-1]):
	310	# flesh out any internal nodes that are needed to
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	311	# preserve the height of the tree
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	312	if internal_row.writer is None:
	313	length = _PAGE_SIZE
	314	if internal_row.nodes == 0:
	315	length -= _RESERVED_HEADER_BYTES # padded
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	316	if allow_optimize:
	317	optimize_for_size = self._optimize_for_size
	318	else:
	319	optimize_for_size = False
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	320	internal_row.writer = chunk_writer.ChunkWriter(length, 0,
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	321	optimize_for_size=optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	322	internal_row.writer.write(_INTERNAL_FLAG)
	323	internal_row.writer.write(_INTERNAL_OFFSET +
	324	str(rows[pos + 1].nodes) + "\n")
	325	# add a new leaf
	326	length = _PAGE_SIZE
	327	if rows[-1].nodes == 0:
	328	length -= _RESERVED_HEADER_BYTES # padded
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	329	rows[-1].writer = chunk_writer.ChunkWriter(length,
	330	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	331	rows[-1].writer.write(_LEAF_FLAG)
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	332	if rows[-1].writer.write(line):
6178.2.9 by Shannon Weyrick A version of the patch, based on suggestions from John Meinel, which detects an empty page differently to avoid false positives.	333	# if we failed to write, despite having an empty page to write to,
	334	# then line is too big. raising the error avoids infinite recursion
	335	# searching for a suitably large page that will not be found.
6015.58.1 by John Arbash Meinel Fix bug #1010339.	336	if new_leaf:
6178.2.9 by Shannon Weyrick A version of the patch, based on suggestions from John Meinel, which detects an empty page differently to avoid false positives.	337	raise errors.BadIndexKey(string_key)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	338	# this key did not fit in the node:
	339	rows[-1].finish_node()
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	340	key_line = string_key + "\n"
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	341	new_row = True
	342	for row in reversed(rows[:-1]):
	343	# Mark the start of the next node in the node above. If it
4031.3.1 by Frank Aspell Fixing various typos	344	# doesn't fit then propagate upwards until we find one that
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	345	# it does fit into.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	346	if row.writer.write(key_line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	347	row.finish_node()
	348	else:
	349	# We've found a node that can handle the pointer.
	350	new_row = False
	351	break
	352	# If we reached the current root without being able to mark the
	353	# division point, then we need a new root:
	354	if new_row:
	355	# We need a new row
	356	if 'index' in debug.debug_flags:
	357	trace.mutter('Inserting new global row.')
	358	new_row = _InternalBuilderRow()
	359	reserved_bytes = 0
	360	rows.insert(0, new_row)
	361	# This will be padded, hence the -100
	362	new_row.writer = chunk_writer.ChunkWriter(
	363	_PAGE_SIZE - _RESERVED_HEADER_BYTES,
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	364	reserved_bytes,
	365	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	366	new_row.writer.write(_INTERNAL_FLAG)
	367	new_row.writer.write(_INTERNAL_OFFSET +
	368	str(rows[1].nodes - 1) + "\n")
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	369	new_row.writer.write(key_line)
6178.2.4 by Shannon Weyrick raise BadIndexKey instead of skipping	370	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	371
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	372	def _write_nodes(self, node_iterator, allow_optimize=True):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	373	"""Write node_iterator out as a B+Tree.
	374
	375	:param node_iterator: An iterator of sorted nodes. Each node should
	376	match the output given by iter_all_entries.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	377	:param allow_optimize: If set to False, prevent setting the optimize
	378	flag when writing out. This is used by the _spill_mem_keys_to_disk
	379	functionality.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	380	:return: A file handle for a temporary file containing a B+Tree for
	381	the nodes.
	382	"""
	383	# The index rows - rows[0] is the root, rows[1] is the layer under it
	384	# etc.
	385	rows = []
	386	# forward sorted by key. In future we may consider topological sorting,
	387	# at the cost of table scans for direct lookup, or a second index for
	388	# direct lookup
	389	key_count = 0
	390	# A stack with the number of nodes of each size. 0 is the root node
	391	# and must always be 1 (if there are any nodes in the tree).
	392	self.row_lengths = []
	393	# Loop over all nodes adding them to the bottom row
	394	# (rows[-1]). When we finish a chunk in a row,
4031.3.1 by Frank Aspell Fixing various typos	395	# propagate the key that didn't fit (comes after the chunk) to the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	396	# row above, transitively.
	397	for node in node_iterator:
	398	if key_count == 0:
	399	# First key triggers the first row
	400	rows.append(_LeafBuilderRow())
6178.2.4 by Shannon Weyrick raise BadIndexKey instead of skipping	401	key_count += 1
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	402	string_key, line = _btree_serializer._flatten_node(node,
	403	self.reference_lists)
6178.2.4 by Shannon Weyrick raise BadIndexKey instead of skipping	404	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	405	for row in reversed(rows):
6619.3.18 by Jelmer Vernooĳ Run 2to3 idioms fixer.	406	pad = (not isinstance(row, _LeafBuilderRow))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	407	row.finish_node(pad=pad)
	408	lines = [_BTSIGNATURE]
	409	lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
	410	lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
	411	lines.append(_OPTION_LEN + str(key_count) + '\n')
	412	row_lengths = [row.nodes for row in rows]
	413	lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	414	if row_lengths and row_lengths[-1] > 1:
	415	result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
	416	else:
6621.22.2 by Martin Use BytesIO or StringIO from bzrlib.sixish	417	result = BytesIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	418	result.writelines(lines)
	419	position = sum(map(len, lines))
	420	root_row = True
	421	if position > _RESERVED_HEADER_BYTES:
	422	raise AssertionError("Could not fit the header in the"
	423	" reserved space: %d > %d"
	424	% (position, _RESERVED_HEADER_BYTES))
	425	# write the rows out:
	426	for row in rows:
	427	reserved = _RESERVED_HEADER_BYTES # reserved space for first node
	428	row.spool.flush()
	429	row.spool.seek(0)
	430	# copy nodes to the finalised file.
	431	# Special case the first node as it may be prefixed
	432	node = row.spool.read(_PAGE_SIZE)
	433	result.write(node[reserved:])
4771.3.1 by John Arbash Meinel We don't have to pad 'short' records.	434	if len(node) == _PAGE_SIZE:
	435	result.write("\x00" * (reserved - position))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	436	position = 0 # Only the root row actually has an offset
	437	copied_len = osutils.pumpfile(row.spool, result)
	438	if copied_len != (row.nodes - 1) * _PAGE_SIZE:
6619.3.18 by Jelmer Vernooĳ Run 2to3 idioms fixer.	439	if not isinstance(row, _LeafBuilderRow):
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	440	raise AssertionError("Incorrect amount of data copied"
	441	" expected: %d, got: %d"
	442	% ((row.nodes - 1) * _PAGE_SIZE,
	443	copied_len))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	444	result.flush()
	445	size = result.tell()
	446	result.seek(0)
	447	return result, size
	448
	449	def finish(self):
	450	"""Finalise the index.
	451
	452	:return: A file handle for a temporary file containing the nodes added
	453	to the index.
	454	"""
	455	return self._write_nodes(self.iter_all_entries())[0]
	456
	457	def iter_all_entries(self):
	458	"""Iterate over all keys within the index
	459
4343.2.2 by John Arbash Meinel Fix an important doc bug about the api of iter_all_entries()	460	:return: An iterable of (index, key, value, reference_lists). There is
	461	no defined order for the result iteration - it will be in the most
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	462	efficient order for the index (in this case dictionary hash order).
	463	"""
	464	if 'evil' in debug.debug_flags:
	465	trace.mutter_callsite(3,
	466	"iter_all_entries scales with size of history.")
	467	# Doing serial rather than ordered would be faster; but this shouldn't
	468	# be getting called routinely anyway.
3644.2.8 by John Arbash Meinel Two quick tweaks.	469	iterators = [self._iter_mem_nodes()]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	470	for backing in self._backing_indices:
	471	if backing is not None:
	472	iterators.append(backing.iter_all_entries())
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	473	if len(iterators) == 1:
	474	return iterators[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	475	return self._iter_smallest(iterators)
	476
	477	def iter_entries(self, keys):
	478	"""Iterate over keys within the index.
	479
	480	:param keys: An iterable providing the keys to be retrieved.
	481	:return: An iterable of (index, key, value, reference_lists). There is no
	482	defined order for the result iteration - it will be in the most
	483	efficient order for the index (keys iteration order in this case).
	484	"""
	485	keys = set(keys)
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	486	# Note: We don't use keys.intersection() here. If you read the C api,
	487	# set.intersection(other) special cases when other is a set and
	488	# will iterate the smaller of the two and lookup in the other.
	489	# It does not do this for any other type (even dict, unlike
	490	# some other set functions.) Since we expect keys is generally <<
	491	# self._nodes, it is faster to iterate over it in a list
	492	# comprehension
	493	nodes = self._nodes
	494	local_keys = [key for key in keys if key in nodes]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	495	if self.reference_lists:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	496	for key in local_keys:
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	497	node = nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	498	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	499	else:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	500	for key in local_keys:
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	501	node = nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	502	yield self, key, node[1]
3847.2.1 by John Arbash Meinel Shortcut BTreeBuilder.iter_entries when there are no backing indices.	503	# Find things that are in backing indices that have not been handled
	504	# yet.
3847.2.3 by John Arbash Meinel Bring back the shortcut	505	if not self._backing_indices:
3847.2.3 by John Arbash Meinel Bring back the shortcut	506	return # We won't find anything there either
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	507	# Remove all of the keys that we found locally
	508	keys.difference_update(local_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	509	for backing in self._backing_indices:
	510	if backing is None:
	511	continue
	512	if not keys:
	513	return
	514	for node in backing.iter_entries(keys):
	515	keys.remove(node[1])
	516	yield (self,) + node[1:]
	517
	518	def iter_entries_prefix(self, keys):
	519	"""Iterate over keys within the index using prefix matching.
	520
	521	Prefix matching is applied within the tuple of a key, not to within
	522	the bytestring of each key element. e.g. if you have the keys ('foo',
	523	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	524	only the former key is returned.
	525
	526	:param keys: An iterable providing the key prefixes to be retrieved.
	527	Each key prefix takes the form of a tuple the length of a key, but
	528	with the last N elements 'None' rather than a regular bytestring.
	529	The first element cannot be 'None'.
	530	:return: An iterable as per iter_all_entries, but restricted to the
	531	keys with a matching prefix to those supplied. No additional keys
	532	will be returned, and every match that is in the index will be
	533	returned.
	534	"""
	535	keys = set(keys)
	536	if not keys:
	537	return
	538	for backing in self._backing_indices:
	539	if backing is None:
	540	continue
	541	for node in backing.iter_entries_prefix(keys):
	542	yield (self,) + node[1:]
	543	if self._key_length == 1:
	544	for key in keys:
6654.1.1 by Martin Factor out some copycode in iter_entries_prefix implementations	545	index._sanity_check_key(self, key)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	546	try:
	547	node = self._nodes[key]
	548	except KeyError:
	549	continue
	550	if self.reference_lists:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	551	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	552	else:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	553	yield self, key, node[1]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	554	return
6654.1.1 by Martin Factor out some copycode in iter_entries_prefix implementations	555	nodes_by_key = self._get_nodes_by_key()
	556	for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
	557	yield entry
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	558
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	559	def _get_nodes_by_key(self):
	560	if self._nodes_by_key is None:
	561	nodes_by_key = {}
	562	if self.reference_lists:
6656.1.1 by Martin Apply 2to3 dict fixer and clean up resulting mess using view helpers	563	for key, (references, value) in viewitems(self._nodes):
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	564	key_dict = nodes_by_key
	565	for subkey in key[:-1]:
	566	key_dict = key_dict.setdefault(subkey, {})
	567	key_dict[key[-1]] = key, value, references
	568	else:
6656.1.1 by Martin Apply 2to3 dict fixer and clean up resulting mess using view helpers	569	for key, (references, value) in viewitems(self._nodes):
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	570	key_dict = nodes_by_key
	571	for subkey in key[:-1]:
	572	key_dict = key_dict.setdefault(subkey, {})
	573	key_dict[key[-1]] = key, value
	574	self._nodes_by_key = nodes_by_key
	575	return self._nodes_by_key
	576
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	577	def key_count(self):
	578	"""Return an estimate of the number of keys in this index.
	579
	580	For InMemoryGraphIndex the estimate is exact.
	581	"""
4789.28.2 by John Arbash Meinel Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.	582	return len(self._nodes) + sum(backing.key_count() for backing in
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	583	self._backing_indices if backing is not None)
	584
	585	def validate(self):
	586	"""In memory index's have no known corruption at the moment."""
	587
	588
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	589	class _LeafNode(dict):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	590	"""A leaf node for a serialised B+Tree index."""
	591
5365.5.23 by John Arbash Meinel A __sizeof__ check that ensure we are getting what we are looking for.	592	__slots__ = ('min_key', 'max_key', '_keys')
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	593
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	594	def __init__(self, bytes, key_length, ref_list_length):
	595	"""Parse bytes to create a leaf node object."""
	596	# splitlines mangles the \r delimiters.. don't use it.
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	597	key_list = _btree_serializer._parse_leaf_lines(bytes,
	598	key_length, ref_list_length)
	599	if key_list:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	600	self.min_key = key_list[0][0]
	601	self.max_key = key_list[-1][0]
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	602	else:
	603	self.min_key = self.max_key = None
5365.5.12 by John Arbash Meinel Make _LeafNode inherit from dict (is-a rather than have-a)	604	super(_LeafNode, self).__init__(key_list)
5365.5.23 by John Arbash Meinel A __sizeof__ check that ensure we are getting what we are looking for.	605	self._keys = dict(self)
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	606
	607	def all_items(self):
	608	"""Return a sorted list of (key, (value, refs)) items"""
6619.3.18 by Jelmer Vernooĳ Run 2to3 idioms fixer.	609	items = sorted(self.items())
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	610	return items
	611
	612	def all_keys(self):
	613	"""Return a sorted list of all keys."""
6619.3.18 by Jelmer Vernooĳ Run 2to3 idioms fixer.	614	keys = sorted(self.keys())
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	615	return keys
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	616
	617
	618	class _InternalNode(object):
	619	"""An internal node for a serialised B+Tree index."""
	620
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	621	__slots__ = ('keys', 'offset')
	622
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	623	def __init__(self, bytes):
	624	"""Parse bytes to create an internal node object."""
	625	# splitlines mangles the \r delimiters.. don't use it.
	626	self.keys = self._parse_lines(bytes.split('\n'))
	627
	628	def _parse_lines(self, lines):
	629	nodes = []
	630	self.offset = int(lines[1][7:])
4789.28.1 by John Arbash Meinel Use StaticTuple as part of the builder process.	631	as_st = static_tuple.StaticTuple.from_sequence
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	632	for line in lines[2:]:
	633	if line == '':
	634	break
6631.3.1 by Martin Run 2to3 map fixer and refactor after	635	# GZ 2017-05-24: Used to intern() each chunk of line as well, need
	636	# to recheck performance and perhaps adapt StaticTuple to adjust.
	637	nodes.append(as_st(line.split(b'\0')).intern())
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	638	return nodes
	639
	640
	641	class BTreeGraphIndex(object):
	642	"""Access to nodes via the standard GraphIndex interface for B+Tree's.
	643
	644	Individual nodes are held in a LRU cache. This holds the root node in
	645	memory except when very large walks are done.
	646	"""
	647
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	648	def __init__(self, transport, name, size, unlimited_cache=False,
	649	offset=0):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	650	"""Create a B+Tree index object on the index name.
	651
	652	:param transport: The transport to read data for the index from.
	653	:param name: The file name of the index on transport.
	654	:param size: Optional size of the index in bytes. This allows
	655	compatibility with the GraphIndex API, as well as ensuring that
	656	the initial read (to read the root node header) can be done
	657	without over-reading even on empty indices, and on small indices
	658	allows single-IO to read the entire index.
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	659	:param unlimited_cache: If set to True, then instead of using an
	660	LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
	661	cache all leaf nodes.
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	662	:param offset: The start of the btree index data isn't byte 0 of the
	663	file. Instead it starts at some point later.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	664	"""
	665	self._transport = transport
	666	self._name = name
	667	self._size = size
	668	self._file = None
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	669	self._recommended_pages = self._compute_recommended_pages()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	670	self._root_node = None
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	671	self._base_offset = offset
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	672	self._leaf_factory = _LeafNode
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	673	# Default max size is 100,000 leave values
	674	self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	675	if unlimited_cache:
	676	self._leaf_node_cache = {}
	677	self._internal_node_cache = {}
	678	else:
	679	self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
	680	# We use a FIFO here just to prevent possible blowout. However, a
	681	# 300k record btree has only 3k leaf nodes, and only 20 internal
	682	# nodes. A value of 100 scales to ~100100100 = 1M records.
	683	self._internal_node_cache = fifo_cache.FIFOCache(100)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	684	self._key_count = None
	685	self._row_lengths = None
	686	self._row_offsets = None # Start of each row, [-1] is the end
	687
	688	def __eq__(self, other):
	689	"""Equal when self and other were created with the same parameters."""
	690	return (
6619.3.18 by Jelmer Vernooĳ Run 2to3 idioms fixer.	691	isinstance(self, type(other)) and
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	692	self._transport == other._transport and
	693	self._name == other._name and
	694	self._size == other._size)
	695
	696	def __ne__(self, other):
	697	return not self.__eq__(other)
	698
3763.8.12 by John Arbash Meinel Code cleanup.	699	def _get_and_cache_nodes(self, nodes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	700	"""Read nodes and cache them in the lru.
	701
	702	The nodes list supplied is sorted and then read from disk, each node
	703	being inserted it into the _node_cache.
	704
	705	Note: Asking for more nodes than the _node_cache can contain will
	706	result in some of the results being immediately discarded, to prevent
	707	this an assertion is raised if more nodes are asked for than are
	708	cachable.
	709
	710	:return: A dict of {node_pos: node}
	711	"""
	712	found = {}
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	713	start_of_leaves = None
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	714	for node_pos, node in self._read_nodes(sorted(nodes)):
	715	if node_pos == 0: # Special case
	716	self._root_node = node
	717	else:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	718	if start_of_leaves is None:
	719	start_of_leaves = self._row_offsets[-2]
	720	if node_pos < start_of_leaves:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	721	self._internal_node_cache[node_pos] = node
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	722	else:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	723	self._leaf_node_cache[node_pos] = node
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	724	found[node_pos] = node
	725	return found
	726
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	727	def _compute_recommended_pages(self):
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	728	"""Convert transport's recommended_page_size into btree pages.
	729
	730	recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
	731	pages fit in that length.
	732	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	733	recommended_read = self._transport.recommended_page_size()
	734	recommended_pages = int(math.ceil(recommended_read /
	735	float(_PAGE_SIZE)))
	736	return recommended_pages
	737
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	738	def _compute_total_pages_in_index(self):
	739	"""How many pages are in the index.
	740
	741	If we have read the header we will use the value stored there.
	742	Otherwise it will be computed based on the length of the index.
	743	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	744	if self._size is None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	745	raise AssertionError('_compute_total_pages_in_index should not be'
	746	' called when self._size is None')
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	747	if self._root_node is not None:
	748	# This is the number of pages as defined by the header
	749	return self._row_offsets[-1]
	750	# This is the number of pages as defined by the size of the index. They
	751	# should be indentical.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	752	total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
	753	return total_pages
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	754
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	755	def _expand_offsets(self, offsets):
	756	"""Find extra pages to download.
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	757
	758	The idea is that we always want to make big-enough requests (like 64kB
	759	for http), so that we don't waste round trips. So given the entries
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	760	that we already have cached and the new pages being downloaded figure
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	761	out what other pages we might want to read.
	762
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	763	See also doc/developers/btree_index_prefetch.txt for more details.
	764
	765	:param offsets: The offsets to be read
	766	:return: A list of offsets to download
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	767	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	768	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	769	trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	770
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	771	if len(offsets) >= self._recommended_pages:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	772	# Don't add more, we are already requesting more than enough
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	773	if 'index' in debug.debug_flags:
	774	trace.mutter(' not expanding large request (%s >= %s)',
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	775	len(offsets), self._recommended_pages)
	776	return offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	777	if self._size is None:
	778	# Don't try anything, because we don't know where the file ends
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	779	if 'index' in debug.debug_flags:
	780	trace.mutter(' not expanding without knowing index size')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	781	return offsets
	782	total_pages = self._compute_total_pages_in_index()
	783	cached_offsets = self._get_offsets_to_cached_pages()
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	784	# If reading recommended_pages would read the rest of the index, just
	785	# do so.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	786	if total_pages - len(cached_offsets) <= self._recommended_pages:
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	787	# Read whatever is left
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	788	if cached_offsets:
6651.2.2 by Martin Apply 2to3 xrange fix and fix up with sixish range	789	expanded = [x for x in range(total_pages)
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	790	if x not in cached_offsets]
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	791	else:
6651.2.2 by Martin Apply 2to3 xrange fix and fix up with sixish range	792	expanded = list(range(total_pages))
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	793	if 'index' in debug.debug_flags:
	794	trace.mutter(' reading all unread pages: %s', expanded)
	795	return expanded
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	796
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	797	if self._root_node is None:
	798	# ATM on the first read of the root node of a large index, we don't
	799	# bother pre-reading any other pages. This is because the
	800	# likelyhood of actually reading interesting pages is very low.
	801	# See doc/developers/btree_index_prefetch.txt for a discussion, and
	802	# a possible implementation when we are guessing that the second
	803	# layer index is small
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	804	final_offsets = offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	805	else:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	806	tree_depth = len(self._row_lengths)
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	807	if len(cached_offsets) < tree_depth and len(offsets) == 1:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	808	# We haven't read enough to justify expansion
	809	# If we are only going to read the root node, and 1 leaf node,
	810	# then it isn't worth expanding our request. Once we've read at
	811	# least 2 nodes, then we are probably doing a search, and we
	812	# start expanding our requests.
	813	if 'index' in debug.debug_flags:
	814	trace.mutter(' not expanding on first reads')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	815	return offsets
	816	final_offsets = self._expand_to_neighbors(offsets, cached_offsets,
	817	total_pages)
	818
	819	final_offsets = sorted(final_offsets)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	820	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	821	trace.mutter('expanded: %s', final_offsets)
	822	return final_offsets
	823
	824	def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):
	825	"""Expand requests to neighbors until we have enough pages.
	826
	827	This is called from _expand_offsets after policy has determined that we
	828	want to expand.
	829	We only want to expand requests within a given layer. We cheat a little
	830	bit and assume all requests will be in the same layer. This is true
	831	given the current design, but if it changes this algorithm may perform
	832	oddly.
	833
	834	:param offsets: requested offsets
	835	:param cached_offsets: offsets for pages we currently have cached
	836	:return: A set() of offsets after expansion
	837	"""
	838	final_offsets = set(offsets)
	839	first = end = None
	840	new_tips = set(final_offsets)
	841	while len(final_offsets) < self._recommended_pages and new_tips:
	842	next_tips = set()
	843	for pos in new_tips:
	844	if first is None:
	845	first, end = self._find_layer_first_and_end(pos)
	846	previous = pos - 1
	847	if (previous > 0
	848	and previous not in cached_offsets
	849	and previous not in final_offsets
	850	and previous >= first):
	851	next_tips.add(previous)
	852	after = pos + 1
	853	if (after < total_pages
	854	and after not in cached_offsets
	855	and after not in final_offsets
	856	and after < end):
	857	next_tips.add(after)
	858	# This would keep us from going bigger than
	859	# recommended_pages by only expanding the first offsets.
	860	# However, if we are making a 'wide' request, it is
	861	# reasonable to expand all points equally.
	862	# if len(final_offsets) > recommended_pages:
	863	# break
	864	final_offsets.update(next_tips)
	865	new_tips = next_tips
	866	return final_offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	867
4744.2.6 by John Arbash Meinel Start exposing an GraphIndex.clear_cache() member.	868	def clear_cache(self):
	869	"""Clear out any cached/memoized values.
	870
	871	This can be called at any time, but generally it is used when we have
	872	extracted some information, but don't expect to be requesting any more
	873	from this index.
	874	"""
	875	# Note that we don't touch self._root_node or self._internal_node_cache
	876	# We don't expect either of those to be big, and it can save
	877	# round-trips in the future. We may re-evaluate this if InternalNode
	878	# memory starts to be an issue.
	879	self._leaf_node_cache.clear()
	880
4011.5.3 by Andrew Bennetts Implement and test external_references on GraphIndex and BTreeGraphIndex.	881	def external_references(self, ref_list_num):
	882	if self._root_node is None:
	883	self._get_root_node()
	884	if ref_list_num + 1 > self.node_ref_lists:
	885	raise ValueError('No ref list %d, index has %d ref lists'
	886	% (ref_list_num, self.node_ref_lists))
	887	keys = set()
	888	refs = set()
	889	for node in self.iter_all_entries():
	890	keys.add(node[1])
	891	refs.update(node[3][ref_list_num])
	892	return refs - keys
	893
3763.8.12 by John Arbash Meinel Code cleanup.	894	def _find_layer_first_and_end(self, offset):
	895	"""Find the start/stop nodes for the layer corresponding to offset.
	896
	897	:return: (first, end)
	898	first is the first node in this layer
	899	end is the first node of the next layer
	900	"""
	901	first = end = 0
	902	for roffset in self._row_offsets:
	903	first = end
	904	end = roffset
	905	if offset < roffset:
	906	break
	907	return first, end
	908
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	909	def _get_offsets_to_cached_pages(self):
3763.8.12 by John Arbash Meinel Code cleanup.	910	"""Determine what nodes we already have cached."""
6656.1.1 by Martin Apply 2to3 dict fixer and clean up resulting mess using view helpers	911	cached_offsets = set(self._internal_node_cache)
	912	# cache may be dict or LRUCache, keys() is the common method
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	913	cached_offsets.update(self._leaf_node_cache.keys())
3763.8.12 by John Arbash Meinel Code cleanup.	914	if self._root_node is not None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	915	cached_offsets.add(0)
	916	return cached_offsets
3763.8.12 by John Arbash Meinel Code cleanup.	917
	918	def _get_root_node(self):
	919	if self._root_node is None:
	920	# We may not have a root node yet
	921	self._get_internal_nodes([0])
	922	return self._root_node
	923
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	924	def _get_nodes(self, cache, node_indexes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	925	found = {}
	926	needed = []
	927	for idx in node_indexes:
	928	if idx == 0 and self._root_node is not None:
	929	found[0] = self._root_node
	930	continue
	931	try:
	932	found[idx] = cache[idx]
	933	except KeyError:
	934	needed.append(idx)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	935	if not needed:
	936	return found
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	937	needed = self._expand_offsets(needed)
3763.8.12 by John Arbash Meinel Code cleanup.	938	found.update(self._get_and_cache_nodes(needed))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	939	return found
	940
	941	def _get_internal_nodes(self, node_indexes):
	942	"""Get a node, from cache or disk.
	943
	944	After getting it, the node will be cached.
	945	"""
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	946	return self._get_nodes(self._internal_node_cache, node_indexes)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	947
3805.4.6 by John Arbash Meinel refactor for clarity.	948	def _cache_leaf_values(self, nodes):
3805.4.6 by John Arbash Meinel refactor for clarity.	949	"""Cache directly from key => value, skipping the btree."""
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	950	if self._leaf_value_cache is not None:
6656.1.1 by Martin Apply 2to3 dict fixer and clean up resulting mess using view helpers	951	for node in viewvalues(nodes):
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	952	for key, value in node.all_items():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	953	if key in self._leaf_value_cache:
	954	# Don't add the rest of the keys, we've seen this node
	955	# before.
	956	break
	957	self._leaf_value_cache[key] = value
3805.4.6 by John Arbash Meinel refactor for clarity.	958
	959	def _get_leaf_nodes(self, node_indexes):
	960	"""Get a bunch of nodes, from cache or disk."""
	961	found = self._get_nodes(self._leaf_node_cache, node_indexes)
	962	self._cache_leaf_values(found)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	963	return found
	964
	965	def iter_all_entries(self):
	966	"""Iterate over all keys within the index.
	967
	968	:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
	969	The former tuple is used when there are no reference lists in the
	970	index, making the API compatible with simple key:value index types.
	971	There is no defined order for the result iteration - it will be in
	972	the most efficient order for the index.
	973	"""
	974	if 'evil' in debug.debug_flags:
	975	trace.mutter_callsite(3,
	976	"iter_all_entries scales with size of history.")
	977	if not self.key_count():
	978	return
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	979	if self._row_offsets[-1] == 1:
	980	# There is only the root node, and we read that via key_count()
	981	if self.node_ref_lists:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	982	for key, (value, refs) in self._root_node.all_items():
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	983	yield (self, key, value, refs)
	984	else:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	985	for key, (value, refs) in self._root_node.all_items():
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	986	yield (self, key, value)
	987	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	988	start_of_leaves = self._row_offsets[-2]
	989	end_of_leaves = self._row_offsets[-1]
6651.2.2 by Martin Apply 2to3 xrange fix and fix up with sixish range	990	needed_offsets = list(range(start_of_leaves, end_of_leaves))
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	991	if needed_offsets == [0]:
	992	# Special case when we only have a root node, as we have already
	993	# read everything
	994	nodes = [(0, self._root_node)]
	995	else:
	996	nodes = self._read_nodes(needed_offsets)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	997	# We iterate strictly in-order so that we can use this function
	998	# for spilling index builds to disk.
	999	if self.node_ref_lists:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	1000	for _, node in nodes:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1001	for key, (value, refs) in node.all_items():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1002	yield (self, key, value, refs)
	1003	else:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	1004	for _, node in nodes:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1005	for key, (value, refs) in node.all_items():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1006	yield (self, key, value)
	1007
	1008	@staticmethod
	1009	def _multi_bisect_right(in_keys, fixed_keys):
	1010	"""Find the positions where each 'in_key' would fit in fixed_keys.
	1011
	1012	This is equivalent to doing "bisect_right" on each in_key into
	1013	fixed_keys
	1014
	1015	:param in_keys: A sorted list of keys to match with fixed_keys
	1016	:param fixed_keys: A sorted list of keys to match against
	1017	:return: A list of (integer position, [key list]) tuples.
	1018	"""
	1019	if not in_keys:
	1020	return []
	1021	if not fixed_keys:
	1022	# no pointers in the fixed_keys list, which means everything must
	1023	# fall to the left.
	1024	return [(0, in_keys)]
	1025
	1026	# TODO: Iterating both lists will generally take M + N steps
	1027	# Bisecting each key will generally take M * log2 N steps.
	1028	# If we had an efficient way to compare, we could pick the method
	1029	# based on which has the fewer number of steps.
	1030	# There is also the argument that bisect_right is a compiled
	1031	# function, so there is even more to be gained.
	1032	# iter_steps = len(in_keys) + len(fixed_keys)
	1033	# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
	1034	if len(in_keys) == 1: # Bisect will always be faster for M = 1
5753.2.4 by Jelmer Vernooij Review feedback from John.	1035	return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1036	# elif bisect_steps < iter_steps:
	1037	# offsets = {}
	1038	# for key in in_keys:
	1039	# offsets.setdefault(bisect_right(fixed_keys, key),
	1040	# []).append(key)
	1041	# return [(o, offsets[o]) for o in sorted(offsets)]
	1042	in_keys_iter = iter(in_keys)
	1043	fixed_keys_iter = enumerate(fixed_keys)
6634.2.1 by Martin Apply 2to3 next fixer and make compatible	1044	cur_in_key = next(in_keys_iter)
	1045	cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1046
	1047	class InputDone(Exception): pass
	1048	class FixedDone(Exception): pass
	1049
	1050	output = []
	1051	cur_out = []
	1052
	1053	# TODO: Another possibility is that rather than iterating on each side,
	1054	# we could use a combination of bisecting and iterating. For
	1055	# example, while cur_in_key < fixed_key, bisect to find its
	1056	# point, then iterate all matching keys, then bisect (restricted
	1057	# to only the remainder) for the next one, etc.
	1058	try:
	1059	while True:
	1060	if cur_in_key < cur_fixed_key:
	1061	cur_keys = []
	1062	cur_out = (cur_fixed_offset, cur_keys)
	1063	output.append(cur_out)
	1064	while cur_in_key < cur_fixed_key:
	1065	cur_keys.append(cur_in_key)
	1066	try:
6634.2.1 by Martin Apply 2to3 next fixer and make compatible	1067	cur_in_key = next(in_keys_iter)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1068	except StopIteration:
	1069	raise InputDone
	1070	# At this point cur_in_key must be >= cur_fixed_key
	1071	# step the cur_fixed_key until we pass the cur key, or walk off
	1072	# the end
	1073	while cur_in_key >= cur_fixed_key:
	1074	try:
6634.2.1 by Martin Apply 2to3 next fixer and make compatible	1075	cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1076	except StopIteration:
	1077	raise FixedDone
	1078	except InputDone:
	1079	# We consumed all of the input, nothing more to do
	1080	pass
	1081	except FixedDone:
	1082	# There was some input left, but we consumed all of fixed, so we
	1083	# have to add one more for the tail
	1084	cur_keys = [cur_in_key]
	1085	cur_keys.extend(in_keys_iter)
	1086	cur_out = (len(fixed_keys), cur_keys)
	1087	output.append(cur_out)
	1088	return output
	1089
4593.4.5 by John Arbash Meinel Start adding some tests.	1090	def _walk_through_internal_nodes(self, keys):
	1091	"""Take the given set of keys, and find the corresponding LeafNodes.
	1092
	1093	:param keys: An unsorted iterable of keys to search for
	1094	:return: (nodes, index_and_keys)
	1095	nodes is a dict mapping {index: LeafNode}
	1096	keys_at_index is a list of tuples of [(index, [keys for Leaf])]
	1097	"""
	1098	# 6 seconds spent in miss_torture using the sorted() line.
	1099	# Even with out of order disk IO it seems faster not to sort it when
	1100	# large queries are being made.
	1101	keys_at_index = [(0, sorted(keys))]
	1102
	1103	for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
	1104	node_indexes = [idx for idx, s_keys in keys_at_index]
	1105	nodes = self._get_internal_nodes(node_indexes)
	1106
	1107	next_nodes_and_keys = []
	1108	for node_index, sub_keys in keys_at_index:
	1109	node = nodes[node_index]
	1110	positions = self._multi_bisect_right(sub_keys, node.keys)
	1111	node_offset = next_row_start + node.offset
	1112	next_nodes_and_keys.extend([(node_offset + pos, s_keys)
	1113	for pos, s_keys in positions])
	1114	keys_at_index = next_nodes_and_keys
	1115	# We should now be at the _LeafNodes
	1116	node_indexes = [idx for idx, s_keys in keys_at_index]
	1117
	1118	# TODO: We may not want to always read all the nodes in one
	1119	# big go. Consider setting a max size on this.
	1120	nodes = self._get_leaf_nodes(node_indexes)
	1121	return nodes, keys_at_index
	1122
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1123	def iter_entries(self, keys):
	1124	"""Iterate over keys within the index.
	1125
	1126	:param keys: An iterable providing the keys to be retrieved.
	1127	:return: An iterable as per iter_all_entries, but restricted to the
	1128	keys supplied. No additional keys will be returned, and every
	1129	key supplied that is in the index will be returned.
	1130	"""
	1131	# 6 seconds spent in miss_torture using the sorted() line.
	1132	# Even with out of order disk IO it seems faster not to sort it when
	1133	# large queries are being made.
	1134	# However, now that we are doing multi-way bisecting, we need the keys
	1135	# in sorted order anyway. We could change the multi-way code to not
	1136	# require sorted order. (For example, it bisects for the first node,
	1137	# does an in-order search until a key comes before the current point,
	1138	# which it then bisects for, etc.)
	1139	keys = frozenset(keys)
	1140	if not keys:
	1141	return
	1142
	1143	if not self.key_count():
	1144	return
	1145
	1146	needed_keys = []
	1147	if self._leaf_value_cache is None:
	1148	needed_keys = keys
	1149	else:
	1150	for key in keys:
	1151	value = self._leaf_value_cache.get(key, None)
	1152	if value is not None:
	1153	# This key is known not to be here, skip it
	1154	value, refs = value
	1155	if self.node_ref_lists:
	1156	yield (self, key, value, refs)
	1157	else:
	1158	yield (self, key, value)
	1159	else:
	1160	needed_keys.append(key)
	1161
	1162	last_key = None
	1163	needed_keys = keys
	1164	if not needed_keys:
	1165	return
4593.4.5 by John Arbash Meinel Start adding some tests.	1166	nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1167	for node_index, sub_keys in nodes_and_keys:
	1168	if not sub_keys:
	1169	continue
	1170	node = nodes[node_index]
	1171	for next_sub_key in sub_keys:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1172	if next_sub_key in node:
	1173	value, refs = node[next_sub_key]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1174	if self.node_ref_lists:
	1175	yield (self, next_sub_key, value, refs)
	1176	else:
	1177	yield (self, next_sub_key, value)
	1178
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1179	def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1180	"""Find the parent_map information for the set of keys.
	1181
	1182	This populates the parent_map dict and missing_keys set based on the
	1183	queried keys. It also can fill out an arbitrary number of parents that
	1184	it finds while searching for the supplied keys.
	1185
	1186	It is unlikely that you want to call this directly. See
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1187	"CombinedGraphIndex.find_ancestry()" for a more appropriate API.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1188
	1189	:param keys: A keys whose ancestry we want to return
	1190	Every key will either end up in 'parent_map' or 'missing_keys'.
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1191	:param ref_list_num: This index in the ref_lists is the parents we
	1192	care about.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1193	:param parent_map: {key: parent_keys} for keys that are present in this
	1194	index. This may contain more entries than were in 'keys', that are
	1195	reachable ancestors of the keys requested.
4593.4.5 by John Arbash Meinel Start adding some tests.	1196	:param missing_keys: keys which are known to be missing in this index.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1197	This may include parents that were not directly requested, but we
	1198	were able to determine that they are not present in this index.
	1199	:return: search_keys parents that were found but not queried to know
	1200	if they are missing or present. Callers can re-query this index for
	1201	those keys, and they will be placed into parent_map or missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1202	"""
	1203	if not self.key_count():
	1204	# We use key_count() to trigger reading the root node and
	1205	# determining info about this BTreeGraphIndex
	1206	# If we don't have any keys, then everything is missing
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1207	missing_keys.update(keys)
	1208	return set()
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1209	if ref_list_num >= self.node_ref_lists:
	1210	raise ValueError('No ref list %d, index has %d ref lists'
	1211	% (ref_list_num, self.node_ref_lists))
	1212
	1213	# The main trick we are trying to accomplish is that when we find a
	1214	# key listing its parents, we expect that the parent key is also likely
	1215	# to sit on the same page. Allowing us to expand parents quickly
	1216	# without suffering the full stack of bisecting, etc.
4593.4.5 by John Arbash Meinel Start adding some tests.	1217	nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1218
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1219	# These are parent keys which could not be immediately resolved on the
	1220	# page where the child was present. Note that we may already be
	1221	# searching for that key, and it may actually be present [or known
	1222	# missing] on one of the other pages we are reading.
	1223	# TODO:
	1224	# We could try searching for them in the immediate previous or next
	1225	# page. If they occur "later" we could put them in a pending lookup
	1226	# set, and then for each node we read thereafter we could check to
	1227	# see if they are present.
	1228	# However, we don't know the impact of keeping this list of things
	1229	# that I'm going to search for every node I come across from here on
	1230	# out.
	1231	# It doesn't handle the case when the parent key is missing on a
	1232	# page that we don't read. So we already have to handle being
	1233	# re-entrant for that.
	1234	# Since most keys contain a date string, they are more likely to be
	1235	# found earlier in the file than later, but we would know that right
	1236	# away (key < min_key), and wouldn't keep searching it on every other
	1237	# page that we read.
	1238	# Mostly, it is an idea, one which should be benchmarked.
	1239	parents_not_on_page = set()
	1240
	1241	for node_index, sub_keys in nodes_and_keys:
	1242	if not sub_keys:
	1243	continue
	1244	# sub_keys is all of the keys we are looking for that should exist
	1245	# on this page, if they aren't here, then they won't be found
	1246	node = nodes[node_index]
	1247	parents_to_check = set()
	1248	for next_sub_key in sub_keys:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1249	if next_sub_key not in node:
4593.4.5 by John Arbash Meinel Start adding some tests.	1250	# This one is just not present in the index at all
	1251	missing_keys.add(next_sub_key)
	1252	else:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1253	value, refs = node[next_sub_key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1254	parent_keys = refs[ref_list_num]
	1255	parent_map[next_sub_key] = parent_keys
	1256	parents_to_check.update(parent_keys)
	1257	# Don't look for things we've already found
	1258	parents_to_check = parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1259	# this can be used to test the benefit of having the check loop
	1260	# inlined.
	1261	# parents_not_on_page.update(parents_to_check)
	1262	# continue
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1263	while parents_to_check:
	1264	next_parents_to_check = set()
	1265	for key in parents_to_check:
5365.5.1 by John Arbash Meinel Implement a custom parser for chk btree leaves.	1266	if key in node:
	1267	value, refs = node[key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1268	parent_keys = refs[ref_list_num]
	1269	parent_map[key] = parent_keys
	1270	next_parents_to_check.update(parent_keys)
	1271	else:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1272	# This parent either is genuinely missing, or should be
	1273	# found on another page. Perf test whether it is better
	1274	# to check if this node should fit on this page or not.
	1275	# in the 'everything-in-one-pack' scenario, this not
	1276	# doing the check is 237ms vs 243ms.
	1277	# So slightly better, but I assume the standard 'lots
	1278	# of packs' is going to show a reasonable improvement
	1279	# from the check, because it avoids 'going around
	1280	# again' for everything that is in another index
4593.4.5 by John Arbash Meinel Start adding some tests.	1281	# parents_not_on_page.add(key)
	1282	# Missing for some reason
	1283	if key < node.min_key:
	1284	# in the case of bzr.dev, 3.4k/5.3k misses are
	1285	# 'earlier' misses (65%)
	1286	parents_not_on_page.add(key)
	1287	elif key > node.max_key:
	1288	# This parent key would be present on a different
	1289	# LeafNode
	1290	parents_not_on_page.add(key)
	1291	else:
	1292	# assert key != node.min_key and key != node.max_key
	1293	# If it was going to be present, it would be on
	1294	# this page, so mark it missing.
	1295	missing_keys.add(key)
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1296	parents_to_check = next_parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1297	# Might want to do another .difference() from missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1298	# parents_not_on_page could have been found on a different page, or be
	1299	# known to be missing. So cull out everything that has already been
	1300	# found.
4593.4.5 by John Arbash Meinel Start adding some tests.	1301	search_keys = parents_not_on_page.difference(
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1302	parent_map).difference(missing_keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1303	return search_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1304
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1305	def iter_entries_prefix(self, keys):
	1306	"""Iterate over keys within the index using prefix matching.
	1307
	1308	Prefix matching is applied within the tuple of a key, not to within
	1309	the bytestring of each key element. e.g. if you have the keys ('foo',
	1310	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	1311	only the former key is returned.
	1312
	1313	WARNING: Note that this method currently causes a full index parse
	1314	unconditionally (which is reasonably appropriate as it is a means for
	1315	thunking many small indices into one larger one and still supplies
	1316	iter_all_entries at the thunk layer).
	1317
	1318	:param keys: An iterable providing the key prefixes to be retrieved.
	1319	Each key prefix takes the form of a tuple the length of a key, but
	1320	with the last N elements 'None' rather than a regular bytestring.
	1321	The first element cannot be 'None'.
	1322	:return: An iterable as per iter_all_entries, but restricted to the
	1323	keys with a matching prefix to those supplied. No additional keys
	1324	will be returned, and every match that is in the index will be
	1325	returned.
	1326	"""
	1327	keys = sorted(set(keys))
	1328	if not keys:
	1329	return
	1330	# Load if needed to check key lengths
	1331	if self._key_count is None:
	1332	self._get_root_node()
	1333	# TODO: only access nodes that can satisfy the prefixes we are looking
	1334	# for. For now, to meet API usage (as this function is not used by
6622.1.34 by Jelmer Vernooĳ Rename brzlib => breezy.	1335	# current breezy) just suck the entire index and iterate in memory.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1336	nodes = {}
	1337	if self.node_ref_lists:
	1338	if self._key_length == 1:
	1339	for _1, key, value, refs in self.iter_all_entries():
	1340	nodes[key] = value, refs
	1341	else:
	1342	nodes_by_key = {}
	1343	for _1, key, value, refs in self.iter_all_entries():
	1344	key_value = key, value, refs
	1345	# For a key of (foo, bar, baz) create
	1346	# _nodes_by_key[foo][bar][baz] = key_value
	1347	key_dict = nodes_by_key
	1348	for subkey in key[:-1]:
	1349	key_dict = key_dict.setdefault(subkey, {})
	1350	key_dict[key[-1]] = key_value
	1351	else:
	1352	if self._key_length == 1:
	1353	for _1, key, value in self.iter_all_entries():
	1354	nodes[key] = value
	1355	else:
	1356	nodes_by_key = {}
	1357	for _1, key, value in self.iter_all_entries():
	1358	key_value = key, value
	1359	# For a key of (foo, bar, baz) create
	1360	# _nodes_by_key[foo][bar][baz] = key_value
	1361	key_dict = nodes_by_key
	1362	for subkey in key[:-1]:
	1363	key_dict = key_dict.setdefault(subkey, {})
	1364	key_dict[key[-1]] = key_value
	1365	if self._key_length == 1:
	1366	for key in keys:
6654.1.1 by Martin Factor out some copycode in iter_entries_prefix implementations	1367	index._sanity_check_key(self, key)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1368	try:
	1369	if self.node_ref_lists:
	1370	value, node_refs = nodes[key]
	1371	yield self, key, value, node_refs
	1372	else:
	1373	yield self, key, nodes[key]
	1374	except KeyError:
	1375	pass
	1376	return
6654.1.1 by Martin Factor out some copycode in iter_entries_prefix implementations	1377	for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
	1378	yield entry
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1379
	1380	def key_count(self):
	1381	"""Return an estimate of the number of keys in this index.
	1382
	1383	For BTreeGraphIndex the estimate is exact as it is contained in the
	1384	header.
	1385	"""
	1386	if self._key_count is None:
	1387	self._get_root_node()
	1388	return self._key_count
	1389
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1390	def _compute_row_offsets(self):
	1391	"""Fill out the _row_offsets attribute based on _row_lengths."""
	1392	offsets = []
	1393	row_offset = 0
	1394	for row in self._row_lengths:
	1395	offsets.append(row_offset)
	1396	row_offset += row
	1397	offsets.append(row_offset)
	1398	self._row_offsets = offsets
	1399
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1400	def _parse_header_from_bytes(self, bytes):
	1401	"""Parse the header from a region of bytes.
	1402
	1403	:param bytes: The data to parse.
	1404	:return: An offset, data tuple such as readv yields, for the unparsed
	1405	data. (which may be of length 0).
	1406	"""
	1407	signature = bytes[0:len(self._signature())]
	1408	if not signature == self._signature():
	1409	raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
	1410	lines = bytes[len(self._signature()):].splitlines()
	1411	options_line = lines[0]
	1412	if not options_line.startswith(_OPTION_NODE_REFS):
	1413	raise errors.BadIndexOptions(self)
	1414	try:
	1415	self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
	1416	except ValueError:
	1417	raise errors.BadIndexOptions(self)
	1418	options_line = lines[1]
	1419	if not options_line.startswith(_OPTION_KEY_ELEMENTS):
	1420	raise errors.BadIndexOptions(self)
	1421	try:
	1422	self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
	1423	except ValueError:
	1424	raise errors.BadIndexOptions(self)
	1425	options_line = lines[2]
	1426	if not options_line.startswith(_OPTION_LEN):
	1427	raise errors.BadIndexOptions(self)
	1428	try:
	1429	self._key_count = int(options_line[len(_OPTION_LEN):])
	1430	except ValueError:
	1431	raise errors.BadIndexOptions(self)
	1432	options_line = lines[3]
	1433	if not options_line.startswith(_OPTION_ROW_LENGTHS):
	1434	raise errors.BadIndexOptions(self)
	1435	try:
6631.3.1 by Martin Run 2to3 map fixer and refactor after	1436	self._row_lengths = [int(length) for length in
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1437	options_line[len(_OPTION_ROW_LENGTHS):].split(',')
6631.3.1 by Martin Run 2to3 map fixer and refactor after	1438	if length]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1439	except ValueError:
	1440	raise errors.BadIndexOptions(self)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1441	self._compute_row_offsets()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1442
	1443	# calculate the bytes we have processed
	1444	header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
	1445	return header_end, bytes[header_end:]
	1446
	1447	def _read_nodes(self, nodes):
	1448	"""Read some nodes from disk into the LRU cache.
	1449
	1450	This performs a readv to get the node data into memory, and parses each
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1451	node, then yields it to the caller. The nodes are requested in the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1452	supplied order. If possible doing sort() on the list before requesting
	1453	a read may improve performance.
	1454
	1455	:param nodes: The nodes to read. 0 - first node, 1 - second node etc.
	1456	:return: None
	1457	"""
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1458	# may be the byte string of the whole file
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1459	bytes = None
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1460	# list of (offset, length) regions of the file that should, evenually
	1461	# be read in to data_ranges, either from 'bytes' or from the transport
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1462	ranges = []
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1463	base_offset = self._base_offset
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1464	for index in nodes:
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1465	offset = (index * _PAGE_SIZE)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1466	size = _PAGE_SIZE
	1467	if index == 0:
	1468	# Root node - special case
	1469	if self._size:
	1470	size = min(_PAGE_SIZE, self._size)
	1471	else:
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1472	# The only case where we don't know the size, is for very
	1473	# small indexes. So we read the whole thing
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1474	bytes = self._transport.get_bytes(self._name)
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1475	num_bytes = len(bytes)
	1476	self._size = num_bytes - base_offset
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1477	# the whole thing should be parsed out of 'bytes'
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1478	ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
6651.2.2 by Martin Apply 2to3 xrange fix and fix up with sixish range	1479	for start in range(base_offset, num_bytes, _PAGE_SIZE)]
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1480	break
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1481	else:
3763.8.6 by John Arbash Meinel Fix the logic a bit, and add a bit more tweaking opportunities	1482	if offset > self._size:
	1483	raise AssertionError('tried to read past the end'
	1484	' of the file %s > %s'
	1485	% (offset, self._size))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1486	size = min(size, self._size - offset)
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1487	ranges.append((base_offset + offset, size))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1488	if not ranges:
	1489	return
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1490	elif bytes is not None:
	1491	# already have the whole file
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1492	data_ranges = [(start, bytes[start:start+size])
	1493	for start, size in ranges]
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1494	elif self._file is None:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1495	data_ranges = self._transport.readv(self._name, ranges)
	1496	else:
	1497	data_ranges = []
	1498	for offset, size in ranges:
	1499	self._file.seek(offset)
	1500	data_ranges.append((offset, self._file.read(size)))
	1501	for offset, data in data_ranges:
5074.4.1 by John Arbash Meinel Add an offset flag to BTreeGraphIndex.	1502	offset -= base_offset
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1503	if offset == 0:
	1504	# extract the header
	1505	offset, data = self._parse_header_from_bytes(data)
	1506	if len(data) == 0:
	1507	continue
	1508	bytes = zlib.decompress(data)
	1509	if bytes.startswith(_LEAF_FLAG):
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	1510	node = self._leaf_factory(bytes, self._key_length,
	1511	self.node_ref_lists)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1512	elif bytes.startswith(_INTERNAL_FLAG):
	1513	node = _InternalNode(bytes)
	1514	else:
	1515	raise AssertionError("Unknown node type for %r" % bytes)
	1516	yield offset / _PAGE_SIZE, node
	1517
	1518	def _signature(self):
	1519	"""The file signature for this index type."""
	1520	return _BTSIGNATURE
	1521
	1522	def validate(self):
	1523	"""Validate that everything in the index can be accessed."""
	1524	# just read and parse every node.
	1525	self._get_root_node()
	1526	if len(self._row_lengths) > 1:
	1527	start_node = self._row_offsets[1]
	1528	else:
	1529	# We shouldn't be reading anything anyway
	1530	start_node = 1
	1531	node_end = self._row_offsets[-1]
6651.2.2 by Martin Apply 2to3 xrange fix and fix up with sixish range	1532	for node in self._read_nodes(list(range(start_node, node_end))):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1533	pass
	1534
	1535
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	1536	_gcchk_factory = _LeafNode
	1537
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1538	try:
6622.1.34 by Jelmer Vernooĳ Rename brzlib => breezy.	1539	from breezy import _btree_serializer_pyx as _btree_serializer
5365.5.18 by John Arbash Meinel Expose the new leaf node factory across the stack.	1540	_gcchk_factory = _btree_serializer._parse_into_chk
6619.3.2 by Jelmer Vernooĳ Apply 2to3 except fix.	1541	except ImportError as e:
4574.3.8 by Martin Pool Only mutter extension load errors when they occur, and record for later	1542	osutils.failed_to_load_extension(e)
6622.1.34 by Jelmer Vernooĳ Rename brzlib => breezy.	1543	from breezy import _btree_serializer_py as _btree_serializer