/brz/remove-bazaar : contents of bzrlib/btree

: (revision 4760.2.3)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

3641.3.29 by John Arbash Meinel Cleanup the copyright headers	1	# Copyright (C) 2008 Canonical Ltd
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	2	#
	3	# This program is free software; you can redistribute it and/or modify
3641.3.29 by John Arbash Meinel Cleanup the copyright headers	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
4183.7.1 by Sabin Iacob update FSF mailing address	15	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	16	#
	17
	18	"""B+Tree indices"""
	19
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	20	import cStringIO
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	21	from bisect import bisect_right
	22	import math
	23	import tempfile
	24	import zlib
	25
	26	from bzrlib import (
	27	chunk_writer,
	28	debug,
	29	errors,
4208.1.2 by John Arbash Meinel Switch to using a FIFOCache.	30	fifo_cache,
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	31	index,
	32	lru_cache,
	33	osutils,
	34	trace,
	35	)
	36	from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
	37	from bzrlib.transport import get_transport
	38
	39
3641.3.3 by John Arbash Meinel Change the header to indicate these indexes are	40	_BTSIGNATURE = "B+Tree Graph Index 2\n"
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	41	_OPTION_ROW_LENGTHS = "row_lengths="
	42	_LEAF_FLAG = "type=leaf\n"
	43	_INTERNAL_FLAG = "type=internal\n"
	44	_INTERNAL_OFFSET = "offset="
	45
	46	_RESERVED_HEADER_BYTES = 120
	47	_PAGE_SIZE = 4096
	48
	49	# 4K per page: 4MB - 1000 entries
	50	_NODE_CACHE_SIZE = 1000
	51
	52
	53	class _BuilderRow(object):
	54	"""The stored state accumulated while writing out a row in the index.
	55
	56	:ivar spool: A temporary file used to accumulate nodes for this row
	57	in the tree.
	58	:ivar nodes: The count of nodes emitted so far.
	59	"""
	60
	61	def __init__(self):
	62	"""Create a _BuilderRow."""
	63	self.nodes = 0
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	64	self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	65	self.writer = None
	66
	67	def finish_node(self, pad=True):
	68	byte_lines, _, padding = self.writer.finish()
	69	if self.nodes == 0:
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	70	self.spool = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	71	# padded note:
	72	self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	73	elif self.nodes == 1:
	74	# We got bigger than 1 node, switch to a temp file
	75	spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
	76	spool.write(self.spool.getvalue())
	77	self.spool = spool
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	78	skipped_bytes = 0
	79	if not pad and padding:
	80	del byte_lines[-1]
	81	skipped_bytes = padding
	82	self.spool.writelines(byte_lines)
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	83	remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
	84	if remainder != 0:
	85	raise AssertionError("incorrect node length: %d, %d"
	86	% (self.spool.tell(), remainder))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	87	self.nodes += 1
	88	self.writer = None
	89
	90
	91	class _InternalBuilderRow(_BuilderRow):
	92	"""The stored state accumulated while writing out internal rows."""
	93
	94	def finish_node(self, pad=True):
	95	if not pad:
	96	raise AssertionError("Must pad internal nodes only.")
	97	_BuilderRow.finish_node(self)
	98
	99
	100	class _LeafBuilderRow(_BuilderRow):
	101	"""The stored state accumulated while writing out a leaf rows."""
	102
	103
	104	class BTreeBuilder(index.GraphIndexBuilder):
	105	"""A Builder for B+Tree based Graph indices.
	106
	107	The resulting graph has the structure:
	108
	109	_SIGNATURE OPTIONS NODES
	110	_SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
	111	OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
	112	REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
	113	KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
	114	LENGTH := 'len=' DIGITS NEWLINE
	115	ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
	116	NODES := NODE_COMPRESSED*
	117	NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
	118	NODE_RAW := INTERNAL \| LEAF
	119	INTERNAL := INTERNAL_FLAG POINTERS
	120	LEAF := LEAF_FLAG ROWS
	121	KEY_ELEMENT := Not-whitespace-utf8
	122	KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
	123	ROWS := ROW*
	124	ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
	125	ABSENT := 'a'
	126	REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
	127	REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
	128	REFERENCE := KEY
	129	VALUE := no-newline-no-null-bytes
	130	"""
	131
	132	def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):
	133	"""See GraphIndexBuilder.__init__.
	134
	135	:param spill_at: Optional parameter controlling the maximum number
	136	of nodes that BTreeBuilder will hold in memory.
	137	"""
	138	index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
	139	key_elements=key_elements)
	140	self._spill_at = spill_at
	141	self._backing_indices = []
3644.2.11 by John Arbash Meinel Document the new form of _nodes and remove an unnecessary cast.	142	# A map of {key: (node_refs, value)}
	143	self._nodes = {}
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	144	# Indicate it hasn't been built yet
	145	self._nodes_by_key = None
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	146	self._optimize_for_size = False
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	147
	148	def add_node(self, key, value, references=()):
	149	"""Add a node to the index.
	150
	151	If adding the node causes the builder to reach its spill_at threshold,
	152	disk spilling will be triggered.
	153
	154	:param key: The key. keys are non-empty tuples containing
	155	as many whitespace-free utf8 bytestrings as the key length
	156	defined for this index.
	157	:param references: An iterable of iterables of keys. Each is a
	158	reference to another key.
	159	:param value: The value to associate with the key. It may be any
	160	bytes as long as it does not contain \0 or \n.
	161	"""
3644.2.9 by John Arbash Meinel Refactor some code.	162	# we don't care about absent_references
3644.2.9 by John Arbash Meinel Refactor some code.	163	node_refs, _ = self._check_key_ref_value(key, references, value)
3644.2.2 by John Arbash Meinel the new btree index doesn't have 'absent' keys in its _nodes	164	if key in self._nodes:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	165	raise errors.BadIndexDuplicateKey(key, self)
4679.8.3 by John Arbash Meinel Expose bzrlib.static_tuple.StaticTuple as a thunk	166	# TODO: StaticTuple
3644.2.11 by John Arbash Meinel Document the new form of _nodes and remove an unnecessary cast.	167	self._nodes[key] = (node_refs, value)
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	168	self._keys.add(key)
3644.2.9 by John Arbash Meinel Refactor some code.	169	if self._nodes_by_key is not None and self._key_length > 1:
3644.2.9 by John Arbash Meinel Refactor some code.	170	self._update_nodes_by_key(key, value, node_refs)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	171	if len(self._keys) < self._spill_at:
	172	return
3644.2.9 by John Arbash Meinel Refactor some code.	173	self._spill_mem_keys_to_disk()
	174
	175	def _spill_mem_keys_to_disk(self):
	176	"""Write the in memory keys down to disk to cap memory consumption.
	177
	178	If we already have some keys written to disk, we will combine them so
	179	as to preserve the sorted order. The algorithm for combining uses
	180	powers of two. So on the first spill, write all mem nodes into a
	181	single index. On the second spill, combine the mem nodes with the nodes
	182	on disk to create a 2x sized disk index and get rid of the first index.
	183	On the third spill, create a single new disk index, which will contain
	184	the mem nodes, and preserve the existing 2x sized index. On the fourth,
	185	combine mem with the first and second indexes, creating a new one of
	186	size 4x. On the fifth create a single new one, etc.
	187	"""
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	188	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	189	(new_backing_file, size,
	190	backing_pos) = self._spill_mem_keys_and_combine()
	191	else:
	192	new_backing_file, size = self._spill_mem_keys_without_combining()
	193	# Note: The transport here isn't strictly needed, because we will use
	194	# direct access to the new_backing._file object
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	195	new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	196	# GC will clean up the file
	197	new_backing._file = new_backing_file
4168.3.6 by John Arbash Meinel Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().	198	if self._combine_backing_indices:
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	199	if len(self._backing_indices) == backing_pos:
	200	self._backing_indices.append(None)
	201	self._backing_indices[backing_pos] = new_backing
	202	for backing_pos in range(backing_pos):
	203	self._backing_indices[backing_pos] = None
	204	else:
	205	self._backing_indices.append(new_backing)
	206	self._keys = set()
	207	self._nodes = {}
	208	self._nodes_by_key = None
	209
	210	def _spill_mem_keys_without_combining(self):
	211	return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
	212
	213	def _spill_mem_keys_and_combine(self):
4168.3.4 by John Arbash Meinel Restore the ability to spill, but prepare a flag to disable it.	214	iterators_to_combine = [self._iter_mem_nodes()]
	215	pos = -1
	216	for pos, backing in enumerate(self._backing_indices):
	217	if backing is None:
	218	pos -= 1
	219	break
	220	iterators_to_combine.append(backing.iter_all_entries())
	221	backing_pos = pos + 1
	222	new_backing_file, size = \
	223	self._write_nodes(self._iter_smallest(iterators_to_combine),
	224	allow_optimize=False)
4168.3.5 by John Arbash Meinel Check that setting _combine_spilled_indices has the expected effect.	225	return new_backing_file, size, backing_pos
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	226
	227	def add_nodes(self, nodes):
	228	"""Add nodes to the index.
	229
	230	:param nodes: An iterable of (key, node_refs, value) entries to add.
	231	"""
	232	if self.reference_lists:
	233	for (key, value, node_refs) in nodes:
	234	self.add_node(key, value, node_refs)
	235	else:
	236	for (key, value) in nodes:
	237	self.add_node(key, value)
	238
	239	def _iter_mem_nodes(self):
	240	"""Iterate over the nodes held in memory."""
3644.2.8 by John Arbash Meinel Two quick tweaks.	241	nodes = self._nodes
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	242	if self.reference_lists:
3644.2.8 by John Arbash Meinel Two quick tweaks.	243	for key in sorted(nodes):
	244	references, value = nodes[key]
	245	yield self, key, value, references
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	246	else:
3644.2.8 by John Arbash Meinel Two quick tweaks.	247	for key in sorted(nodes):
	248	references, value = nodes[key]
	249	yield self, key, value
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	250
	251	def _iter_smallest(self, iterators_to_combine):
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	252	if len(iterators_to_combine) == 1:
	253	for value in iterators_to_combine[0]:
	254	yield value
	255	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	256	current_values = []
	257	for iterator in iterators_to_combine:
	258	try:
	259	current_values.append(iterator.next())
	260	except StopIteration:
	261	current_values.append(None)
	262	last = None
	263	while True:
	264	# Decorate candidates with the value to allow 2.4's min to be used.
	265	candidates = [(item[1][1], item) for item
	266	in enumerate(current_values) if item[1] is not None]
	267	if not len(candidates):
	268	return
	269	selected = min(candidates)
	270	# undecorate back to (pos, node)
	271	selected = selected[1]
	272	if last == selected[1][1]:
	273	raise errors.BadIndexDuplicateKey(last, self)
	274	last = selected[1][1]
	275	# Yield, with self as the index
	276	yield (self,) + selected[1][1:]
	277	pos = selected[0]
	278	try:
	279	current_values[pos] = iterators_to_combine[pos].next()
	280	except StopIteration:
	281	current_values[pos] = None
	282
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	283	def _add_key(self, string_key, line, rows, allow_optimize=True):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	284	"""Add a key to the current chunk.
	285
	286	:param string_key: The key to add.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	287	:param line: The fully serialised key and value.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	288	:param allow_optimize: If set to False, prevent setting the optimize
	289	flag when writing out. This is used by the _spill_mem_keys_to_disk
	290	functionality.
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	291	"""
	292	if rows[-1].writer is None:
	293	# opening a new leaf chunk;
	294	for pos, internal_row in enumerate(rows[:-1]):
	295	# flesh out any internal nodes that are needed to
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	296	# preserve the height of the tree
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	297	if internal_row.writer is None:
	298	length = _PAGE_SIZE
	299	if internal_row.nodes == 0:
	300	length -= _RESERVED_HEADER_BYTES # padded
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	301	if allow_optimize:
	302	optimize_for_size = self._optimize_for_size
	303	else:
	304	optimize_for_size = False
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	305	internal_row.writer = chunk_writer.ChunkWriter(length, 0,
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	306	optimize_for_size=optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	307	internal_row.writer.write(_INTERNAL_FLAG)
	308	internal_row.writer.write(_INTERNAL_OFFSET +
	309	str(rows[pos + 1].nodes) + "\n")
	310	# add a new leaf
	311	length = _PAGE_SIZE
	312	if rows[-1].nodes == 0:
	313	length -= _RESERVED_HEADER_BYTES # padded
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	314	rows[-1].writer = chunk_writer.ChunkWriter(length,
	315	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	316	rows[-1].writer.write(_LEAF_FLAG)
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	317	if rows[-1].writer.write(line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	318	# this key did not fit in the node:
	319	rows[-1].finish_node()
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	320	key_line = string_key + "\n"
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	321	new_row = True
	322	for row in reversed(rows[:-1]):
	323	# Mark the start of the next node in the node above. If it
4031.3.1 by Frank Aspell Fixing various typos	324	# doesn't fit then propagate upwards until we find one that
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	325	# it does fit into.
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	326	if row.writer.write(key_line):
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	327	row.finish_node()
	328	else:
	329	# We've found a node that can handle the pointer.
	330	new_row = False
	331	break
	332	# If we reached the current root without being able to mark the
	333	# division point, then we need a new root:
	334	if new_row:
	335	# We need a new row
	336	if 'index' in debug.debug_flags:
	337	trace.mutter('Inserting new global row.')
	338	new_row = _InternalBuilderRow()
	339	reserved_bytes = 0
	340	rows.insert(0, new_row)
	341	# This will be padded, hence the -100
	342	new_row.writer = chunk_writer.ChunkWriter(
	343	_PAGE_SIZE - _RESERVED_HEADER_BYTES,
3777.5.2 by John Arbash Meinel Change the name to ChunkWriter.set_optimize()	344	reserved_bytes,
	345	optimize_for_size=self._optimize_for_size)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	346	new_row.writer.write(_INTERNAL_FLAG)
	347	new_row.writer.write(_INTERNAL_OFFSET +
	348	str(rows[1].nodes - 1) + "\n")
3641.3.11 by John Arbash Meinel Start working on an alternate way to track compressed_chunk state.	349	new_row.writer.write(key_line)
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	350	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.8 by John Arbash Meinel Move the add_key helper function into a separate func	351
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	352	def _write_nodes(self, node_iterator, allow_optimize=True):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	353	"""Write node_iterator out as a B+Tree.
	354
	355	:param node_iterator: An iterator of sorted nodes. Each node should
	356	match the output given by iter_all_entries.
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	357	:param allow_optimize: If set to False, prevent setting the optimize
	358	flag when writing out. This is used by the _spill_mem_keys_to_disk
	359	functionality.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	360	:return: A file handle for a temporary file containing a B+Tree for
	361	the nodes.
	362	"""
	363	# The index rows - rows[0] is the root, rows[1] is the layer under it
	364	# etc.
	365	rows = []
	366	# forward sorted by key. In future we may consider topological sorting,
	367	# at the cost of table scans for direct lookup, or a second index for
	368	# direct lookup
	369	key_count = 0
	370	# A stack with the number of nodes of each size. 0 is the root node
	371	# and must always be 1 (if there are any nodes in the tree).
	372	self.row_lengths = []
	373	# Loop over all nodes adding them to the bottom row
	374	# (rows[-1]). When we finish a chunk in a row,
4031.3.1 by Frank Aspell Fixing various typos	375	# propagate the key that didn't fit (comes after the chunk) to the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	376	# row above, transitively.
	377	for node in node_iterator:
	378	if key_count == 0:
	379	# First key triggers the first row
	380	rows.append(_LeafBuilderRow())
	381	key_count += 1
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	382	string_key, line = _btree_serializer._flatten_node(node,
	383	self.reference_lists)
4168.2.1 by John Arbash Meinel Disable optimizations when spilling content to disk.	384	self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	385	for row in reversed(rows):
	386	pad = (type(row) != _LeafBuilderRow)
	387	row.finish_node(pad=pad)
	388	lines = [_BTSIGNATURE]
	389	lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
	390	lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
	391	lines.append(_OPTION_LEN + str(key_count) + '\n')
	392	row_lengths = [row.nodes for row in rows]
	393	lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
4708.1.1 by John Arbash Meinel Use a cStringIO.StringIO for 1-page btree indexes.	394	if row_lengths and row_lengths[-1] > 1:
	395	result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
	396	else:
	397	result = cStringIO.StringIO()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	398	result.writelines(lines)
	399	position = sum(map(len, lines))
	400	root_row = True
	401	if position > _RESERVED_HEADER_BYTES:
	402	raise AssertionError("Could not fit the header in the"
	403	" reserved space: %d > %d"
	404	% (position, _RESERVED_HEADER_BYTES))
	405	# write the rows out:
	406	for row in rows:
	407	reserved = _RESERVED_HEADER_BYTES # reserved space for first node
	408	row.spool.flush()
	409	row.spool.seek(0)
	410	# copy nodes to the finalised file.
	411	# Special case the first node as it may be prefixed
	412	node = row.spool.read(_PAGE_SIZE)
	413	result.write(node[reserved:])
	414	result.write("\x00" * (reserved - position))
	415	position = 0 # Only the root row actually has an offset
	416	copied_len = osutils.pumpfile(row.spool, result)
	417	if copied_len != (row.nodes - 1) * _PAGE_SIZE:
	418	if type(row) != _LeafBuilderRow:
3644.2.3 by John Arbash Meinel Do a bit more work to get all the tests to pass.	419	raise AssertionError("Incorrect amount of data copied"
	420	" expected: %d, got: %d"
	421	% ((row.nodes - 1) * _PAGE_SIZE,
	422	copied_len))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	423	result.flush()
	424	size = result.tell()
	425	result.seek(0)
	426	return result, size
	427
	428	def finish(self):
	429	"""Finalise the index.
	430
	431	:return: A file handle for a temporary file containing the nodes added
	432	to the index.
	433	"""
	434	return self._write_nodes(self.iter_all_entries())[0]
	435
	436	def iter_all_entries(self):
	437	"""Iterate over all keys within the index
	438
4343.2.2 by John Arbash Meinel Fix an important doc bug about the api of iter_all_entries()	439	:return: An iterable of (index, key, value, reference_lists). There is
	440	no defined order for the result iteration - it will be in the most
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	441	efficient order for the index (in this case dictionary hash order).
	442	"""
	443	if 'evil' in debug.debug_flags:
	444	trace.mutter_callsite(3,
	445	"iter_all_entries scales with size of history.")
	446	# Doing serial rather than ordered would be faster; but this shouldn't
	447	# be getting called routinely anyway.
3644.2.8 by John Arbash Meinel Two quick tweaks.	448	iterators = [self._iter_mem_nodes()]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	449	for backing in self._backing_indices:
	450	if backing is not None:
	451	iterators.append(backing.iter_all_entries())
3641.3.9 by John Arbash Meinel Special case around _iter_smallest when we have only	452	if len(iterators) == 1:
	453	return iterators[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	454	return self._iter_smallest(iterators)
	455
	456	def iter_entries(self, keys):
	457	"""Iterate over keys within the index.
	458
	459	:param keys: An iterable providing the keys to be retrieved.
	460	:return: An iterable of (index, key, value, reference_lists). There is no
	461	defined order for the result iteration - it will be in the most
	462	efficient order for the index (keys iteration order in this case).
	463	"""
	464	keys = set(keys)
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	465	local_keys = keys.intersection(self._keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	466	if self.reference_lists:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	467	for key in local_keys:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	468	node = self._nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	469	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	470	else:
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	471	for key in local_keys:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	472	node = self._nodes[key]
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	473	yield self, key, node[1]
3847.2.1 by John Arbash Meinel Shortcut BTreeBuilder.iter_entries when there are no backing indices.	474	# Find things that are in backing indices that have not been handled
	475	# yet.
3847.2.3 by John Arbash Meinel Bring back the shortcut	476	if not self._backing_indices:
3847.2.3 by John Arbash Meinel Bring back the shortcut	477	return # We won't find anything there either
3847.2.2 by John Arbash Meinel Rather than skipping the difference_update entirely, just restrict it to the intersection keys.	478	# Remove all of the keys that we found locally
	479	keys.difference_update(local_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	480	for backing in self._backing_indices:
	481	if backing is None:
	482	continue
	483	if not keys:
	484	return
	485	for node in backing.iter_entries(keys):
	486	keys.remove(node[1])
	487	yield (self,) + node[1:]
	488
	489	def iter_entries_prefix(self, keys):
	490	"""Iterate over keys within the index using prefix matching.
	491
	492	Prefix matching is applied within the tuple of a key, not to within
	493	the bytestring of each key element. e.g. if you have the keys ('foo',
	494	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	495	only the former key is returned.
	496
	497	:param keys: An iterable providing the key prefixes to be retrieved.
	498	Each key prefix takes the form of a tuple the length of a key, but
	499	with the last N elements 'None' rather than a regular bytestring.
	500	The first element cannot be 'None'.
	501	:return: An iterable as per iter_all_entries, but restricted to the
	502	keys with a matching prefix to those supplied. No additional keys
	503	will be returned, and every match that is in the index will be
	504	returned.
	505	"""
	506	# XXX: To much duplication with the GraphIndex class; consider finding
	507	# a good place to pull out the actual common logic.
	508	keys = set(keys)
	509	if not keys:
	510	return
	511	for backing in self._backing_indices:
	512	if backing is None:
	513	continue
	514	for node in backing.iter_entries_prefix(keys):
	515	yield (self,) + node[1:]
	516	if self._key_length == 1:
	517	for key in keys:
	518	# sanity check
	519	if key[0] is None:
	520	raise errors.BadIndexKey(key)
	521	if len(key) != self._key_length:
	522	raise errors.BadIndexKey(key)
	523	try:
	524	node = self._nodes[key]
	525	except KeyError:
	526	continue
	527	if self.reference_lists:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	528	yield self, key, node[1], node[0]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	529	else:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	530	yield self, key, node[1]
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	531	return
	532	for key in keys:
	533	# sanity check
	534	if key[0] is None:
	535	raise errors.BadIndexKey(key)
	536	if len(key) != self._key_length:
	537	raise errors.BadIndexKey(key)
	538	# find what it refers to:
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	539	key_dict = self._get_nodes_by_key()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	540	elements = list(key)
	541	# find the subdict to return
	542	try:
	543	while len(elements) and elements[0] is not None:
	544	key_dict = key_dict[elements[0]]
	545	elements.pop(0)
	546	except KeyError:
	547	# a non-existant lookup.
	548	continue
	549	if len(elements):
	550	dicts = [key_dict]
	551	while dicts:
	552	key_dict = dicts.pop(-1)
	553	# can't be empty or would not exist
	554	item, value = key_dict.iteritems().next()
	555	if type(value) == dict:
	556	# push keys
	557	dicts.extend(key_dict.itervalues())
	558	else:
	559	# yield keys
	560	for value in key_dict.itervalues():
	561	yield (self, ) + value
	562	else:
	563	yield (self, ) + key_dict
	564
3644.2.1 by John Arbash Meinel Change the IndexBuilders to not generate the nodes_by_key unless needed.	565	def _get_nodes_by_key(self):
	566	if self._nodes_by_key is None:
	567	nodes_by_key = {}
	568	if self.reference_lists:
	569	for key, (references, value) in self._nodes.iteritems():
	570	key_dict = nodes_by_key
	571	for subkey in key[:-1]:
	572	key_dict = key_dict.setdefault(subkey, {})
	573	key_dict[key[-1]] = key, value, references
	574	else:
	575	for key, (references, value) in self._nodes.iteritems():
	576	key_dict = nodes_by_key
	577	for subkey in key[:-1]:
	578	key_dict = key_dict.setdefault(subkey, {})
	579	key_dict[key[-1]] = key, value
	580	self._nodes_by_key = nodes_by_key
	581	return self._nodes_by_key
	582
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	583	def key_count(self):
	584	"""Return an estimate of the number of keys in this index.
	585
	586	For InMemoryGraphIndex the estimate is exact.
	587	"""
	588	return len(self._keys) + sum(backing.key_count() for backing in
	589	self._backing_indices if backing is not None)
	590
	591	def validate(self):
	592	"""In memory index's have no known corruption at the moment."""
	593
	594
	595	class _LeafNode(object):
	596	"""A leaf node for a serialised B+Tree index."""
	597
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	598	__slots__ = ('keys', 'min_key', 'max_key')
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	599
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	600	def __init__(self, bytes, key_length, ref_list_length):
	601	"""Parse bytes to create a leaf node object."""
	602	# splitlines mangles the \r delimiters.. don't use it.
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	603	key_list = _btree_serializer._parse_leaf_lines(bytes,
	604	key_length, ref_list_length)
	605	if key_list:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	606	self.min_key = key_list[0][0]
	607	self.max_key = key_list[-1][0]
4593.4.2 by John Arbash Meinel Removing the min(keys) and max(keys) calls saves 100ms in the inner loop	608	else:
	609	self.min_key = self.max_key = None
	610	self.keys = dict(key_list)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	611
	612
	613	class _InternalNode(object):
	614	"""An internal node for a serialised B+Tree index."""
	615
4274.1.2 by John Arbash Meinel Add slots to _LeafNode and _InternalNode.	616	__slots__ = ('keys', 'offset')
	617
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	618	def __init__(self, bytes):
	619	"""Parse bytes to create an internal node object."""
	620	# splitlines mangles the \r delimiters.. don't use it.
	621	self.keys = self._parse_lines(bytes.split('\n'))
	622
	623	def _parse_lines(self, lines):
	624	nodes = []
	625	self.offset = int(lines[1][7:])
	626	for line in lines[2:]:
	627	if line == '':
	628	break
4679.8.3 by John Arbash Meinel Expose bzrlib.static_tuple.StaticTuple as a thunk	629	# TODO: Switch to StaticTuple here.
4075.3.1 by John Arbash Meinel Use PyString_InternInPlace to intern() the various parts of keys that are processed.	630	nodes.append(tuple(map(intern, line.split('\0'))))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	631	return nodes
	632
	633
	634	class BTreeGraphIndex(object):
	635	"""Access to nodes via the standard GraphIndex interface for B+Tree's.
	636
	637	Individual nodes are held in a LRU cache. This holds the root node in
	638	memory except when very large walks are done.
	639	"""
	640
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	641	def __init__(self, transport, name, size, unlimited_cache=False):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	642	"""Create a B+Tree index object on the index name.
	643
	644	:param transport: The transport to read data for the index from.
	645	:param name: The file name of the index on transport.
	646	:param size: Optional size of the index in bytes. This allows
	647	compatibility with the GraphIndex API, as well as ensuring that
	648	the initial read (to read the root node header) can be done
	649	without over-reading even on empty indices, and on small indices
	650	allows single-IO to read the entire index.
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	651	:param unlimited_cache: If set to True, then instead of using an
	652	LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
	653	cache all leaf nodes.
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	654	"""
	655	self._transport = transport
	656	self._name = name
	657	self._size = size
	658	self._file = None
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	659	self._recommended_pages = self._compute_recommended_pages()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	660	self._root_node = None
	661	# Default max size is 100,000 leave values
	662	self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
4634.71.1 by John Arbash Meinel Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).	663	if unlimited_cache:
	664	self._leaf_node_cache = {}
	665	self._internal_node_cache = {}
	666	else:
	667	self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
	668	# We use a FIFO here just to prevent possible blowout. However, a
	669	# 300k record btree has only 3k leaf nodes, and only 20 internal
	670	# nodes. A value of 100 scales to ~100100100 = 1M records.
	671	self._internal_node_cache = fifo_cache.FIFOCache(100)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	672	self._key_count = None
	673	self._row_lengths = None
	674	self._row_offsets = None # Start of each row, [-1] is the end
	675
	676	def __eq__(self, other):
	677	"""Equal when self and other were created with the same parameters."""
	678	return (
	679	type(self) == type(other) and
	680	self._transport == other._transport and
	681	self._name == other._name and
	682	self._size == other._size)
	683
	684	def __ne__(self, other):
	685	return not self.__eq__(other)
	686
3763.8.12 by John Arbash Meinel Code cleanup.	687	def _get_and_cache_nodes(self, nodes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	688	"""Read nodes and cache them in the lru.
	689
	690	The nodes list supplied is sorted and then read from disk, each node
	691	being inserted it into the _node_cache.
	692
	693	Note: Asking for more nodes than the _node_cache can contain will
	694	result in some of the results being immediately discarded, to prevent
	695	this an assertion is raised if more nodes are asked for than are
	696	cachable.
	697
	698	:return: A dict of {node_pos: node}
	699	"""
	700	found = {}
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	701	start_of_leaves = None
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	702	for node_pos, node in self._read_nodes(sorted(nodes)):
	703	if node_pos == 0: # Special case
	704	self._root_node = node
	705	else:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	706	if start_of_leaves is None:
	707	start_of_leaves = self._row_offsets[-2]
	708	if node_pos < start_of_leaves:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	709	self._internal_node_cache[node_pos] = node
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	710	else:
4634.71.2 by John Arbash Meinel If we are going to sometimes use a dict, we have to conform to just the dict interface.	711	self._leaf_node_cache[node_pos] = node
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	712	found[node_pos] = node
	713	return found
	714
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	715	def _compute_recommended_pages(self):
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	716	"""Convert transport's recommended_page_size into btree pages.
	717
	718	recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
	719	pages fit in that length.
	720	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	721	recommended_read = self._transport.recommended_page_size()
	722	recommended_pages = int(math.ceil(recommended_read /
	723	float(_PAGE_SIZE)))
	724	return recommended_pages
	725
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	726	def _compute_total_pages_in_index(self):
	727	"""How many pages are in the index.
	728
	729	If we have read the header we will use the value stored there.
	730	Otherwise it will be computed based on the length of the index.
	731	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	732	if self._size is None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	733	raise AssertionError('_compute_total_pages_in_index should not be'
	734	' called when self._size is None')
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	735	if self._root_node is not None:
	736	# This is the number of pages as defined by the header
	737	return self._row_offsets[-1]
	738	# This is the number of pages as defined by the size of the index. They
	739	# should be indentical.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	740	total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
	741	return total_pages
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	742
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	743	def _expand_offsets(self, offsets):
	744	"""Find extra pages to download.
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	745
	746	The idea is that we always want to make big-enough requests (like 64kB
	747	for http), so that we don't waste round trips. So given the entries
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	748	that we already have cached and the new pages being downloaded figure
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	749	out what other pages we might want to read.
	750
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	751	See also doc/developers/btree_index_prefetch.txt for more details.
	752
	753	:param offsets: The offsets to be read
	754	:return: A list of offsets to download
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	755	"""
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	756	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	757	trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	758
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	759	if len(offsets) >= self._recommended_pages:
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	760	# Don't add more, we are already requesting more than enough
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	761	if 'index' in debug.debug_flags:
	762	trace.mutter(' not expanding large request (%s >= %s)',
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	763	len(offsets), self._recommended_pages)
	764	return offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	765	if self._size is None:
	766	# Don't try anything, because we don't know where the file ends
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	767	if 'index' in debug.debug_flags:
	768	trace.mutter(' not expanding without knowing index size')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	769	return offsets
	770	total_pages = self._compute_total_pages_in_index()
	771	cached_offsets = self._get_offsets_to_cached_pages()
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	772	# If reading recommended_pages would read the rest of the index, just
	773	# do so.
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	774	if total_pages - len(cached_offsets) <= self._recommended_pages:
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	775	# Read whatever is left
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	776	if cached_offsets:
	777	expanded = [x for x in xrange(total_pages)
	778	if x not in cached_offsets]
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	779	else:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	780	expanded = range(total_pages)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	781	if 'index' in debug.debug_flags:
	782	trace.mutter(' reading all unread pages: %s', expanded)
	783	return expanded
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	784
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	785	if self._root_node is None:
	786	# ATM on the first read of the root node of a large index, we don't
	787	# bother pre-reading any other pages. This is because the
	788	# likelyhood of actually reading interesting pages is very low.
	789	# See doc/developers/btree_index_prefetch.txt for a discussion, and
	790	# a possible implementation when we are guessing that the second
	791	# layer index is small
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	792	final_offsets = offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	793	else:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	794	tree_depth = len(self._row_lengths)
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	795	if len(cached_offsets) < tree_depth and len(offsets) == 1:
3763.8.14 by John Arbash Meinel Add in a shortcut when we haven't cached much yet.	796	# We haven't read enough to justify expansion
	797	# If we are only going to read the root node, and 1 leaf node,
	798	# then it isn't worth expanding our request. Once we've read at
	799	# least 2 nodes, then we are probably doing a search, and we
	800	# start expanding our requests.
	801	if 'index' in debug.debug_flags:
	802	trace.mutter(' not expanding on first reads')
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	803	return offsets
	804	final_offsets = self._expand_to_neighbors(offsets, cached_offsets,
	805	total_pages)
	806
	807	final_offsets = sorted(final_offsets)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	808	if 'index' in debug.debug_flags:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	809	trace.mutter('expanded: %s', final_offsets)
	810	return final_offsets
	811
	812	def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):
	813	"""Expand requests to neighbors until we have enough pages.
	814
	815	This is called from _expand_offsets after policy has determined that we
	816	want to expand.
	817	We only want to expand requests within a given layer. We cheat a little
	818	bit and assume all requests will be in the same layer. This is true
	819	given the current design, but if it changes this algorithm may perform
	820	oddly.
	821
	822	:param offsets: requested offsets
	823	:param cached_offsets: offsets for pages we currently have cached
	824	:return: A set() of offsets after expansion
	825	"""
	826	final_offsets = set(offsets)
	827	first = end = None
	828	new_tips = set(final_offsets)
	829	while len(final_offsets) < self._recommended_pages and new_tips:
	830	next_tips = set()
	831	for pos in new_tips:
	832	if first is None:
	833	first, end = self._find_layer_first_and_end(pos)
	834	previous = pos - 1
	835	if (previous > 0
	836	and previous not in cached_offsets
	837	and previous not in final_offsets
	838	and previous >= first):
	839	next_tips.add(previous)
	840	after = pos + 1
	841	if (after < total_pages
	842	and after not in cached_offsets
	843	and after not in final_offsets
	844	and after < end):
	845	next_tips.add(after)
	846	# This would keep us from going bigger than
	847	# recommended_pages by only expanding the first offsets.
	848	# However, if we are making a 'wide' request, it is
	849	# reasonable to expand all points equally.
	850	# if len(final_offsets) > recommended_pages:
	851	# break
	852	final_offsets.update(next_tips)
	853	new_tips = next_tips
	854	return final_offsets
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	855
4744.2.6 by John Arbash Meinel Start exposing an GraphIndex.clear_cache() member.	856	def clear_cache(self):
	857	"""Clear out any cached/memoized values.
	858
	859	This can be called at any time, but generally it is used when we have
	860	extracted some information, but don't expect to be requesting any more
	861	from this index.
	862	"""
	863	# Note that we don't touch self._root_node or self._internal_node_cache
	864	# We don't expect either of those to be big, and it can save
	865	# round-trips in the future. We may re-evaluate this if InternalNode
	866	# memory starts to be an issue.
	867	self._leaf_node_cache.clear()
	868
4011.5.3 by Andrew Bennetts Implement and test external_references on GraphIndex and BTreeGraphIndex.	869	def external_references(self, ref_list_num):
	870	if self._root_node is None:
	871	self._get_root_node()
	872	if ref_list_num + 1 > self.node_ref_lists:
	873	raise ValueError('No ref list %d, index has %d ref lists'
	874	% (ref_list_num, self.node_ref_lists))
	875	keys = set()
	876	refs = set()
	877	for node in self.iter_all_entries():
	878	keys.add(node[1])
	879	refs.update(node[3][ref_list_num])
	880	return refs - keys
	881
3763.8.12 by John Arbash Meinel Code cleanup.	882	def _find_layer_first_and_end(self, offset):
	883	"""Find the start/stop nodes for the layer corresponding to offset.
	884
	885	:return: (first, end)
	886	first is the first node in this layer
	887	end is the first node of the next layer
	888	"""
	889	first = end = 0
	890	for roffset in self._row_offsets:
	891	first = end
	892	end = roffset
	893	if offset < roffset:
	894	break
	895	return first, end
	896
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	897	def _get_offsets_to_cached_pages(self):
3763.8.12 by John Arbash Meinel Code cleanup.	898	"""Determine what nodes we already have cached."""
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	899	cached_offsets = set(self._internal_node_cache.keys())
	900	cached_offsets.update(self._leaf_node_cache.keys())
3763.8.12 by John Arbash Meinel Code cleanup.	901	if self._root_node is not None:
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	902	cached_offsets.add(0)
	903	return cached_offsets
3763.8.12 by John Arbash Meinel Code cleanup.	904
	905	def _get_root_node(self):
	906	if self._root_node is None:
	907	# We may not have a root node yet
	908	self._get_internal_nodes([0])
	909	return self._root_node
	910
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	911	def _get_nodes(self, cache, node_indexes):
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	912	found = {}
	913	needed = []
	914	for idx in node_indexes:
	915	if idx == 0 and self._root_node is not None:
	916	found[0] = self._root_node
	917	continue
	918	try:
	919	found[idx] = cache[idx]
	920	except KeyError:
	921	needed.append(idx)
3763.8.1 by John Arbash Meinel Playing around with expanding requests for btree index nodes into neighboring nodes.	922	if not needed:
	923	return found
3763.8.15 by John Arbash Meinel Review comments from Martin. Code clarity/variable name/docstring updates.	924	needed = self._expand_offsets(needed)
3763.8.12 by John Arbash Meinel Code cleanup.	925	found.update(self._get_and_cache_nodes(needed))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	926	return found
	927
	928	def _get_internal_nodes(self, node_indexes):
	929	"""Get a node, from cache or disk.
	930
	931	After getting it, the node will be cached.
	932	"""
3641.5.18 by John Arbash Meinel Clean out the global state, good for prototyping and tuning, bad for production code.	933	return self._get_nodes(self._internal_node_cache, node_indexes)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	934
3805.4.6 by John Arbash Meinel refactor for clarity.	935	def _cache_leaf_values(self, nodes):
3805.4.6 by John Arbash Meinel refactor for clarity.	936	"""Cache directly from key => value, skipping the btree."""
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	937	if self._leaf_value_cache is not None:
3805.4.6 by John Arbash Meinel refactor for clarity.	938	for node in nodes.itervalues():
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	939	for key, value in node.keys.iteritems():
	940	if key in self._leaf_value_cache:
	941	# Don't add the rest of the keys, we've seen this node
	942	# before.
	943	break
	944	self._leaf_value_cache[key] = value
3805.4.6 by John Arbash Meinel refactor for clarity.	945
	946	def _get_leaf_nodes(self, node_indexes):
	947	"""Get a bunch of nodes, from cache or disk."""
	948	found = self._get_nodes(self._leaf_node_cache, node_indexes)
	949	self._cache_leaf_values(found)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	950	return found
	951
	952	def iter_all_entries(self):
	953	"""Iterate over all keys within the index.
	954
	955	:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
	956	The former tuple is used when there are no reference lists in the
	957	index, making the API compatible with simple key:value index types.
	958	There is no defined order for the result iteration - it will be in
	959	the most efficient order for the index.
	960	"""
	961	if 'evil' in debug.debug_flags:
	962	trace.mutter_callsite(3,
	963	"iter_all_entries scales with size of history.")
	964	if not self.key_count():
	965	return
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	966	if self._row_offsets[-1] == 1:
	967	# There is only the root node, and we read that via key_count()
	968	if self.node_ref_lists:
	969	for key, (value, refs) in sorted(self._root_node.keys.items()):
	970	yield (self, key, value, refs)
	971	else:
	972	for key, (value, refs) in sorted(self._root_node.keys.items()):
	973	yield (self, key, value)
	974	return
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	975	start_of_leaves = self._row_offsets[-2]
	976	end_of_leaves = self._row_offsets[-1]
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	977	needed_offsets = range(start_of_leaves, end_of_leaves)
	978	if needed_offsets == [0]:
	979	# Special case when we only have a root node, as we have already
	980	# read everything
	981	nodes = [(0, self._root_node)]
	982	else:
	983	nodes = self._read_nodes(needed_offsets)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	984	# We iterate strictly in-order so that we can use this function
	985	# for spilling index builds to disk.
	986	if self.node_ref_lists:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	987	for _, node in nodes:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	988	for key, (value, refs) in sorted(node.keys.items()):
	989	yield (self, key, value, refs)
	990	else:
3824.1.2 by John Arbash Meinel iter_all_entries() shouldn't need to re-read the page.	991	for _, node in nodes:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	992	for key, (value, refs) in sorted(node.keys.items()):
	993	yield (self, key, value)
	994
	995	@staticmethod
	996	def _multi_bisect_right(in_keys, fixed_keys):
	997	"""Find the positions where each 'in_key' would fit in fixed_keys.
	998
	999	This is equivalent to doing "bisect_right" on each in_key into
	1000	fixed_keys
	1001
	1002	:param in_keys: A sorted list of keys to match with fixed_keys
	1003	:param fixed_keys: A sorted list of keys to match against
	1004	:return: A list of (integer position, [key list]) tuples.
	1005	"""
	1006	if not in_keys:
	1007	return []
	1008	if not fixed_keys:
	1009	# no pointers in the fixed_keys list, which means everything must
	1010	# fall to the left.
	1011	return [(0, in_keys)]
	1012
	1013	# TODO: Iterating both lists will generally take M + N steps
	1014	# Bisecting each key will generally take M * log2 N steps.
	1015	# If we had an efficient way to compare, we could pick the method
	1016	# based on which has the fewer number of steps.
	1017	# There is also the argument that bisect_right is a compiled
	1018	# function, so there is even more to be gained.
	1019	# iter_steps = len(in_keys) + len(fixed_keys)
	1020	# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
	1021	if len(in_keys) == 1: # Bisect will always be faster for M = 1
	1022	return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
	1023	# elif bisect_steps < iter_steps:
	1024	# offsets = {}
	1025	# for key in in_keys:
	1026	# offsets.setdefault(bisect_right(fixed_keys, key),
	1027	# []).append(key)
	1028	# return [(o, offsets[o]) for o in sorted(offsets)]
	1029	in_keys_iter = iter(in_keys)
	1030	fixed_keys_iter = enumerate(fixed_keys)
	1031	cur_in_key = in_keys_iter.next()
	1032	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
	1033
	1034	class InputDone(Exception): pass
	1035	class FixedDone(Exception): pass
	1036
	1037	output = []
	1038	cur_out = []
	1039
	1040	# TODO: Another possibility is that rather than iterating on each side,
	1041	# we could use a combination of bisecting and iterating. For
	1042	# example, while cur_in_key < fixed_key, bisect to find its
	1043	# point, then iterate all matching keys, then bisect (restricted
	1044	# to only the remainder) for the next one, etc.
	1045	try:
	1046	while True:
	1047	if cur_in_key < cur_fixed_key:
	1048	cur_keys = []
	1049	cur_out = (cur_fixed_offset, cur_keys)
	1050	output.append(cur_out)
	1051	while cur_in_key < cur_fixed_key:
	1052	cur_keys.append(cur_in_key)
	1053	try:
	1054	cur_in_key = in_keys_iter.next()
	1055	except StopIteration:
1056	raise InputDone
1057	# At this point cur_in_key must be >= cur_fixed_key
1058	# step the cur_fixed_key until we pass the cur key, or walk off
1059	# the end
1060	while cur_in_key >= cur_fixed_key:
1061	try:
1062	cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1063	except StopIteration:
1064	raise FixedDone
1065	except InputDone:
1066	# We consumed all of the input, nothing more to do
1067	pass
1068	except FixedDone:
1069	# There was some input left, but we consumed all of fixed, so we
1070	# have to add one more for the tail
1071	cur_keys = [cur_in_key]
1072	cur_keys.extend(in_keys_iter)
1073	cur_out = (len(fixed_keys), cur_keys)
1074	output.append(cur_out)
1075	return output
1076
4593.4.5 by John Arbash Meinel Start adding some tests.	1077	def _walk_through_internal_nodes(self, keys):
	1078	"""Take the given set of keys, and find the corresponding LeafNodes.
	1079
	1080	:param keys: An unsorted iterable of keys to search for
	1081	:return: (nodes, index_and_keys)
	1082	nodes is a dict mapping {index: LeafNode}
	1083	keys_at_index is a list of tuples of [(index, [keys for Leaf])]
	1084	"""
	1085	# 6 seconds spent in miss_torture using the sorted() line.
	1086	# Even with out of order disk IO it seems faster not to sort it when
	1087	# large queries are being made.
	1088	keys_at_index = [(0, sorted(keys))]
	1089
	1090	for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
	1091	node_indexes = [idx for idx, s_keys in keys_at_index]
	1092	nodes = self._get_internal_nodes(node_indexes)
	1093
	1094	next_nodes_and_keys = []
	1095	for node_index, sub_keys in keys_at_index:
	1096	node = nodes[node_index]
	1097	positions = self._multi_bisect_right(sub_keys, node.keys)
	1098	node_offset = next_row_start + node.offset
	1099	next_nodes_and_keys.extend([(node_offset + pos, s_keys)
	1100	for pos, s_keys in positions])
	1101	keys_at_index = next_nodes_and_keys
	1102	# We should now be at the _LeafNodes
	1103	node_indexes = [idx for idx, s_keys in keys_at_index]
	1104
	1105	# TODO: We may not want to always read all the nodes in one
	1106	# big go. Consider setting a max size on this.
	1107	nodes = self._get_leaf_nodes(node_indexes)
	1108	return nodes, keys_at_index
	1109
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1110	def iter_entries(self, keys):
	1111	"""Iterate over keys within the index.
	1112
	1113	:param keys: An iterable providing the keys to be retrieved.
	1114	:return: An iterable as per iter_all_entries, but restricted to the
	1115	keys supplied. No additional keys will be returned, and every
	1116	key supplied that is in the index will be returned.
	1117	"""
	1118	# 6 seconds spent in miss_torture using the sorted() line.
	1119	# Even with out of order disk IO it seems faster not to sort it when
	1120	# large queries are being made.
	1121	# However, now that we are doing multi-way bisecting, we need the keys
	1122	# in sorted order anyway. We could change the multi-way code to not
	1123	# require sorted order. (For example, it bisects for the first node,
	1124	# does an in-order search until a key comes before the current point,
	1125	# which it then bisects for, etc.)
	1126	keys = frozenset(keys)
	1127	if not keys:
	1128	return
	1129
	1130	if not self.key_count():
	1131	return
	1132
	1133	needed_keys = []
	1134	if self._leaf_value_cache is None:
	1135	needed_keys = keys
	1136	else:
	1137	for key in keys:
	1138	value = self._leaf_value_cache.get(key, None)
	1139	if value is not None:
	1140	# This key is known not to be here, skip it
	1141	value, refs = value
	1142	if self.node_ref_lists:
	1143	yield (self, key, value, refs)
	1144	else:
	1145	yield (self, key, value)
	1146	else:
	1147	needed_keys.append(key)
	1148
	1149	last_key = None
	1150	needed_keys = keys
	1151	if not needed_keys:
	1152	return
4593.4.5 by John Arbash Meinel Start adding some tests.	1153	nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1154	for node_index, sub_keys in nodes_and_keys:
	1155	if not sub_keys:
	1156	continue
	1157	node = nodes[node_index]
	1158	for next_sub_key in sub_keys:
	1159	if next_sub_key in node.keys:
	1160	value, refs = node.keys[next_sub_key]
	1161	if self.node_ref_lists:
	1162	yield (self, next_sub_key, value, refs)
	1163	else:
	1164	yield (self, next_sub_key, value)
	1165
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1166	def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1167	"""Find the parent_map information for the set of keys.
	1168
	1169	This populates the parent_map dict and missing_keys set based on the
	1170	queried keys. It also can fill out an arbitrary number of parents that
	1171	it finds while searching for the supplied keys.
	1172
	1173	It is unlikely that you want to call this directly. See
4593.4.12 by John Arbash Meinel Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()	1174	"CombinedGraphIndex.find_ancestry()" for a more appropriate API.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1175
	1176	:param keys: A keys whose ancestry we want to return
	1177	Every key will either end up in 'parent_map' or 'missing_keys'.
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1178	:param ref_list_num: This index in the ref_lists is the parents we
	1179	care about.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1180	:param parent_map: {key: parent_keys} for keys that are present in this
	1181	index. This may contain more entries than were in 'keys', that are
	1182	reachable ancestors of the keys requested.
4593.4.5 by John Arbash Meinel Start adding some tests.	1183	:param missing_keys: keys which are known to be missing in this index.
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1184	This may include parents that were not directly requested, but we
	1185	were able to determine that they are not present in this index.
	1186	:return: search_keys parents that were found but not queried to know
	1187	if they are missing or present. Callers can re-query this index for
	1188	those keys, and they will be placed into parent_map or missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1189	"""
	1190	if not self.key_count():
	1191	# We use key_count() to trigger reading the root node and
	1192	# determining info about this BTreeGraphIndex
	1193	# If we don't have any keys, then everything is missing
4593.4.11 by John Arbash Meinel Snapshot the work in progress.	1194	missing_keys.update(keys)
	1195	return set()
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1196	if ref_list_num >= self.node_ref_lists:
	1197	raise ValueError('No ref list %d, index has %d ref lists'
	1198	% (ref_list_num, self.node_ref_lists))
	1199
	1200	# The main trick we are trying to accomplish is that when we find a
	1201	# key listing its parents, we expect that the parent key is also likely
	1202	# to sit on the same page. Allowing us to expand parents quickly
	1203	# without suffering the full stack of bisecting, etc.
4593.4.5 by John Arbash Meinel Start adding some tests.	1204	nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1205
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1206	# These are parent keys which could not be immediately resolved on the
	1207	# page where the child was present. Note that we may already be
	1208	# searching for that key, and it may actually be present [or known
	1209	# missing] on one of the other pages we are reading.
	1210	# TODO:
	1211	# We could try searching for them in the immediate previous or next
	1212	# page. If they occur "later" we could put them in a pending lookup
	1213	# set, and then for each node we read thereafter we could check to
	1214	# see if they are present.
	1215	# However, we don't know the impact of keeping this list of things
	1216	# that I'm going to search for every node I come across from here on
	1217	# out.
	1218	# It doesn't handle the case when the parent key is missing on a
	1219	# page that we don't read. So we already have to handle being
	1220	# re-entrant for that.
	1221	# Since most keys contain a date string, they are more likely to be
	1222	# found earlier in the file than later, but we would know that right
	1223	# away (key < min_key), and wouldn't keep searching it on every other
	1224	# page that we read.
	1225	# Mostly, it is an idea, one which should be benchmarked.
	1226	parents_not_on_page = set()
	1227
	1228	for node_index, sub_keys in nodes_and_keys:
	1229	if not sub_keys:
	1230	continue
	1231	# sub_keys is all of the keys we are looking for that should exist
	1232	# on this page, if they aren't here, then they won't be found
	1233	node = nodes[node_index]
4593.4.3 by John Arbash Meinel Some minor attribute lookup cleanus, doesn't make a big difference.	1234	node_keys = node.keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1235	parents_to_check = set()
	1236	for next_sub_key in sub_keys:
4593.4.5 by John Arbash Meinel Start adding some tests.	1237	if next_sub_key not in node_keys:
	1238	# This one is just not present in the index at all
	1239	missing_keys.add(next_sub_key)
	1240	else:
4593.4.3 by John Arbash Meinel Some minor attribute lookup cleanus, doesn't make a big difference.	1241	value, refs = node_keys[next_sub_key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1242	parent_keys = refs[ref_list_num]
	1243	parent_map[next_sub_key] = parent_keys
	1244	parents_to_check.update(parent_keys)
	1245	# Don't look for things we've already found
	1246	parents_to_check = parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1247	# this can be used to test the benefit of having the check loop
	1248	# inlined.
	1249	# parents_not_on_page.update(parents_to_check)
	1250	# continue
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1251	while parents_to_check:
	1252	next_parents_to_check = set()
	1253	for key in parents_to_check:
4593.4.3 by John Arbash Meinel Some minor attribute lookup cleanus, doesn't make a big difference.	1254	if key in node_keys:
	1255	value, refs = node_keys[key]
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1256	parent_keys = refs[ref_list_num]
	1257	parent_map[key] = parent_keys
	1258	next_parents_to_check.update(parent_keys)
	1259	else:
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1260	# This parent either is genuinely missing, or should be
	1261	# found on another page. Perf test whether it is better
	1262	# to check if this node should fit on this page or not.
	1263	# in the 'everything-in-one-pack' scenario, this not
	1264	# doing the check is 237ms vs 243ms.
	1265	# So slightly better, but I assume the standard 'lots
	1266	# of packs' is going to show a reasonable improvement
	1267	# from the check, because it avoids 'going around
	1268	# again' for everything that is in another index
4593.4.5 by John Arbash Meinel Start adding some tests.	1269	# parents_not_on_page.add(key)
	1270	# Missing for some reason
	1271	if key < node.min_key:
	1272	# in the case of bzr.dev, 3.4k/5.3k misses are
	1273	# 'earlier' misses (65%)
	1274	parents_not_on_page.add(key)
	1275	elif key > node.max_key:
	1276	# This parent key would be present on a different
	1277	# LeafNode
	1278	parents_not_on_page.add(key)
	1279	else:
	1280	# assert key != node.min_key and key != node.max_key
	1281	# If it was going to be present, it would be on
	1282	# this page, so mark it missing.
	1283	missing_keys.add(key)
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1284	parents_to_check = next_parents_to_check.difference(parent_map)
4593.4.4 by John Arbash Meinel Trying out a few more tweaks.	1285	# Might want to do another .difference() from missing_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1286	# parents_not_on_page could have been found on a different page, or be
	1287	# known to be missing. So cull out everything that has already been
	1288	# found.
4593.4.5 by John Arbash Meinel Start adding some tests.	1289	search_keys = parents_not_on_page.difference(
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1290	parent_map).difference(missing_keys)
4593.4.5 by John Arbash Meinel Start adding some tests.	1291	return search_keys
4593.4.1 by John Arbash Meinel Implement a function on btree that inlines the get_parent_map loop.	1292
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1293	def iter_entries_prefix(self, keys):
	1294	"""Iterate over keys within the index using prefix matching.
	1295
	1296	Prefix matching is applied within the tuple of a key, not to within
	1297	the bytestring of each key element. e.g. if you have the keys ('foo',
	1298	'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
	1299	only the former key is returned.
	1300
	1301	WARNING: Note that this method currently causes a full index parse
	1302	unconditionally (which is reasonably appropriate as it is a means for
	1303	thunking many small indices into one larger one and still supplies
	1304	iter_all_entries at the thunk layer).
	1305
	1306	:param keys: An iterable providing the key prefixes to be retrieved.
	1307	Each key prefix takes the form of a tuple the length of a key, but
	1308	with the last N elements 'None' rather than a regular bytestring.
	1309	The first element cannot be 'None'.
	1310	:return: An iterable as per iter_all_entries, but restricted to the
	1311	keys with a matching prefix to those supplied. No additional keys
	1312	will be returned, and every match that is in the index will be
	1313	returned.
	1314	"""
	1315	keys = sorted(set(keys))
	1316	if not keys:
	1317	return
	1318	# Load if needed to check key lengths
	1319	if self._key_count is None:
	1320	self._get_root_node()
	1321	# TODO: only access nodes that can satisfy the prefixes we are looking
	1322	# for. For now, to meet API usage (as this function is not used by
	1323	# current bzrlib) just suck the entire index and iterate in memory.
	1324	nodes = {}
	1325	if self.node_ref_lists:
	1326	if self._key_length == 1:
	1327	for _1, key, value, refs in self.iter_all_entries():
	1328	nodes[key] = value, refs
	1329	else:
	1330	nodes_by_key = {}
	1331	for _1, key, value, refs in self.iter_all_entries():
	1332	key_value = key, value, refs
	1333	# For a key of (foo, bar, baz) create
	1334	# _nodes_by_key[foo][bar][baz] = key_value
	1335	key_dict = nodes_by_key
	1336	for subkey in key[:-1]:
	1337	key_dict = key_dict.setdefault(subkey, {})
	1338	key_dict[key[-1]] = key_value
	1339	else:
	1340	if self._key_length == 1:
	1341	for _1, key, value in self.iter_all_entries():
	1342	nodes[key] = value
	1343	else:
	1344	nodes_by_key = {}
	1345	for _1, key, value in self.iter_all_entries():
	1346	key_value = key, value
	1347	# For a key of (foo, bar, baz) create
	1348	# _nodes_by_key[foo][bar][baz] = key_value
	1349	key_dict = nodes_by_key
	1350	for subkey in key[:-1]:
	1351	key_dict = key_dict.setdefault(subkey, {})
	1352	key_dict[key[-1]] = key_value
	1353	if self._key_length == 1:
	1354	for key in keys:
	1355	# sanity check
	1356	if key[0] is None:
1357	raise errors.BadIndexKey(key)
1358	if len(key) != self._key_length:
1359	raise errors.BadIndexKey(key)
1360	try:
1361	if self.node_ref_lists:
1362	value, node_refs = nodes[key]
1363	yield self, key, value, node_refs
1364	else:
1365	yield self, key, nodes[key]
1366	except KeyError:
1367	pass
1368	return
1369	for key in keys:
1370	# sanity check
1371	if key[0] is None:
1372	raise errors.BadIndexKey(key)
1373	if len(key) != self._key_length:
1374	raise errors.BadIndexKey(key)
1375	# find what it refers to:
1376	key_dict = nodes_by_key
1377	elements = list(key)
1378	# find the subdict whose contents should be returned.
1379	try:
1380	while len(elements) and elements[0] is not None:
1381	key_dict = key_dict[elements[0]]
1382	elements.pop(0)
1383	except KeyError:
1384	# a non-existant lookup.
1385	continue
1386	if len(elements):
1387	dicts = [key_dict]
1388	while dicts:
1389	key_dict = dicts.pop(-1)
1390	# can't be empty or would not exist
1391	item, value = key_dict.iteritems().next()
1392	if type(value) == dict:
1393	# push keys
1394	dicts.extend(key_dict.itervalues())
1395	else:
1396	# yield keys
1397	for value in key_dict.itervalues():
1398	# each value is the key:value:node refs tuple
1399	# ready to yield.
1400	yield (self, ) + value
1401	else:
1402	# the last thing looked up was a terminal element
1403	yield (self, ) + key_dict
1404
1405	def key_count(self):
1406	"""Return an estimate of the number of keys in this index.
1407
1408	For BTreeGraphIndex the estimate is exact as it is contained in the
1409	header.
1410	"""
1411	if self._key_count is None:
1412	self._get_root_node()
1413	return self._key_count
1414
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1415	def _compute_row_offsets(self):
	1416	"""Fill out the _row_offsets attribute based on _row_lengths."""
	1417	offsets = []
	1418	row_offset = 0
	1419	for row in self._row_lengths:
	1420	offsets.append(row_offset)
	1421	row_offset += row
	1422	offsets.append(row_offset)
	1423	self._row_offsets = offsets
	1424
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1425	def _parse_header_from_bytes(self, bytes):
	1426	"""Parse the header from a region of bytes.
	1427
	1428	:param bytes: The data to parse.
	1429	:return: An offset, data tuple such as readv yields, for the unparsed
	1430	data. (which may be of length 0).
	1431	"""
	1432	signature = bytes[0:len(self._signature())]
	1433	if not signature == self._signature():
	1434	raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
	1435	lines = bytes[len(self._signature()):].splitlines()
	1436	options_line = lines[0]
	1437	if not options_line.startswith(_OPTION_NODE_REFS):
	1438	raise errors.BadIndexOptions(self)
	1439	try:
	1440	self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
	1441	except ValueError:
	1442	raise errors.BadIndexOptions(self)
	1443	options_line = lines[1]
	1444	if not options_line.startswith(_OPTION_KEY_ELEMENTS):
	1445	raise errors.BadIndexOptions(self)
	1446	try:
	1447	self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
	1448	except ValueError:
	1449	raise errors.BadIndexOptions(self)
	1450	options_line = lines[2]
	1451	if not options_line.startswith(_OPTION_LEN):
	1452	raise errors.BadIndexOptions(self)
	1453	try:
	1454	self._key_count = int(options_line[len(_OPTION_LEN):])
	1455	except ValueError:
	1456	raise errors.BadIndexOptions(self)
	1457	options_line = lines[3]
	1458	if not options_line.startswith(_OPTION_ROW_LENGTHS):
	1459	raise errors.BadIndexOptions(self)
	1460	try:
	1461	self._row_lengths = map(int, [length for length in
	1462	options_line[len(_OPTION_ROW_LENGTHS):].split(',')
	1463	if len(length)])
	1464	except ValueError:
	1465	raise errors.BadIndexOptions(self)
3763.8.7 by John Arbash Meinel A bit of doc updates, start putting in tests for current behavior.	1466	self._compute_row_offsets()
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1467
	1468	# calculate the bytes we have processed
	1469	header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
	1470	return header_end, bytes[header_end:]
	1471
	1472	def _read_nodes(self, nodes):
	1473	"""Read some nodes from disk into the LRU cache.
	1474
	1475	This performs a readv to get the node data into memory, and parses each
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1476	node, then yields it to the caller. The nodes are requested in the
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1477	supplied order. If possible doing sort() on the list before requesting
	1478	a read may improve performance.
	1479
	1480	:param nodes: The nodes to read. 0 - first node, 1 - second node etc.
	1481	:return: None
	1482	"""
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1483	# may be the byte string of the whole file
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1484	bytes = None
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1485	# list of (offset, length) regions of the file that should, evenually
	1486	# be read in to data_ranges, either from 'bytes' or from the transport
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1487	ranges = []
	1488	for index in nodes:
	1489	offset = index * _PAGE_SIZE
	1490	size = _PAGE_SIZE
	1491	if index == 0:
	1492	# Root node - special case
	1493	if self._size:
	1494	size = min(_PAGE_SIZE, self._size)
	1495	else:
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1496	# The only case where we don't know the size, is for very
	1497	# small indexes. So we read the whole thing
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1498	bytes = self._transport.get_bytes(self._name)
	1499	self._size = len(bytes)
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1500	# the whole thing should be parsed out of 'bytes'
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1501	ranges.append((0, len(bytes)))
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1502	break
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1503	else:
3763.8.6 by John Arbash Meinel Fix the logic a bit, and add a bit more tweaking opportunities	1504	if offset > self._size:
	1505	raise AssertionError('tried to read past the end'
	1506	' of the file %s > %s'
	1507	% (offset, self._size))
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1508	size = min(size, self._size - offset)
	1509	ranges.append((offset, size))
	1510	if not ranges:
	1511	return
3868.1.1 by Martin Pool merge John's patch to avoid re-reading pack-names file	1512	elif bytes is not None:
	1513	# already have the whole file
3823.5.2 by John Arbash Meinel It turns out that we read the pack-names file 3-times because	1514	data_ranges = [(start, bytes[start:start+_PAGE_SIZE])
	1515	for start in xrange(0, len(bytes), _PAGE_SIZE)]
3824.1.1 by John Arbash Meinel Fix _read_nodes() to only issue a single read if there is no known size.	1516	elif self._file is None:
3641.3.1 by John Arbash Meinel Bring in the btree_index and chunk_writer code and their tests.	1517	data_ranges = self._transport.readv(self._name, ranges)
	1518	else:
	1519	data_ranges = []
	1520	for offset, size in ranges:
	1521	self._file.seek(offset)
	1522	data_ranges.append((offset, self._file.read(size)))
	1523	for offset, data in data_ranges:
	1524	if offset == 0:
	1525	# extract the header
	1526	offset, data = self._parse_header_from_bytes(data)
	1527	if len(data) == 0:
	1528	continue
	1529	bytes = zlib.decompress(data)
	1530	if bytes.startswith(_LEAF_FLAG):
	1531	node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
	1532	elif bytes.startswith(_INTERNAL_FLAG):
	1533	node = _InternalNode(bytes)
	1534	else:
	1535	raise AssertionError("Unknown node type for %r" % bytes)
	1536	yield offset / _PAGE_SIZE, node
	1537
	1538	def _signature(self):
	1539	"""The file signature for this index type."""
	1540	return _BTSIGNATURE
	1541
	1542	def validate(self):
	1543	"""Validate that everything in the index can be accessed."""
	1544	# just read and parse every node.
	1545	self._get_root_node()
	1546	if len(self._row_lengths) > 1:
	1547	start_node = self._row_offsets[1]
	1548	else:
	1549	# We shouldn't be reading anything anyway
	1550	start_node = 1
	1551	node_end = self._row_offsets[-1]
	1552	for node in self._read_nodes(range(start_node, node_end)):
	1553	pass
	1554
	1555
	1556	try:
4459.2.1 by Vincent Ladeuil Use a consistent scheme for naming pyrex source files.	1557	from bzrlib import _btree_serializer_pyx as _btree_serializer
4574.3.6 by Martin Pool More warnings when failing to load extensions	1558	except ImportError, e:
4574.3.8 by Martin Pool Only mutter extension load errors when they occur, and record for later	1559	osutils.failed_to_load_extension(e)
3641.3.30 by John Arbash Meinel Rename _parse_btree to _btree_serializer	1560	from bzrlib import _btree_serializer_py as _btree_serializer