bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
| 
4763.2.4
by John Arbash Meinel
 merge bzr.2.1 in preparation for NEWS entry.  | 
1  | 
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
 | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
2  | 
#
 | 
3  | 
# This program is free software; you can redistribute it and/or modify
 | 
|
| 
3641.3.29
by John Arbash Meinel
 Cleanup the copyright headers  | 
4  | 
# it under the terms of the GNU General Public License as published by
 | 
5  | 
# the Free Software Foundation; either version 2 of the License, or
 | 
|
6  | 
# (at your option) any later version.
 | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
7  | 
#
 | 
8  | 
# This program is distributed in the hope that it will be useful,
 | 
|
9  | 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
|
10  | 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
|
11  | 
# GNU General Public License for more details.
 | 
|
12  | 
#
 | 
|
13  | 
# You should have received a copy of the GNU General Public License
 | 
|
14  | 
# along with this program; if not, write to the Free Software
 | 
|
| 
4183.7.1
by Sabin Iacob
 update FSF mailing address  | 
15  | 
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
16  | 
#
 | 
17  | 
||
18  | 
"""B+Tree indices"""
 | 
|
19  | 
||
| 
4708.1.1
by John Arbash Meinel
 Use a cStringIO.StringIO for 1-page btree indexes.  | 
20  | 
import cStringIO  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
21  | 
from bisect import bisect_right  | 
22  | 
import math  | 
|
23  | 
import tempfile  | 
|
24  | 
import zlib  | 
|
25  | 
||
26  | 
from bzrlib import (  | 
|
27  | 
chunk_writer,  | 
|
28  | 
debug,  | 
|
29  | 
errors,  | 
|
| 
4208.1.2
by John Arbash Meinel
 Switch to using a FIFOCache.  | 
30  | 
fifo_cache,  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
31  | 
index,  | 
32  | 
lru_cache,  | 
|
33  | 
osutils,  | 
|
| 
4789.28.1
by John Arbash Meinel
 Use StaticTuple as part of the builder process.  | 
34  | 
static_tuple,  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
35  | 
trace,  | 
36  | 
    )
 | 
|
37  | 
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN  | 
|
38  | 
from bzrlib.transport import get_transport  | 
|
39  | 
||
40  | 
||
| 
3641.3.3
by John Arbash Meinel
 Change the header to indicate these indexes are  | 
41  | 
_BTSIGNATURE = "B+Tree Graph Index 2\n"  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
42  | 
_OPTION_ROW_LENGTHS = "row_lengths="  | 
43  | 
_LEAF_FLAG = "type=leaf\n"  | 
|
44  | 
_INTERNAL_FLAG = "type=internal\n"  | 
|
45  | 
_INTERNAL_OFFSET = "offset="  | 
|
46  | 
||
47  | 
_RESERVED_HEADER_BYTES = 120  | 
|
48  | 
_PAGE_SIZE = 4096  | 
|
49  | 
||
50  | 
# 4K per page: 4MB - 1000 entries
 | 
|
51  | 
_NODE_CACHE_SIZE = 1000  | 
|
52  | 
||
53  | 
||
54  | 
class _BuilderRow(object):  | 
|
55  | 
"""The stored state accumulated while writing out a row in the index.  | 
|
56  | 
||
57  | 
    :ivar spool: A temporary file used to accumulate nodes for this row
 | 
|
58  | 
        in the tree.
 | 
|
59  | 
    :ivar nodes: The count of nodes emitted so far.
 | 
|
60  | 
    """
 | 
|
61  | 
||
62  | 
def __init__(self):  | 
|
63  | 
"""Create a _BuilderRow."""  | 
|
64  | 
self.nodes = 0  | 
|
| 
4708.1.1
by John Arbash Meinel
 Use a cStringIO.StringIO for 1-page btree indexes.  | 
65  | 
self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
66  | 
self.writer = None  | 
67  | 
||
68  | 
def finish_node(self, pad=True):  | 
|
69  | 
byte_lines, _, padding = self.writer.finish()  | 
|
70  | 
if self.nodes == 0:  | 
|
| 
4708.1.1
by John Arbash Meinel
 Use a cStringIO.StringIO for 1-page btree indexes.  | 
71  | 
self.spool = cStringIO.StringIO()  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
72  | 
            # padded note:
 | 
73  | 
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)  | 
|
| 
4708.1.1
by John Arbash Meinel
 Use a cStringIO.StringIO for 1-page btree indexes.  | 
74  | 
elif self.nodes == 1:  | 
75  | 
            # We got bigger than 1 node, switch to a temp file
 | 
|
76  | 
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')  | 
|
77  | 
spool.write(self.spool.getvalue())  | 
|
78  | 
self.spool = spool  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
79  | 
skipped_bytes = 0  | 
80  | 
if not pad and padding:  | 
|
81  | 
del byte_lines[-1]  | 
|
82  | 
skipped_bytes = padding  | 
|
83  | 
self.spool.writelines(byte_lines)  | 
|
| 
3644.2.3
by John Arbash Meinel
 Do a bit more work to get all the tests to pass.  | 
84  | 
remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE  | 
85  | 
if remainder != 0:  | 
|
86  | 
raise AssertionError("incorrect node length: %d, %d"  | 
|
87  | 
% (self.spool.tell(), remainder))  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
88  | 
self.nodes += 1  | 
89  | 
self.writer = None  | 
|
90  | 
||
91  | 
||
92  | 
class _InternalBuilderRow(_BuilderRow):  | 
|
93  | 
"""The stored state accumulated while writing out internal rows."""  | 
|
94  | 
||
95  | 
def finish_node(self, pad=True):  | 
|
96  | 
if not pad:  | 
|
97  | 
raise AssertionError("Must pad internal nodes only.")  | 
|
98  | 
_BuilderRow.finish_node(self)  | 
|
99  | 
||
100  | 
||
101  | 
class _LeafBuilderRow(_BuilderRow):  | 
|
102  | 
"""The stored state accumulated while writing out a leaf rows."""  | 
|
103  | 
||
104  | 
||
105  | 
class BTreeBuilder(index.GraphIndexBuilder):  | 
|
106  | 
"""A Builder for B+Tree based Graph indices.  | 
|
107  | 
||
108  | 
    The resulting graph has the structure:
 | 
|
109  | 
||
110  | 
    _SIGNATURE OPTIONS NODES
 | 
|
111  | 
    _SIGNATURE     := 'B+Tree Graph Index 1' NEWLINE
 | 
|
112  | 
    OPTIONS        := REF_LISTS KEY_ELEMENTS LENGTH
 | 
|
113  | 
    REF_LISTS      := 'node_ref_lists=' DIGITS NEWLINE
 | 
|
114  | 
    KEY_ELEMENTS   := 'key_elements=' DIGITS NEWLINE
 | 
|
115  | 
    LENGTH         := 'len=' DIGITS NEWLINE
 | 
|
116  | 
    ROW_LENGTHS    := 'row_lengths' DIGITS (COMMA DIGITS)*
 | 
|
117  | 
    NODES          := NODE_COMPRESSED*
 | 
|
118  | 
    NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
 | 
|
119  | 
    NODE_RAW       := INTERNAL | LEAF
 | 
|
120  | 
    INTERNAL       := INTERNAL_FLAG POINTERS
 | 
|
121  | 
    LEAF           := LEAF_FLAG ROWS
 | 
|
122  | 
    KEY_ELEMENT    := Not-whitespace-utf8
 | 
|
123  | 
    KEY            := KEY_ELEMENT (NULL KEY_ELEMENT)*
 | 
|
124  | 
    ROWS           := ROW*
 | 
|
125  | 
    ROW            := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
 | 
|
126  | 
    ABSENT         := 'a'
 | 
|
127  | 
    REFERENCES     := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
 | 
|
128  | 
    REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
 | 
|
129  | 
    REFERENCE      := KEY
 | 
|
130  | 
    VALUE          := no-newline-no-null-bytes
 | 
|
131  | 
    """
 | 
|
132  | 
||
133  | 
def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):  | 
|
134  | 
"""See GraphIndexBuilder.__init__.  | 
|
135  | 
||
136  | 
        :param spill_at: Optional parameter controlling the maximum number
 | 
|
137  | 
            of nodes that BTreeBuilder will hold in memory.
 | 
|
138  | 
        """
 | 
|
139  | 
index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,  | 
|
140  | 
key_elements=key_elements)  | 
|
141  | 
self._spill_at = spill_at  | 
|
142  | 
self._backing_indices = []  | 
|
| 
3644.2.11
by John Arbash Meinel
 Document the new form of _nodes and remove an unnecessary cast.  | 
143  | 
        # A map of {key: (node_refs, value)}
 | 
144  | 
self._nodes = {}  | 
|
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
145  | 
        # Indicate it hasn't been built yet
 | 
146  | 
self._nodes_by_key = None  | 
|
| 
3777.5.2
by John Arbash Meinel
 Change the name to ChunkWriter.set_optimize()  | 
147  | 
self._optimize_for_size = False  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
148  | 
|
149  | 
def add_node(self, key, value, references=()):  | 
|
150  | 
"""Add a node to the index.  | 
|
151  | 
||
152  | 
        If adding the node causes the builder to reach its spill_at threshold,
 | 
|
153  | 
        disk spilling will be triggered.
 | 
|
154  | 
||
155  | 
        :param key: The key. keys are non-empty tuples containing
 | 
|
156  | 
            as many whitespace-free utf8 bytestrings as the key length
 | 
|
157  | 
            defined for this index.
 | 
|
158  | 
        :param references: An iterable of iterables of keys. Each is a
 | 
|
159  | 
            reference to another key.
 | 
|
160  | 
        :param value: The value to associate with the key. It may be any
 | 
|
161  | 
            bytes as long as it does not contain \0 or \n.
 | 
|
162  | 
        """
 | 
|
| 
4789.28.2
by John Arbash Meinel
 Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.  | 
163  | 
        # Ensure that 'key' is a StaticTuple
 | 
164  | 
key = static_tuple.StaticTuple.from_sequence(key).intern()  | 
|
| 
3644.2.9
by John Arbash Meinel
 Refactor some code.  | 
165  | 
        # we don't care about absent_references
 | 
166  | 
node_refs, _ = self._check_key_ref_value(key, references, value)  | 
|
| 
3644.2.2
by John Arbash Meinel
 the new btree index doesn't have 'absent' keys in its _nodes  | 
167  | 
if key in self._nodes:  | 
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
168  | 
raise errors.BadIndexDuplicateKey(key, self)  | 
| 
4789.28.1
by John Arbash Meinel
 Use StaticTuple as part of the builder process.  | 
169  | 
self._nodes[key] = static_tuple.StaticTuple(node_refs, value)  | 
| 
3644.2.9
by John Arbash Meinel
 Refactor some code.  | 
170  | 
if self._nodes_by_key is not None and self._key_length > 1:  | 
171  | 
self._update_nodes_by_key(key, value, node_refs)  | 
|
| 
4789.28.2
by John Arbash Meinel
 Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.  | 
172  | 
if len(self._nodes) < self._spill_at:  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
173  | 
            return
 | 
| 
3644.2.9
by John Arbash Meinel
 Refactor some code.  | 
174  | 
self._spill_mem_keys_to_disk()  | 
175  | 
||
176  | 
def _spill_mem_keys_to_disk(self):  | 
|
177  | 
"""Write the in memory keys down to disk to cap memory consumption.  | 
|
178  | 
||
179  | 
        If we already have some keys written to disk, we will combine them so
 | 
|
180  | 
        as to preserve the sorted order.  The algorithm for combining uses
 | 
|
181  | 
        powers of two.  So on the first spill, write all mem nodes into a
 | 
|
182  | 
        single index. On the second spill, combine the mem nodes with the nodes
 | 
|
183  | 
        on disk to create a 2x sized disk index and get rid of the first index.
 | 
|
184  | 
        On the third spill, create a single new disk index, which will contain
 | 
|
185  | 
        the mem nodes, and preserve the existing 2x sized index.  On the fourth,
 | 
|
186  | 
        combine mem with the first and second indexes, creating a new one of
 | 
|
187  | 
        size 4x. On the fifth create a single new one, etc.
 | 
|
188  | 
        """
 | 
|
| 
4168.3.6
by John Arbash Meinel
 Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().  | 
189  | 
if self._combine_backing_indices:  | 
| 
4168.3.5
by John Arbash Meinel
 Check that setting _combine_spilled_indices has the expected effect.  | 
190  | 
(new_backing_file, size,  | 
191  | 
backing_pos) = self._spill_mem_keys_and_combine()  | 
|
192  | 
else:  | 
|
193  | 
new_backing_file, size = self._spill_mem_keys_without_combining()  | 
|
194  | 
        # Note: The transport here isn't strictly needed, because we will use
 | 
|
195  | 
        #       direct access to the new_backing._file object
 | 
|
| 
4708.1.1
by John Arbash Meinel
 Use a cStringIO.StringIO for 1-page btree indexes.  | 
196  | 
new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)  | 
| 
4168.3.5
by John Arbash Meinel
 Check that setting _combine_spilled_indices has the expected effect.  | 
197  | 
        # GC will clean up the file
 | 
198  | 
new_backing._file = new_backing_file  | 
|
| 
4168.3.6
by John Arbash Meinel
 Add 'combine_backing_indices' as a flag for GraphIndex.set_optimize().  | 
199  | 
if self._combine_backing_indices:  | 
| 
4168.3.5
by John Arbash Meinel
 Check that setting _combine_spilled_indices has the expected effect.  | 
200  | 
if len(self._backing_indices) == backing_pos:  | 
201  | 
self._backing_indices.append(None)  | 
|
202  | 
self._backing_indices[backing_pos] = new_backing  | 
|
203  | 
for backing_pos in range(backing_pos):  | 
|
204  | 
self._backing_indices[backing_pos] = None  | 
|
205  | 
else:  | 
|
206  | 
self._backing_indices.append(new_backing)  | 
|
207  | 
self._nodes = {}  | 
|
208  | 
self._nodes_by_key = None  | 
|
209  | 
||
210  | 
def _spill_mem_keys_without_combining(self):  | 
|
211  | 
return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)  | 
|
212  | 
||
213  | 
def _spill_mem_keys_and_combine(self):  | 
|
| 
4168.3.4
by John Arbash Meinel
 Restore the ability to spill, but prepare a flag to disable it.  | 
214  | 
iterators_to_combine = [self._iter_mem_nodes()]  | 
215  | 
pos = -1  | 
|
216  | 
for pos, backing in enumerate(self._backing_indices):  | 
|
217  | 
if backing is None:  | 
|
218  | 
pos -= 1  | 
|
219  | 
                break
 | 
|
220  | 
iterators_to_combine.append(backing.iter_all_entries())  | 
|
221  | 
backing_pos = pos + 1  | 
|
222  | 
new_backing_file, size = \  | 
|
223  | 
self._write_nodes(self._iter_smallest(iterators_to_combine),  | 
|
224  | 
allow_optimize=False)  | 
|
| 
4168.3.5
by John Arbash Meinel
 Check that setting _combine_spilled_indices has the expected effect.  | 
225  | 
return new_backing_file, size, backing_pos  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
226  | 
|
227  | 
def add_nodes(self, nodes):  | 
|
228  | 
"""Add nodes to the index.  | 
|
229  | 
||
230  | 
        :param nodes: An iterable of (key, node_refs, value) entries to add.
 | 
|
231  | 
        """
 | 
|
232  | 
if self.reference_lists:  | 
|
233  | 
for (key, value, node_refs) in nodes:  | 
|
234  | 
self.add_node(key, value, node_refs)  | 
|
235  | 
else:  | 
|
236  | 
for (key, value) in nodes:  | 
|
237  | 
self.add_node(key, value)  | 
|
238  | 
||
239  | 
def _iter_mem_nodes(self):  | 
|
240  | 
"""Iterate over the nodes held in memory."""  | 
|
| 
3644.2.8
by John Arbash Meinel
 Two quick tweaks.  | 
241  | 
nodes = self._nodes  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
242  | 
if self.reference_lists:  | 
| 
3644.2.8
by John Arbash Meinel
 Two quick tweaks.  | 
243  | 
for key in sorted(nodes):  | 
244  | 
references, value = nodes[key]  | 
|
245  | 
yield self, key, value, references  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
246  | 
else:  | 
| 
3644.2.8
by John Arbash Meinel
 Two quick tweaks.  | 
247  | 
for key in sorted(nodes):  | 
248  | 
references, value = nodes[key]  | 
|
249  | 
yield self, key, value  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
250  | 
|
251  | 
def _iter_smallest(self, iterators_to_combine):  | 
|
| 
3641.3.9
by John Arbash Meinel
 Special case around _iter_smallest when we have only  | 
252  | 
if len(iterators_to_combine) == 1:  | 
253  | 
for value in iterators_to_combine[0]:  | 
|
254  | 
yield value  | 
|
255  | 
            return
 | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
256  | 
current_values = []  | 
257  | 
for iterator in iterators_to_combine:  | 
|
258  | 
try:  | 
|
259  | 
current_values.append(iterator.next())  | 
|
260  | 
except StopIteration:  | 
|
261  | 
current_values.append(None)  | 
|
262  | 
last = None  | 
|
263  | 
while True:  | 
|
264  | 
            # Decorate candidates with the value to allow 2.4's min to be used.
 | 
|
265  | 
candidates = [(item[1][1], item) for item  | 
|
266  | 
in enumerate(current_values) if item[1] is not None]  | 
|
267  | 
if not len(candidates):  | 
|
268  | 
                return
 | 
|
269  | 
selected = min(candidates)  | 
|
270  | 
            # undecorate back to (pos, node)
 | 
|
271  | 
selected = selected[1]  | 
|
272  | 
if last == selected[1][1]:  | 
|
273  | 
raise errors.BadIndexDuplicateKey(last, self)  | 
|
274  | 
last = selected[1][1]  | 
|
275  | 
            # Yield, with self as the index
 | 
|
276  | 
yield (self,) + selected[1][1:]  | 
|
277  | 
pos = selected[0]  | 
|
278  | 
try:  | 
|
279  | 
current_values[pos] = iterators_to_combine[pos].next()  | 
|
280  | 
except StopIteration:  | 
|
281  | 
current_values[pos] = None  | 
|
282  | 
||
| 
4168.2.1
by John Arbash Meinel
 Disable optimizations when spilling content to disk.  | 
283  | 
def _add_key(self, string_key, line, rows, allow_optimize=True):  | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
284  | 
"""Add a key to the current chunk.  | 
285  | 
||
286  | 
        :param string_key: The key to add.
 | 
|
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
287  | 
        :param line: The fully serialised key and value.
 | 
| 
4168.2.1
by John Arbash Meinel
 Disable optimizations when spilling content to disk.  | 
288  | 
        :param allow_optimize: If set to False, prevent setting the optimize
 | 
289  | 
            flag when writing out. This is used by the _spill_mem_keys_to_disk
 | 
|
290  | 
            functionality.
 | 
|
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
291  | 
        """
 | 
292  | 
if rows[-1].writer is None:  | 
|
293  | 
            # opening a new leaf chunk;
 | 
|
294  | 
for pos, internal_row in enumerate(rows[:-1]):  | 
|
295  | 
                # flesh out any internal nodes that are needed to
 | 
|
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
296  | 
                # preserve the height of the tree
 | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
297  | 
if internal_row.writer is None:  | 
298  | 
length = _PAGE_SIZE  | 
|
299  | 
if internal_row.nodes == 0:  | 
|
300  | 
length -= _RESERVED_HEADER_BYTES # padded  | 
|
| 
4168.2.1
by John Arbash Meinel
 Disable optimizations when spilling content to disk.  | 
301  | 
if allow_optimize:  | 
302  | 
optimize_for_size = self._optimize_for_size  | 
|
303  | 
else:  | 
|
304  | 
optimize_for_size = False  | 
|
| 
3777.5.2
by John Arbash Meinel
 Change the name to ChunkWriter.set_optimize()  | 
305  | 
internal_row.writer = chunk_writer.ChunkWriter(length, 0,  | 
| 
4168.2.1
by John Arbash Meinel
 Disable optimizations when spilling content to disk.  | 
306  | 
optimize_for_size=optimize_for_size)  | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
307  | 
internal_row.writer.write(_INTERNAL_FLAG)  | 
308  | 
internal_row.writer.write(_INTERNAL_OFFSET +  | 
|
309  | 
str(rows[pos + 1].nodes) + "\n")  | 
|
310  | 
            # add a new leaf
 | 
|
311  | 
length = _PAGE_SIZE  | 
|
312  | 
if rows[-1].nodes == 0:  | 
|
313  | 
length -= _RESERVED_HEADER_BYTES # padded  | 
|
| 
3777.5.2
by John Arbash Meinel
 Change the name to ChunkWriter.set_optimize()  | 
314  | 
rows[-1].writer = chunk_writer.ChunkWriter(length,  | 
315  | 
optimize_for_size=self._optimize_for_size)  | 
|
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
316  | 
rows[-1].writer.write(_LEAF_FLAG)  | 
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
317  | 
if rows[-1].writer.write(line):  | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
318  | 
            # this key did not fit in the node:
 | 
319  | 
rows[-1].finish_node()  | 
|
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
320  | 
key_line = string_key + "\n"  | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
321  | 
new_row = True  | 
322  | 
for row in reversed(rows[:-1]):  | 
|
323  | 
                # Mark the start of the next node in the node above. If it
 | 
|
| 
4031.3.1
by Frank Aspell
 Fixing various typos  | 
324  | 
                # doesn't fit then propagate upwards until we find one that
 | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
325  | 
                # it does fit into.
 | 
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
326  | 
if row.writer.write(key_line):  | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
327  | 
row.finish_node()  | 
328  | 
else:  | 
|
329  | 
                    # We've found a node that can handle the pointer.
 | 
|
330  | 
new_row = False  | 
|
331  | 
                    break
 | 
|
332  | 
            # If we reached the current root without being able to mark the
 | 
|
333  | 
            # division point, then we need a new root:
 | 
|
334  | 
if new_row:  | 
|
335  | 
                # We need a new row
 | 
|
336  | 
if 'index' in debug.debug_flags:  | 
|
337  | 
trace.mutter('Inserting new global row.')  | 
|
338  | 
new_row = _InternalBuilderRow()  | 
|
339  | 
reserved_bytes = 0  | 
|
340  | 
rows.insert(0, new_row)  | 
|
341  | 
                # This will be padded, hence the -100
 | 
|
342  | 
new_row.writer = chunk_writer.ChunkWriter(  | 
|
343  | 
_PAGE_SIZE - _RESERVED_HEADER_BYTES,  | 
|
| 
3777.5.2
by John Arbash Meinel
 Change the name to ChunkWriter.set_optimize()  | 
344  | 
reserved_bytes,  | 
345  | 
optimize_for_size=self._optimize_for_size)  | 
|
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
346  | 
new_row.writer.write(_INTERNAL_FLAG)  | 
347  | 
new_row.writer.write(_INTERNAL_OFFSET +  | 
|
348  | 
str(rows[1].nodes - 1) + "\n")  | 
|
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
349  | 
new_row.writer.write(key_line)  | 
| 
4168.2.1
by John Arbash Meinel
 Disable optimizations when spilling content to disk.  | 
350  | 
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)  | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
351  | 
|
| 
4168.2.1
by John Arbash Meinel
 Disable optimizations when spilling content to disk.  | 
352  | 
def _write_nodes(self, node_iterator, allow_optimize=True):  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
353  | 
"""Write node_iterator out as a B+Tree.  | 
354  | 
||
355  | 
        :param node_iterator: An iterator of sorted nodes. Each node should
 | 
|
356  | 
            match the output given by iter_all_entries.
 | 
|
| 
4168.2.1
by John Arbash Meinel
 Disable optimizations when spilling content to disk.  | 
357  | 
        :param allow_optimize: If set to False, prevent setting the optimize
 | 
358  | 
            flag when writing out. This is used by the _spill_mem_keys_to_disk
 | 
|
359  | 
            functionality.
 | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
360  | 
        :return: A file handle for a temporary file containing a B+Tree for
 | 
361  | 
            the nodes.
 | 
|
362  | 
        """
 | 
|
363  | 
        # The index rows - rows[0] is the root, rows[1] is the layer under it
 | 
|
364  | 
        # etc.
 | 
|
365  | 
rows = []  | 
|
366  | 
        # forward sorted by key. In future we may consider topological sorting,
 | 
|
367  | 
        # at the cost of table scans for direct lookup, or a second index for
 | 
|
368  | 
        # direct lookup
 | 
|
369  | 
key_count = 0  | 
|
370  | 
        # A stack with the number of nodes of each size. 0 is the root node
 | 
|
371  | 
        # and must always be 1 (if there are any nodes in the tree).
 | 
|
372  | 
self.row_lengths = []  | 
|
373  | 
        # Loop over all nodes adding them to the bottom row
 | 
|
374  | 
        # (rows[-1]). When we finish a chunk in a row,
 | 
|
| 
4031.3.1
by Frank Aspell
 Fixing various typos  | 
375  | 
        # propagate the key that didn't fit (comes after the chunk) to the
 | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
376  | 
        # row above, transitively.
 | 
377  | 
for node in node_iterator:  | 
|
378  | 
if key_count == 0:  | 
|
379  | 
                # First key triggers the first row
 | 
|
380  | 
rows.append(_LeafBuilderRow())  | 
|
381  | 
key_count += 1  | 
|
| 
3641.3.30
by John Arbash Meinel
 Rename _parse_btree to _btree_serializer  | 
382  | 
string_key, line = _btree_serializer._flatten_node(node,  | 
383  | 
self.reference_lists)  | 
|
| 
4168.2.1
by John Arbash Meinel
 Disable optimizations when spilling content to disk.  | 
384  | 
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
385  | 
for row in reversed(rows):  | 
386  | 
pad = (type(row) != _LeafBuilderRow)  | 
|
387  | 
row.finish_node(pad=pad)  | 
|
388  | 
lines = [_BTSIGNATURE]  | 
|
389  | 
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')  | 
|
390  | 
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')  | 
|
391  | 
lines.append(_OPTION_LEN + str(key_count) + '\n')  | 
|
392  | 
row_lengths = [row.nodes for row in rows]  | 
|
393  | 
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')  | 
|
| 
4708.1.1
by John Arbash Meinel
 Use a cStringIO.StringIO for 1-page btree indexes.  | 
394  | 
if row_lengths and row_lengths[-1] > 1:  | 
395  | 
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')  | 
|
396  | 
else:  | 
|
397  | 
result = cStringIO.StringIO()  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
398  | 
result.writelines(lines)  | 
399  | 
position = sum(map(len, lines))  | 
|
400  | 
root_row = True  | 
|
401  | 
if position > _RESERVED_HEADER_BYTES:  | 
|
402  | 
raise AssertionError("Could not fit the header in the"  | 
|
403  | 
" reserved space: %d > %d"  | 
|
404  | 
% (position, _RESERVED_HEADER_BYTES))  | 
|
405  | 
        # write the rows out:
 | 
|
406  | 
for row in rows:  | 
|
407  | 
reserved = _RESERVED_HEADER_BYTES # reserved space for first node  | 
|
408  | 
row.spool.flush()  | 
|
409  | 
row.spool.seek(0)  | 
|
410  | 
            # copy nodes to the finalised file.
 | 
|
411  | 
            # Special case the first node as it may be prefixed
 | 
|
412  | 
node = row.spool.read(_PAGE_SIZE)  | 
|
413  | 
result.write(node[reserved:])  | 
|
| 
4771.3.1
by John Arbash Meinel
 We don't have to pad 'short' records.  | 
414  | 
if len(node) == _PAGE_SIZE:  | 
415  | 
result.write("\x00" * (reserved - position))  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
416  | 
position = 0 # Only the root row actually has an offset  | 
417  | 
copied_len = osutils.pumpfile(row.spool, result)  | 
|
418  | 
if copied_len != (row.nodes - 1) * _PAGE_SIZE:  | 
|
419  | 
if type(row) != _LeafBuilderRow:  | 
|
| 
3644.2.3
by John Arbash Meinel
 Do a bit more work to get all the tests to pass.  | 
420  | 
raise AssertionError("Incorrect amount of data copied"  | 
421  | 
" expected: %d, got: %d"  | 
|
422  | 
% ((row.nodes - 1) * _PAGE_SIZE,  | 
|
423  | 
copied_len))  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
424  | 
result.flush()  | 
425  | 
size = result.tell()  | 
|
426  | 
result.seek(0)  | 
|
427  | 
return result, size  | 
|
428  | 
||
429  | 
def finish(self):  | 
|
430  | 
"""Finalise the index.  | 
|
431  | 
||
432  | 
        :return: A file handle for a temporary file containing the nodes added
 | 
|
433  | 
            to the index.
 | 
|
434  | 
        """
 | 
|
435  | 
return self._write_nodes(self.iter_all_entries())[0]  | 
|
436  | 
||
437  | 
def iter_all_entries(self):  | 
|
438  | 
"""Iterate over all keys within the index  | 
|
439  | 
||
| 
4343.2.2
by John Arbash Meinel
 Fix an important doc bug about the api of iter_all_entries()  | 
440  | 
        :return: An iterable of (index, key, value, reference_lists). There is
 | 
441  | 
            no defined order for the result iteration - it will be in the most
 | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
442  | 
            efficient order for the index (in this case dictionary hash order).
 | 
443  | 
        """
 | 
|
444  | 
if 'evil' in debug.debug_flags:  | 
|
445  | 
trace.mutter_callsite(3,  | 
|
446  | 
"iter_all_entries scales with size of history.")  | 
|
447  | 
        # Doing serial rather than ordered would be faster; but this shouldn't
 | 
|
448  | 
        # be getting called routinely anyway.
 | 
|
| 
3644.2.8
by John Arbash Meinel
 Two quick tweaks.  | 
449  | 
iterators = [self._iter_mem_nodes()]  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
450  | 
for backing in self._backing_indices:  | 
451  | 
if backing is not None:  | 
|
452  | 
iterators.append(backing.iter_all_entries())  | 
|
| 
3641.3.9
by John Arbash Meinel
 Special case around _iter_smallest when we have only  | 
453  | 
if len(iterators) == 1:  | 
454  | 
return iterators[0]  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
455  | 
return self._iter_smallest(iterators)  | 
456  | 
||
457  | 
def iter_entries(self, keys):  | 
|
458  | 
"""Iterate over keys within the index.  | 
|
459  | 
||
460  | 
        :param keys: An iterable providing the keys to be retrieved.
 | 
|
461  | 
        :return: An iterable of (index, key, value, reference_lists). There is no
 | 
|
462  | 
            defined order for the result iteration - it will be in the most
 | 
|
463  | 
            efficient order for the index (keys iteration order in this case).
 | 
|
464  | 
        """
 | 
|
465  | 
keys = set(keys)  | 
|
| 
4789.28.2
by John Arbash Meinel
 Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.  | 
466  | 
        # Note: We don't use keys.intersection() here. If you read the C api,
 | 
467  | 
        #       set.intersection(other) special cases when other is a set and
 | 
|
468  | 
        #       will iterate the smaller of the two and lookup in the other.
 | 
|
469  | 
        #       It does *not* do this for any other type (even dict, unlike
 | 
|
470  | 
        #       some other set functions.) Since we expect keys is generally <<
 | 
|
471  | 
        #       self._nodes, it is faster to iterate over it in a list
 | 
|
472  | 
        #       comprehension
 | 
|
473  | 
nodes = self._nodes  | 
|
474  | 
local_keys = [key for key in keys if key in nodes]  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
475  | 
if self.reference_lists:  | 
| 
3847.2.2
by John Arbash Meinel
 Rather than skipping the difference_update entirely, just restrict it to the intersection keys.  | 
476  | 
for key in local_keys:  | 
| 
4789.28.2
by John Arbash Meinel
 Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.  | 
477  | 
node = nodes[key]  | 
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
478  | 
yield self, key, node[1], node[0]  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
479  | 
else:  | 
| 
3847.2.2
by John Arbash Meinel
 Rather than skipping the difference_update entirely, just restrict it to the intersection keys.  | 
480  | 
for key in local_keys:  | 
| 
4789.28.2
by John Arbash Meinel
 Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.  | 
481  | 
node = nodes[key]  | 
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
482  | 
yield self, key, node[1]  | 
| 
3847.2.1
by John Arbash Meinel
 Shortcut BTreeBuilder.iter_entries when there are no backing indices.  | 
483  | 
        # Find things that are in backing indices that have not been handled
 | 
484  | 
        # yet.
 | 
|
| 
3847.2.3
by John Arbash Meinel
 Bring back the shortcut  | 
485  | 
if not self._backing_indices:  | 
486  | 
return # We won't find anything there either  | 
|
| 
3847.2.2
by John Arbash Meinel
 Rather than skipping the difference_update entirely, just restrict it to the intersection keys.  | 
487  | 
        # Remove all of the keys that we found locally
 | 
488  | 
keys.difference_update(local_keys)  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
489  | 
for backing in self._backing_indices:  | 
490  | 
if backing is None:  | 
|
491  | 
                continue
 | 
|
492  | 
if not keys:  | 
|
493  | 
                return
 | 
|
494  | 
for node in backing.iter_entries(keys):  | 
|
495  | 
keys.remove(node[1])  | 
|
496  | 
yield (self,) + node[1:]  | 
|
497  | 
||
498  | 
def iter_entries_prefix(self, keys):  | 
|
499  | 
"""Iterate over keys within the index using prefix matching.  | 
|
500  | 
||
501  | 
        Prefix matching is applied within the tuple of a key, not to within
 | 
|
502  | 
        the bytestring of each key element. e.g. if you have the keys ('foo',
 | 
|
503  | 
        'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
 | 
|
504  | 
        only the former key is returned.
 | 
|
505  | 
||
506  | 
        :param keys: An iterable providing the key prefixes to be retrieved.
 | 
|
507  | 
            Each key prefix takes the form of a tuple the length of a key, but
 | 
|
508  | 
            with the last N elements 'None' rather than a regular bytestring.
 | 
|
509  | 
            The first element cannot be 'None'.
 | 
|
510  | 
        :return: An iterable as per iter_all_entries, but restricted to the
 | 
|
511  | 
            keys with a matching prefix to those supplied. No additional keys
 | 
|
512  | 
            will be returned, and every match that is in the index will be
 | 
|
513  | 
            returned.
 | 
|
514  | 
        """
 | 
|
515  | 
        # XXX: To much duplication with the GraphIndex class; consider finding
 | 
|
516  | 
        # a good place to pull out the actual common logic.
 | 
|
517  | 
keys = set(keys)  | 
|
518  | 
if not keys:  | 
|
519  | 
            return
 | 
|
520  | 
for backing in self._backing_indices:  | 
|
521  | 
if backing is None:  | 
|
522  | 
                continue
 | 
|
523  | 
for node in backing.iter_entries_prefix(keys):  | 
|
524  | 
yield (self,) + node[1:]  | 
|
525  | 
if self._key_length == 1:  | 
|
526  | 
for key in keys:  | 
|
527  | 
                # sanity check
 | 
|
528  | 
if key[0] is None:  | 
|
529  | 
raise errors.BadIndexKey(key)  | 
|
530  | 
if len(key) != self._key_length:  | 
|
531  | 
raise errors.BadIndexKey(key)  | 
|
532  | 
try:  | 
|
533  | 
node = self._nodes[key]  | 
|
534  | 
except KeyError:  | 
|
535  | 
                    continue
 | 
|
536  | 
if self.reference_lists:  | 
|
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
537  | 
yield self, key, node[1], node[0]  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
538  | 
else:  | 
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
539  | 
yield self, key, node[1]  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
540  | 
            return
 | 
541  | 
for key in keys:  | 
|
542  | 
            # sanity check
 | 
|
543  | 
if key[0] is None:  | 
|
544  | 
raise errors.BadIndexKey(key)  | 
|
545  | 
if len(key) != self._key_length:  | 
|
546  | 
raise errors.BadIndexKey(key)  | 
|
547  | 
            # find what it refers to:
 | 
|
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
548  | 
key_dict = self._get_nodes_by_key()  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
549  | 
elements = list(key)  | 
550  | 
            # find the subdict to return
 | 
|
551  | 
try:  | 
|
552  | 
while len(elements) and elements[0] is not None:  | 
|
553  | 
key_dict = key_dict[elements[0]]  | 
|
554  | 
elements.pop(0)  | 
|
555  | 
except KeyError:  | 
|
556  | 
                # a non-existant lookup.
 | 
|
557  | 
                continue
 | 
|
558  | 
if len(elements):  | 
|
559  | 
dicts = [key_dict]  | 
|
560  | 
while dicts:  | 
|
561  | 
key_dict = dicts.pop(-1)  | 
|
562  | 
                    # can't be empty or would not exist
 | 
|
563  | 
item, value = key_dict.iteritems().next()  | 
|
564  | 
if type(value) == dict:  | 
|
565  | 
                        # push keys
 | 
|
566  | 
dicts.extend(key_dict.itervalues())  | 
|
567  | 
else:  | 
|
568  | 
                        # yield keys
 | 
|
569  | 
for value in key_dict.itervalues():  | 
|
| 
5088.1.1
by Jelmer Vernooij
 Force value to a tuple before concatenating with tuple that contains variables  | 
570  | 
yield (self, ) + tuple(value)  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
571  | 
else:  | 
572  | 
yield (self, ) + key_dict  | 
|
573  | 
||
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
574  | 
def _get_nodes_by_key(self):  | 
575  | 
if self._nodes_by_key is None:  | 
|
576  | 
nodes_by_key = {}  | 
|
577  | 
if self.reference_lists:  | 
|
578  | 
for key, (references, value) in self._nodes.iteritems():  | 
|
579  | 
key_dict = nodes_by_key  | 
|
580  | 
for subkey in key[:-1]:  | 
|
581  | 
key_dict = key_dict.setdefault(subkey, {})  | 
|
582  | 
key_dict[key[-1]] = key, value, references  | 
|
583  | 
else:  | 
|
584  | 
for key, (references, value) in self._nodes.iteritems():  | 
|
585  | 
key_dict = nodes_by_key  | 
|
586  | 
for subkey in key[:-1]:  | 
|
587  | 
key_dict = key_dict.setdefault(subkey, {})  | 
|
588  | 
key_dict[key[-1]] = key, value  | 
|
589  | 
self._nodes_by_key = nodes_by_key  | 
|
590  | 
return self._nodes_by_key  | 
|
591  | 
||
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
592  | 
def key_count(self):  | 
593  | 
"""Return an estimate of the number of keys in this index.  | 
|
594  | 
||
595  | 
        For InMemoryGraphIndex the estimate is exact.
 | 
|
596  | 
        """
 | 
|
| 
4789.28.2
by John Arbash Meinel
 Get rid of the GraphIndexBuilder/BTreeBuilder._keys attribute.  | 
597  | 
return len(self._nodes) + sum(backing.key_count() for backing in  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
598  | 
self._backing_indices if backing is not None)  | 
599  | 
||
600  | 
def validate(self):  | 
|
601  | 
"""In memory index's have no known corruption at the moment."""  | 
|
602  | 
||
603  | 
||
604  | 
class _LeafNode(object):  | 
|
605  | 
"""A leaf node for a serialised B+Tree index."""  | 
|
606  | 
||
| 
4593.4.2
by John Arbash Meinel
 Removing the min(keys) and max(keys) calls saves 100ms in the inner loop  | 
607  | 
__slots__ = ('keys', 'min_key', 'max_key')  | 
| 
4274.1.2
by John Arbash Meinel
 Add slots to _LeafNode and _InternalNode.  | 
608  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
609  | 
def __init__(self, bytes, key_length, ref_list_length):  | 
610  | 
"""Parse bytes to create a leaf node object."""  | 
|
611  | 
        # splitlines mangles the \r delimiters.. don't use it.
 | 
|
| 
4593.4.2
by John Arbash Meinel
 Removing the min(keys) and max(keys) calls saves 100ms in the inner loop  | 
612  | 
key_list = _btree_serializer._parse_leaf_lines(bytes,  | 
613  | 
key_length, ref_list_length)  | 
|
614  | 
if key_list:  | 
|
| 
4593.4.4
by John Arbash Meinel
 Trying out a few more tweaks.  | 
615  | 
self.min_key = key_list[0][0]  | 
616  | 
self.max_key = key_list[-1][0]  | 
|
| 
4593.4.2
by John Arbash Meinel
 Removing the min(keys) and max(keys) calls saves 100ms in the inner loop  | 
617  | 
else:  | 
618  | 
self.min_key = self.max_key = None  | 
|
619  | 
self.keys = dict(key_list)  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
620  | 
|
621  | 
||
622  | 
class _InternalNode(object):  | 
|
623  | 
"""An internal node for a serialised B+Tree index."""  | 
|
624  | 
||
| 
4274.1.2
by John Arbash Meinel
 Add slots to _LeafNode and _InternalNode.  | 
625  | 
__slots__ = ('keys', 'offset')  | 
626  | 
||
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
627  | 
def __init__(self, bytes):  | 
628  | 
"""Parse bytes to create an internal node object."""  | 
|
629  | 
        # splitlines mangles the \r delimiters.. don't use it.
 | 
|
630  | 
self.keys = self._parse_lines(bytes.split('\n'))  | 
|
631  | 
||
632  | 
def _parse_lines(self, lines):  | 
|
633  | 
nodes = []  | 
|
634  | 
self.offset = int(lines[1][7:])  | 
|
| 
4789.28.1
by John Arbash Meinel
 Use StaticTuple as part of the builder process.  | 
635  | 
as_st = static_tuple.StaticTuple.from_sequence  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
636  | 
for line in lines[2:]:  | 
637  | 
if line == '':  | 
|
638  | 
                break
 | 
|
| 
4789.28.1
by John Arbash Meinel
 Use StaticTuple as part of the builder process.  | 
639  | 
nodes.append(as_st(map(intern, line.split('\0'))).intern())  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
640  | 
return nodes  | 
641  | 
||
642  | 
||
643  | 
class BTreeGraphIndex(object):  | 
|
644  | 
"""Access to nodes via the standard GraphIndex interface for B+Tree's.  | 
|
645  | 
||
646  | 
    Individual nodes are held in a LRU cache. This holds the root node in
 | 
|
647  | 
    memory except when very large walks are done.
 | 
|
648  | 
    """
 | 
|
649  | 
||
| 
4634.71.1
by John Arbash Meinel
 Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).  | 
650  | 
def __init__(self, transport, name, size, unlimited_cache=False):  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
651  | 
"""Create a B+Tree index object on the index name.  | 
652  | 
||
653  | 
        :param transport: The transport to read data for the index from.
 | 
|
654  | 
        :param name: The file name of the index on transport.
 | 
|
655  | 
        :param size: Optional size of the index in bytes. This allows
 | 
|
656  | 
            compatibility with the GraphIndex API, as well as ensuring that
 | 
|
657  | 
            the initial read (to read the root node header) can be done
 | 
|
658  | 
            without over-reading even on empty indices, and on small indices
 | 
|
659  | 
            allows single-IO to read the entire index.
 | 
|
| 
4634.71.1
by John Arbash Meinel
 Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).  | 
660  | 
        :param unlimited_cache: If set to True, then instead of using an
 | 
661  | 
            LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
 | 
|
662  | 
            cache all leaf nodes.
 | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
663  | 
        """
 | 
664  | 
self._transport = transport  | 
|
665  | 
self._name = name  | 
|
666  | 
self._size = size  | 
|
667  | 
self._file = None  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
668  | 
self._recommended_pages = self._compute_recommended_pages()  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
669  | 
self._root_node = None  | 
670  | 
        # Default max size is 100,000 leave values
 | 
|
671  | 
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)  | 
|
| 
4634.71.1
by John Arbash Meinel
 Work around bug #402623 by allowing BTreeGraphIndex(...,unlimited_cache=True).  | 
672  | 
if unlimited_cache:  | 
673  | 
self._leaf_node_cache = {}  | 
|
674  | 
self._internal_node_cache = {}  | 
|
675  | 
else:  | 
|
676  | 
self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)  | 
|
677  | 
            # We use a FIFO here just to prevent possible blowout. However, a
 | 
|
678  | 
            # 300k record btree has only 3k leaf nodes, and only 20 internal
 | 
|
679  | 
            # nodes. A value of 100 scales to ~100*100*100 = 1M records.
 | 
|
680  | 
self._internal_node_cache = fifo_cache.FIFOCache(100)  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
681  | 
self._key_count = None  | 
682  | 
self._row_lengths = None  | 
|
683  | 
self._row_offsets = None # Start of each row, [-1] is the end  | 
|
684  | 
||
685  | 
def __eq__(self, other):  | 
|
686  | 
"""Equal when self and other were created with the same parameters."""  | 
|
687  | 
return (  | 
|
688  | 
type(self) == type(other) and  | 
|
689  | 
self._transport == other._transport and  | 
|
690  | 
self._name == other._name and  | 
|
691  | 
self._size == other._size)  | 
|
692  | 
||
693  | 
def __ne__(self, other):  | 
|
694  | 
return not self.__eq__(other)  | 
|
695  | 
||
| 
3763.8.12
by John Arbash Meinel
 Code cleanup.  | 
696  | 
def _get_and_cache_nodes(self, nodes):  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
697  | 
"""Read nodes and cache them in the lru.  | 
698  | 
||
699  | 
        The nodes list supplied is sorted and then read from disk, each node
 | 
|
700  | 
        being inserted it into the _node_cache.
 | 
|
701  | 
||
702  | 
        Note: Asking for more nodes than the _node_cache can contain will
 | 
|
703  | 
        result in some of the results being immediately discarded, to prevent
 | 
|
704  | 
        this an assertion is raised if more nodes are asked for than are
 | 
|
705  | 
        cachable.
 | 
|
706  | 
||
707  | 
        :return: A dict of {node_pos: node}
 | 
|
708  | 
        """
 | 
|
709  | 
found = {}  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
710  | 
start_of_leaves = None  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
711  | 
for node_pos, node in self._read_nodes(sorted(nodes)):  | 
712  | 
if node_pos == 0: # Special case  | 
|
713  | 
self._root_node = node  | 
|
714  | 
else:  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
715  | 
if start_of_leaves is None:  | 
716  | 
start_of_leaves = self._row_offsets[-2]  | 
|
717  | 
if node_pos < start_of_leaves:  | 
|
| 
4634.71.2
by John Arbash Meinel
 If we are going to sometimes use a dict, we have to conform to just the dict interface.  | 
718  | 
self._internal_node_cache[node_pos] = node  | 
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
719  | 
else:  | 
| 
4634.71.2
by John Arbash Meinel
 If we are going to sometimes use a dict, we have to conform to just the dict interface.  | 
720  | 
self._leaf_node_cache[node_pos] = node  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
721  | 
found[node_pos] = node  | 
722  | 
return found  | 
|
723  | 
||
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
724  | 
def _compute_recommended_pages(self):  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
725  | 
"""Convert transport's recommended_page_size into btree pages.  | 
726  | 
||
727  | 
        recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
 | 
|
728  | 
        pages fit in that length.
 | 
|
729  | 
        """
 | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
730  | 
recommended_read = self._transport.recommended_page_size()  | 
731  | 
recommended_pages = int(math.ceil(recommended_read /  | 
|
732  | 
float(_PAGE_SIZE)))  | 
|
733  | 
return recommended_pages  | 
|
734  | 
||
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
735  | 
def _compute_total_pages_in_index(self):  | 
736  | 
"""How many pages are in the index.  | 
|
737  | 
||
738  | 
        If we have read the header we will use the value stored there.
 | 
|
739  | 
        Otherwise it will be computed based on the length of the index.
 | 
|
740  | 
        """
 | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
741  | 
if self._size is None:  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
742  | 
raise AssertionError('_compute_total_pages_in_index should not be'  | 
743  | 
' called when self._size is None')  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
744  | 
if self._root_node is not None:  | 
745  | 
            # This is the number of pages as defined by the header
 | 
|
746  | 
return self._row_offsets[-1]  | 
|
747  | 
        # This is the number of pages as defined by the size of the index. They
 | 
|
748  | 
        # should be indentical.
 | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
749  | 
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))  | 
750  | 
return total_pages  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
751  | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
752  | 
def _expand_offsets(self, offsets):  | 
753  | 
"""Find extra pages to download.  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
754  | 
|
755  | 
        The idea is that we always want to make big-enough requests (like 64kB
 | 
|
756  | 
        for http), so that we don't waste round trips. So given the entries
 | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
757  | 
        that we already have cached and the new pages being downloaded figure
 | 
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
758  | 
        out what other pages we might want to read.
 | 
759  | 
||
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
760  | 
        See also doc/developers/btree_index_prefetch.txt for more details.
 | 
761  | 
||
762  | 
        :param offsets: The offsets to be read
 | 
|
763  | 
        :return: A list of offsets to download
 | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
764  | 
        """
 | 
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
765  | 
if 'index' in debug.debug_flags:  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
766  | 
trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)  | 
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
767  | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
768  | 
if len(offsets) >= self._recommended_pages:  | 
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
769  | 
            # Don't add more, we are already requesting more than enough
 | 
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
770  | 
if 'index' in debug.debug_flags:  | 
771  | 
trace.mutter(' not expanding large request (%s >= %s)',  | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
772  | 
len(offsets), self._recommended_pages)  | 
773  | 
return offsets  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
774  | 
if self._size is None:  | 
775  | 
            # Don't try anything, because we don't know where the file ends
 | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
776  | 
if 'index' in debug.debug_flags:  | 
777  | 
trace.mutter(' not expanding without knowing index size')  | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
778  | 
return offsets  | 
779  | 
total_pages = self._compute_total_pages_in_index()  | 
|
780  | 
cached_offsets = self._get_offsets_to_cached_pages()  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
781  | 
        # If reading recommended_pages would read the rest of the index, just
 | 
782  | 
        # do so.
 | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
783  | 
if total_pages - len(cached_offsets) <= self._recommended_pages:  | 
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
784  | 
            # Read whatever is left
 | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
785  | 
if cached_offsets:  | 
786  | 
expanded = [x for x in xrange(total_pages)  | 
|
787  | 
if x not in cached_offsets]  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
788  | 
else:  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
789  | 
expanded = range(total_pages)  | 
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
790  | 
if 'index' in debug.debug_flags:  | 
791  | 
trace.mutter(' reading all unread pages: %s', expanded)  | 
|
792  | 
return expanded  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
793  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
794  | 
if self._root_node is None:  | 
795  | 
            # ATM on the first read of the root node of a large index, we don't
 | 
|
796  | 
            # bother pre-reading any other pages. This is because the
 | 
|
797  | 
            # likelyhood of actually reading interesting pages is very low.
 | 
|
798  | 
            # See doc/developers/btree_index_prefetch.txt for a discussion, and
 | 
|
799  | 
            # a possible implementation when we are guessing that the second
 | 
|
800  | 
            # layer index is small
 | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
801  | 
final_offsets = offsets  | 
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
802  | 
else:  | 
| 
3763.8.14
by John Arbash Meinel
 Add in a shortcut when we haven't cached much yet.  | 
803  | 
tree_depth = len(self._row_lengths)  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
804  | 
if len(cached_offsets) < tree_depth and len(offsets) == 1:  | 
| 
3763.8.14
by John Arbash Meinel
 Add in a shortcut when we haven't cached much yet.  | 
805  | 
                # We haven't read enough to justify expansion
 | 
806  | 
                # If we are only going to read the root node, and 1 leaf node,
 | 
|
807  | 
                # then it isn't worth expanding our request. Once we've read at
 | 
|
808  | 
                # least 2 nodes, then we are probably doing a search, and we
 | 
|
809  | 
                # start expanding our requests.
 | 
|
810  | 
if 'index' in debug.debug_flags:  | 
|
811  | 
trace.mutter(' not expanding on first reads')  | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
812  | 
return offsets  | 
813  | 
final_offsets = self._expand_to_neighbors(offsets, cached_offsets,  | 
|
814  | 
total_pages)  | 
|
815  | 
||
816  | 
final_offsets = sorted(final_offsets)  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
817  | 
if 'index' in debug.debug_flags:  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
818  | 
trace.mutter('expanded: %s', final_offsets)  | 
819  | 
return final_offsets  | 
|
820  | 
||
821  | 
def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):  | 
|
822  | 
"""Expand requests to neighbors until we have enough pages.  | 
|
823  | 
||
824  | 
        This is called from _expand_offsets after policy has determined that we
 | 
|
825  | 
        want to expand.
 | 
|
826  | 
        We only want to expand requests within a given layer. We cheat a little
 | 
|
827  | 
        bit and assume all requests will be in the same layer. This is true
 | 
|
828  | 
        given the current design, but if it changes this algorithm may perform
 | 
|
829  | 
        oddly.
 | 
|
830  | 
||
831  | 
        :param offsets: requested offsets
 | 
|
832  | 
        :param cached_offsets: offsets for pages we currently have cached
 | 
|
833  | 
        :return: A set() of offsets after expansion
 | 
|
834  | 
        """
 | 
|
835  | 
final_offsets = set(offsets)  | 
|
836  | 
first = end = None  | 
|
837  | 
new_tips = set(final_offsets)  | 
|
838  | 
while len(final_offsets) < self._recommended_pages and new_tips:  | 
|
839  | 
next_tips = set()  | 
|
840  | 
for pos in new_tips:  | 
|
841  | 
if first is None:  | 
|
842  | 
first, end = self._find_layer_first_and_end(pos)  | 
|
843  | 
previous = pos - 1  | 
|
844  | 
if (previous > 0  | 
|
845  | 
and previous not in cached_offsets  | 
|
846  | 
and previous not in final_offsets  | 
|
847  | 
and previous >= first):  | 
|
848  | 
next_tips.add(previous)  | 
|
849  | 
after = pos + 1  | 
|
850  | 
if (after < total_pages  | 
|
851  | 
and after not in cached_offsets  | 
|
852  | 
and after not in final_offsets  | 
|
853  | 
and after < end):  | 
|
854  | 
next_tips.add(after)  | 
|
855  | 
                # This would keep us from going bigger than
 | 
|
856  | 
                # recommended_pages by only expanding the first offsets.
 | 
|
857  | 
                # However, if we are making a 'wide' request, it is
 | 
|
858  | 
                # reasonable to expand all points equally.
 | 
|
859  | 
                # if len(final_offsets) > recommended_pages:
 | 
|
860  | 
                #     break
 | 
|
861  | 
final_offsets.update(next_tips)  | 
|
862  | 
new_tips = next_tips  | 
|
863  | 
return final_offsets  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
864  | 
|
| 
4744.2.6
by John Arbash Meinel
 Start exposing an GraphIndex.clear_cache() member.  | 
865  | 
def clear_cache(self):  | 
866  | 
"""Clear out any cached/memoized values.  | 
|
867  | 
||
868  | 
        This can be called at any time, but generally it is used when we have
 | 
|
869  | 
        extracted some information, but don't expect to be requesting any more
 | 
|
870  | 
        from this index.
 | 
|
871  | 
        """
 | 
|
872  | 
        # Note that we don't touch self._root_node or self._internal_node_cache
 | 
|
873  | 
        # We don't expect either of those to be big, and it can save
 | 
|
874  | 
        # round-trips in the future. We may re-evaluate this if InternalNode
 | 
|
875  | 
        # memory starts to be an issue.
 | 
|
876  | 
self._leaf_node_cache.clear()  | 
|
877  | 
||
| 
4011.5.3
by Andrew Bennetts
 Implement and test external_references on GraphIndex and BTreeGraphIndex.  | 
878  | 
def external_references(self, ref_list_num):  | 
879  | 
if self._root_node is None:  | 
|
880  | 
self._get_root_node()  | 
|
881  | 
if ref_list_num + 1 > self.node_ref_lists:  | 
|
882  | 
raise ValueError('No ref list %d, index has %d ref lists'  | 
|
883  | 
% (ref_list_num, self.node_ref_lists))  | 
|
884  | 
keys = set()  | 
|
885  | 
refs = set()  | 
|
886  | 
for node in self.iter_all_entries():  | 
|
887  | 
keys.add(node[1])  | 
|
888  | 
refs.update(node[3][ref_list_num])  | 
|
889  | 
return refs - keys  | 
|
890  | 
||
| 
3763.8.12
by John Arbash Meinel
 Code cleanup.  | 
891  | 
def _find_layer_first_and_end(self, offset):  | 
892  | 
"""Find the start/stop nodes for the layer corresponding to offset.  | 
|
893  | 
||
894  | 
        :return: (first, end)
 | 
|
895  | 
            first is the first node in this layer
 | 
|
896  | 
            end is the first node of the next layer
 | 
|
897  | 
        """
 | 
|
898  | 
first = end = 0  | 
|
899  | 
for roffset in self._row_offsets:  | 
|
900  | 
first = end  | 
|
901  | 
end = roffset  | 
|
902  | 
if offset < roffset:  | 
|
903  | 
                break
 | 
|
904  | 
return first, end  | 
|
905  | 
||
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
906  | 
def _get_offsets_to_cached_pages(self):  | 
| 
3763.8.12
by John Arbash Meinel
 Code cleanup.  | 
907  | 
"""Determine what nodes we already have cached."""  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
908  | 
cached_offsets = set(self._internal_node_cache.keys())  | 
909  | 
cached_offsets.update(self._leaf_node_cache.keys())  | 
|
| 
3763.8.12
by John Arbash Meinel
 Code cleanup.  | 
910  | 
if self._root_node is not None:  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
911  | 
cached_offsets.add(0)  | 
912  | 
return cached_offsets  | 
|
| 
3763.8.12
by John Arbash Meinel
 Code cleanup.  | 
913  | 
|
914  | 
def _get_root_node(self):  | 
|
915  | 
if self._root_node is None:  | 
|
916  | 
            # We may not have a root node yet
 | 
|
917  | 
self._get_internal_nodes([0])  | 
|
918  | 
return self._root_node  | 
|
919  | 
||
| 
3641.5.18
by John Arbash Meinel
 Clean out the global state, good for prototyping and tuning, bad for production code.  | 
920  | 
def _get_nodes(self, cache, node_indexes):  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
921  | 
found = {}  | 
922  | 
needed = []  | 
|
923  | 
for idx in node_indexes:  | 
|
924  | 
if idx == 0 and self._root_node is not None:  | 
|
925  | 
found[0] = self._root_node  | 
|
926  | 
                continue
 | 
|
927  | 
try:  | 
|
928  | 
found[idx] = cache[idx]  | 
|
929  | 
except KeyError:  | 
|
930  | 
needed.append(idx)  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
931  | 
if not needed:  | 
932  | 
return found  | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
933  | 
needed = self._expand_offsets(needed)  | 
| 
3763.8.12
by John Arbash Meinel
 Code cleanup.  | 
934  | 
found.update(self._get_and_cache_nodes(needed))  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
935  | 
return found  | 
936  | 
||
937  | 
def _get_internal_nodes(self, node_indexes):  | 
|
938  | 
"""Get a node, from cache or disk.  | 
|
939  | 
||
940  | 
        After getting it, the node will be cached.
 | 
|
941  | 
        """
 | 
|
| 
3641.5.18
by John Arbash Meinel
 Clean out the global state, good for prototyping and tuning, bad for production code.  | 
942  | 
return self._get_nodes(self._internal_node_cache, node_indexes)  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
943  | 
|
| 
3805.4.6
by John Arbash Meinel
 refactor for clarity.  | 
944  | 
def _cache_leaf_values(self, nodes):  | 
945  | 
"""Cache directly from key => value, skipping the btree."""  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
946  | 
if self._leaf_value_cache is not None:  | 
| 
3805.4.6
by John Arbash Meinel
 refactor for clarity.  | 
947  | 
for node in nodes.itervalues():  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
948  | 
for key, value in node.keys.iteritems():  | 
949  | 
if key in self._leaf_value_cache:  | 
|
950  | 
                        # Don't add the rest of the keys, we've seen this node
 | 
|
951  | 
                        # before.
 | 
|
952  | 
                        break
 | 
|
953  | 
self._leaf_value_cache[key] = value  | 
|
| 
3805.4.6
by John Arbash Meinel
 refactor for clarity.  | 
954  | 
|
955  | 
def _get_leaf_nodes(self, node_indexes):  | 
|
956  | 
"""Get a bunch of nodes, from cache or disk."""  | 
|
957  | 
found = self._get_nodes(self._leaf_node_cache, node_indexes)  | 
|
958  | 
self._cache_leaf_values(found)  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
959  | 
return found  | 
960  | 
||
961  | 
def iter_all_entries(self):  | 
|
962  | 
"""Iterate over all keys within the index.  | 
|
963  | 
||
964  | 
        :return: An iterable of (index, key, value) or (index, key, value, reference_lists).
 | 
|
965  | 
            The former tuple is used when there are no reference lists in the
 | 
|
966  | 
            index, making the API compatible with simple key:value index types.
 | 
|
967  | 
            There is no defined order for the result iteration - it will be in
 | 
|
968  | 
            the most efficient order for the index.
 | 
|
969  | 
        """
 | 
|
970  | 
if 'evil' in debug.debug_flags:  | 
|
971  | 
trace.mutter_callsite(3,  | 
|
972  | 
"iter_all_entries scales with size of history.")  | 
|
973  | 
if not self.key_count():  | 
|
974  | 
            return
 | 
|
| 
3823.5.2
by John Arbash Meinel
 It turns out that we read the pack-names file 3-times because  | 
975  | 
if self._row_offsets[-1] == 1:  | 
976  | 
            # There is only the root node, and we read that via key_count()
 | 
|
977  | 
if self.node_ref_lists:  | 
|
978  | 
for key, (value, refs) in sorted(self._root_node.keys.items()):  | 
|
979  | 
yield (self, key, value, refs)  | 
|
980  | 
else:  | 
|
981  | 
for key, (value, refs) in sorted(self._root_node.keys.items()):  | 
|
982  | 
yield (self, key, value)  | 
|
983  | 
            return
 | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
984  | 
start_of_leaves = self._row_offsets[-2]  | 
985  | 
end_of_leaves = self._row_offsets[-1]  | 
|
| 
3824.1.2
by John Arbash Meinel
 iter_all_entries() shouldn't need to re-read the page.  | 
986  | 
needed_offsets = range(start_of_leaves, end_of_leaves)  | 
987  | 
if needed_offsets == [0]:  | 
|
988  | 
            # Special case when we only have a root node, as we have already
 | 
|
989  | 
            # read everything
 | 
|
990  | 
nodes = [(0, self._root_node)]  | 
|
991  | 
else:  | 
|
992  | 
nodes = self._read_nodes(needed_offsets)  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
993  | 
        # We iterate strictly in-order so that we can use this function
 | 
994  | 
        # for spilling index builds to disk.
 | 
|
995  | 
if self.node_ref_lists:  | 
|
| 
3824.1.2
by John Arbash Meinel
 iter_all_entries() shouldn't need to re-read the page.  | 
996  | 
for _, node in nodes:  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
997  | 
for key, (value, refs) in sorted(node.keys.items()):  | 
998  | 
yield (self, key, value, refs)  | 
|
999  | 
else:  | 
|
| 
3824.1.2
by John Arbash Meinel
 iter_all_entries() shouldn't need to re-read the page.  | 
1000  | 
for _, node in nodes:  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1001  | 
for key, (value, refs) in sorted(node.keys.items()):  | 
1002  | 
yield (self, key, value)  | 
|
1003  | 
||
1004  | 
    @staticmethod
 | 
|
1005  | 
def _multi_bisect_right(in_keys, fixed_keys):  | 
|
1006  | 
"""Find the positions where each 'in_key' would fit in fixed_keys.  | 
|
1007  | 
||
1008  | 
        This is equivalent to doing "bisect_right" on each in_key into
 | 
|
1009  | 
        fixed_keys
 | 
|
1010  | 
||
1011  | 
        :param in_keys: A sorted list of keys to match with fixed_keys
 | 
|
1012  | 
        :param fixed_keys: A sorted list of keys to match against
 | 
|
1013  | 
        :return: A list of (integer position, [key list]) tuples.
 | 
|
1014  | 
        """
 | 
|
1015  | 
if not in_keys:  | 
|
1016  | 
return []  | 
|
1017  | 
if not fixed_keys:  | 
|
1018  | 
            # no pointers in the fixed_keys list, which means everything must
 | 
|
1019  | 
            # fall to the left.
 | 
|
1020  | 
return [(0, in_keys)]  | 
|
1021  | 
||
1022  | 
        # TODO: Iterating both lists will generally take M + N steps
 | 
|
1023  | 
        #       Bisecting each key will generally take M * log2 N steps.
 | 
|
1024  | 
        #       If we had an efficient way to compare, we could pick the method
 | 
|
1025  | 
        #       based on which has the fewer number of steps.
 | 
|
1026  | 
        #       There is also the argument that bisect_right is a compiled
 | 
|
1027  | 
        #       function, so there is even more to be gained.
 | 
|
1028  | 
        # iter_steps = len(in_keys) + len(fixed_keys)
 | 
|
1029  | 
        # bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
 | 
|
1030  | 
if len(in_keys) == 1: # Bisect will always be faster for M = 1  | 
|
1031  | 
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]  | 
|
1032  | 
        # elif bisect_steps < iter_steps:
 | 
|
1033  | 
        #     offsets = {}
 | 
|
1034  | 
        #     for key in in_keys:
 | 
|
1035  | 
        #         offsets.setdefault(bisect_right(fixed_keys, key),
 | 
|
1036  | 
        #                            []).append(key)
 | 
|
1037  | 
        #     return [(o, offsets[o]) for o in sorted(offsets)]
 | 
|
1038  | 
in_keys_iter = iter(in_keys)  | 
|
1039  | 
fixed_keys_iter = enumerate(fixed_keys)  | 
|
1040  | 
cur_in_key = in_keys_iter.next()  | 
|
1041  | 
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()  | 
|
1042  | 
||
1043  | 
class InputDone(Exception): pass  | 
|
1044  | 
class FixedDone(Exception): pass  | 
|
1045  | 
||
1046  | 
output = []  | 
|
1047  | 
cur_out = []  | 
|
1048  | 
||
1049  | 
        # TODO: Another possibility is that rather than iterating on each side,
 | 
|
1050  | 
        #       we could use a combination of bisecting and iterating. For
 | 
|
1051  | 
        #       example, while cur_in_key < fixed_key, bisect to find its
 | 
|
1052  | 
        #       point, then iterate all matching keys, then bisect (restricted
 | 
|
1053  | 
        #       to only the remainder) for the next one, etc.
 | 
|
1054  | 
try:  | 
|
1055  | 
while True:  | 
|
1056  | 
if cur_in_key < cur_fixed_key:  | 
|
1057  | 
cur_keys = []  | 
|
1058  | 
cur_out = (cur_fixed_offset, cur_keys)  | 
|
1059  | 
output.append(cur_out)  | 
|
1060  | 
while cur_in_key < cur_fixed_key:  | 
|
1061  | 
cur_keys.append(cur_in_key)  | 
|
1062  | 
try:  | 
|
1063  | 
cur_in_key = in_keys_iter.next()  | 
|
1064  | 
except StopIteration:  | 
|
1065  | 
raise InputDone  | 
|
1066  | 
                    # At this point cur_in_key must be >= cur_fixed_key
 | 
|
1067  | 
                # step the cur_fixed_key until we pass the cur key, or walk off
 | 
|
1068  | 
                # the end
 | 
|
1069  | 
while cur_in_key >= cur_fixed_key:  | 
|
1070  | 
try:  | 
|
1071  | 
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()  | 
|
1072  | 
except StopIteration:  | 
|
1073  | 
raise FixedDone  | 
|
1074  | 
except InputDone:  | 
|
1075  | 
            # We consumed all of the input, nothing more to do
 | 
|
1076  | 
            pass
 | 
|
1077  | 
except FixedDone:  | 
|
1078  | 
            # There was some input left, but we consumed all of fixed, so we
 | 
|
1079  | 
            # have to add one more for the tail
 | 
|
1080  | 
cur_keys = [cur_in_key]  | 
|
1081  | 
cur_keys.extend(in_keys_iter)  | 
|
1082  | 
cur_out = (len(fixed_keys), cur_keys)  | 
|
1083  | 
output.append(cur_out)  | 
|
1084  | 
return output  | 
|
1085  | 
||
| 
4593.4.5
by John Arbash Meinel
 Start adding some tests.  | 
1086  | 
def _walk_through_internal_nodes(self, keys):  | 
1087  | 
"""Take the given set of keys, and find the corresponding LeafNodes.  | 
|
1088  | 
||
1089  | 
        :param keys: An unsorted iterable of keys to search for
 | 
|
1090  | 
        :return: (nodes, index_and_keys)
 | 
|
1091  | 
            nodes is a dict mapping {index: LeafNode}
 | 
|
1092  | 
            keys_at_index is a list of tuples of [(index, [keys for Leaf])]
 | 
|
1093  | 
        """
 | 
|
1094  | 
        # 6 seconds spent in miss_torture using the sorted() line.
 | 
|
1095  | 
        # Even with out of order disk IO it seems faster not to sort it when
 | 
|
1096  | 
        # large queries are being made.
 | 
|
1097  | 
keys_at_index = [(0, sorted(keys))]  | 
|
1098  | 
||
1099  | 
for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):  | 
|
1100  | 
node_indexes = [idx for idx, s_keys in keys_at_index]  | 
|
1101  | 
nodes = self._get_internal_nodes(node_indexes)  | 
|
1102  | 
||
1103  | 
next_nodes_and_keys = []  | 
|
1104  | 
for node_index, sub_keys in keys_at_index:  | 
|
1105  | 
node = nodes[node_index]  | 
|
1106  | 
positions = self._multi_bisect_right(sub_keys, node.keys)  | 
|
1107  | 
node_offset = next_row_start + node.offset  | 
|
1108  | 
next_nodes_and_keys.extend([(node_offset + pos, s_keys)  | 
|
1109  | 
for pos, s_keys in positions])  | 
|
1110  | 
keys_at_index = next_nodes_and_keys  | 
|
1111  | 
        # We should now be at the _LeafNodes
 | 
|
1112  | 
node_indexes = [idx for idx, s_keys in keys_at_index]  | 
|
1113  | 
||
1114  | 
        # TODO: We may *not* want to always read all the nodes in one
 | 
|
1115  | 
        #       big go. Consider setting a max size on this.
 | 
|
1116  | 
nodes = self._get_leaf_nodes(node_indexes)  | 
|
1117  | 
return nodes, keys_at_index  | 
|
1118  | 
||
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1119  | 
def iter_entries(self, keys):  | 
1120  | 
"""Iterate over keys within the index.  | 
|
1121  | 
||
1122  | 
        :param keys: An iterable providing the keys to be retrieved.
 | 
|
1123  | 
        :return: An iterable as per iter_all_entries, but restricted to the
 | 
|
1124  | 
            keys supplied. No additional keys will be returned, and every
 | 
|
1125  | 
            key supplied that is in the index will be returned.
 | 
|
1126  | 
        """
 | 
|
1127  | 
        # 6 seconds spent in miss_torture using the sorted() line.
 | 
|
1128  | 
        # Even with out of order disk IO it seems faster not to sort it when
 | 
|
1129  | 
        # large queries are being made.
 | 
|
1130  | 
        # However, now that we are doing multi-way bisecting, we need the keys
 | 
|
1131  | 
        # in sorted order anyway. We could change the multi-way code to not
 | 
|
1132  | 
        # require sorted order. (For example, it bisects for the first node,
 | 
|
1133  | 
        # does an in-order search until a key comes before the current point,
 | 
|
1134  | 
        # which it then bisects for, etc.)
 | 
|
1135  | 
keys = frozenset(keys)  | 
|
1136  | 
if not keys:  | 
|
1137  | 
            return
 | 
|
1138  | 
||
1139  | 
if not self.key_count():  | 
|
1140  | 
            return
 | 
|
1141  | 
||
1142  | 
needed_keys = []  | 
|
1143  | 
if self._leaf_value_cache is None:  | 
|
1144  | 
needed_keys = keys  | 
|
1145  | 
else:  | 
|
1146  | 
for key in keys:  | 
|
1147  | 
value = self._leaf_value_cache.get(key, None)  | 
|
1148  | 
if value is not None:  | 
|
1149  | 
                    # This key is known not to be here, skip it
 | 
|
1150  | 
value, refs = value  | 
|
1151  | 
if self.node_ref_lists:  | 
|
1152  | 
yield (self, key, value, refs)  | 
|
1153  | 
else:  | 
|
1154  | 
yield (self, key, value)  | 
|
1155  | 
else:  | 
|
1156  | 
needed_keys.append(key)  | 
|
1157  | 
||
1158  | 
last_key = None  | 
|
1159  | 
needed_keys = keys  | 
|
1160  | 
if not needed_keys:  | 
|
1161  | 
            return
 | 
|
| 
4593.4.5
by John Arbash Meinel
 Start adding some tests.  | 
1162  | 
nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1163  | 
for node_index, sub_keys in nodes_and_keys:  | 
1164  | 
if not sub_keys:  | 
|
1165  | 
                continue
 | 
|
1166  | 
node = nodes[node_index]  | 
|
1167  | 
for next_sub_key in sub_keys:  | 
|
1168  | 
if next_sub_key in node.keys:  | 
|
1169  | 
value, refs = node.keys[next_sub_key]  | 
|
1170  | 
if self.node_ref_lists:  | 
|
1171  | 
yield (self, next_sub_key, value, refs)  | 
|
1172  | 
else:  | 
|
1173  | 
yield (self, next_sub_key, value)  | 
|
1174  | 
||
| 
4593.4.12
by John Arbash Meinel
 Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()  | 
1175  | 
def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):  | 
| 
4593.4.11
by John Arbash Meinel
 Snapshot the work in progress.  | 
1176  | 
"""Find the parent_map information for the set of keys.  | 
1177  | 
||
1178  | 
        This populates the parent_map dict and missing_keys set based on the
 | 
|
1179  | 
        queried keys. It also can fill out an arbitrary number of parents that
 | 
|
1180  | 
        it finds while searching for the supplied keys.
 | 
|
1181  | 
||
1182  | 
        It is unlikely that you want to call this directly. See
 | 
|
| 
4593.4.12
by John Arbash Meinel
 Name the specific index api _find_ancestors, and the public CombinedGraphIndex api find_ancestry()  | 
1183  | 
        "CombinedGraphIndex.find_ancestry()" for a more appropriate API.
 | 
| 
4593.4.11
by John Arbash Meinel
 Snapshot the work in progress.  | 
1184  | 
|
1185  | 
        :param keys: A keys whose ancestry we want to return
 | 
|
1186  | 
            Every key will either end up in 'parent_map' or 'missing_keys'.
 | 
|
| 
4593.4.1
by John Arbash Meinel
 Implement a function on btree that inlines the get_parent_map loop.  | 
1187  | 
        :param ref_list_num: This index in the ref_lists is the parents we
 | 
1188  | 
            care about.
 | 
|
| 
4593.4.11
by John Arbash Meinel
 Snapshot the work in progress.  | 
1189  | 
        :param parent_map: {key: parent_keys} for keys that are present in this
 | 
1190  | 
            index. This may contain more entries than were in 'keys', that are
 | 
|
1191  | 
            reachable ancestors of the keys requested.
 | 
|
| 
4593.4.5
by John Arbash Meinel
 Start adding some tests.  | 
1192  | 
        :param missing_keys: keys which are known to be missing in this index.
 | 
| 
4593.4.11
by John Arbash Meinel
 Snapshot the work in progress.  | 
1193  | 
            This may include parents that were not directly requested, but we
 | 
1194  | 
            were able to determine that they are not present in this index.
 | 
|
1195  | 
        :return: search_keys    parents that were found but not queried to know
 | 
|
1196  | 
            if they are missing or present. Callers can re-query this index for
 | 
|
1197  | 
            those keys, and they will be placed into parent_map or missing_keys
 | 
|
| 
4593.4.1
by John Arbash Meinel
 Implement a function on btree that inlines the get_parent_map loop.  | 
1198  | 
        """
 | 
1199  | 
if not self.key_count():  | 
|
1200  | 
            # We use key_count() to trigger reading the root node and
 | 
|
1201  | 
            # determining info about this BTreeGraphIndex
 | 
|
1202  | 
            # If we don't have any keys, then everything is missing
 | 
|
| 
4593.4.11
by John Arbash Meinel
 Snapshot the work in progress.  | 
1203  | 
missing_keys.update(keys)  | 
1204  | 
return set()  | 
|
| 
4593.4.1
by John Arbash Meinel
 Implement a function on btree that inlines the get_parent_map loop.  | 
1205  | 
if ref_list_num >= self.node_ref_lists:  | 
1206  | 
raise ValueError('No ref list %d, index has %d ref lists'  | 
|
1207  | 
% (ref_list_num, self.node_ref_lists))  | 
|
1208  | 
||
1209  | 
        # The main trick we are trying to accomplish is that when we find a
 | 
|
1210  | 
        # key listing its parents, we expect that the parent key is also likely
 | 
|
1211  | 
        # to sit on the same page. Allowing us to expand parents quickly
 | 
|
1212  | 
        # without suffering the full stack of bisecting, etc.
 | 
|
| 
4593.4.5
by John Arbash Meinel
 Start adding some tests.  | 
1213  | 
nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)  | 
1214  | 
||
| 
4593.4.1
by John Arbash Meinel
 Implement a function on btree that inlines the get_parent_map loop.  | 
1215  | 
        # These are parent keys which could not be immediately resolved on the
 | 
1216  | 
        # page where the child was present. Note that we may already be
 | 
|
1217  | 
        # searching for that key, and it may actually be present [or known
 | 
|
1218  | 
        # missing] on one of the other pages we are reading.
 | 
|
1219  | 
        # TODO:
 | 
|
1220  | 
        #   We could try searching for them in the immediate previous or next
 | 
|
1221  | 
        #   page. If they occur "later" we could put them in a pending lookup
 | 
|
1222  | 
        #   set, and then for each node we read thereafter we could check to
 | 
|
1223  | 
        #   see if they are present.
 | 
|
1224  | 
        #   However, we don't know the impact of keeping this list of things
 | 
|
1225  | 
        #   that I'm going to search for every node I come across from here on
 | 
|
1226  | 
        #   out.
 | 
|
1227  | 
        #   It doesn't handle the case when the parent key is missing on a
 | 
|
1228  | 
        #   page that we *don't* read. So we already have to handle being
 | 
|
1229  | 
        #   re-entrant for that.
 | 
|
1230  | 
        #   Since most keys contain a date string, they are more likely to be
 | 
|
1231  | 
        #   found earlier in the file than later, but we would know that right
 | 
|
1232  | 
        #   away (key < min_key), and wouldn't keep searching it on every other
 | 
|
1233  | 
        #   page that we read.
 | 
|
1234  | 
        #   Mostly, it is an idea, one which should be benchmarked.
 | 
|
1235  | 
parents_not_on_page = set()  | 
|
1236  | 
||
1237  | 
for node_index, sub_keys in nodes_and_keys:  | 
|
1238  | 
if not sub_keys:  | 
|
1239  | 
                continue
 | 
|
1240  | 
            # sub_keys is all of the keys we are looking for that should exist
 | 
|
1241  | 
            # on this page, if they aren't here, then they won't be found
 | 
|
1242  | 
node = nodes[node_index]  | 
|
| 
4593.4.3
by John Arbash Meinel
 Some minor attribute lookup cleanus, doesn't make a big difference.  | 
1243  | 
node_keys = node.keys  | 
| 
4593.4.1
by John Arbash Meinel
 Implement a function on btree that inlines the get_parent_map loop.  | 
1244  | 
parents_to_check = set()  | 
1245  | 
for next_sub_key in sub_keys:  | 
|
| 
4593.4.5
by John Arbash Meinel
 Start adding some tests.  | 
1246  | 
if next_sub_key not in node_keys:  | 
1247  | 
                    # This one is just not present in the index at all
 | 
|
1248  | 
missing_keys.add(next_sub_key)  | 
|
1249  | 
else:  | 
|
| 
4593.4.3
by John Arbash Meinel
 Some minor attribute lookup cleanus, doesn't make a big difference.  | 
1250  | 
value, refs = node_keys[next_sub_key]  | 
| 
4593.4.1
by John Arbash Meinel
 Implement a function on btree that inlines the get_parent_map loop.  | 
1251  | 
parent_keys = refs[ref_list_num]  | 
1252  | 
parent_map[next_sub_key] = parent_keys  | 
|
1253  | 
parents_to_check.update(parent_keys)  | 
|
1254  | 
            # Don't look for things we've already found
 | 
|
1255  | 
parents_to_check = parents_to_check.difference(parent_map)  | 
|
| 
4593.4.4
by John Arbash Meinel
 Trying out a few more tweaks.  | 
1256  | 
            # this can be used to test the benefit of having the check loop
 | 
1257  | 
            # inlined.
 | 
|
1258  | 
            # parents_not_on_page.update(parents_to_check)
 | 
|
1259  | 
            # continue
 | 
|
| 
4593.4.1
by John Arbash Meinel
 Implement a function on btree that inlines the get_parent_map loop.  | 
1260  | 
while parents_to_check:  | 
1261  | 
next_parents_to_check = set()  | 
|
1262  | 
for key in parents_to_check:  | 
|
| 
4593.4.3
by John Arbash Meinel
 Some minor attribute lookup cleanus, doesn't make a big difference.  | 
1263  | 
if key in node_keys:  | 
1264  | 
value, refs = node_keys[key]  | 
|
| 
4593.4.1
by John Arbash Meinel
 Implement a function on btree that inlines the get_parent_map loop.  | 
1265  | 
parent_keys = refs[ref_list_num]  | 
1266  | 
parent_map[key] = parent_keys  | 
|
1267  | 
next_parents_to_check.update(parent_keys)  | 
|
1268  | 
else:  | 
|
| 
4593.4.4
by John Arbash Meinel
 Trying out a few more tweaks.  | 
1269  | 
                        # This parent either is genuinely missing, or should be
 | 
1270  | 
                        # found on another page. Perf test whether it is better
 | 
|
1271  | 
                        # to check if this node should fit on this page or not.
 | 
|
1272  | 
                        # in the 'everything-in-one-pack' scenario, this *not*
 | 
|
1273  | 
                        # doing the check is 237ms vs 243ms.
 | 
|
1274  | 
                        # So slightly better, but I assume the standard 'lots
 | 
|
1275  | 
                        # of packs' is going to show a reasonable improvement
 | 
|
1276  | 
                        # from the check, because it avoids 'going around
 | 
|
1277  | 
                        # again' for everything that is in another index
 | 
|
| 
4593.4.5
by John Arbash Meinel
 Start adding some tests.  | 
1278  | 
                        # parents_not_on_page.add(key)
 | 
1279  | 
                        # Missing for some reason
 | 
|
1280  | 
if key < node.min_key:  | 
|
1281  | 
                            # in the case of bzr.dev, 3.4k/5.3k misses are
 | 
|
1282  | 
                            # 'earlier' misses (65%)
 | 
|
1283  | 
parents_not_on_page.add(key)  | 
|
1284  | 
elif key > node.max_key:  | 
|
1285  | 
                            # This parent key would be present on a different
 | 
|
1286  | 
                            # LeafNode
 | 
|
1287  | 
parents_not_on_page.add(key)  | 
|
1288  | 
else:  | 
|
1289  | 
                            # assert key != node.min_key and key != node.max_key
 | 
|
1290  | 
                            # If it was going to be present, it would be on
 | 
|
1291  | 
                            # *this* page, so mark it missing.
 | 
|
1292  | 
missing_keys.add(key)  | 
|
| 
4593.4.1
by John Arbash Meinel
 Implement a function on btree that inlines the get_parent_map loop.  | 
1293  | 
parents_to_check = next_parents_to_check.difference(parent_map)  | 
| 
4593.4.4
by John Arbash Meinel
 Trying out a few more tweaks.  | 
1294  | 
                # Might want to do another .difference() from missing_keys
 | 
| 
4593.4.1
by John Arbash Meinel
 Implement a function on btree that inlines the get_parent_map loop.  | 
1295  | 
        # parents_not_on_page could have been found on a different page, or be
 | 
1296  | 
        # known to be missing. So cull out everything that has already been
 | 
|
1297  | 
        # found.
 | 
|
| 
4593.4.5
by John Arbash Meinel
 Start adding some tests.  | 
1298  | 
search_keys = parents_not_on_page.difference(  | 
| 
4593.4.1
by John Arbash Meinel
 Implement a function on btree that inlines the get_parent_map loop.  | 
1299  | 
parent_map).difference(missing_keys)  | 
| 
4593.4.5
by John Arbash Meinel
 Start adding some tests.  | 
1300  | 
return search_keys  | 
| 
4593.4.1
by John Arbash Meinel
 Implement a function on btree that inlines the get_parent_map loop.  | 
1301  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1302  | 
def iter_entries_prefix(self, keys):  | 
1303  | 
"""Iterate over keys within the index using prefix matching.  | 
|
1304  | 
||
1305  | 
        Prefix matching is applied within the tuple of a key, not to within
 | 
|
1306  | 
        the bytestring of each key element. e.g. if you have the keys ('foo',
 | 
|
1307  | 
        'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
 | 
|
1308  | 
        only the former key is returned.
 | 
|
1309  | 
||
1310  | 
        WARNING: Note that this method currently causes a full index parse
 | 
|
1311  | 
        unconditionally (which is reasonably appropriate as it is a means for
 | 
|
1312  | 
        thunking many small indices into one larger one and still supplies
 | 
|
1313  | 
        iter_all_entries at the thunk layer).
 | 
|
1314  | 
||
1315  | 
        :param keys: An iterable providing the key prefixes to be retrieved.
 | 
|
1316  | 
            Each key prefix takes the form of a tuple the length of a key, but
 | 
|
1317  | 
            with the last N elements 'None' rather than a regular bytestring.
 | 
|
1318  | 
            The first element cannot be 'None'.
 | 
|
1319  | 
        :return: An iterable as per iter_all_entries, but restricted to the
 | 
|
1320  | 
            keys with a matching prefix to those supplied. No additional keys
 | 
|
1321  | 
            will be returned, and every match that is in the index will be
 | 
|
1322  | 
            returned.
 | 
|
1323  | 
        """
 | 
|
1324  | 
keys = sorted(set(keys))  | 
|
1325  | 
if not keys:  | 
|
1326  | 
            return
 | 
|
1327  | 
        # Load if needed to check key lengths
 | 
|
1328  | 
if self._key_count is None:  | 
|
1329  | 
self._get_root_node()  | 
|
1330  | 
        # TODO: only access nodes that can satisfy the prefixes we are looking
 | 
|
1331  | 
        # for. For now, to meet API usage (as this function is not used by
 | 
|
1332  | 
        # current bzrlib) just suck the entire index and iterate in memory.
 | 
|
1333  | 
nodes = {}  | 
|
1334  | 
if self.node_ref_lists:  | 
|
1335  | 
if self._key_length == 1:  | 
|
1336  | 
for _1, key, value, refs in self.iter_all_entries():  | 
|
1337  | 
nodes[key] = value, refs  | 
|
1338  | 
else:  | 
|
1339  | 
nodes_by_key = {}  | 
|
1340  | 
for _1, key, value, refs in self.iter_all_entries():  | 
|
1341  | 
key_value = key, value, refs  | 
|
1342  | 
                    # For a key of (foo, bar, baz) create
 | 
|
1343  | 
                    # _nodes_by_key[foo][bar][baz] = key_value
 | 
|
1344  | 
key_dict = nodes_by_key  | 
|
1345  | 
for subkey in key[:-1]:  | 
|
1346  | 
key_dict = key_dict.setdefault(subkey, {})  | 
|
1347  | 
key_dict[key[-1]] = key_value  | 
|
1348  | 
else:  | 
|
1349  | 
if self._key_length == 1:  | 
|
1350  | 
for _1, key, value in self.iter_all_entries():  | 
|
1351  | 
nodes[key] = value  | 
|
1352  | 
else:  | 
|
1353  | 
nodes_by_key = {}  | 
|
1354  | 
for _1, key, value in self.iter_all_entries():  | 
|
1355  | 
key_value = key, value  | 
|
1356  | 
                    # For a key of (foo, bar, baz) create
 | 
|
1357  | 
                    # _nodes_by_key[foo][bar][baz] = key_value
 | 
|
1358  | 
key_dict = nodes_by_key  | 
|
1359  | 
for subkey in key[:-1]:  | 
|
1360  | 
key_dict = key_dict.setdefault(subkey, {})  | 
|
1361  | 
key_dict[key[-1]] = key_value  | 
|
1362  | 
if self._key_length == 1:  | 
|
1363  | 
for key in keys:  | 
|
1364  | 
                # sanity check
 | 
|
1365  | 
if key[0] is None:  | 
|
1366  | 
raise errors.BadIndexKey(key)  | 
|
1367  | 
if len(key) != self._key_length:  | 
|
1368  | 
raise errors.BadIndexKey(key)  | 
|
1369  | 
try:  | 
|
1370  | 
if self.node_ref_lists:  | 
|
1371  | 
value, node_refs = nodes[key]  | 
|
1372  | 
yield self, key, value, node_refs  | 
|
1373  | 
else:  | 
|
1374  | 
yield self, key, nodes[key]  | 
|
1375  | 
except KeyError:  | 
|
1376  | 
                    pass
 | 
|
1377  | 
            return
 | 
|
1378  | 
for key in keys:  | 
|
1379  | 
            # sanity check
 | 
|
1380  | 
if key[0] is None:  | 
|
1381  | 
raise errors.BadIndexKey(key)  | 
|
1382  | 
if len(key) != self._key_length:  | 
|
1383  | 
raise errors.BadIndexKey(key)  | 
|
1384  | 
            # find what it refers to:
 | 
|
1385  | 
key_dict = nodes_by_key  | 
|
1386  | 
elements = list(key)  | 
|
1387  | 
            # find the subdict whose contents should be returned.
 | 
|
1388  | 
try:  | 
|
1389  | 
while len(elements) and elements[0] is not None:  | 
|
1390  | 
key_dict = key_dict[elements[0]]  | 
|
1391  | 
elements.pop(0)  | 
|
1392  | 
except KeyError:  | 
|
1393  | 
                # a non-existant lookup.
 | 
|
1394  | 
                continue
 | 
|
1395  | 
if len(elements):  | 
|
1396  | 
dicts = [key_dict]  | 
|
1397  | 
while dicts:  | 
|
1398  | 
key_dict = dicts.pop(-1)  | 
|
1399  | 
                    # can't be empty or would not exist
 | 
|
1400  | 
item, value = key_dict.iteritems().next()  | 
|
1401  | 
if type(value) == dict:  | 
|
1402  | 
                        # push keys
 | 
|
1403  | 
dicts.extend(key_dict.itervalues())  | 
|
1404  | 
else:  | 
|
1405  | 
                        # yield keys
 | 
|
1406  | 
for value in key_dict.itervalues():  | 
|
1407  | 
                            # each value is the key:value:node refs tuple
 | 
|
1408  | 
                            # ready to yield.
 | 
|
1409  | 
yield (self, ) + value  | 
|
1410  | 
else:  | 
|
1411  | 
                # the last thing looked up was a terminal element
 | 
|
1412  | 
yield (self, ) + key_dict  | 
|
1413  | 
||
1414  | 
def key_count(self):  | 
|
1415  | 
"""Return an estimate of the number of keys in this index.  | 
|
1416  | 
||
1417  | 
        For BTreeGraphIndex the estimate is exact as it is contained in the
 | 
|
1418  | 
        header.
 | 
|
1419  | 
        """
 | 
|
1420  | 
if self._key_count is None:  | 
|
1421  | 
self._get_root_node()  | 
|
1422  | 
return self._key_count  | 
|
1423  | 
||
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
1424  | 
def _compute_row_offsets(self):  | 
1425  | 
"""Fill out the _row_offsets attribute based on _row_lengths."""  | 
|
1426  | 
offsets = []  | 
|
1427  | 
row_offset = 0  | 
|
1428  | 
for row in self._row_lengths:  | 
|
1429  | 
offsets.append(row_offset)  | 
|
1430  | 
row_offset += row  | 
|
1431  | 
offsets.append(row_offset)  | 
|
1432  | 
self._row_offsets = offsets  | 
|
1433  | 
||
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1434  | 
def _parse_header_from_bytes(self, bytes):  | 
1435  | 
"""Parse the header from a region of bytes.  | 
|
1436  | 
||
1437  | 
        :param bytes: The data to parse.
 | 
|
1438  | 
        :return: An offset, data tuple such as readv yields, for the unparsed
 | 
|
1439  | 
            data. (which may be of length 0).
 | 
|
1440  | 
        """
 | 
|
1441  | 
signature = bytes[0:len(self._signature())]  | 
|
1442  | 
if not signature == self._signature():  | 
|
1443  | 
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)  | 
|
1444  | 
lines = bytes[len(self._signature()):].splitlines()  | 
|
1445  | 
options_line = lines[0]  | 
|
1446  | 
if not options_line.startswith(_OPTION_NODE_REFS):  | 
|
1447  | 
raise errors.BadIndexOptions(self)  | 
|
1448  | 
try:  | 
|
1449  | 
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])  | 
|
1450  | 
except ValueError:  | 
|
1451  | 
raise errors.BadIndexOptions(self)  | 
|
1452  | 
options_line = lines[1]  | 
|
1453  | 
if not options_line.startswith(_OPTION_KEY_ELEMENTS):  | 
|
1454  | 
raise errors.BadIndexOptions(self)  | 
|
1455  | 
try:  | 
|
1456  | 
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])  | 
|
1457  | 
except ValueError:  | 
|
1458  | 
raise errors.BadIndexOptions(self)  | 
|
1459  | 
options_line = lines[2]  | 
|
1460  | 
if not options_line.startswith(_OPTION_LEN):  | 
|
1461  | 
raise errors.BadIndexOptions(self)  | 
|
1462  | 
try:  | 
|
1463  | 
self._key_count = int(options_line[len(_OPTION_LEN):])  | 
|
1464  | 
except ValueError:  | 
|
1465  | 
raise errors.BadIndexOptions(self)  | 
|
1466  | 
options_line = lines[3]  | 
|
1467  | 
if not options_line.startswith(_OPTION_ROW_LENGTHS):  | 
|
1468  | 
raise errors.BadIndexOptions(self)  | 
|
1469  | 
try:  | 
|
1470  | 
self._row_lengths = map(int, [length for length in  | 
|
1471  | 
options_line[len(_OPTION_ROW_LENGTHS):].split(',')  | 
|
1472  | 
if len(length)])  | 
|
1473  | 
except ValueError:  | 
|
1474  | 
raise errors.BadIndexOptions(self)  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
1475  | 
self._compute_row_offsets()  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1476  | 
|
1477  | 
        # calculate the bytes we have processed
 | 
|
1478  | 
header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)  | 
|
1479  | 
return header_end, bytes[header_end:]  | 
|
1480  | 
||
1481  | 
def _read_nodes(self, nodes):  | 
|
1482  | 
"""Read some nodes from disk into the LRU cache.  | 
|
1483  | 
||
1484  | 
        This performs a readv to get the node data into memory, and parses each
 | 
|
| 
3868.1.1
by Martin Pool
 merge John's patch to avoid re-reading pack-names file  | 
1485  | 
        node, then yields it to the caller. The nodes are requested in the
 | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1486  | 
        supplied order. If possible doing sort() on the list before requesting
 | 
1487  | 
        a read may improve performance.
 | 
|
1488  | 
||
1489  | 
        :param nodes: The nodes to read. 0 - first node, 1 - second node etc.
 | 
|
1490  | 
        :return: None
 | 
|
1491  | 
        """
 | 
|
| 
3868.1.1
by Martin Pool
 merge John's patch to avoid re-reading pack-names file  | 
1492  | 
        # may be the byte string of the whole file
 | 
| 
3823.5.2
by John Arbash Meinel
 It turns out that we read the pack-names file 3-times because  | 
1493  | 
bytes = None  | 
| 
3868.1.1
by Martin Pool
 merge John's patch to avoid re-reading pack-names file  | 
1494  | 
        # list of (offset, length) regions of the file that should, evenually
 | 
1495  | 
        # be read in to data_ranges, either from 'bytes' or from the transport
 | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1496  | 
ranges = []  | 
1497  | 
for index in nodes:  | 
|
1498  | 
offset = index * _PAGE_SIZE  | 
|
1499  | 
size = _PAGE_SIZE  | 
|
1500  | 
if index == 0:  | 
|
1501  | 
                # Root node - special case
 | 
|
1502  | 
if self._size:  | 
|
1503  | 
size = min(_PAGE_SIZE, self._size)  | 
|
1504  | 
else:  | 
|
| 
3824.1.1
by John Arbash Meinel
 Fix _read_nodes() to only issue a single read if there is no known size.  | 
1505  | 
                    # The only case where we don't know the size, is for very
 | 
1506  | 
                    # small indexes. So we read the whole thing
 | 
|
| 
3823.5.2
by John Arbash Meinel
 It turns out that we read the pack-names file 3-times because  | 
1507  | 
bytes = self._transport.get_bytes(self._name)  | 
1508  | 
self._size = len(bytes)  | 
|
| 
3868.1.1
by Martin Pool
 merge John's patch to avoid re-reading pack-names file  | 
1509  | 
                    # the whole thing should be parsed out of 'bytes'
 | 
| 
3824.1.1
by John Arbash Meinel
 Fix _read_nodes() to only issue a single read if there is no known size.  | 
1510  | 
ranges.append((0, len(bytes)))  | 
| 
3823.5.2
by John Arbash Meinel
 It turns out that we read the pack-names file 3-times because  | 
1511  | 
                    break
 | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1512  | 
else:  | 
| 
3763.8.6
by John Arbash Meinel
 Fix the logic a bit, and add a bit more tweaking opportunities  | 
1513  | 
if offset > self._size:  | 
1514  | 
raise AssertionError('tried to read past the end'  | 
|
1515  | 
' of the file %s > %s'  | 
|
1516  | 
% (offset, self._size))  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1517  | 
size = min(size, self._size - offset)  | 
1518  | 
ranges.append((offset, size))  | 
|
1519  | 
if not ranges:  | 
|
1520  | 
            return
 | 
|
| 
3868.1.1
by Martin Pool
 merge John's patch to avoid re-reading pack-names file  | 
1521  | 
elif bytes is not None:  | 
1522  | 
            # already have the whole file
 | 
|
| 
3823.5.2
by John Arbash Meinel
 It turns out that we read the pack-names file 3-times because  | 
1523  | 
data_ranges = [(start, bytes[start:start+_PAGE_SIZE])  | 
1524  | 
for start in xrange(0, len(bytes), _PAGE_SIZE)]  | 
|
| 
3824.1.1
by John Arbash Meinel
 Fix _read_nodes() to only issue a single read if there is no known size.  | 
1525  | 
elif self._file is None:  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1526  | 
data_ranges = self._transport.readv(self._name, ranges)  | 
1527  | 
else:  | 
|
1528  | 
data_ranges = []  | 
|
1529  | 
for offset, size in ranges:  | 
|
1530  | 
self._file.seek(offset)  | 
|
1531  | 
data_ranges.append((offset, self._file.read(size)))  | 
|
1532  | 
for offset, data in data_ranges:  | 
|
1533  | 
if offset == 0:  | 
|
1534  | 
                # extract the header
 | 
|
1535  | 
offset, data = self._parse_header_from_bytes(data)  | 
|
1536  | 
if len(data) == 0:  | 
|
1537  | 
                    continue
 | 
|
1538  | 
bytes = zlib.decompress(data)  | 
|
1539  | 
if bytes.startswith(_LEAF_FLAG):  | 
|
1540  | 
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)  | 
|
1541  | 
elif bytes.startswith(_INTERNAL_FLAG):  | 
|
1542  | 
node = _InternalNode(bytes)  | 
|
1543  | 
else:  | 
|
1544  | 
raise AssertionError("Unknown node type for %r" % bytes)  | 
|
1545  | 
yield offset / _PAGE_SIZE, node  | 
|
1546  | 
||
1547  | 
def _signature(self):  | 
|
1548  | 
"""The file signature for this index type."""  | 
|
1549  | 
return _BTSIGNATURE  | 
|
1550  | 
||
1551  | 
def validate(self):  | 
|
1552  | 
"""Validate that everything in the index can be accessed."""  | 
|
1553  | 
        # just read and parse every node.
 | 
|
1554  | 
self._get_root_node()  | 
|
1555  | 
if len(self._row_lengths) > 1:  | 
|
1556  | 
start_node = self._row_offsets[1]  | 
|
1557  | 
else:  | 
|
1558  | 
            # We shouldn't be reading anything anyway
 | 
|
1559  | 
start_node = 1  | 
|
1560  | 
node_end = self._row_offsets[-1]  | 
|
1561  | 
for node in self._read_nodes(range(start_node, node_end)):  | 
|
1562  | 
            pass
 | 
|
1563  | 
||
1564  | 
||
1565  | 
try:  | 
|
| 
4459.2.1
by Vincent Ladeuil
 Use a consistent scheme for naming pyrex source files.  | 
1566  | 
from bzrlib import _btree_serializer_pyx as _btree_serializer  | 
| 
4574.3.6
by Martin Pool
 More warnings when failing to load extensions  | 
1567  | 
except ImportError, e:  | 
| 
4574.3.8
by Martin Pool
 Only mutter extension load errors when they occur, and record for later  | 
1568  | 
osutils.failed_to_load_extension(e)  | 
| 
3641.3.30
by John Arbash Meinel
 Rename _parse_btree to _btree_serializer  | 
1569  | 
from bzrlib import _btree_serializer_py as _btree_serializer  |