bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
| 
3641.3.29
by John Arbash Meinel
 Cleanup the copyright headers  | 
1  | 
# Copyright (C) 2008 Canonical Ltd
 | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
2  | 
#
 | 
3  | 
# This program is free software; you can redistribute it and/or modify
 | 
|
| 
3641.3.29
by John Arbash Meinel
 Cleanup the copyright headers  | 
4  | 
# it under the terms of the GNU General Public License as published by
 | 
5  | 
# the Free Software Foundation; either version 2 of the License, or
 | 
|
6  | 
# (at your option) any later version.
 | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
7  | 
#
 | 
8  | 
# This program is distributed in the hope that it will be useful,
 | 
|
9  | 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
|
10  | 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
|
11  | 
# GNU General Public License for more details.
 | 
|
12  | 
#
 | 
|
13  | 
# You should have received a copy of the GNU General Public License
 | 
|
14  | 
# along with this program; if not, write to the Free Software
 | 
|
| 
3641.3.29
by John Arbash Meinel
 Cleanup the copyright headers  | 
15  | 
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
16  | 
#
 | 
17  | 
||
18  | 
"""B+Tree indices"""
 | 
|
19  | 
||
20  | 
import array  | 
|
21  | 
import bisect  | 
|
22  | 
from bisect import bisect_right  | 
|
23  | 
from copy import deepcopy  | 
|
24  | 
import math  | 
|
25  | 
import struct  | 
|
26  | 
import tempfile  | 
|
27  | 
import zlib  | 
|
28  | 
||
29  | 
from bzrlib import (  | 
|
30  | 
chunk_writer,  | 
|
31  | 
debug,  | 
|
32  | 
errors,  | 
|
33  | 
index,  | 
|
34  | 
lru_cache,  | 
|
35  | 
osutils,  | 
|
36  | 
trace,  | 
|
37  | 
    )
 | 
|
38  | 
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN  | 
|
39  | 
from bzrlib.transport import get_transport  | 
|
40  | 
||
41  | 
||
| 
3641.3.3
by John Arbash Meinel
 Change the header to indicate these indexes are  | 
42  | 
_BTSIGNATURE = "B+Tree Graph Index 2\n"  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
43  | 
_OPTION_ROW_LENGTHS = "row_lengths="  | 
44  | 
_LEAF_FLAG = "type=leaf\n"  | 
|
45  | 
_INTERNAL_FLAG = "type=internal\n"  | 
|
46  | 
_INTERNAL_OFFSET = "offset="  | 
|
47  | 
||
48  | 
_RESERVED_HEADER_BYTES = 120  | 
|
49  | 
_PAGE_SIZE = 4096  | 
|
50  | 
||
51  | 
# 4K per page: 4MB - 1000 entries
 | 
|
52  | 
_NODE_CACHE_SIZE = 1000  | 
|
53  | 
||
54  | 
||
55  | 
class _BuilderRow(object):  | 
|
56  | 
"""The stored state accumulated while writing out a row in the index.  | 
|
57  | 
||
58  | 
    :ivar spool: A temporary file used to accumulate nodes for this row
 | 
|
59  | 
        in the tree.
 | 
|
60  | 
    :ivar nodes: The count of nodes emitted so far.
 | 
|
61  | 
    """
 | 
|
62  | 
||
63  | 
def __init__(self):  | 
|
64  | 
"""Create a _BuilderRow."""  | 
|
65  | 
self.nodes = 0  | 
|
66  | 
self.spool = tempfile.TemporaryFile()  | 
|
67  | 
self.writer = None  | 
|
68  | 
||
69  | 
def finish_node(self, pad=True):  | 
|
70  | 
byte_lines, _, padding = self.writer.finish()  | 
|
71  | 
if self.nodes == 0:  | 
|
72  | 
            # padded note:
 | 
|
73  | 
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)  | 
|
74  | 
skipped_bytes = 0  | 
|
75  | 
if not pad and padding:  | 
|
76  | 
del byte_lines[-1]  | 
|
77  | 
skipped_bytes = padding  | 
|
78  | 
self.spool.writelines(byte_lines)  | 
|
| 
3644.2.3
by John Arbash Meinel
 Do a bit more work to get all the tests to pass.  | 
79  | 
remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE  | 
80  | 
if remainder != 0:  | 
|
81  | 
raise AssertionError("incorrect node length: %d, %d"  | 
|
82  | 
% (self.spool.tell(), remainder))  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
83  | 
self.nodes += 1  | 
84  | 
self.writer = None  | 
|
85  | 
||
86  | 
||
87  | 
class _InternalBuilderRow(_BuilderRow):  | 
|
88  | 
"""The stored state accumulated while writing out internal rows."""  | 
|
89  | 
||
90  | 
def finish_node(self, pad=True):  | 
|
91  | 
if not pad:  | 
|
92  | 
raise AssertionError("Must pad internal nodes only.")  | 
|
93  | 
_BuilderRow.finish_node(self)  | 
|
94  | 
||
95  | 
||
96  | 
class _LeafBuilderRow(_BuilderRow):  | 
|
97  | 
"""The stored state accumulated while writing out a leaf rows."""  | 
|
98  | 
||
99  | 
||
100  | 
class BTreeBuilder(index.GraphIndexBuilder):  | 
|
101  | 
"""A Builder for B+Tree based Graph indices.  | 
|
102  | 
||
103  | 
    The resulting graph has the structure:
 | 
|
104  | 
||
105  | 
    _SIGNATURE OPTIONS NODES
 | 
|
106  | 
    _SIGNATURE     := 'B+Tree Graph Index 1' NEWLINE
 | 
|
107  | 
    OPTIONS        := REF_LISTS KEY_ELEMENTS LENGTH
 | 
|
108  | 
    REF_LISTS      := 'node_ref_lists=' DIGITS NEWLINE
 | 
|
109  | 
    KEY_ELEMENTS   := 'key_elements=' DIGITS NEWLINE
 | 
|
110  | 
    LENGTH         := 'len=' DIGITS NEWLINE
 | 
|
111  | 
    ROW_LENGTHS    := 'row_lengths' DIGITS (COMMA DIGITS)*
 | 
|
112  | 
    NODES          := NODE_COMPRESSED*
 | 
|
113  | 
    NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
 | 
|
114  | 
    NODE_RAW       := INTERNAL | LEAF
 | 
|
115  | 
    INTERNAL       := INTERNAL_FLAG POINTERS
 | 
|
116  | 
    LEAF           := LEAF_FLAG ROWS
 | 
|
117  | 
    KEY_ELEMENT    := Not-whitespace-utf8
 | 
|
118  | 
    KEY            := KEY_ELEMENT (NULL KEY_ELEMENT)*
 | 
|
119  | 
    ROWS           := ROW*
 | 
|
120  | 
    ROW            := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
 | 
|
121  | 
    ABSENT         := 'a'
 | 
|
122  | 
    REFERENCES     := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
 | 
|
123  | 
    REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
 | 
|
124  | 
    REFERENCE      := KEY
 | 
|
125  | 
    VALUE          := no-newline-no-null-bytes
 | 
|
126  | 
    """
 | 
|
127  | 
||
128  | 
def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):  | 
|
129  | 
"""See GraphIndexBuilder.__init__.  | 
|
130  | 
||
131  | 
        :param spill_at: Optional parameter controlling the maximum number
 | 
|
132  | 
            of nodes that BTreeBuilder will hold in memory.
 | 
|
133  | 
        """
 | 
|
134  | 
index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,  | 
|
135  | 
key_elements=key_elements)  | 
|
136  | 
self._spill_at = spill_at  | 
|
137  | 
self._backing_indices = []  | 
|
| 
3644.2.11
by John Arbash Meinel
 Document the new form of _nodes and remove an unnecessary cast.  | 
138  | 
        # A map of {key: (node_refs, value)}
 | 
139  | 
self._nodes = {}  | 
|
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
140  | 
        # Indicate it hasn't been built yet
 | 
141  | 
self._nodes_by_key = None  | 
|
| 
3777.5.2
by John Arbash Meinel
 Change the name to ChunkWriter.set_optimize()  | 
142  | 
self._optimize_for_size = False  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
143  | 
|
144  | 
def add_node(self, key, value, references=()):  | 
|
145  | 
"""Add a node to the index.  | 
|
146  | 
||
147  | 
        If adding the node causes the builder to reach its spill_at threshold,
 | 
|
148  | 
        disk spilling will be triggered.
 | 
|
149  | 
||
150  | 
        :param key: The key. keys are non-empty tuples containing
 | 
|
151  | 
            as many whitespace-free utf8 bytestrings as the key length
 | 
|
152  | 
            defined for this index.
 | 
|
153  | 
        :param references: An iterable of iterables of keys. Each is a
 | 
|
154  | 
            reference to another key.
 | 
|
155  | 
        :param value: The value to associate with the key. It may be any
 | 
|
156  | 
            bytes as long as it does not contain \0 or \n.
 | 
|
157  | 
        """
 | 
|
| 
3644.2.9
by John Arbash Meinel
 Refactor some code.  | 
158  | 
        # we don't care about absent_references
 | 
159  | 
node_refs, _ = self._check_key_ref_value(key, references, value)  | 
|
| 
3644.2.2
by John Arbash Meinel
 the new btree index doesn't have 'absent' keys in its _nodes  | 
160  | 
if key in self._nodes:  | 
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
161  | 
raise errors.BadIndexDuplicateKey(key, self)  | 
| 
3644.2.11
by John Arbash Meinel
 Document the new form of _nodes and remove an unnecessary cast.  | 
162  | 
self._nodes[key] = (node_refs, value)  | 
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
163  | 
self._keys.add(key)  | 
| 
3644.2.9
by John Arbash Meinel
 Refactor some code.  | 
164  | 
if self._nodes_by_key is not None and self._key_length > 1:  | 
165  | 
self._update_nodes_by_key(key, value, node_refs)  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
166  | 
if len(self._keys) < self._spill_at:  | 
167  | 
            return
 | 
|
| 
3644.2.9
by John Arbash Meinel
 Refactor some code.  | 
168  | 
self._spill_mem_keys_to_disk()  | 
169  | 
||
170  | 
def _spill_mem_keys_to_disk(self):  | 
|
171  | 
"""Write the in memory keys down to disk to cap memory consumption.  | 
|
172  | 
||
173  | 
        If we already have some keys written to disk, we will combine them so
 | 
|
174  | 
        as to preserve the sorted order.  The algorithm for combining uses
 | 
|
175  | 
        powers of two.  So on the first spill, write all mem nodes into a
 | 
|
176  | 
        single index. On the second spill, combine the mem nodes with the nodes
 | 
|
177  | 
        on disk to create a 2x sized disk index and get rid of the first index.
 | 
|
178  | 
        On the third spill, create a single new disk index, which will contain
 | 
|
179  | 
        the mem nodes, and preserve the existing 2x sized index.  On the fourth,
 | 
|
180  | 
        combine mem with the first and second indexes, creating a new one of
 | 
|
181  | 
        size 4x. On the fifth create a single new one, etc.
 | 
|
182  | 
        """
 | 
|
| 
3644.2.8
by John Arbash Meinel
 Two quick tweaks.  | 
183  | 
iterators_to_combine = [self._iter_mem_nodes()]  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
184  | 
pos = -1  | 
185  | 
for pos, backing in enumerate(self._backing_indices):  | 
|
186  | 
if backing is None:  | 
|
187  | 
pos -= 1  | 
|
188  | 
                break
 | 
|
189  | 
iterators_to_combine.append(backing.iter_all_entries())  | 
|
190  | 
backing_pos = pos + 1  | 
|
191  | 
new_backing_file, size = \  | 
|
192  | 
self._write_nodes(self._iter_smallest(iterators_to_combine))  | 
|
| 
3644.2.9
by John Arbash Meinel
 Refactor some code.  | 
193  | 
dir_path, base_name = osutils.split(new_backing_file.name)  | 
194  | 
        # Note: The transport here isn't strictly needed, because we will use
 | 
|
195  | 
        #       direct access to the new_backing._file object
 | 
|
196  | 
new_backing = BTreeGraphIndex(get_transport(dir_path),  | 
|
197  | 
base_name, size)  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
198  | 
        # GC will clean up the file
 | 
199  | 
new_backing._file = new_backing_file  | 
|
200  | 
if len(self._backing_indices) == backing_pos:  | 
|
201  | 
self._backing_indices.append(None)  | 
|
202  | 
self._backing_indices[backing_pos] = new_backing  | 
|
203  | 
for pos in range(backing_pos):  | 
|
204  | 
self._backing_indices[pos] = None  | 
|
205  | 
self._keys = set()  | 
|
206  | 
self._nodes = {}  | 
|
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
207  | 
self._nodes_by_key = None  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
208  | 
|
209  | 
def add_nodes(self, nodes):  | 
|
210  | 
"""Add nodes to the index.  | 
|
211  | 
||
212  | 
        :param nodes: An iterable of (key, node_refs, value) entries to add.
 | 
|
213  | 
        """
 | 
|
214  | 
if self.reference_lists:  | 
|
215  | 
for (key, value, node_refs) in nodes:  | 
|
216  | 
self.add_node(key, value, node_refs)  | 
|
217  | 
else:  | 
|
218  | 
for (key, value) in nodes:  | 
|
219  | 
self.add_node(key, value)  | 
|
220  | 
||
221  | 
def _iter_mem_nodes(self):  | 
|
222  | 
"""Iterate over the nodes held in memory."""  | 
|
| 
3644.2.8
by John Arbash Meinel
 Two quick tweaks.  | 
223  | 
nodes = self._nodes  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
224  | 
if self.reference_lists:  | 
| 
3644.2.8
by John Arbash Meinel
 Two quick tweaks.  | 
225  | 
for key in sorted(nodes):  | 
226  | 
references, value = nodes[key]  | 
|
227  | 
yield self, key, value, references  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
228  | 
else:  | 
| 
3644.2.8
by John Arbash Meinel
 Two quick tweaks.  | 
229  | 
for key in sorted(nodes):  | 
230  | 
references, value = nodes[key]  | 
|
231  | 
yield self, key, value  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
232  | 
|
233  | 
def _iter_smallest(self, iterators_to_combine):  | 
|
| 
3641.3.9
by John Arbash Meinel
 Special case around _iter_smallest when we have only  | 
234  | 
if len(iterators_to_combine) == 1:  | 
235  | 
for value in iterators_to_combine[0]:  | 
|
236  | 
yield value  | 
|
237  | 
            return
 | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
238  | 
current_values = []  | 
239  | 
for iterator in iterators_to_combine:  | 
|
240  | 
try:  | 
|
241  | 
current_values.append(iterator.next())  | 
|
242  | 
except StopIteration:  | 
|
243  | 
current_values.append(None)  | 
|
244  | 
last = None  | 
|
245  | 
while True:  | 
|
246  | 
            # Decorate candidates with the value to allow 2.4's min to be used.
 | 
|
247  | 
candidates = [(item[1][1], item) for item  | 
|
248  | 
in enumerate(current_values) if item[1] is not None]  | 
|
249  | 
if not len(candidates):  | 
|
250  | 
                return
 | 
|
251  | 
selected = min(candidates)  | 
|
252  | 
            # undecorate back to (pos, node)
 | 
|
253  | 
selected = selected[1]  | 
|
254  | 
if last == selected[1][1]:  | 
|
255  | 
raise errors.BadIndexDuplicateKey(last, self)  | 
|
256  | 
last = selected[1][1]  | 
|
257  | 
            # Yield, with self as the index
 | 
|
258  | 
yield (self,) + selected[1][1:]  | 
|
259  | 
pos = selected[0]  | 
|
260  | 
try:  | 
|
261  | 
current_values[pos] = iterators_to_combine[pos].next()  | 
|
262  | 
except StopIteration:  | 
|
263  | 
current_values[pos] = None  | 
|
264  | 
||
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
265  | 
def _add_key(self, string_key, line, rows):  | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
266  | 
"""Add a key to the current chunk.  | 
267  | 
||
268  | 
        :param string_key: The key to add.
 | 
|
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
269  | 
        :param line: The fully serialised key and value.
 | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
270  | 
        """
 | 
271  | 
if rows[-1].writer is None:  | 
|
272  | 
            # opening a new leaf chunk;
 | 
|
273  | 
for pos, internal_row in enumerate(rows[:-1]):  | 
|
274  | 
                # flesh out any internal nodes that are needed to
 | 
|
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
275  | 
                # preserve the height of the tree
 | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
276  | 
if internal_row.writer is None:  | 
277  | 
length = _PAGE_SIZE  | 
|
278  | 
if internal_row.nodes == 0:  | 
|
279  | 
length -= _RESERVED_HEADER_BYTES # padded  | 
|
| 
3777.5.2
by John Arbash Meinel
 Change the name to ChunkWriter.set_optimize()  | 
280  | 
internal_row.writer = chunk_writer.ChunkWriter(length, 0,  | 
281  | 
optimize_for_size=self._optimize_for_size)  | 
|
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
282  | 
internal_row.writer.write(_INTERNAL_FLAG)  | 
283  | 
internal_row.writer.write(_INTERNAL_OFFSET +  | 
|
284  | 
str(rows[pos + 1].nodes) + "\n")  | 
|
285  | 
            # add a new leaf
 | 
|
286  | 
length = _PAGE_SIZE  | 
|
287  | 
if rows[-1].nodes == 0:  | 
|
288  | 
length -= _RESERVED_HEADER_BYTES # padded  | 
|
| 
3777.5.2
by John Arbash Meinel
 Change the name to ChunkWriter.set_optimize()  | 
289  | 
rows[-1].writer = chunk_writer.ChunkWriter(length,  | 
290  | 
optimize_for_size=self._optimize_for_size)  | 
|
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
291  | 
rows[-1].writer.write(_LEAF_FLAG)  | 
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
292  | 
if rows[-1].writer.write(line):  | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
293  | 
            # this key did not fit in the node:
 | 
294  | 
rows[-1].finish_node()  | 
|
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
295  | 
key_line = string_key + "\n"  | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
296  | 
new_row = True  | 
297  | 
for row in reversed(rows[:-1]):  | 
|
298  | 
                # Mark the start of the next node in the node above. If it
 | 
|
299  | 
                # doesn't fit then propogate upwards until we find one that
 | 
|
300  | 
                # it does fit into.
 | 
|
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
301  | 
if row.writer.write(key_line):  | 
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
302  | 
row.finish_node()  | 
303  | 
else:  | 
|
304  | 
                    # We've found a node that can handle the pointer.
 | 
|
305  | 
new_row = False  | 
|
306  | 
                    break
 | 
|
307  | 
            # If we reached the current root without being able to mark the
 | 
|
308  | 
            # division point, then we need a new root:
 | 
|
309  | 
if new_row:  | 
|
310  | 
                # We need a new row
 | 
|
311  | 
if 'index' in debug.debug_flags:  | 
|
312  | 
trace.mutter('Inserting new global row.')  | 
|
313  | 
new_row = _InternalBuilderRow()  | 
|
314  | 
reserved_bytes = 0  | 
|
315  | 
rows.insert(0, new_row)  | 
|
316  | 
                # This will be padded, hence the -100
 | 
|
317  | 
new_row.writer = chunk_writer.ChunkWriter(  | 
|
318  | 
_PAGE_SIZE - _RESERVED_HEADER_BYTES,  | 
|
| 
3777.5.2
by John Arbash Meinel
 Change the name to ChunkWriter.set_optimize()  | 
319  | 
reserved_bytes,  | 
320  | 
optimize_for_size=self._optimize_for_size)  | 
|
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
321  | 
new_row.writer.write(_INTERNAL_FLAG)  | 
322  | 
new_row.writer.write(_INTERNAL_OFFSET +  | 
|
323  | 
str(rows[1].nodes - 1) + "\n")  | 
|
| 
3641.3.11
by John Arbash Meinel
 Start working on an alternate way to track compressed_chunk state.  | 
324  | 
new_row.writer.write(key_line)  | 
325  | 
self._add_key(string_key, line, rows)  | 
|
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
326  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
327  | 
def _write_nodes(self, node_iterator):  | 
328  | 
"""Write node_iterator out as a B+Tree.  | 
|
329  | 
||
330  | 
        :param node_iterator: An iterator of sorted nodes. Each node should
 | 
|
331  | 
            match the output given by iter_all_entries.
 | 
|
332  | 
        :return: A file handle for a temporary file containing a B+Tree for
 | 
|
333  | 
            the nodes.
 | 
|
334  | 
        """
 | 
|
335  | 
        # The index rows - rows[0] is the root, rows[1] is the layer under it
 | 
|
336  | 
        # etc.
 | 
|
337  | 
rows = []  | 
|
338  | 
        # forward sorted by key. In future we may consider topological sorting,
 | 
|
339  | 
        # at the cost of table scans for direct lookup, or a second index for
 | 
|
340  | 
        # direct lookup
 | 
|
341  | 
key_count = 0  | 
|
342  | 
        # A stack with the number of nodes of each size. 0 is the root node
 | 
|
343  | 
        # and must always be 1 (if there are any nodes in the tree).
 | 
|
344  | 
self.row_lengths = []  | 
|
345  | 
        # Loop over all nodes adding them to the bottom row
 | 
|
346  | 
        # (rows[-1]). When we finish a chunk in a row,
 | 
|
347  | 
        # propogate the key that didn't fit (comes after the chunk) to the
 | 
|
348  | 
        # row above, transitively.
 | 
|
349  | 
for node in node_iterator:  | 
|
350  | 
if key_count == 0:  | 
|
351  | 
                # First key triggers the first row
 | 
|
352  | 
rows.append(_LeafBuilderRow())  | 
|
353  | 
key_count += 1  | 
|
| 
3641.3.30
by John Arbash Meinel
 Rename _parse_btree to _btree_serializer  | 
354  | 
string_key, line = _btree_serializer._flatten_node(node,  | 
355  | 
self.reference_lists)  | 
|
| 
3641.3.8
by John Arbash Meinel
 Move the add_key helper function into a separate func  | 
356  | 
self._add_key(string_key, line, rows)  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
357  | 
for row in reversed(rows):  | 
358  | 
pad = (type(row) != _LeafBuilderRow)  | 
|
359  | 
row.finish_node(pad=pad)  | 
|
360  | 
result = tempfile.NamedTemporaryFile()  | 
|
361  | 
lines = [_BTSIGNATURE]  | 
|
362  | 
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')  | 
|
363  | 
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')  | 
|
364  | 
lines.append(_OPTION_LEN + str(key_count) + '\n')  | 
|
365  | 
row_lengths = [row.nodes for row in rows]  | 
|
366  | 
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')  | 
|
367  | 
result.writelines(lines)  | 
|
368  | 
position = sum(map(len, lines))  | 
|
369  | 
root_row = True  | 
|
370  | 
if position > _RESERVED_HEADER_BYTES:  | 
|
371  | 
raise AssertionError("Could not fit the header in the"  | 
|
372  | 
" reserved space: %d > %d"  | 
|
373  | 
% (position, _RESERVED_HEADER_BYTES))  | 
|
374  | 
        # write the rows out:
 | 
|
375  | 
for row in rows:  | 
|
376  | 
reserved = _RESERVED_HEADER_BYTES # reserved space for first node  | 
|
377  | 
row.spool.flush()  | 
|
378  | 
row.spool.seek(0)  | 
|
379  | 
            # copy nodes to the finalised file.
 | 
|
380  | 
            # Special case the first node as it may be prefixed
 | 
|
381  | 
node = row.spool.read(_PAGE_SIZE)  | 
|
382  | 
result.write(node[reserved:])  | 
|
383  | 
result.write("\x00" * (reserved - position))  | 
|
384  | 
position = 0 # Only the root row actually has an offset  | 
|
385  | 
copied_len = osutils.pumpfile(row.spool, result)  | 
|
386  | 
if copied_len != (row.nodes - 1) * _PAGE_SIZE:  | 
|
387  | 
if type(row) != _LeafBuilderRow:  | 
|
| 
3644.2.3
by John Arbash Meinel
 Do a bit more work to get all the tests to pass.  | 
388  | 
raise AssertionError("Incorrect amount of data copied"  | 
389  | 
" expected: %d, got: %d"  | 
|
390  | 
% ((row.nodes - 1) * _PAGE_SIZE,  | 
|
391  | 
copied_len))  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
392  | 
result.flush()  | 
393  | 
size = result.tell()  | 
|
394  | 
result.seek(0)  | 
|
395  | 
return result, size  | 
|
396  | 
||
397  | 
def finish(self):  | 
|
398  | 
"""Finalise the index.  | 
|
399  | 
||
400  | 
        :return: A file handle for a temporary file containing the nodes added
 | 
|
401  | 
            to the index.
 | 
|
402  | 
        """
 | 
|
403  | 
return self._write_nodes(self.iter_all_entries())[0]  | 
|
404  | 
||
405  | 
def iter_all_entries(self):  | 
|
406  | 
"""Iterate over all keys within the index  | 
|
407  | 
||
408  | 
        :return: An iterable of (index, key, reference_lists, value). There is no
 | 
|
409  | 
            defined order for the result iteration - it will be in the most
 | 
|
410  | 
            efficient order for the index (in this case dictionary hash order).
 | 
|
411  | 
        """
 | 
|
412  | 
if 'evil' in debug.debug_flags:  | 
|
413  | 
trace.mutter_callsite(3,  | 
|
414  | 
"iter_all_entries scales with size of history.")  | 
|
415  | 
        # Doing serial rather than ordered would be faster; but this shouldn't
 | 
|
416  | 
        # be getting called routinely anyway.
 | 
|
| 
3644.2.8
by John Arbash Meinel
 Two quick tweaks.  | 
417  | 
iterators = [self._iter_mem_nodes()]  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
418  | 
for backing in self._backing_indices:  | 
419  | 
if backing is not None:  | 
|
420  | 
iterators.append(backing.iter_all_entries())  | 
|
| 
3641.3.9
by John Arbash Meinel
 Special case around _iter_smallest when we have only  | 
421  | 
if len(iterators) == 1:  | 
422  | 
return iterators[0]  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
423  | 
return self._iter_smallest(iterators)  | 
424  | 
||
425  | 
def iter_entries(self, keys):  | 
|
426  | 
"""Iterate over keys within the index.  | 
|
427  | 
||
428  | 
        :param keys: An iterable providing the keys to be retrieved.
 | 
|
429  | 
        :return: An iterable of (index, key, value, reference_lists). There is no
 | 
|
430  | 
            defined order for the result iteration - it will be in the most
 | 
|
431  | 
            efficient order for the index (keys iteration order in this case).
 | 
|
432  | 
        """
 | 
|
433  | 
keys = set(keys)  | 
|
| 
3847.2.2
by John Arbash Meinel
 Rather than skipping the difference_update entirely, just restrict it to the intersection keys.  | 
434  | 
local_keys = keys.intersection(self._keys)  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
435  | 
if self.reference_lists:  | 
| 
3847.2.2
by John Arbash Meinel
 Rather than skipping the difference_update entirely, just restrict it to the intersection keys.  | 
436  | 
for key in local_keys:  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
437  | 
node = self._nodes[key]  | 
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
438  | 
yield self, key, node[1], node[0]  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
439  | 
else:  | 
| 
3847.2.2
by John Arbash Meinel
 Rather than skipping the difference_update entirely, just restrict it to the intersection keys.  | 
440  | 
for key in local_keys:  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
441  | 
node = self._nodes[key]  | 
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
442  | 
yield self, key, node[1]  | 
| 
3847.2.1
by John Arbash Meinel
 Shortcut BTreeBuilder.iter_entries when there are no backing indices.  | 
443  | 
        # Find things that are in backing indices that have not been handled
 | 
444  | 
        # yet.
 | 
|
| 
3847.2.3
by John Arbash Meinel
 Bring back the shortcut  | 
445  | 
if not self._backing_indices:  | 
446  | 
return # We won't find anything there either  | 
|
| 
3847.2.2
by John Arbash Meinel
 Rather than skipping the difference_update entirely, just restrict it to the intersection keys.  | 
447  | 
        # Remove all of the keys that we found locally
 | 
448  | 
keys.difference_update(local_keys)  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
449  | 
for backing in self._backing_indices:  | 
450  | 
if backing is None:  | 
|
451  | 
                continue
 | 
|
452  | 
if not keys:  | 
|
453  | 
                return
 | 
|
454  | 
for node in backing.iter_entries(keys):  | 
|
455  | 
keys.remove(node[1])  | 
|
456  | 
yield (self,) + node[1:]  | 
|
457  | 
||
458  | 
def iter_entries_prefix(self, keys):  | 
|
459  | 
"""Iterate over keys within the index using prefix matching.  | 
|
460  | 
||
461  | 
        Prefix matching is applied within the tuple of a key, not to within
 | 
|
462  | 
        the bytestring of each key element. e.g. if you have the keys ('foo',
 | 
|
463  | 
        'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
 | 
|
464  | 
        only the former key is returned.
 | 
|
465  | 
||
466  | 
        :param keys: An iterable providing the key prefixes to be retrieved.
 | 
|
467  | 
            Each key prefix takes the form of a tuple the length of a key, but
 | 
|
468  | 
            with the last N elements 'None' rather than a regular bytestring.
 | 
|
469  | 
            The first element cannot be 'None'.
 | 
|
470  | 
        :return: An iterable as per iter_all_entries, but restricted to the
 | 
|
471  | 
            keys with a matching prefix to those supplied. No additional keys
 | 
|
472  | 
            will be returned, and every match that is in the index will be
 | 
|
473  | 
            returned.
 | 
|
474  | 
        """
 | 
|
475  | 
        # XXX: To much duplication with the GraphIndex class; consider finding
 | 
|
476  | 
        # a good place to pull out the actual common logic.
 | 
|
477  | 
keys = set(keys)  | 
|
478  | 
if not keys:  | 
|
479  | 
            return
 | 
|
480  | 
for backing in self._backing_indices:  | 
|
481  | 
if backing is None:  | 
|
482  | 
                continue
 | 
|
483  | 
for node in backing.iter_entries_prefix(keys):  | 
|
484  | 
yield (self,) + node[1:]  | 
|
485  | 
if self._key_length == 1:  | 
|
486  | 
for key in keys:  | 
|
487  | 
                # sanity check
 | 
|
488  | 
if key[0] is None:  | 
|
489  | 
raise errors.BadIndexKey(key)  | 
|
490  | 
if len(key) != self._key_length:  | 
|
491  | 
raise errors.BadIndexKey(key)  | 
|
492  | 
try:  | 
|
493  | 
node = self._nodes[key]  | 
|
494  | 
except KeyError:  | 
|
495  | 
                    continue
 | 
|
496  | 
if self.reference_lists:  | 
|
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
497  | 
yield self, key, node[1], node[0]  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
498  | 
else:  | 
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
499  | 
yield self, key, node[1]  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
500  | 
            return
 | 
501  | 
for key in keys:  | 
|
502  | 
            # sanity check
 | 
|
503  | 
if key[0] is None:  | 
|
504  | 
raise errors.BadIndexKey(key)  | 
|
505  | 
if len(key) != self._key_length:  | 
|
506  | 
raise errors.BadIndexKey(key)  | 
|
507  | 
            # find what it refers to:
 | 
|
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
508  | 
key_dict = self._get_nodes_by_key()  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
509  | 
elements = list(key)  | 
510  | 
            # find the subdict to return
 | 
|
511  | 
try:  | 
|
512  | 
while len(elements) and elements[0] is not None:  | 
|
513  | 
key_dict = key_dict[elements[0]]  | 
|
514  | 
elements.pop(0)  | 
|
515  | 
except KeyError:  | 
|
516  | 
                # a non-existant lookup.
 | 
|
517  | 
                continue
 | 
|
518  | 
if len(elements):  | 
|
519  | 
dicts = [key_dict]  | 
|
520  | 
while dicts:  | 
|
521  | 
key_dict = dicts.pop(-1)  | 
|
522  | 
                    # can't be empty or would not exist
 | 
|
523  | 
item, value = key_dict.iteritems().next()  | 
|
524  | 
if type(value) == dict:  | 
|
525  | 
                        # push keys
 | 
|
526  | 
dicts.extend(key_dict.itervalues())  | 
|
527  | 
else:  | 
|
528  | 
                        # yield keys
 | 
|
529  | 
for value in key_dict.itervalues():  | 
|
530  | 
yield (self, ) + value  | 
|
531  | 
else:  | 
|
532  | 
yield (self, ) + key_dict  | 
|
533  | 
||
| 
3644.2.1
by John Arbash Meinel
 Change the IndexBuilders to not generate the nodes_by_key unless needed.  | 
534  | 
def _get_nodes_by_key(self):  | 
535  | 
if self._nodes_by_key is None:  | 
|
536  | 
nodes_by_key = {}  | 
|
537  | 
if self.reference_lists:  | 
|
538  | 
for key, (references, value) in self._nodes.iteritems():  | 
|
539  | 
key_dict = nodes_by_key  | 
|
540  | 
for subkey in key[:-1]:  | 
|
541  | 
key_dict = key_dict.setdefault(subkey, {})  | 
|
542  | 
key_dict[key[-1]] = key, value, references  | 
|
543  | 
else:  | 
|
544  | 
for key, (references, value) in self._nodes.iteritems():  | 
|
545  | 
key_dict = nodes_by_key  | 
|
546  | 
for subkey in key[:-1]:  | 
|
547  | 
key_dict = key_dict.setdefault(subkey, {})  | 
|
548  | 
key_dict[key[-1]] = key, value  | 
|
549  | 
self._nodes_by_key = nodes_by_key  | 
|
550  | 
return self._nodes_by_key  | 
|
551  | 
||
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
552  | 
def key_count(self):  | 
553  | 
"""Return an estimate of the number of keys in this index.  | 
|
554  | 
||
555  | 
        For InMemoryGraphIndex the estimate is exact.
 | 
|
556  | 
        """
 | 
|
557  | 
return len(self._keys) + sum(backing.key_count() for backing in  | 
|
558  | 
self._backing_indices if backing is not None)  | 
|
559  | 
||
560  | 
def validate(self):  | 
|
561  | 
"""In memory index's have no known corruption at the moment."""  | 
|
562  | 
||
563  | 
||
564  | 
class _LeafNode(object):  | 
|
565  | 
"""A leaf node for a serialised B+Tree index."""  | 
|
566  | 
||
567  | 
def __init__(self, bytes, key_length, ref_list_length):  | 
|
568  | 
"""Parse bytes to create a leaf node object."""  | 
|
569  | 
        # splitlines mangles the \r delimiters.. don't use it.
 | 
|
| 
3641.3.30
by John Arbash Meinel
 Rename _parse_btree to _btree_serializer  | 
570  | 
self.keys = dict(_btree_serializer._parse_leaf_lines(bytes,  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
571  | 
key_length, ref_list_length))  | 
572  | 
||
573  | 
||
574  | 
class _InternalNode(object):  | 
|
575  | 
"""An internal node for a serialised B+Tree index."""  | 
|
576  | 
||
577  | 
def __init__(self, bytes):  | 
|
578  | 
"""Parse bytes to create an internal node object."""  | 
|
579  | 
        # splitlines mangles the \r delimiters.. don't use it.
 | 
|
580  | 
self.keys = self._parse_lines(bytes.split('\n'))  | 
|
581  | 
||
582  | 
def _parse_lines(self, lines):  | 
|
583  | 
nodes = []  | 
|
584  | 
self.offset = int(lines[1][7:])  | 
|
585  | 
for line in lines[2:]:  | 
|
586  | 
if line == '':  | 
|
587  | 
                break
 | 
|
588  | 
nodes.append(tuple(line.split('\0')))  | 
|
589  | 
return nodes  | 
|
590  | 
||
591  | 
||
592  | 
class BTreeGraphIndex(object):  | 
|
593  | 
"""Access to nodes via the standard GraphIndex interface for B+Tree's.  | 
|
594  | 
||
595  | 
    Individual nodes are held in a LRU cache. This holds the root node in
 | 
|
596  | 
    memory except when very large walks are done.
 | 
|
597  | 
    """
 | 
|
598  | 
||
599  | 
def __init__(self, transport, name, size):  | 
|
600  | 
"""Create a B+Tree index object on the index name.  | 
|
601  | 
||
602  | 
        :param transport: The transport to read data for the index from.
 | 
|
603  | 
        :param name: The file name of the index on transport.
 | 
|
604  | 
        :param size: Optional size of the index in bytes. This allows
 | 
|
605  | 
            compatibility with the GraphIndex API, as well as ensuring that
 | 
|
606  | 
            the initial read (to read the root node header) can be done
 | 
|
607  | 
            without over-reading even on empty indices, and on small indices
 | 
|
608  | 
            allows single-IO to read the entire index.
 | 
|
609  | 
        """
 | 
|
610  | 
self._transport = transport  | 
|
611  | 
self._name = name  | 
|
612  | 
self._size = size  | 
|
613  | 
self._file = None  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
614  | 
self._recommended_pages = self._compute_recommended_pages()  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
615  | 
self._root_node = None  | 
616  | 
        # Default max size is 100,000 leave values
 | 
|
617  | 
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)  | 
|
618  | 
self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)  | 
|
619  | 
self._internal_node_cache = lru_cache.LRUCache()  | 
|
620  | 
self._key_count = None  | 
|
621  | 
self._row_lengths = None  | 
|
622  | 
self._row_offsets = None # Start of each row, [-1] is the end  | 
|
623  | 
||
624  | 
def __eq__(self, other):  | 
|
625  | 
"""Equal when self and other were created with the same parameters."""  | 
|
626  | 
return (  | 
|
627  | 
type(self) == type(other) and  | 
|
628  | 
self._transport == other._transport and  | 
|
629  | 
self._name == other._name and  | 
|
630  | 
self._size == other._size)  | 
|
631  | 
||
632  | 
def __ne__(self, other):  | 
|
633  | 
return not self.__eq__(other)  | 
|
634  | 
||
| 
3763.8.12
by John Arbash Meinel
 Code cleanup.  | 
635  | 
def _get_and_cache_nodes(self, nodes):  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
636  | 
"""Read nodes and cache them in the lru.  | 
637  | 
||
638  | 
        The nodes list supplied is sorted and then read from disk, each node
 | 
|
639  | 
        being inserted it into the _node_cache.
 | 
|
640  | 
||
641  | 
        Note: Asking for more nodes than the _node_cache can contain will
 | 
|
642  | 
        result in some of the results being immediately discarded, to prevent
 | 
|
643  | 
        this an assertion is raised if more nodes are asked for than are
 | 
|
644  | 
        cachable.
 | 
|
645  | 
||
646  | 
        :return: A dict of {node_pos: node}
 | 
|
647  | 
        """
 | 
|
648  | 
found = {}  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
649  | 
start_of_leaves = None  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
650  | 
for node_pos, node in self._read_nodes(sorted(nodes)):  | 
651  | 
if node_pos == 0: # Special case  | 
|
652  | 
self._root_node = node  | 
|
653  | 
else:  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
654  | 
if start_of_leaves is None:  | 
655  | 
start_of_leaves = self._row_offsets[-2]  | 
|
656  | 
if node_pos < start_of_leaves:  | 
|
657  | 
self._internal_node_cache.add(node_pos, node)  | 
|
658  | 
else:  | 
|
659  | 
self._leaf_node_cache.add(node_pos, node)  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
660  | 
found[node_pos] = node  | 
661  | 
return found  | 
|
662  | 
||
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
663  | 
def _compute_recommended_pages(self):  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
664  | 
"""Convert transport's recommended_page_size into btree pages.  | 
665  | 
||
666  | 
        recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
 | 
|
667  | 
        pages fit in that length.
 | 
|
668  | 
        """
 | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
669  | 
recommended_read = self._transport.recommended_page_size()  | 
670  | 
recommended_pages = int(math.ceil(recommended_read /  | 
|
671  | 
float(_PAGE_SIZE)))  | 
|
672  | 
return recommended_pages  | 
|
673  | 
||
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
674  | 
def _compute_total_pages_in_index(self):  | 
675  | 
"""How many pages are in the index.  | 
|
676  | 
||
677  | 
        If we have read the header we will use the value stored there.
 | 
|
678  | 
        Otherwise it will be computed based on the length of the index.
 | 
|
679  | 
        """
 | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
680  | 
if self._size is None:  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
681  | 
raise AssertionError('_compute_total_pages_in_index should not be'  | 
682  | 
' called when self._size is None')  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
683  | 
if self._root_node is not None:  | 
684  | 
            # This is the number of pages as defined by the header
 | 
|
685  | 
return self._row_offsets[-1]  | 
|
686  | 
        # This is the number of pages as defined by the size of the index. They
 | 
|
687  | 
        # should be indentical.
 | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
688  | 
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))  | 
689  | 
return total_pages  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
690  | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
691  | 
def _expand_offsets(self, offsets):  | 
692  | 
"""Find extra pages to download.  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
693  | 
|
694  | 
        The idea is that we always want to make big-enough requests (like 64kB
 | 
|
695  | 
        for http), so that we don't waste round trips. So given the entries
 | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
696  | 
        that we already have cached and the new pages being downloaded figure
 | 
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
697  | 
        out what other pages we might want to read.
 | 
698  | 
||
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
699  | 
        See also doc/developers/btree_index_prefetch.txt for more details.
 | 
700  | 
||
701  | 
        :param offsets: The offsets to be read
 | 
|
702  | 
        :return: A list of offsets to download
 | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
703  | 
        """
 | 
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
704  | 
if 'index' in debug.debug_flags:  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
705  | 
trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)  | 
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
706  | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
707  | 
if len(offsets) >= self._recommended_pages:  | 
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
708  | 
            # Don't add more, we are already requesting more than enough
 | 
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
709  | 
if 'index' in debug.debug_flags:  | 
710  | 
trace.mutter(' not expanding large request (%s >= %s)',  | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
711  | 
len(offsets), self._recommended_pages)  | 
712  | 
return offsets  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
713  | 
if self._size is None:  | 
714  | 
            # Don't try anything, because we don't know where the file ends
 | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
715  | 
if 'index' in debug.debug_flags:  | 
716  | 
trace.mutter(' not expanding without knowing index size')  | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
717  | 
return offsets  | 
718  | 
total_pages = self._compute_total_pages_in_index()  | 
|
719  | 
cached_offsets = self._get_offsets_to_cached_pages()  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
720  | 
        # If reading recommended_pages would read the rest of the index, just
 | 
721  | 
        # do so.
 | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
722  | 
if total_pages - len(cached_offsets) <= self._recommended_pages:  | 
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
723  | 
            # Read whatever is left
 | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
724  | 
if cached_offsets:  | 
725  | 
expanded = [x for x in xrange(total_pages)  | 
|
726  | 
if x not in cached_offsets]  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
727  | 
else:  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
728  | 
expanded = range(total_pages)  | 
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
729  | 
if 'index' in debug.debug_flags:  | 
730  | 
trace.mutter(' reading all unread pages: %s', expanded)  | 
|
731  | 
return expanded  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
732  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
733  | 
if self._root_node is None:  | 
734  | 
            # ATM on the first read of the root node of a large index, we don't
 | 
|
735  | 
            # bother pre-reading any other pages. This is because the
 | 
|
736  | 
            # likelyhood of actually reading interesting pages is very low.
 | 
|
737  | 
            # See doc/developers/btree_index_prefetch.txt for a discussion, and
 | 
|
738  | 
            # a possible implementation when we are guessing that the second
 | 
|
739  | 
            # layer index is small
 | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
740  | 
final_offsets = offsets  | 
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
741  | 
else:  | 
| 
3763.8.14
by John Arbash Meinel
 Add in a shortcut when we haven't cached much yet.  | 
742  | 
tree_depth = len(self._row_lengths)  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
743  | 
if len(cached_offsets) < tree_depth and len(offsets) == 1:  | 
| 
3763.8.14
by John Arbash Meinel
 Add in a shortcut when we haven't cached much yet.  | 
744  | 
                # We haven't read enough to justify expansion
 | 
745  | 
                # If we are only going to read the root node, and 1 leaf node,
 | 
|
746  | 
                # then it isn't worth expanding our request. Once we've read at
 | 
|
747  | 
                # least 2 nodes, then we are probably doing a search, and we
 | 
|
748  | 
                # start expanding our requests.
 | 
|
749  | 
if 'index' in debug.debug_flags:  | 
|
750  | 
trace.mutter(' not expanding on first reads')  | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
751  | 
return offsets  | 
752  | 
final_offsets = self._expand_to_neighbors(offsets, cached_offsets,  | 
|
753  | 
total_pages)  | 
|
754  | 
||
755  | 
final_offsets = sorted(final_offsets)  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
756  | 
if 'index' in debug.debug_flags:  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
757  | 
trace.mutter('expanded: %s', final_offsets)  | 
758  | 
return final_offsets  | 
|
759  | 
||
760  | 
def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):  | 
|
761  | 
"""Expand requests to neighbors until we have enough pages.  | 
|
762  | 
||
763  | 
        This is called from _expand_offsets after policy has determined that we
 | 
|
764  | 
        want to expand.
 | 
|
765  | 
        We only want to expand requests within a given layer. We cheat a little
 | 
|
766  | 
        bit and assume all requests will be in the same layer. This is true
 | 
|
767  | 
        given the current design, but if it changes this algorithm may perform
 | 
|
768  | 
        oddly.
 | 
|
769  | 
||
770  | 
        :param offsets: requested offsets
 | 
|
771  | 
        :param cached_offsets: offsets for pages we currently have cached
 | 
|
772  | 
        :return: A set() of offsets after expansion
 | 
|
773  | 
        """
 | 
|
774  | 
final_offsets = set(offsets)  | 
|
775  | 
first = end = None  | 
|
776  | 
new_tips = set(final_offsets)  | 
|
777  | 
while len(final_offsets) < self._recommended_pages and new_tips:  | 
|
778  | 
next_tips = set()  | 
|
779  | 
for pos in new_tips:  | 
|
780  | 
if first is None:  | 
|
781  | 
first, end = self._find_layer_first_and_end(pos)  | 
|
782  | 
previous = pos - 1  | 
|
783  | 
if (previous > 0  | 
|
784  | 
and previous not in cached_offsets  | 
|
785  | 
and previous not in final_offsets  | 
|
786  | 
and previous >= first):  | 
|
787  | 
next_tips.add(previous)  | 
|
788  | 
after = pos + 1  | 
|
789  | 
if (after < total_pages  | 
|
790  | 
and after not in cached_offsets  | 
|
791  | 
and after not in final_offsets  | 
|
792  | 
and after < end):  | 
|
793  | 
next_tips.add(after)  | 
|
794  | 
                # This would keep us from going bigger than
 | 
|
795  | 
                # recommended_pages by only expanding the first offsets.
 | 
|
796  | 
                # However, if we are making a 'wide' request, it is
 | 
|
797  | 
                # reasonable to expand all points equally.
 | 
|
798  | 
                # if len(final_offsets) > recommended_pages:
 | 
|
799  | 
                #     break
 | 
|
800  | 
final_offsets.update(next_tips)  | 
|
801  | 
new_tips = next_tips  | 
|
802  | 
return final_offsets  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
803  | 
|
| 
4011.5.3
by Andrew Bennetts
 Implement and test external_references on GraphIndex and BTreeGraphIndex.  | 
804  | 
def external_references(self, ref_list_num):  | 
805  | 
if self._root_node is None:  | 
|
806  | 
self._get_root_node()  | 
|
807  | 
if ref_list_num + 1 > self.node_ref_lists:  | 
|
808  | 
raise ValueError('No ref list %d, index has %d ref lists'  | 
|
809  | 
% (ref_list_num, self.node_ref_lists))  | 
|
810  | 
keys = set()  | 
|
811  | 
refs = set()  | 
|
812  | 
for node in self.iter_all_entries():  | 
|
813  | 
keys.add(node[1])  | 
|
814  | 
refs.update(node[3][ref_list_num])  | 
|
815  | 
return refs - keys  | 
|
816  | 
||
| 
3763.8.12
by John Arbash Meinel
 Code cleanup.  | 
817  | 
def _find_layer_first_and_end(self, offset):  | 
818  | 
"""Find the start/stop nodes for the layer corresponding to offset.  | 
|
819  | 
||
820  | 
        :return: (first, end)
 | 
|
821  | 
            first is the first node in this layer
 | 
|
822  | 
            end is the first node of the next layer
 | 
|
823  | 
        """
 | 
|
824  | 
first = end = 0  | 
|
825  | 
for roffset in self._row_offsets:  | 
|
826  | 
first = end  | 
|
827  | 
end = roffset  | 
|
828  | 
if offset < roffset:  | 
|
829  | 
                break
 | 
|
830  | 
return first, end  | 
|
831  | 
||
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
832  | 
def _get_offsets_to_cached_pages(self):  | 
| 
3763.8.12
by John Arbash Meinel
 Code cleanup.  | 
833  | 
"""Determine what nodes we already have cached."""  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
834  | 
cached_offsets = set(self._internal_node_cache.keys())  | 
835  | 
cached_offsets.update(self._leaf_node_cache.keys())  | 
|
| 
3763.8.12
by John Arbash Meinel
 Code cleanup.  | 
836  | 
if self._root_node is not None:  | 
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
837  | 
cached_offsets.add(0)  | 
838  | 
return cached_offsets  | 
|
| 
3763.8.12
by John Arbash Meinel
 Code cleanup.  | 
839  | 
|
840  | 
def _get_root_node(self):  | 
|
841  | 
if self._root_node is None:  | 
|
842  | 
            # We may not have a root node yet
 | 
|
843  | 
self._get_internal_nodes([0])  | 
|
844  | 
return self._root_node  | 
|
845  | 
||
| 
3641.5.18
by John Arbash Meinel
 Clean out the global state, good for prototyping and tuning, bad for production code.  | 
846  | 
def _get_nodes(self, cache, node_indexes):  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
847  | 
found = {}  | 
848  | 
needed = []  | 
|
849  | 
for idx in node_indexes:  | 
|
850  | 
if idx == 0 and self._root_node is not None:  | 
|
851  | 
found[0] = self._root_node  | 
|
852  | 
                continue
 | 
|
853  | 
try:  | 
|
854  | 
found[idx] = cache[idx]  | 
|
855  | 
except KeyError:  | 
|
856  | 
needed.append(idx)  | 
|
| 
3763.8.1
by John Arbash Meinel
 Playing around with expanding requests for btree index nodes into neighboring nodes.  | 
857  | 
if not needed:  | 
858  | 
return found  | 
|
| 
3763.8.15
by John Arbash Meinel
 Review comments from Martin. Code clarity/variable name/docstring updates.  | 
859  | 
needed = self._expand_offsets(needed)  | 
| 
3763.8.12
by John Arbash Meinel
 Code cleanup.  | 
860  | 
found.update(self._get_and_cache_nodes(needed))  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
861  | 
return found  | 
862  | 
||
863  | 
def _get_internal_nodes(self, node_indexes):  | 
|
864  | 
"""Get a node, from cache or disk.  | 
|
865  | 
||
866  | 
        After getting it, the node will be cached.
 | 
|
867  | 
        """
 | 
|
| 
3641.5.18
by John Arbash Meinel
 Clean out the global state, good for prototyping and tuning, bad for production code.  | 
868  | 
return self._get_nodes(self._internal_node_cache, node_indexes)  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
869  | 
|
| 
3805.4.6
by John Arbash Meinel
 refactor for clarity.  | 
870  | 
def _cache_leaf_values(self, nodes):  | 
871  | 
"""Cache directly from key => value, skipping the btree."""  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
872  | 
if self._leaf_value_cache is not None:  | 
| 
3805.4.6
by John Arbash Meinel
 refactor for clarity.  | 
873  | 
for node in nodes.itervalues():  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
874  | 
for key, value in node.keys.iteritems():  | 
875  | 
if key in self._leaf_value_cache:  | 
|
876  | 
                        # Don't add the rest of the keys, we've seen this node
 | 
|
877  | 
                        # before.
 | 
|
878  | 
                        break
 | 
|
879  | 
self._leaf_value_cache[key] = value  | 
|
| 
3805.4.6
by John Arbash Meinel
 refactor for clarity.  | 
880  | 
|
881  | 
def _get_leaf_nodes(self, node_indexes):  | 
|
882  | 
"""Get a bunch of nodes, from cache or disk."""  | 
|
883  | 
found = self._get_nodes(self._leaf_node_cache, node_indexes)  | 
|
884  | 
self._cache_leaf_values(found)  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
885  | 
return found  | 
886  | 
||
887  | 
def iter_all_entries(self):  | 
|
888  | 
"""Iterate over all keys within the index.  | 
|
889  | 
||
890  | 
        :return: An iterable of (index, key, value) or (index, key, value, reference_lists).
 | 
|
891  | 
            The former tuple is used when there are no reference lists in the
 | 
|
892  | 
            index, making the API compatible with simple key:value index types.
 | 
|
893  | 
            There is no defined order for the result iteration - it will be in
 | 
|
894  | 
            the most efficient order for the index.
 | 
|
895  | 
        """
 | 
|
896  | 
if 'evil' in debug.debug_flags:  | 
|
897  | 
trace.mutter_callsite(3,  | 
|
898  | 
"iter_all_entries scales with size of history.")  | 
|
899  | 
if not self.key_count():  | 
|
900  | 
            return
 | 
|
| 
3823.5.2
by John Arbash Meinel
 It turns out that we read the pack-names file 3-times because  | 
901  | 
if self._row_offsets[-1] == 1:  | 
902  | 
            # There is only the root node, and we read that via key_count()
 | 
|
903  | 
if self.node_ref_lists:  | 
|
904  | 
for key, (value, refs) in sorted(self._root_node.keys.items()):  | 
|
905  | 
yield (self, key, value, refs)  | 
|
906  | 
else:  | 
|
907  | 
for key, (value, refs) in sorted(self._root_node.keys.items()):  | 
|
908  | 
yield (self, key, value)  | 
|
909  | 
            return
 | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
910  | 
start_of_leaves = self._row_offsets[-2]  | 
911  | 
end_of_leaves = self._row_offsets[-1]  | 
|
| 
3824.1.2
by John Arbash Meinel
 iter_all_entries() shouldn't need to re-read the page.  | 
912  | 
needed_offsets = range(start_of_leaves, end_of_leaves)  | 
913  | 
if needed_offsets == [0]:  | 
|
914  | 
            # Special case when we only have a root node, as we have already
 | 
|
915  | 
            # read everything
 | 
|
916  | 
nodes = [(0, self._root_node)]  | 
|
917  | 
else:  | 
|
918  | 
nodes = self._read_nodes(needed_offsets)  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
919  | 
        # We iterate strictly in-order so that we can use this function
 | 
920  | 
        # for spilling index builds to disk.
 | 
|
921  | 
if self.node_ref_lists:  | 
|
| 
3824.1.2
by John Arbash Meinel
 iter_all_entries() shouldn't need to re-read the page.  | 
922  | 
for _, node in nodes:  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
923  | 
for key, (value, refs) in sorted(node.keys.items()):  | 
924  | 
yield (self, key, value, refs)  | 
|
925  | 
else:  | 
|
| 
3824.1.2
by John Arbash Meinel
 iter_all_entries() shouldn't need to re-read the page.  | 
926  | 
for _, node in nodes:  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
927  | 
for key, (value, refs) in sorted(node.keys.items()):  | 
928  | 
yield (self, key, value)  | 
|
929  | 
||
930  | 
    @staticmethod
 | 
|
931  | 
def _multi_bisect_right(in_keys, fixed_keys):  | 
|
932  | 
"""Find the positions where each 'in_key' would fit in fixed_keys.  | 
|
933  | 
||
934  | 
        This is equivalent to doing "bisect_right" on each in_key into
 | 
|
935  | 
        fixed_keys
 | 
|
936  | 
||
937  | 
        :param in_keys: A sorted list of keys to match with fixed_keys
 | 
|
938  | 
        :param fixed_keys: A sorted list of keys to match against
 | 
|
939  | 
        :return: A list of (integer position, [key list]) tuples.
 | 
|
940  | 
        """
 | 
|
941  | 
if not in_keys:  | 
|
942  | 
return []  | 
|
943  | 
if not fixed_keys:  | 
|
944  | 
            # no pointers in the fixed_keys list, which means everything must
 | 
|
945  | 
            # fall to the left.
 | 
|
946  | 
return [(0, in_keys)]  | 
|
947  | 
||
948  | 
        # TODO: Iterating both lists will generally take M + N steps
 | 
|
949  | 
        #       Bisecting each key will generally take M * log2 N steps.
 | 
|
950  | 
        #       If we had an efficient way to compare, we could pick the method
 | 
|
951  | 
        #       based on which has the fewer number of steps.
 | 
|
952  | 
        #       There is also the argument that bisect_right is a compiled
 | 
|
953  | 
        #       function, so there is even more to be gained.
 | 
|
954  | 
        # iter_steps = len(in_keys) + len(fixed_keys)
 | 
|
955  | 
        # bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
 | 
|
956  | 
if len(in_keys) == 1: # Bisect will always be faster for M = 1  | 
|
957  | 
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]  | 
|
958  | 
        # elif bisect_steps < iter_steps:
 | 
|
959  | 
        #     offsets = {}
 | 
|
960  | 
        #     for key in in_keys:
 | 
|
961  | 
        #         offsets.setdefault(bisect_right(fixed_keys, key),
 | 
|
962  | 
        #                            []).append(key)
 | 
|
963  | 
        #     return [(o, offsets[o]) for o in sorted(offsets)]
 | 
|
964  | 
in_keys_iter = iter(in_keys)  | 
|
965  | 
fixed_keys_iter = enumerate(fixed_keys)  | 
|
966  | 
cur_in_key = in_keys_iter.next()  | 
|
967  | 
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()  | 
|
968  | 
||
969  | 
class InputDone(Exception): pass  | 
|
970  | 
class FixedDone(Exception): pass  | 
|
971  | 
||
972  | 
output = []  | 
|
973  | 
cur_out = []  | 
|
974  | 
||
975  | 
        # TODO: Another possibility is that rather than iterating on each side,
 | 
|
976  | 
        #       we could use a combination of bisecting and iterating. For
 | 
|
977  | 
        #       example, while cur_in_key < fixed_key, bisect to find its
 | 
|
978  | 
        #       point, then iterate all matching keys, then bisect (restricted
 | 
|
979  | 
        #       to only the remainder) for the next one, etc.
 | 
|
980  | 
try:  | 
|
981  | 
while True:  | 
|
982  | 
if cur_in_key < cur_fixed_key:  | 
|
983  | 
cur_keys = []  | 
|
984  | 
cur_out = (cur_fixed_offset, cur_keys)  | 
|
985  | 
output.append(cur_out)  | 
|
986  | 
while cur_in_key < cur_fixed_key:  | 
|
987  | 
cur_keys.append(cur_in_key)  | 
|
988  | 
try:  | 
|
989  | 
cur_in_key = in_keys_iter.next()  | 
|
990  | 
except StopIteration:  | 
|
991  | 
raise InputDone  | 
|
992  | 
                    # At this point cur_in_key must be >= cur_fixed_key
 | 
|
993  | 
                # step the cur_fixed_key until we pass the cur key, or walk off
 | 
|
994  | 
                # the end
 | 
|
995  | 
while cur_in_key >= cur_fixed_key:  | 
|
996  | 
try:  | 
|
997  | 
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()  | 
|
998  | 
except StopIteration:  | 
|
999  | 
raise FixedDone  | 
|
1000  | 
except InputDone:  | 
|
1001  | 
            # We consumed all of the input, nothing more to do
 | 
|
1002  | 
            pass
 | 
|
1003  | 
except FixedDone:  | 
|
1004  | 
            # There was some input left, but we consumed all of fixed, so we
 | 
|
1005  | 
            # have to add one more for the tail
 | 
|
1006  | 
cur_keys = [cur_in_key]  | 
|
1007  | 
cur_keys.extend(in_keys_iter)  | 
|
1008  | 
cur_out = (len(fixed_keys), cur_keys)  | 
|
1009  | 
output.append(cur_out)  | 
|
1010  | 
return output  | 
|
1011  | 
||
1012  | 
def iter_entries(self, keys):  | 
|
1013  | 
"""Iterate over keys within the index.  | 
|
1014  | 
||
1015  | 
        :param keys: An iterable providing the keys to be retrieved.
 | 
|
1016  | 
        :return: An iterable as per iter_all_entries, but restricted to the
 | 
|
1017  | 
            keys supplied. No additional keys will be returned, and every
 | 
|
1018  | 
            key supplied that is in the index will be returned.
 | 
|
1019  | 
        """
 | 
|
1020  | 
        # 6 seconds spent in miss_torture using the sorted() line.
 | 
|
1021  | 
        # Even with out of order disk IO it seems faster not to sort it when
 | 
|
1022  | 
        # large queries are being made.
 | 
|
1023  | 
        # However, now that we are doing multi-way bisecting, we need the keys
 | 
|
1024  | 
        # in sorted order anyway. We could change the multi-way code to not
 | 
|
1025  | 
        # require sorted order. (For example, it bisects for the first node,
 | 
|
1026  | 
        # does an in-order search until a key comes before the current point,
 | 
|
1027  | 
        # which it then bisects for, etc.)
 | 
|
1028  | 
keys = frozenset(keys)  | 
|
1029  | 
if not keys:  | 
|
1030  | 
            return
 | 
|
1031  | 
||
1032  | 
if not self.key_count():  | 
|
1033  | 
            return
 | 
|
1034  | 
||
1035  | 
needed_keys = []  | 
|
1036  | 
if self._leaf_value_cache is None:  | 
|
1037  | 
needed_keys = keys  | 
|
1038  | 
else:  | 
|
1039  | 
for key in keys:  | 
|
1040  | 
value = self._leaf_value_cache.get(key, None)  | 
|
1041  | 
if value is not None:  | 
|
1042  | 
                    # This key is known not to be here, skip it
 | 
|
1043  | 
value, refs = value  | 
|
1044  | 
if self.node_ref_lists:  | 
|
1045  | 
yield (self, key, value, refs)  | 
|
1046  | 
else:  | 
|
1047  | 
yield (self, key, value)  | 
|
1048  | 
else:  | 
|
1049  | 
needed_keys.append(key)  | 
|
1050  | 
||
1051  | 
last_key = None  | 
|
1052  | 
needed_keys = keys  | 
|
1053  | 
if not needed_keys:  | 
|
1054  | 
            return
 | 
|
1055  | 
        # 6 seconds spent in miss_torture using the sorted() line.
 | 
|
1056  | 
        # Even with out of order disk IO it seems faster not to sort it when
 | 
|
1057  | 
        # large queries are being made.
 | 
|
1058  | 
needed_keys = sorted(needed_keys)  | 
|
1059  | 
||
1060  | 
nodes_and_keys = [(0, needed_keys)]  | 
|
1061  | 
||
1062  | 
for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):  | 
|
1063  | 
node_indexes = [idx for idx, s_keys in nodes_and_keys]  | 
|
1064  | 
nodes = self._get_internal_nodes(node_indexes)  | 
|
1065  | 
||
1066  | 
next_nodes_and_keys = []  | 
|
1067  | 
for node_index, sub_keys in nodes_and_keys:  | 
|
1068  | 
node = nodes[node_index]  | 
|
1069  | 
positions = self._multi_bisect_right(sub_keys, node.keys)  | 
|
1070  | 
node_offset = next_row_start + node.offset  | 
|
1071  | 
next_nodes_and_keys.extend([(node_offset + pos, s_keys)  | 
|
1072  | 
for pos, s_keys in positions])  | 
|
1073  | 
nodes_and_keys = next_nodes_and_keys  | 
|
1074  | 
        # We should now be at the _LeafNodes
 | 
|
1075  | 
node_indexes = [idx for idx, s_keys in nodes_and_keys]  | 
|
1076  | 
||
1077  | 
        # TODO: We may *not* want to always read all the nodes in one
 | 
|
1078  | 
        #       big go. Consider setting a max size on this.
 | 
|
1079  | 
||
1080  | 
nodes = self._get_leaf_nodes(node_indexes)  | 
|
1081  | 
for node_index, sub_keys in nodes_and_keys:  | 
|
1082  | 
if not sub_keys:  | 
|
1083  | 
                continue
 | 
|
1084  | 
node = nodes[node_index]  | 
|
1085  | 
for next_sub_key in sub_keys:  | 
|
1086  | 
if next_sub_key in node.keys:  | 
|
1087  | 
value, refs = node.keys[next_sub_key]  | 
|
1088  | 
if self.node_ref_lists:  | 
|
1089  | 
yield (self, next_sub_key, value, refs)  | 
|
1090  | 
else:  | 
|
1091  | 
yield (self, next_sub_key, value)  | 
|
1092  | 
||
1093  | 
def iter_entries_prefix(self, keys):  | 
|
1094  | 
"""Iterate over keys within the index using prefix matching.  | 
|
1095  | 
||
1096  | 
        Prefix matching is applied within the tuple of a key, not to within
 | 
|
1097  | 
        the bytestring of each key element. e.g. if you have the keys ('foo',
 | 
|
1098  | 
        'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
 | 
|
1099  | 
        only the former key is returned.
 | 
|
1100  | 
||
1101  | 
        WARNING: Note that this method currently causes a full index parse
 | 
|
1102  | 
        unconditionally (which is reasonably appropriate as it is a means for
 | 
|
1103  | 
        thunking many small indices into one larger one and still supplies
 | 
|
1104  | 
        iter_all_entries at the thunk layer).
 | 
|
1105  | 
||
1106  | 
        :param keys: An iterable providing the key prefixes to be retrieved.
 | 
|
1107  | 
            Each key prefix takes the form of a tuple the length of a key, but
 | 
|
1108  | 
            with the last N elements 'None' rather than a regular bytestring.
 | 
|
1109  | 
            The first element cannot be 'None'.
 | 
|
1110  | 
        :return: An iterable as per iter_all_entries, but restricted to the
 | 
|
1111  | 
            keys with a matching prefix to those supplied. No additional keys
 | 
|
1112  | 
            will be returned, and every match that is in the index will be
 | 
|
1113  | 
            returned.
 | 
|
1114  | 
        """
 | 
|
1115  | 
keys = sorted(set(keys))  | 
|
1116  | 
if not keys:  | 
|
1117  | 
            return
 | 
|
1118  | 
        # Load if needed to check key lengths
 | 
|
1119  | 
if self._key_count is None:  | 
|
1120  | 
self._get_root_node()  | 
|
1121  | 
        # TODO: only access nodes that can satisfy the prefixes we are looking
 | 
|
1122  | 
        # for. For now, to meet API usage (as this function is not used by
 | 
|
1123  | 
        # current bzrlib) just suck the entire index and iterate in memory.
 | 
|
1124  | 
nodes = {}  | 
|
1125  | 
if self.node_ref_lists:  | 
|
1126  | 
if self._key_length == 1:  | 
|
1127  | 
for _1, key, value, refs in self.iter_all_entries():  | 
|
1128  | 
nodes[key] = value, refs  | 
|
1129  | 
else:  | 
|
1130  | 
nodes_by_key = {}  | 
|
1131  | 
for _1, key, value, refs in self.iter_all_entries():  | 
|
1132  | 
key_value = key, value, refs  | 
|
1133  | 
                    # For a key of (foo, bar, baz) create
 | 
|
1134  | 
                    # _nodes_by_key[foo][bar][baz] = key_value
 | 
|
1135  | 
key_dict = nodes_by_key  | 
|
1136  | 
for subkey in key[:-1]:  | 
|
1137  | 
key_dict = key_dict.setdefault(subkey, {})  | 
|
1138  | 
key_dict[key[-1]] = key_value  | 
|
1139  | 
else:  | 
|
1140  | 
if self._key_length == 1:  | 
|
1141  | 
for _1, key, value in self.iter_all_entries():  | 
|
1142  | 
nodes[key] = value  | 
|
1143  | 
else:  | 
|
1144  | 
nodes_by_key = {}  | 
|
1145  | 
for _1, key, value in self.iter_all_entries():  | 
|
1146  | 
key_value = key, value  | 
|
1147  | 
                    # For a key of (foo, bar, baz) create
 | 
|
1148  | 
                    # _nodes_by_key[foo][bar][baz] = key_value
 | 
|
1149  | 
key_dict = nodes_by_key  | 
|
1150  | 
for subkey in key[:-1]:  | 
|
1151  | 
key_dict = key_dict.setdefault(subkey, {})  | 
|
1152  | 
key_dict[key[-1]] = key_value  | 
|
1153  | 
if self._key_length == 1:  | 
|
1154  | 
for key in keys:  | 
|
1155  | 
                # sanity check
 | 
|
1156  | 
if key[0] is None:  | 
|
1157  | 
raise errors.BadIndexKey(key)  | 
|
1158  | 
if len(key) != self._key_length:  | 
|
1159  | 
raise errors.BadIndexKey(key)  | 
|
1160  | 
try:  | 
|
1161  | 
if self.node_ref_lists:  | 
|
1162  | 
value, node_refs = nodes[key]  | 
|
1163  | 
yield self, key, value, node_refs  | 
|
1164  | 
else:  | 
|
1165  | 
yield self, key, nodes[key]  | 
|
1166  | 
except KeyError:  | 
|
1167  | 
                    pass
 | 
|
1168  | 
            return
 | 
|
1169  | 
for key in keys:  | 
|
1170  | 
            # sanity check
 | 
|
1171  | 
if key[0] is None:  | 
|
1172  | 
raise errors.BadIndexKey(key)  | 
|
1173  | 
if len(key) != self._key_length:  | 
|
1174  | 
raise errors.BadIndexKey(key)  | 
|
1175  | 
            # find what it refers to:
 | 
|
1176  | 
key_dict = nodes_by_key  | 
|
1177  | 
elements = list(key)  | 
|
1178  | 
            # find the subdict whose contents should be returned.
 | 
|
1179  | 
try:  | 
|
1180  | 
while len(elements) and elements[0] is not None:  | 
|
1181  | 
key_dict = key_dict[elements[0]]  | 
|
1182  | 
elements.pop(0)  | 
|
1183  | 
except KeyError:  | 
|
1184  | 
                # a non-existant lookup.
 | 
|
1185  | 
                continue
 | 
|
1186  | 
if len(elements):  | 
|
1187  | 
dicts = [key_dict]  | 
|
1188  | 
while dicts:  | 
|
1189  | 
key_dict = dicts.pop(-1)  | 
|
1190  | 
                    # can't be empty or would not exist
 | 
|
1191  | 
item, value = key_dict.iteritems().next()  | 
|
1192  | 
if type(value) == dict:  | 
|
1193  | 
                        # push keys
 | 
|
1194  | 
dicts.extend(key_dict.itervalues())  | 
|
1195  | 
else:  | 
|
1196  | 
                        # yield keys
 | 
|
1197  | 
for value in key_dict.itervalues():  | 
|
1198  | 
                            # each value is the key:value:node refs tuple
 | 
|
1199  | 
                            # ready to yield.
 | 
|
1200  | 
yield (self, ) + value  | 
|
1201  | 
else:  | 
|
1202  | 
                # the last thing looked up was a terminal element
 | 
|
1203  | 
yield (self, ) + key_dict  | 
|
1204  | 
||
1205  | 
def key_count(self):  | 
|
1206  | 
"""Return an estimate of the number of keys in this index.  | 
|
1207  | 
||
1208  | 
        For BTreeGraphIndex the estimate is exact as it is contained in the
 | 
|
1209  | 
        header.
 | 
|
1210  | 
        """
 | 
|
1211  | 
if self._key_count is None:  | 
|
1212  | 
self._get_root_node()  | 
|
1213  | 
return self._key_count  | 
|
1214  | 
||
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
1215  | 
def _compute_row_offsets(self):  | 
1216  | 
"""Fill out the _row_offsets attribute based on _row_lengths."""  | 
|
1217  | 
offsets = []  | 
|
1218  | 
row_offset = 0  | 
|
1219  | 
for row in self._row_lengths:  | 
|
1220  | 
offsets.append(row_offset)  | 
|
1221  | 
row_offset += row  | 
|
1222  | 
offsets.append(row_offset)  | 
|
1223  | 
self._row_offsets = offsets  | 
|
1224  | 
||
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1225  | 
def _parse_header_from_bytes(self, bytes):  | 
1226  | 
"""Parse the header from a region of bytes.  | 
|
1227  | 
||
1228  | 
        :param bytes: The data to parse.
 | 
|
1229  | 
        :return: An offset, data tuple such as readv yields, for the unparsed
 | 
|
1230  | 
            data. (which may be of length 0).
 | 
|
1231  | 
        """
 | 
|
1232  | 
signature = bytes[0:len(self._signature())]  | 
|
1233  | 
if not signature == self._signature():  | 
|
1234  | 
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)  | 
|
1235  | 
lines = bytes[len(self._signature()):].splitlines()  | 
|
1236  | 
options_line = lines[0]  | 
|
1237  | 
if not options_line.startswith(_OPTION_NODE_REFS):  | 
|
1238  | 
raise errors.BadIndexOptions(self)  | 
|
1239  | 
try:  | 
|
1240  | 
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])  | 
|
1241  | 
except ValueError:  | 
|
1242  | 
raise errors.BadIndexOptions(self)  | 
|
1243  | 
options_line = lines[1]  | 
|
1244  | 
if not options_line.startswith(_OPTION_KEY_ELEMENTS):  | 
|
1245  | 
raise errors.BadIndexOptions(self)  | 
|
1246  | 
try:  | 
|
1247  | 
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])  | 
|
1248  | 
except ValueError:  | 
|
1249  | 
raise errors.BadIndexOptions(self)  | 
|
1250  | 
options_line = lines[2]  | 
|
1251  | 
if not options_line.startswith(_OPTION_LEN):  | 
|
1252  | 
raise errors.BadIndexOptions(self)  | 
|
1253  | 
try:  | 
|
1254  | 
self._key_count = int(options_line[len(_OPTION_LEN):])  | 
|
1255  | 
except ValueError:  | 
|
1256  | 
raise errors.BadIndexOptions(self)  | 
|
1257  | 
options_line = lines[3]  | 
|
1258  | 
if not options_line.startswith(_OPTION_ROW_LENGTHS):  | 
|
1259  | 
raise errors.BadIndexOptions(self)  | 
|
1260  | 
try:  | 
|
1261  | 
self._row_lengths = map(int, [length for length in  | 
|
1262  | 
options_line[len(_OPTION_ROW_LENGTHS):].split(',')  | 
|
1263  | 
if len(length)])  | 
|
1264  | 
except ValueError:  | 
|
1265  | 
raise errors.BadIndexOptions(self)  | 
|
| 
3763.8.7
by John Arbash Meinel
 A bit of doc updates, start putting in tests for current behavior.  | 
1266  | 
self._compute_row_offsets()  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1267  | 
|
1268  | 
        # calculate the bytes we have processed
 | 
|
1269  | 
header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)  | 
|
1270  | 
return header_end, bytes[header_end:]  | 
|
1271  | 
||
1272  | 
def _read_nodes(self, nodes):  | 
|
1273  | 
"""Read some nodes from disk into the LRU cache.  | 
|
1274  | 
||
1275  | 
        This performs a readv to get the node data into memory, and parses each
 | 
|
| 
3868.1.1
by Martin Pool
 merge John's patch to avoid re-reading pack-names file  | 
1276  | 
        node, then yields it to the caller. The nodes are requested in the
 | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1277  | 
        supplied order. If possible doing sort() on the list before requesting
 | 
1278  | 
        a read may improve performance.
 | 
|
1279  | 
||
1280  | 
        :param nodes: The nodes to read. 0 - first node, 1 - second node etc.
 | 
|
1281  | 
        :return: None
 | 
|
1282  | 
        """
 | 
|
| 
3868.1.1
by Martin Pool
 merge John's patch to avoid re-reading pack-names file  | 
1283  | 
        # may be the byte string of the whole file
 | 
| 
3823.5.2
by John Arbash Meinel
 It turns out that we read the pack-names file 3-times because  | 
1284  | 
bytes = None  | 
| 
3868.1.1
by Martin Pool
 merge John's patch to avoid re-reading pack-names file  | 
1285  | 
        # list of (offset, length) regions of the file that should, evenually
 | 
1286  | 
        # be read in to data_ranges, either from 'bytes' or from the transport
 | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1287  | 
ranges = []  | 
1288  | 
for index in nodes:  | 
|
1289  | 
offset = index * _PAGE_SIZE  | 
|
1290  | 
size = _PAGE_SIZE  | 
|
1291  | 
if index == 0:  | 
|
1292  | 
                # Root node - special case
 | 
|
1293  | 
if self._size:  | 
|
1294  | 
size = min(_PAGE_SIZE, self._size)  | 
|
1295  | 
else:  | 
|
| 
3824.1.1
by John Arbash Meinel
 Fix _read_nodes() to only issue a single read if there is no known size.  | 
1296  | 
                    # The only case where we don't know the size, is for very
 | 
1297  | 
                    # small indexes. So we read the whole thing
 | 
|
| 
3823.5.2
by John Arbash Meinel
 It turns out that we read the pack-names file 3-times because  | 
1298  | 
bytes = self._transport.get_bytes(self._name)  | 
1299  | 
self._size = len(bytes)  | 
|
| 
3868.1.1
by Martin Pool
 merge John's patch to avoid re-reading pack-names file  | 
1300  | 
                    # the whole thing should be parsed out of 'bytes'
 | 
| 
3824.1.1
by John Arbash Meinel
 Fix _read_nodes() to only issue a single read if there is no known size.  | 
1301  | 
ranges.append((0, len(bytes)))  | 
| 
3823.5.2
by John Arbash Meinel
 It turns out that we read the pack-names file 3-times because  | 
1302  | 
                    break
 | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1303  | 
else:  | 
| 
3763.8.6
by John Arbash Meinel
 Fix the logic a bit, and add a bit more tweaking opportunities  | 
1304  | 
if offset > self._size:  | 
1305  | 
raise AssertionError('tried to read past the end'  | 
|
1306  | 
' of the file %s > %s'  | 
|
1307  | 
% (offset, self._size))  | 
|
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1308  | 
size = min(size, self._size - offset)  | 
1309  | 
ranges.append((offset, size))  | 
|
1310  | 
if not ranges:  | 
|
1311  | 
            return
 | 
|
| 
3868.1.1
by Martin Pool
 merge John's patch to avoid re-reading pack-names file  | 
1312  | 
elif bytes is not None:  | 
1313  | 
            # already have the whole file
 | 
|
| 
3823.5.2
by John Arbash Meinel
 It turns out that we read the pack-names file 3-times because  | 
1314  | 
data_ranges = [(start, bytes[start:start+_PAGE_SIZE])  | 
1315  | 
for start in xrange(0, len(bytes), _PAGE_SIZE)]  | 
|
| 
3824.1.1
by John Arbash Meinel
 Fix _read_nodes() to only issue a single read if there is no known size.  | 
1316  | 
elif self._file is None:  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1317  | 
data_ranges = self._transport.readv(self._name, ranges)  | 
1318  | 
else:  | 
|
1319  | 
data_ranges = []  | 
|
1320  | 
for offset, size in ranges:  | 
|
1321  | 
self._file.seek(offset)  | 
|
1322  | 
data_ranges.append((offset, self._file.read(size)))  | 
|
1323  | 
for offset, data in data_ranges:  | 
|
1324  | 
if offset == 0:  | 
|
1325  | 
                # extract the header
 | 
|
1326  | 
offset, data = self._parse_header_from_bytes(data)  | 
|
1327  | 
if len(data) == 0:  | 
|
1328  | 
                    continue
 | 
|
1329  | 
bytes = zlib.decompress(data)  | 
|
1330  | 
if bytes.startswith(_LEAF_FLAG):  | 
|
1331  | 
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)  | 
|
1332  | 
elif bytes.startswith(_INTERNAL_FLAG):  | 
|
1333  | 
node = _InternalNode(bytes)  | 
|
1334  | 
else:  | 
|
1335  | 
raise AssertionError("Unknown node type for %r" % bytes)  | 
|
1336  | 
yield offset / _PAGE_SIZE, node  | 
|
1337  | 
||
1338  | 
def _signature(self):  | 
|
1339  | 
"""The file signature for this index type."""  | 
|
1340  | 
return _BTSIGNATURE  | 
|
1341  | 
||
1342  | 
def validate(self):  | 
|
1343  | 
"""Validate that everything in the index can be accessed."""  | 
|
1344  | 
        # just read and parse every node.
 | 
|
1345  | 
self._get_root_node()  | 
|
1346  | 
if len(self._row_lengths) > 1:  | 
|
1347  | 
start_node = self._row_offsets[1]  | 
|
1348  | 
else:  | 
|
1349  | 
            # We shouldn't be reading anything anyway
 | 
|
1350  | 
start_node = 1  | 
|
1351  | 
node_end = self._row_offsets[-1]  | 
|
1352  | 
for node in self._read_nodes(range(start_node, node_end)):  | 
|
1353  | 
            pass
 | 
|
1354  | 
||
1355  | 
||
1356  | 
try:  | 
|
| 
3641.3.30
by John Arbash Meinel
 Rename _parse_btree to _btree_serializer  | 
1357  | 
from bzrlib import _btree_serializer_c as _btree_serializer  | 
| 
3641.3.1
by John Arbash Meinel
 Bring in the btree_index and chunk_writer code and their tests.  | 
1358  | 
except ImportError:  | 
| 
3641.3.30
by John Arbash Meinel
 Rename _parse_btree to _btree_serializer  | 
1359  | 
from bzrlib import _btree_serializer_py as _btree_serializer  |