/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
30
    osutils,
0.17.4 by Robert Collins
Annotate.
31
    pack,
32
    patiencediff,
33
    )
34
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
35
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
36
from bzrlib.osutils import (
37
    contains_whitespace,
38
    contains_linebreaks,
39
    sha_string,
40
    sha_strings,
41
    split_lines,
42
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
43
from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
44
from bzrlib.lru_cache import LRUSizeCache
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
45
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
46
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
47
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
48
    adapter_registry,
49
    AbsentContentFactory,
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
50
    ChunkedContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
51
    FulltextContentFactory,
52
    VersionedFiles,
53
    )
54
55
0.17.5 by Robert Collins
nograph tests completely passing.
56
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
57
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
58
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
59
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
60
    label_line = lines.next()
61
    sha1_line = lines.next()
62
    if (not label_line.startswith('label: ') or
63
        not sha1_line.startswith('sha1: ')):
64
        raise AssertionError("bad text record %r" % lines)
65
    label = tuple(label_line[7:-1].split('\x00'))
66
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
67
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
68
        op = header[0]
69
        numbers = header[2:]
70
        numbers = [int(n) for n in header[2:].split(',')]
71
        if op == 'c':
72
            result.append((op, numbers[0], numbers[1], None))
73
        else:
74
            contents = [next() for i in xrange(numbers[0])]
75
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
76
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
77
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
78
0.20.9 by John Arbash Meinel
Revert previous change.
79
def apply_delta(basis, delta):
0.17.2 by Robert Collins
Core proof of concept working.
80
    """Apply delta to this object to become new_version_id."""
81
    lines = []
82
    last_offset = 0
83
    # eq ranges occur where gaps occur
84
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
85
    for op, start, count, delta_lines in delta:
86
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
87
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
88
        else:
89
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
90
    trim_encoding_newline(lines)
91
    return lines
92
93
94
def trim_encoding_newline(lines):
95
    if lines[-1] == '\n':
96
        del lines[-1]
97
    else:
98
        lines[-1] = lines[-1][:-1]
99
100
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
101
102
def sort_gc_optimal(parent_map):
103
    """Sort and group the keys in parent_map into gc-optimal order.
104
105
    gc-optimal is defined (currently) as reverse-topological order, grouped by
106
    the key prefix.
107
108
    :return: A sorted-list of keys
109
    """
110
    # gc-optimal ordering is approximately reverse topological,
111
    # properly grouped by file-id.
112
    per_prefix_map = {'': []}
113
    present_keys = []
114
    for item in parent_map.iteritems():
115
        key = item[0]
116
        if isinstance(key, str) or len(key) == 1:
117
            per_prefix_map[''].append(item)
118
        else:
119
            try:
120
                per_prefix_map[key[0]].append(item)
121
            except KeyError:
122
                per_prefix_map[key[0]] = [item]
123
124
    for prefix in sorted(per_prefix_map):
125
        present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
126
    return present_keys
127
128
0.17.2 by Robert Collins
Core proof of concept working.
129
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
130
    """Produce a serialised group of compressed texts.
131
    
132
    It contains code very similar to SequenceMatcher because of having a similar
133
    task. However some key differences apply:
134
     - there is no junk, we want a minimal edit not a human readable diff.
135
     - we don't filter very common lines (because we don't know where a good
136
       range will start, and after the first text we want to be emitting minmal
137
       edits only.
138
     - we chain the left side, not the right side
139
     - we incrementally update the adjacency matrix as new lines are provided.
140
     - we look for matches in all of the left side, so the routine which does
141
       the analagous task of find_longest_match does not need to filter on the
142
       left side.
143
    """
0.17.2 by Robert Collins
Core proof of concept working.
144
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
145
    _equivalence_table_class = equivalence_table.EquivalenceTable
146
0.17.2 by Robert Collins
Core proof of concept working.
147
    def __init__(self, delta=True):
148
        """Create a GroupCompressor.
149
150
        :paeam delta: If False, do not compress records.
151
        """
152
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
153
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
154
        self.endpoint = 0
155
        self.input_bytes = 0
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
156
        self.line_locations = self._equivalence_table_class([])
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
157
        self.lines = self.line_locations.lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
158
        self.labels_deltas = {}
0.20.12 by John Arbash Meinel
Change the code a little bit.
159
        self._present_prefixes = set()
0.17.2 by Robert Collins
Core proof of concept working.
160
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
161
    def get_matching_blocks(self, lines, soft=False):
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
162
        """Return an the ranges in lines which match self.lines.
163
164
        :param lines: lines to compress
165
        :return: A list of (old_start, new_start, length) tuples which reflect
166
            a region in self.lines that is present in lines.  The last element
167
            of the list is always (old_len, new_len, 0) to provide a end point
168
            for generating instructions from the matching blocks list.
169
        """
170
        result = []
171
        pos = 0
172
        line_locations = self.line_locations
0.18.11 by John Arbash Meinel
Convert back into grabbing a right-lines ahead of time.
173
        line_locations.set_right_lines(lines)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
174
        # We either copy a range (while there are reusable lines) or we 
175
        # insert new lines. To find reusable lines we traverse 
0.18.24 by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func.
176
        locations = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
177
        max_pos = len(lines)
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
178
        result_append = result.append
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
179
        min_match_bytes = 10
180
        if soft:
181
            min_match_bytes = 200
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
182
        while pos < max_pos:
0.18.35 by John Arbash Meinel
remove the timing calls
183
            block, pos, locations = _get_longest_match(line_locations, pos,
184
                                                       max_pos, locations)
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
185
            if block is not None:
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
186
                # Check to see if we are matching fewer than 5 characters,
187
                # which is turned into a simple 'insert', rather than a copy
188
                # If we have more than 5 lines, we definitely have more than 5
189
                # chars
190
                if block[-1] < min_match_bytes:
191
                    # This block may be a 'short' block, check
192
                    old_start, new_start, range_len = block
193
                    matched_bytes = sum(map(len,
194
                        lines[new_start:new_start + range_len]))
195
                    if matched_bytes < min_match_bytes:
196
                        block = None
197
            if block is not None:
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
198
                result_append(block)
199
        result_append((len(self.lines), len(lines), 0))
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
200
        return result
201
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
202
    def compress(self, key, lines, expected_sha, soft=False):
0.17.2 by Robert Collins
Core proof of concept working.
203
        """Compress lines with label key.
204
205
        :param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins
Working better --gc-plain-chk.
206
            for identification of the text during decompression. If the last
207
            element is 'None' it is replaced with the sha1 of the text -
208
            e.g. sha1:xxxxxxx.
0.17.2 by Robert Collins
Core proof of concept working.
209
        :param lines: The lines to be compressed. Must be split
210
            on \n, with the \n preserved.'
211
        :param expected_sha: If non-None, the sha the lines are blieved to
212
            have. During compression the sha is calculated; a mismatch will
213
            cause an error.
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
214
        :param soft: Do a 'soft' compression. This means that we require larger
215
            ranges to match to be considered for a copy command.
0.17.2 by Robert Collins
Core proof of concept working.
216
        :return: The sha1 of lines, and the number of bytes accumulated in
217
            the group output so far.
218
        """
219
        sha1 = sha_strings(lines)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
220
        if key[-1] is None:
221
            key = key[:-1] + ('sha1:' + sha1,)
0.17.2 by Robert Collins
Core proof of concept working.
222
        label = '\x00'.join(key)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
223
        new_lines = []
224
        new_lines.append('label: %s\n' % label)
225
        new_lines.append('sha1: %s\n' % sha1)
226
        index_lines = [False, False]
0.17.2 by Robert Collins
Core proof of concept working.
227
        # setup good encoding for trailing \n support.
228
        if not lines or lines[-1].endswith('\n'):
229
            lines.append('\n')
230
        else:
231
            lines[-1] = lines[-1] + '\n'
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
232
        pos = 0
0.17.14 by Robert Collins
Cleaner code.
233
        range_len = 0
234
        range_start = 0
235
        flush_range = self.flush_range
236
        copy_ends = None
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
237
        blocks = self.get_matching_blocks(lines, soft=soft)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
238
        current_pos = 0
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
239
        copies_without_insertion = []
240
        # We either copy a range (while there are reusable lines) or we
241
        # insert new lines. To find reusable lines we traverse
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
242
        for old_start, new_start, range_len in blocks:
243
            if new_start != current_pos:
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
244
                # if copies_without_insertion:
245
                #     self.flush_multi(copies_without_insertion,
246
                #                      lines, new_lines, index_lines)
247
                #     copies_without_insertion = []
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
248
                # non-matching region
0.19.1 by Robert Collins
Start to simplify flush_range.
249
                flush_range(current_pos, None, new_start - current_pos,
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
250
                    lines, new_lines, index_lines)
251
            current_pos = new_start + range_len
252
            if not range_len:
253
                continue
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
254
            # copies_without_insertion.append((new_start, old_start, range_len))
255
            flush_range(new_start, old_start, range_len,
256
                        lines, new_lines, index_lines)
257
        # if copies_without_insertion:
258
        #     self.flush_multi(copies_without_insertion,
259
        #                      lines, new_lines, index_lines)
260
        #     copies_without_insertion = []
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
261
        delta_start = (self.endpoint, len(self.lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
262
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
263
        trim_encoding_newline(lines)
264
        self.input_bytes += sum(map(len, lines))
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
265
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
266
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
267
        return sha1, self.endpoint
268
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
269
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
270
        """Extract a key previously added to the compressor.
271
        
272
        :param key: The key to extract.
273
        :return: An iterable over bytes and the sha1.
274
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
275
        delta_details = self.labels_deltas[key]
276
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
277
        label, sha1, delta = parse(delta_lines)
278
        if label != key:
279
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
280
        # Perhaps we want to keep the line offsets too in memory at least?
281
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
282
        sha1 = sha_strings(lines)
283
        return lines, sha1
284
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
285
    def flush_multi(self, instructions, lines, new_lines, index_lines):
286
        """Flush a bunch of different ranges out.
287
288
        This should only be called with data that are "pure" copies.
289
        """
290
        flush_range = self.flush_range
291
        if len(instructions) > 2:
292
            # This is the number of lines to be copied
293
            total_copy_range = sum(i[2] for i in instructions)
294
            if len(instructions) > 0.5 * total_copy_range:
295
                # We are copying N lines, but taking more than N/2
296
                # copy instructions to do so. We will go ahead and expand this
297
                # text so that other code is able to match against it
298
                flush_range(instructions[0][0], None, total_copy_range,
299
                            lines, new_lines, index_lines)
300
                return
301
        for ns, os, rl in instructions:
302
            flush_range(ns, os, rl, lines, new_lines, index_lines)
303
0.19.1 by Robert Collins
Start to simplify flush_range.
304
    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
0.17.14 by Robert Collins
Cleaner code.
305
        insert_instruction = "i,%d\n" % range_len
0.19.1 by Robert Collins
Start to simplify flush_range.
306
        if copy_start is not None:
0.17.14 by Robert Collins
Cleaner code.
307
            # range stops, flush and start a new copy range
308
            stop_byte = self.line_offsets[copy_start + range_len - 1]
309
            if copy_start == 0:
310
                start_byte = 0
311
            else:
312
                start_byte = self.line_offsets[copy_start - 1]
313
            bytes = stop_byte - start_byte
314
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
315
            if (bytes + len(insert_instruction) >
316
                len(copy_control_instruction)):
317
                new_lines.append(copy_control_instruction)
318
                index_lines.append(False)
319
                return
320
        # not copying, or inserting is shorter than copying, so insert.
321
        new_lines.append(insert_instruction)
322
        new_lines.extend(lines[range_start:range_start+range_len])
323
        index_lines.append(False)
0.19.1 by Robert Collins
Start to simplify flush_range.
324
        index_lines.extend([copy_start is None]*range_len)
0.17.14 by Robert Collins
Cleaner code.
325
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
326
    def output_lines(self, new_lines, index_lines):
327
        """Output some lines.
328
329
        :param new_lines: The lines to output.
330
        :param index_lines: A boolean flag for each line - when True, index
331
            that line.
332
        """
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
333
        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
334
        #                          if val and new_lines[idx] == '\n']
335
        # if indexed_newlines:
336
        #     import pdb; pdb.set_trace()
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
337
        endpoint = self.endpoint
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
338
        self.line_locations.extend_lines(new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
339
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
340
            endpoint += len(line)
341
            self.line_offsets.append(endpoint)
342
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
343
0.17.2 by Robert Collins
Core proof of concept working.
344
    def ratio(self):
345
        """Return the overall compression ratio."""
346
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
347
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
348
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
349
def make_pack_factory(graph, delta, keylength):
350
    """Create a factory for creating a pack based groupcompress.
351
352
    This is only functional enough to run interface tests, it doesn't try to
353
    provide a full pack environment.
354
    
355
    :param graph: Store a graph.
356
    :param delta: Delta compress contents.
357
    :param keylength: How long should keys be.
358
    """
359
    def factory(transport):
360
        parents = graph or delta
361
        ref_length = 0
362
        if graph:
363
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
364
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
365
            key_elements=keylength)
366
        stream = transport.open_write_stream('newpack')
367
        writer = pack.ContainerWriter(stream.write)
368
        writer.begin()
369
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
370
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
371
        access = _DirectPackAccess({})
372
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
373
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
374
        result.stream = stream
375
        result.writer = writer
376
        return result
377
    return factory
378
379
380
def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
381
    versioned_files.writer.end()
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
382
    versioned_files.stream.close()
383
384
385
class GroupCompressVersionedFiles(VersionedFiles):
386
    """A group-compress based VersionedFiles implementation."""
387
0.17.2 by Robert Collins
Core proof of concept working.
388
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
389
        """Create a GroupCompressVersionedFiles object.
390
391
        :param index: The index object storing access and graph data.
392
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
393
        :param delta: Whether to delta compress or just entropy compress.
394
        """
395
        self._index = index
396
        self._access = access
397
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
398
        self._unadded_refs = {}
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
399
        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
0.17.2 by Robert Collins
Core proof of concept working.
400
401
    def add_lines(self, key, parents, lines, parent_texts=None,
402
        left_matching_blocks=None, nostore_sha=None, random_id=False,
403
        check_content=True):
404
        """Add a text to the store.
405
406
        :param key: The key tuple of the text to add.
407
        :param parents: The parents key tuples of the text to add.
408
        :param lines: A list of lines. Each line must be a bytestring. And all
409
            of them except the last must be terminated with \n and contain no
410
            other \n's. The last line may either contain no \n's or a single
411
            terminating \n. If the lines list does meet this constraint the add
412
            routine may error or may succeed - but you will be unable to read
413
            the data back accurately. (Checking the lines have been split
414
            correctly is expensive and extremely unlikely to catch bugs so it
415
            is not done at runtime unless check_content is True.)
416
        :param parent_texts: An optional dictionary containing the opaque 
417
            representations of some or all of the parents of version_id to
418
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
419
            returned by add_lines or data corruption can be caused.
420
        :param left_matching_blocks: a hint about which areas are common
421
            between the text and its left-hand-parent.  The format is
422
            the SequenceMatcher.get_matching_blocks format.
423
        :param nostore_sha: Raise ExistingContent and do not add the lines to
424
            the versioned file if the digest of the lines matches this.
425
        :param random_id: If True a random id has been selected rather than
426
            an id determined by some deterministic process such as a converter
427
            from a foreign VCS. When True the backend may choose not to check
428
            for uniqueness of the resulting key within the versioned file, so
429
            this should only be done when the result is expected to be unique
430
            anyway.
431
        :param check_content: If True, the lines supplied are verified to be
432
            bytestrings that are correctly formed lines.
433
        :return: The text sha1, the number of bytes in the text, and an opaque
434
                 representation of the inserted version which can be provided
435
                 back to future add_lines calls in the parent_texts dictionary.
436
        """
437
        self._index._check_write_ok()
438
        self._check_add(key, lines, random_id, check_content)
439
        if parents is None:
440
            # The caller might pass None if there is no graph data, but kndx
441
            # indexes can't directly store that, so we give them
442
            # an empty tuple instead.
443
            parents = ()
444
        # double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
445
        length = sum(map(len, lines))
446
        record = ChunkedContentFactory(key, parents, None, lines)
0.17.5 by Robert Collins
nograph tests completely passing.
447
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
448
        return sha1, length, None
0.17.2 by Robert Collins
Core proof of concept working.
449
0.17.4 by Robert Collins
Annotate.
450
    def annotate(self, key):
451
        """See VersionedFiles.annotate."""
452
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
453
        parent_map = self.get_parent_map([key])
454
        if not parent_map:
455
            raise errors.RevisionNotPresent(key, self)
456
        if parent_map[key] is not None:
457
            search = graph._make_breadth_first_searcher([key])
458
            keys = set()
459
            while True:
460
                try:
461
                    present, ghosts = search.next_with_ghosts()
462
                except StopIteration:
463
                    break
464
                keys.update(present)
465
            parent_map = self.get_parent_map(keys)
466
        else:
467
            keys = [key]
468
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
469
        head_cache = _mod_graph.FrozenHeadsCache(graph)
470
        parent_cache = {}
471
        reannotate = annotate.reannotate
472
        for record in self.get_record_stream(keys, 'topological', True):
473
            key = record.key
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
474
            chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins
Annotate.
475
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
476
            parent_cache[key] = list(
477
                reannotate(parent_lines, fulltext, key, None, head_cache))
478
        return parent_cache[key]
479
0.17.5 by Robert Collins
nograph tests completely passing.
480
    def check(self, progress_bar=None):
481
        """See VersionedFiles.check()."""
482
        keys = self.keys()
483
        for record in self.get_record_stream(keys, 'unordered', True):
484
            record.get_bytes_as('fulltext')
485
0.17.2 by Robert Collins
Core proof of concept working.
486
    def _check_add(self, key, lines, random_id, check_content):
487
        """check that version_id and lines are safe to add."""
488
        version_id = key[-1]
0.17.26 by Robert Collins
Working better --gc-plain-chk.
489
        if version_id is not None:
490
            if contains_whitespace(version_id):
491
                raise InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins
Core proof of concept working.
492
        self.check_not_reserved_id(version_id)
493
        # TODO: If random_id==False and the key is already present, we should
494
        # probably check that the existing content is identical to what is
495
        # being inserted, and otherwise raise an exception.  This would make
496
        # the bundle code simpler.
497
        if check_content:
498
            self._check_lines_not_unicode(lines)
499
            self._check_lines_are_lines(lines)
500
0.17.5 by Robert Collins
nograph tests completely passing.
501
    def get_parent_map(self, keys):
502
        """Get a map of the parents of keys.
503
504
        :param keys: The keys to look up parents for.
505
        :return: A mapping from keys to parents. Absent keys are absent from
506
            the mapping.
507
        """
508
        result = {}
509
        sources = [self._index]
510
        source_results = []
511
        missing = set(keys)
512
        for source in sources:
513
            if not missing:
514
                break
515
            new_result = source.get_parent_map(missing)
516
            source_results.append(new_result)
517
            result.update(new_result)
518
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
519
        if self._unadded_refs:
520
            for key in missing:
521
                if key in self._unadded_refs:
522
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
523
        return result
524
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
525
    def _get_delta_lines(self, key):
526
        """Helper function for manual debugging.
527
528
        This is a convenience function that shouldn't be used in production
529
        code.
530
        """
531
        build_details = self._index.get_build_details([key])[key]
532
        index_memo = build_details[0]
533
        group, delta_lines = self._get_group_and_delta_lines(index_memo)
534
        return delta_lines
535
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
536
    def _get_group_and_delta_lines(self, index_memo):
537
        read_memo = index_memo[0:3]
538
        # get the group:
539
        try:
540
            plain = self._group_cache[read_memo]
541
        except KeyError:
542
            # read the group
543
            zdata = self._access.get_raw_records([read_memo]).next()
544
            # decompress - whole thing - this is not a bug, as it
545
            # permits caching. We might want to store the partially
546
            # decompresed group and decompress object, so that recent
547
            # texts are not penalised by big groups.
548
            plain = zlib.decompress(zdata) #, index_memo[4])
549
            self._group_cache[read_memo] = plain
550
        # cheapo debugging:
551
        # print len(zdata), len(plain)
552
        # parse - requires split_lines, better to have byte offsets
553
        # here (but not by much - we only split the region for the
554
        # recipe, and we often want to end up with lines anyway.
555
        return plain, split_lines(plain[index_memo[3]:index_memo[4]])
556
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
557
    def get_missing_compression_parent_keys(self):
558
        """Return the keys of missing compression parents.
559
560
        Missing compression parents occur when a record stream was missing
561
        basis texts, or a index was scanned that had missing basis texts.
562
        """
563
        # GroupCompress cannot currently reference texts that are not in the
564
        # group, so this is valid for now
565
        return frozenset()
566
0.17.5 by Robert Collins
nograph tests completely passing.
567
    def get_record_stream(self, keys, ordering, include_delta_closure):
568
        """Get a stream of records for keys.
569
570
        :param keys: The keys to include.
571
        :param ordering: Either 'unordered' or 'topological'. A topologically
572
            sorted stream has compression parents strictly before their
573
            children.
574
        :param include_delta_closure: If True then the closure across any
575
            compression parents will be included (in the opaque data).
576
        :return: An iterator of ContentFactory objects, each of which is only
577
            valid until the iterator is advanced.
578
        """
579
        # keys might be a generator
580
        keys = set(keys)
581
        if not keys:
582
            return
583
        if not self._index.has_graph:
584
            # Cannot topological order when no graph has been stored.
585
            ordering = 'unordered'
586
        # Cheap: iterate
587
        locations = self._index.get_build_details(keys)
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
588
        local_keys = frozenset(keys).intersection(set(self._unadded_refs))
589
        locations.update((key, None) for key in local_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
590
        if ordering == 'topological':
591
            # would be better to not globally sort initially but instead
592
            # start with one key, recurse to its oldest parent, then grab
593
            # everything in the same group, etc.
594
            parent_map = dict((key, details[2]) for key, details in
595
                locations.iteritems())
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
596
            for key in local_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
597
                parent_map[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
598
            present_keys = topo_sort(parent_map)
599
            # Now group by source:
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
600
        elif ordering == 'gc-optimal':
601
            parent_map = dict((key, details[2]) for key, details in
602
                locations.iteritems())
603
            for key in local_keys:
604
                parent_map[key] = self._unadded_refs[key]
605
            # XXX: This only optimizes for the target ordering. We may need to
606
            #      balance that with the time it takes to extract ordering, by
607
            #      somehow grouping based on locations[key][0:3]
608
            present_keys = sort_gc_optimal(parent_map)
0.17.5 by Robert Collins
nograph tests completely passing.
609
        else:
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
610
            # We want to yield the keys in a semi-optimal (read-wise) ordering.
611
            # Otherwise we thrash the _group_cache and destroy performance
612
            def get_group(key):
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
613
                # This is the group the bytes are stored in, followed by the
614
                # location in the group
615
                return locations[key][0]
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
616
            present_keys = sorted(locations.iterkeys(), key=get_group)
617
            # We don't have an ordering for keys in the in-memory object, but
618
            # lets process the in-memory ones first.
619
            local = list(local_keys)
620
            present_keys = list(local_keys) + present_keys
0.17.5 by Robert Collins
nograph tests completely passing.
621
        absent_keys = keys.difference(set(locations))
622
        for key in absent_keys:
623
            yield AbsentContentFactory(key)
624
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
625
            if key in self._unadded_refs:
626
                lines, sha1 = self._compressor.extract(key)
627
                parents = self._unadded_refs[key]
628
            else:
629
                index_memo, _, parents, (method, _) = locations[key]
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
630
                plain, delta_lines = self._get_group_and_delta_lines(index_memo)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
631
                label, sha1, delta = parse(delta_lines)
632
                if label != key:
633
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.20.9 by John Arbash Meinel
Revert previous change.
634
                lines = apply_delta(plain, delta)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
635
                if sha_strings(lines) != sha1:
636
                    raise AssertionError('sha1 sum did not match')
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
637
            yield ChunkedContentFactory(key, parents, sha1, lines)
638
0.17.5 by Robert Collins
nograph tests completely passing.
639
    def get_sha1s(self, keys):
640
        """See VersionedFiles.get_sha1s()."""
641
        result = {}
642
        for record in self.get_record_stream(keys, 'unordered', True):
643
            if record.sha1 != None:
644
                result[record.key] = record.sha1
645
            else:
646
                if record.storage_kind != 'absent':
647
                    result[record.key] == sha_string(record.get_bytes_as(
648
                        'fulltext'))
649
        return result
650
0.17.2 by Robert Collins
Core proof of concept working.
651
    def insert_record_stream(self, stream):
652
        """Insert a record stream into this container.
653
654
        :param stream: A stream of records to insert. 
655
        :return: None
656
        :seealso VersionedFiles.get_record_stream:
657
        """
0.17.5 by Robert Collins
nograph tests completely passing.
658
        for _ in self._insert_record_stream(stream):
659
            pass
0.17.2 by Robert Collins
Core proof of concept working.
660
0.17.5 by Robert Collins
nograph tests completely passing.
661
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
662
        """Internal core to insert a record stream into this container.
663
664
        This helper function has a different interface than insert_record_stream
665
        to allow add_lines to be minimal, but still return the needed data.
666
667
        :param stream: A stream of records to insert. 
668
        :return: An iterator over the sha1 of the inserted records.
669
        :seealso insert_record_stream:
670
        :seealso add_lines:
671
        """
0.17.5 by Robert Collins
nograph tests completely passing.
672
        def get_adapter(adapter_key):
673
            try:
674
                return adapters[adapter_key]
675
            except KeyError:
676
                adapter_factory = adapter_registry.get(adapter_key)
677
                adapter = adapter_factory(self)
678
                adapters[adapter_key] = adapter
679
                return adapter
680
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
681
        # This will go up to fulltexts for gc to gc fetching, which isn't
682
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
683
        self._compressor = GroupCompressor(self._delta)
684
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
685
        keys_to_add = []
686
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
687
        groups = 1
688
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
689
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
690
            index, start, length = self._access.add_raw_records(
691
                [(None, len(compressed))], compressed)[0]
692
            nodes = []
693
            for key, reads, refs in keys_to_add:
694
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
695
            self._index.add_records(nodes, random_id=random_id)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
696
        last_prefix = None
0.17.2 by Robert Collins
Core proof of concept working.
697
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
698
            # Raise an error when a record is missing.
699
            if record.storage_kind == 'absent':
700
                raise errors.RevisionNotPresent([record.key], self)
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
701
            try:
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
702
                lines = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
703
            except errors.UnavailableRepresentation:
0.17.5 by Robert Collins
nograph tests completely passing.
704
                adapter_key = record.storage_kind, 'fulltext'
705
                adapter = get_adapter(adapter_key)
706
                bytes = adapter.get_bytes(record,
707
                    record.get_bytes_as(record.storage_kind))
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
708
                lines = osutils.split_lines(bytes)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
709
            soft = False
0.20.13 by John Arbash Meinel
Play around a bit.
710
            if len(record.key) > 1:
711
                prefix = record.key[0]
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
712
                if (last_prefix is not None and prefix != last_prefix):
713
                    soft = True
714
                    if basis_end > 1024 * 1024 * 4:
715
                        flush()
716
                        self._compressor = GroupCompressor(self._delta)
717
                        self._unadded_refs = {}
718
                        keys_to_add = []
719
                        basis_end = 0
720
                        groups += 1
721
                last_prefix = prefix
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
722
            found_sha1, end_point = self._compressor.compress(record.key,
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
723
                lines, record.sha1, soft=soft)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
724
            if record.key[-1] is None:
725
                key = record.key[:-1] + ('sha1:' + found_sha1,)
726
            else:
727
                key = record.key
728
            self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
729
            yield found_sha1
0.17.26 by Robert Collins
Working better --gc-plain-chk.
730
            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
0.17.5 by Robert Collins
nograph tests completely passing.
731
                (record.parents,)))
732
            basis_end = end_point
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
733
            if basis_end > 1024 * 1024 * 8:
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
734
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
735
                self._compressor = GroupCompressor(self._delta)
736
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
737
                keys_to_add = []
738
                basis_end = 0
739
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
740
        if len(keys_to_add):
741
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
742
        self._compressor = None
743
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
744
745
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
746
        """Iterate over the lines in the versioned files from keys.
747
748
        This may return lines from other keys. Each item the returned
749
        iterator yields is a tuple of a line and a text version that that line
750
        is present in (not introduced in).
751
752
        Ordering of results is in whatever order is most suitable for the
753
        underlying storage format.
754
755
        If a progress bar is supplied, it may be used to indicate progress.
756
        The caller is responsible for cleaning up progress bars (because this
757
        is an iterator).
758
759
        NOTES:
760
         * Lines are normalised by the underlying store: they will all have \n
761
           terminators.
762
         * Lines are returned in arbitrary order.
763
764
        :return: An iterator over (line, key).
765
        """
766
        if pb is None:
767
            pb = progress.DummyProgress()
768
        keys = set(keys)
769
        total = len(keys)
770
        # we don't care about inclusions, the caller cares.
771
        # but we need to setup a list of records to visit.
772
        # we need key, position, length
773
        for key_idx, record in enumerate(self.get_record_stream(keys,
774
            'unordered', True)):
775
            # XXX: todo - optimise to use less than full texts.
776
            key = record.key
777
            pb.update('Walking content.', key_idx, total)
778
            if record.storage_kind == 'absent':
779
                raise errors.RevisionNotPresent(record.key, self)
780
            lines = split_lines(record.get_bytes_as('fulltext'))
781
            for line in lines:
782
                yield line, key
783
        pb.update('Walking content.', total, total)
784
785
    def keys(self):
786
        """See VersionedFiles.keys."""
787
        if 'evil' in debug.debug_flags:
788
            trace.mutter_callsite(2, "keys scales with size of history")
789
        sources = [self._index]
790
        result = set()
791
        for source in sources:
792
            result.update(source.keys())
793
        return result
794
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
795
796
class _GCGraphIndex(object):
797
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
798
0.17.9 by Robert Collins
Initial stab at repository format support.
799
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
800
        add_callback=None):
801
        """Construct a _GCGraphIndex on a graph_index.
802
803
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
804
        :param is_locked: A callback to check whether the object should answer
805
            queries.
806
        :param parents: If True, record knits parents, if not do not record 
807
            parents.
808
        :param add_callback: If not None, allow additions to the index and call
809
            this callback with a list of added GraphIndex nodes:
810
            [(node, value, node_refs), ...]
811
        :param is_locked: A callback, returns True if the index is locked and
812
            thus usable.
813
        """
814
        self._add_callback = add_callback
815
        self._graph_index = graph_index
816
        self._parents = parents
817
        self.has_graph = parents
818
        self._is_locked = is_locked
819
0.17.5 by Robert Collins
nograph tests completely passing.
820
    def add_records(self, records, random_id=False):
821
        """Add multiple records to the index.
822
        
823
        This function does not insert data into the Immutable GraphIndex
824
        backing the KnitGraphIndex, instead it prepares data for insertion by
825
        the caller and checks that it is safe to insert then calls
826
        self._add_callback with the prepared GraphIndex nodes.
827
828
        :param records: a list of tuples:
829
                         (key, options, access_memo, parents).
830
        :param random_id: If True the ids being added were randomly generated
831
            and no check for existence will be performed.
832
        """
833
        if not self._add_callback:
834
            raise errors.ReadOnlyError(self)
835
        # we hope there are no repositories with inconsistent parentage
836
        # anymore.
837
838
        changed = False
839
        keys = {}
840
        for (key, value, refs) in records:
841
            if not self._parents:
842
                if refs:
843
                    for ref in refs:
844
                        if ref:
845
                            raise KnitCorrupt(self,
846
                                "attempt to add node with parents "
847
                                "in parentless index.")
848
                    refs = ()
849
                    changed = True
850
            keys[key] = (value, refs)
851
        # check for dups
852
        if not random_id:
853
            present_nodes = self._get_entries(keys)
854
            for (index, key, value, node_refs) in present_nodes:
855
                if node_refs != keys[key][1]:
856
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
857
                        ": %s %s" % ((value, node_refs), keys[key]))
858
                del keys[key]
859
                changed = True
860
        if changed:
861
            result = []
862
            if self._parents:
863
                for key, (value, node_refs) in keys.iteritems():
864
                    result.append((key, value, node_refs))
865
            else:
866
                for key, (value, node_refs) in keys.iteritems():
867
                    result.append((key, value))
868
            records = result
869
        self._add_callback(records)
870
        
871
    def _check_read(self):
872
        """raise if reads are not permitted."""
873
        if not self._is_locked():
874
            raise errors.ObjectNotLocked(self)
875
0.17.2 by Robert Collins
Core proof of concept working.
876
    def _check_write_ok(self):
877
        """Assert if writes are not permitted."""
878
        if not self._is_locked():
879
            raise errors.ObjectNotLocked(self)
880
0.17.5 by Robert Collins
nograph tests completely passing.
881
    def _get_entries(self, keys, check_present=False):
882
        """Get the entries for keys.
883
        
884
        :param keys: An iterable of index key tuples.
885
        """
886
        keys = set(keys)
887
        found_keys = set()
888
        if self._parents:
889
            for node in self._graph_index.iter_entries(keys):
890
                yield node
891
                found_keys.add(node[1])
892
        else:
893
            # adapt parentless index to the rest of the code.
894
            for node in self._graph_index.iter_entries(keys):
895
                yield node[0], node[1], node[2], ()
896
                found_keys.add(node[1])
897
        if check_present:
898
            missing_keys = keys.difference(found_keys)
899
            if missing_keys:
900
                raise RevisionNotPresent(missing_keys.pop(), self)
901
902
    def get_parent_map(self, keys):
903
        """Get a map of the parents of keys.
904
905
        :param keys: The keys to look up parents for.
906
        :return: A mapping from keys to parents. Absent keys are absent from
907
            the mapping.
908
        """
909
        self._check_read()
910
        nodes = self._get_entries(keys)
911
        result = {}
912
        if self._parents:
913
            for node in nodes:
914
                result[node[1]] = node[3][0]
915
        else:
916
            for node in nodes:
917
                result[node[1]] = None
918
        return result
919
920
    def get_build_details(self, keys):
921
        """Get the various build details for keys.
922
923
        Ghosts are omitted from the result.
924
925
        :param keys: An iterable of keys.
926
        :return: A dict of key:
927
            (index_memo, compression_parent, parents, record_details).
928
            index_memo
929
                opaque structure to pass to read_records to extract the raw
930
                data
931
            compression_parent
932
                Content that this record is built upon, may be None
933
            parents
934
                Logical parents of this node
935
            record_details
936
                extra information about the content which needs to be passed to
937
                Factory.parse_record
938
        """
939
        self._check_read()
940
        result = {}
941
        entries = self._get_entries(keys, False)
942
        for entry in entries:
943
            key = entry[1]
944
            if not self._parents:
945
                parents = None
946
            else:
947
                parents = entry[3][0]
948
            value = entry[2]
949
            method = 'group'
950
            result[key] = (self._node_to_position(entry),
951
                                  None, parents, (method, None))
952
        return result
953
    
954
    def keys(self):
955
        """Get all the keys in the collection.
956
        
957
        The keys are not ordered.
958
        """
959
        self._check_read()
960
        return [node[1] for node in self._graph_index.iter_all_entries()]
961
    
962
    def _node_to_position(self, node):
963
        """Convert an index value to position details."""
964
        bits = node[2].split(' ')
965
        # It would be nice not to read the entire gzip.
966
        start = int(bits[0])
967
        stop = int(bits[1])
968
        basis_end = int(bits[2])
969
        delta_end = int(bits[3])
970
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
971
972
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
973
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
974
    """Get the longest possible match for the current position."""
975
    range_start = pos
976
    range_len = 0
977
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
978
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
979
        if locations is None:
980
            locations = equivalence_table.get_idx_matches(pos)
981
        if locations is None:
982
            # No more matches, just return whatever we have, but we know that
983
            # this last position is not going to match anything
984
            pos += 1
985
            break
986
        else:
987
            if copy_ends is None:
988
                # We are starting a new range
989
                copy_ends = [loc + 1 for loc in locations]
990
                range_len = 1
991
                locations = None # Consumed
992
            else:
993
                # We are currently in the middle of a match
994
                next_locations = set(copy_ends).intersection(locations)
995
                if len(next_locations):
996
                    # range continues
997
                    copy_ends = [loc + 1 for loc in next_locations]
998
                    range_len += 1
999
                    locations = None # Consumed
1000
                else:
1001
                    # But we are done with this match, we should be
1002
                    # starting a new one, though. We will pass back 'locations'
1003
                    # so that we don't have to do another lookup.
1004
                    break
1005
        pos += 1
1006
    if copy_ends is None:
1007
        return None, pos, locations
1008
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
1009
1010
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
1011
try:
1012
    from bzrlib.plugins.groupcompress import _groupcompress_c
1013
except ImportError:
1014
    pass
1015
else:
1016
    GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
1017
    _get_longest_match = _groupcompress_c._get_longest_match