/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
30
    osutils,
0.17.4 by Robert Collins
Annotate.
31
    pack,
32
    patiencediff,
33
    )
34
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
35
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
36
from bzrlib.osutils import (
37
    contains_whitespace,
38
    contains_linebreaks,
39
    sha_string,
40
    sha_strings,
41
    split_lines,
42
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
43
from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
44
from bzrlib.lru_cache import LRUSizeCache
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
45
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
46
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
47
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
48
    adapter_registry,
49
    AbsentContentFactory,
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
50
    ChunkedContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
51
    FulltextContentFactory,
52
    VersionedFiles,
53
    )
54
55
0.17.5 by Robert Collins
nograph tests completely passing.
56
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
57
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
58
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
59
    next = lines.next
0.22.2 by John Arbash Meinel
experiment with removing the label and sha1 fields. Seems to shrink texts by 10-30%.
60
    ## label_line = lines.next()
61
    ## sha1_line = lines.next()
62
    ## if (not label_line.startswith('label: ') or
63
    ##     not sha1_line.startswith('sha1: ')):
64
    ##     raise AssertionError("bad text record %r" % lines)
65
    ## label = tuple(label_line[7:-1].split('\x00'))
66
    ## sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
67
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
68
        op = header[0]
69
        numbers = header[2:]
70
        numbers = [int(n) for n in header[2:].split(',')]
71
        if op == 'c':
72
            result.append((op, numbers[0], numbers[1], None))
73
        else:
74
            contents = [next() for i in xrange(numbers[0])]
75
            result.append((op, None, numbers[0], contents))
0.22.2 by John Arbash Meinel
experiment with removing the label and sha1 fields. Seems to shrink texts by 10-30%.
76
    return result
77
    ## return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
78
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
79
0.20.9 by John Arbash Meinel
Revert previous change.
80
def apply_delta(basis, delta):
0.17.2 by Robert Collins
Core proof of concept working.
81
    """Apply delta to this object to become new_version_id."""
82
    lines = []
83
    last_offset = 0
84
    # eq ranges occur where gaps occur
85
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
86
    for op, start, count, delta_lines in delta:
87
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
88
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
89
        else:
90
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
91
    trim_encoding_newline(lines)
92
    return lines
93
94
95
def trim_encoding_newline(lines):
96
    if lines[-1] == '\n':
97
        del lines[-1]
98
    else:
99
        lines[-1] = lines[-1][:-1]
100
101
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
102
103
def sort_gc_optimal(parent_map):
104
    """Sort and group the keys in parent_map into gc-optimal order.
105
106
    gc-optimal is defined (currently) as reverse-topological order, grouped by
107
    the key prefix.
108
109
    :return: A sorted-list of keys
110
    """
111
    # gc-optimal ordering is approximately reverse topological,
112
    # properly grouped by file-id.
113
    per_prefix_map = {'': []}
114
    present_keys = []
115
    for item in parent_map.iteritems():
116
        key = item[0]
117
        if isinstance(key, str) or len(key) == 1:
118
            per_prefix_map[''].append(item)
119
        else:
120
            try:
121
                per_prefix_map[key[0]].append(item)
122
            except KeyError:
123
                per_prefix_map[key[0]] = [item]
124
125
    for prefix in sorted(per_prefix_map):
126
        present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
127
    return present_keys
128
129
0.17.2 by Robert Collins
Core proof of concept working.
130
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
131
    """Produce a serialised group of compressed texts.
132
    
133
    It contains code very similar to SequenceMatcher because of having a similar
134
    task. However some key differences apply:
135
     - there is no junk, we want a minimal edit not a human readable diff.
136
     - we don't filter very common lines (because we don't know where a good
137
       range will start, and after the first text we want to be emitting minmal
138
       edits only.
139
     - we chain the left side, not the right side
140
     - we incrementally update the adjacency matrix as new lines are provided.
141
     - we look for matches in all of the left side, so the routine which does
142
       the analagous task of find_longest_match does not need to filter on the
143
       left side.
144
    """
0.17.2 by Robert Collins
Core proof of concept working.
145
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
146
    _equivalence_table_class = equivalence_table.EquivalenceTable
147
0.17.2 by Robert Collins
Core proof of concept working.
148
    def __init__(self, delta=True):
149
        """Create a GroupCompressor.
150
151
        :paeam delta: If False, do not compress records.
152
        """
153
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
154
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
155
        self.endpoint = 0
156
        self.input_bytes = 0
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
157
        self.line_locations = self._equivalence_table_class([])
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
158
        self.lines = self.line_locations.lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
159
        self.labels_deltas = {}
0.20.12 by John Arbash Meinel
Change the code a little bit.
160
        self._present_prefixes = set()
0.17.2 by Robert Collins
Core proof of concept working.
161
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
162
    def get_matching_blocks(self, lines, soft=False):
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
163
        """Return an the ranges in lines which match self.lines.
164
165
        :param lines: lines to compress
166
        :return: A list of (old_start, new_start, length) tuples which reflect
167
            a region in self.lines that is present in lines.  The last element
168
            of the list is always (old_len, new_len, 0) to provide a end point
169
            for generating instructions from the matching blocks list.
170
        """
171
        result = []
172
        pos = 0
173
        line_locations = self.line_locations
0.18.11 by John Arbash Meinel
Convert back into grabbing a right-lines ahead of time.
174
        line_locations.set_right_lines(lines)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
175
        # We either copy a range (while there are reusable lines) or we 
176
        # insert new lines. To find reusable lines we traverse 
0.18.24 by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func.
177
        locations = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
178
        max_pos = len(lines)
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
179
        result_append = result.append
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
180
        min_match_bytes = 10
181
        if soft:
182
            min_match_bytes = 200
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
183
        while pos < max_pos:
0.18.35 by John Arbash Meinel
remove the timing calls
184
            block, pos, locations = _get_longest_match(line_locations, pos,
185
                                                       max_pos, locations)
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
186
            if block is not None:
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
187
                # Check to see if we are matching fewer than 5 characters,
188
                # which is turned into a simple 'insert', rather than a copy
189
                # If we have more than 5 lines, we definitely have more than 5
190
                # chars
191
                if block[-1] < min_match_bytes:
192
                    # This block may be a 'short' block, check
193
                    old_start, new_start, range_len = block
194
                    matched_bytes = sum(map(len,
195
                        lines[new_start:new_start + range_len]))
196
                    if matched_bytes < min_match_bytes:
197
                        block = None
198
            if block is not None:
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
199
                result_append(block)
200
        result_append((len(self.lines), len(lines), 0))
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
201
        return result
202
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
203
    def compress(self, key, lines, expected_sha, soft=False):
0.17.2 by Robert Collins
Core proof of concept working.
204
        """Compress lines with label key.
205
206
        :param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins
Working better --gc-plain-chk.
207
            for identification of the text during decompression. If the last
208
            element is 'None' it is replaced with the sha1 of the text -
209
            e.g. sha1:xxxxxxx.
0.17.2 by Robert Collins
Core proof of concept working.
210
        :param lines: The lines to be compressed. Must be split
211
            on \n, with the \n preserved.'
212
        :param expected_sha: If non-None, the sha the lines are blieved to
213
            have. During compression the sha is calculated; a mismatch will
214
            cause an error.
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
215
        :param soft: Do a 'soft' compression. This means that we require larger
216
            ranges to match to be considered for a copy command.
0.17.2 by Robert Collins
Core proof of concept working.
217
        :return: The sha1 of lines, and the number of bytes accumulated in
218
            the group output so far.
219
        """
220
        sha1 = sha_strings(lines)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
221
        if key[-1] is None:
222
            key = key[:-1] + ('sha1:' + sha1,)
0.17.2 by Robert Collins
Core proof of concept working.
223
        label = '\x00'.join(key)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
224
        new_lines = []
0.22.2 by John Arbash Meinel
experiment with removing the label and sha1 fields. Seems to shrink texts by 10-30%.
225
        # new_lines.append('label: %s\n' % label)
226
        # new_lines.append('sha1: %s\n' % sha1)
227
        index_lines = []
0.17.2 by Robert Collins
Core proof of concept working.
228
        # setup good encoding for trailing \n support.
229
        if not lines or lines[-1].endswith('\n'):
230
            lines.append('\n')
231
        else:
232
            lines[-1] = lines[-1] + '\n'
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
233
        pos = 0
0.17.14 by Robert Collins
Cleaner code.
234
        range_len = 0
235
        range_start = 0
236
        flush_range = self.flush_range
237
        copy_ends = None
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
238
        blocks = self.get_matching_blocks(lines, soft=soft)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
239
        current_pos = 0
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
240
        copies_without_insertion = []
241
        # We either copy a range (while there are reusable lines) or we
242
        # insert new lines. To find reusable lines we traverse
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
243
        for old_start, new_start, range_len in blocks:
244
            if new_start != current_pos:
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
245
                # if copies_without_insertion:
246
                #     self.flush_multi(copies_without_insertion,
247
                #                      lines, new_lines, index_lines)
248
                #     copies_without_insertion = []
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
249
                # non-matching region
0.19.1 by Robert Collins
Start to simplify flush_range.
250
                flush_range(current_pos, None, new_start - current_pos,
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
251
                    lines, new_lines, index_lines)
252
            current_pos = new_start + range_len
253
            if not range_len:
254
                continue
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
255
            # copies_without_insertion.append((new_start, old_start, range_len))
256
            flush_range(new_start, old_start, range_len,
257
                        lines, new_lines, index_lines)
258
        # if copies_without_insertion:
259
        #     self.flush_multi(copies_without_insertion,
260
        #                      lines, new_lines, index_lines)
261
        #     copies_without_insertion = []
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
262
        delta_start = (self.endpoint, len(self.lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
263
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
264
        trim_encoding_newline(lines)
265
        self.input_bytes += sum(map(len, lines))
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
266
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
267
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
268
        return sha1, self.endpoint
269
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
270
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
271
        """Extract a key previously added to the compressor.
272
        
273
        :param key: The key to extract.
274
        :return: An iterable over bytes and the sha1.
275
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
276
        delta_details = self.labels_deltas[key]
277
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
0.22.2 by John Arbash Meinel
experiment with removing the label and sha1 fields. Seems to shrink texts by 10-30%.
278
        ## label, sha1, delta = parse(delta_lines)
279
        delta = parse(delta_lines)
280
        ## if label != key:
281
        ##     raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
282
        # Perhaps we want to keep the line offsets too in memory at least?
283
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
284
        sha1 = sha_strings(lines)
285
        return lines, sha1
286
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
287
    def flush_multi(self, instructions, lines, new_lines, index_lines):
288
        """Flush a bunch of different ranges out.
289
290
        This should only be called with data that are "pure" copies.
291
        """
292
        flush_range = self.flush_range
293
        if len(instructions) > 2:
294
            # This is the number of lines to be copied
295
            total_copy_range = sum(i[2] for i in instructions)
296
            if len(instructions) > 0.5 * total_copy_range:
297
                # We are copying N lines, but taking more than N/2
298
                # copy instructions to do so. We will go ahead and expand this
299
                # text so that other code is able to match against it
300
                flush_range(instructions[0][0], None, total_copy_range,
301
                            lines, new_lines, index_lines)
302
                return
303
        for ns, os, rl in instructions:
304
            flush_range(ns, os, rl, lines, new_lines, index_lines)
305
0.19.1 by Robert Collins
Start to simplify flush_range.
306
    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
0.17.14 by Robert Collins
Cleaner code.
307
        insert_instruction = "i,%d\n" % range_len
0.19.1 by Robert Collins
Start to simplify flush_range.
308
        if copy_start is not None:
0.17.14 by Robert Collins
Cleaner code.
309
            # range stops, flush and start a new copy range
310
            stop_byte = self.line_offsets[copy_start + range_len - 1]
311
            if copy_start == 0:
312
                start_byte = 0
313
            else:
314
                start_byte = self.line_offsets[copy_start - 1]
315
            bytes = stop_byte - start_byte
316
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
317
            if (bytes + len(insert_instruction) >
318
                len(copy_control_instruction)):
319
                new_lines.append(copy_control_instruction)
320
                index_lines.append(False)
321
                return
322
        # not copying, or inserting is shorter than copying, so insert.
323
        new_lines.append(insert_instruction)
324
        new_lines.extend(lines[range_start:range_start+range_len])
325
        index_lines.append(False)
0.19.1 by Robert Collins
Start to simplify flush_range.
326
        index_lines.extend([copy_start is None]*range_len)
0.17.14 by Robert Collins
Cleaner code.
327
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
328
    def output_lines(self, new_lines, index_lines):
329
        """Output some lines.
330
331
        :param new_lines: The lines to output.
332
        :param index_lines: A boolean flag for each line - when True, index
333
            that line.
334
        """
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
335
        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
336
        #                          if val and new_lines[idx] == '\n']
337
        # if indexed_newlines:
338
        #     import pdb; pdb.set_trace()
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
339
        endpoint = self.endpoint
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
340
        self.line_locations.extend_lines(new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
341
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
342
            endpoint += len(line)
343
            self.line_offsets.append(endpoint)
344
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
345
0.17.2 by Robert Collins
Core proof of concept working.
346
    def ratio(self):
347
        """Return the overall compression ratio."""
348
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
349
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
350
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
351
def make_pack_factory(graph, delta, keylength):
352
    """Create a factory for creating a pack based groupcompress.
353
354
    This is only functional enough to run interface tests, it doesn't try to
355
    provide a full pack environment.
356
    
357
    :param graph: Store a graph.
358
    :param delta: Delta compress contents.
359
    :param keylength: How long should keys be.
360
    """
361
    def factory(transport):
362
        parents = graph or delta
363
        ref_length = 0
364
        if graph:
365
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
366
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
367
            key_elements=keylength)
368
        stream = transport.open_write_stream('newpack')
369
        writer = pack.ContainerWriter(stream.write)
370
        writer.begin()
371
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
372
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
373
        access = _DirectPackAccess({})
374
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
375
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
376
        result.stream = stream
377
        result.writer = writer
378
        return result
379
    return factory
380
381
382
def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
383
    versioned_files.writer.end()
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
384
    versioned_files.stream.close()
385
386
387
class GroupCompressVersionedFiles(VersionedFiles):
388
    """A group-compress based VersionedFiles implementation."""
389
0.17.2 by Robert Collins
Core proof of concept working.
390
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
391
        """Create a GroupCompressVersionedFiles object.
392
393
        :param index: The index object storing access and graph data.
394
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
395
        :param delta: Whether to delta compress or just entropy compress.
396
        """
397
        self._index = index
398
        self._access = access
399
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
400
        self._unadded_refs = {}
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
401
        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
0.17.2 by Robert Collins
Core proof of concept working.
402
403
    def add_lines(self, key, parents, lines, parent_texts=None,
404
        left_matching_blocks=None, nostore_sha=None, random_id=False,
405
        check_content=True):
406
        """Add a text to the store.
407
408
        :param key: The key tuple of the text to add.
409
        :param parents: The parents key tuples of the text to add.
410
        :param lines: A list of lines. Each line must be a bytestring. And all
411
            of them except the last must be terminated with \n and contain no
412
            other \n's. The last line may either contain no \n's or a single
413
            terminating \n. If the lines list does meet this constraint the add
414
            routine may error or may succeed - but you will be unable to read
415
            the data back accurately. (Checking the lines have been split
416
            correctly is expensive and extremely unlikely to catch bugs so it
417
            is not done at runtime unless check_content is True.)
418
        :param parent_texts: An optional dictionary containing the opaque 
419
            representations of some or all of the parents of version_id to
420
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
421
            returned by add_lines or data corruption can be caused.
422
        :param left_matching_blocks: a hint about which areas are common
423
            between the text and its left-hand-parent.  The format is
424
            the SequenceMatcher.get_matching_blocks format.
425
        :param nostore_sha: Raise ExistingContent and do not add the lines to
426
            the versioned file if the digest of the lines matches this.
427
        :param random_id: If True a random id has been selected rather than
428
            an id determined by some deterministic process such as a converter
429
            from a foreign VCS. When True the backend may choose not to check
430
            for uniqueness of the resulting key within the versioned file, so
431
            this should only be done when the result is expected to be unique
432
            anyway.
433
        :param check_content: If True, the lines supplied are verified to be
434
            bytestrings that are correctly formed lines.
435
        :return: The text sha1, the number of bytes in the text, and an opaque
436
                 representation of the inserted version which can be provided
437
                 back to future add_lines calls in the parent_texts dictionary.
438
        """
439
        self._index._check_write_ok()
440
        self._check_add(key, lines, random_id, check_content)
441
        if parents is None:
442
            # The caller might pass None if there is no graph data, but kndx
443
            # indexes can't directly store that, so we give them
444
            # an empty tuple instead.
445
            parents = ()
446
        # double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
447
        length = sum(map(len, lines))
448
        record = ChunkedContentFactory(key, parents, None, lines)
0.17.5 by Robert Collins
nograph tests completely passing.
449
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
450
        return sha1, length, None
0.17.2 by Robert Collins
Core proof of concept working.
451
0.17.4 by Robert Collins
Annotate.
452
    def annotate(self, key):
453
        """See VersionedFiles.annotate."""
454
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
455
        parent_map = self.get_parent_map([key])
456
        if not parent_map:
457
            raise errors.RevisionNotPresent(key, self)
458
        if parent_map[key] is not None:
459
            search = graph._make_breadth_first_searcher([key])
460
            keys = set()
461
            while True:
462
                try:
463
                    present, ghosts = search.next_with_ghosts()
464
                except StopIteration:
465
                    break
466
                keys.update(present)
467
            parent_map = self.get_parent_map(keys)
468
        else:
469
            keys = [key]
470
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
471
        head_cache = _mod_graph.FrozenHeadsCache(graph)
472
        parent_cache = {}
473
        reannotate = annotate.reannotate
474
        for record in self.get_record_stream(keys, 'topological', True):
475
            key = record.key
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
476
            chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins
Annotate.
477
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
478
            parent_cache[key] = list(
479
                reannotate(parent_lines, fulltext, key, None, head_cache))
480
        return parent_cache[key]
481
0.17.5 by Robert Collins
nograph tests completely passing.
482
    def check(self, progress_bar=None):
483
        """See VersionedFiles.check()."""
484
        keys = self.keys()
485
        for record in self.get_record_stream(keys, 'unordered', True):
486
            record.get_bytes_as('fulltext')
487
0.17.2 by Robert Collins
Core proof of concept working.
488
    def _check_add(self, key, lines, random_id, check_content):
489
        """check that version_id and lines are safe to add."""
490
        version_id = key[-1]
0.17.26 by Robert Collins
Working better --gc-plain-chk.
491
        if version_id is not None:
492
            if contains_whitespace(version_id):
493
                raise InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins
Core proof of concept working.
494
        self.check_not_reserved_id(version_id)
495
        # TODO: If random_id==False and the key is already present, we should
496
        # probably check that the existing content is identical to what is
497
        # being inserted, and otherwise raise an exception.  This would make
498
        # the bundle code simpler.
499
        if check_content:
500
            self._check_lines_not_unicode(lines)
501
            self._check_lines_are_lines(lines)
502
0.17.5 by Robert Collins
nograph tests completely passing.
503
    def get_parent_map(self, keys):
504
        """Get a map of the parents of keys.
505
506
        :param keys: The keys to look up parents for.
507
        :return: A mapping from keys to parents. Absent keys are absent from
508
            the mapping.
509
        """
510
        result = {}
511
        sources = [self._index]
512
        source_results = []
513
        missing = set(keys)
514
        for source in sources:
515
            if not missing:
516
                break
517
            new_result = source.get_parent_map(missing)
518
            source_results.append(new_result)
519
            result.update(new_result)
520
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
521
        if self._unadded_refs:
522
            for key in missing:
523
                if key in self._unadded_refs:
524
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
525
        return result
526
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
527
    def _get_delta_lines(self, key):
528
        """Helper function for manual debugging.
529
530
        This is a convenience function that shouldn't be used in production
531
        code.
532
        """
533
        build_details = self._index.get_build_details([key])[key]
534
        index_memo = build_details[0]
535
        group, delta_lines = self._get_group_and_delta_lines(index_memo)
536
        return delta_lines
537
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
538
    def _get_group_and_delta_lines(self, index_memo):
539
        read_memo = index_memo[0:3]
540
        # get the group:
541
        try:
542
            plain = self._group_cache[read_memo]
543
        except KeyError:
544
            # read the group
545
            zdata = self._access.get_raw_records([read_memo]).next()
546
            # decompress - whole thing - this is not a bug, as it
547
            # permits caching. We might want to store the partially
548
            # decompresed group and decompress object, so that recent
549
            # texts are not penalised by big groups.
550
            plain = zlib.decompress(zdata) #, index_memo[4])
551
            self._group_cache[read_memo] = plain
552
        # cheapo debugging:
553
        # print len(zdata), len(plain)
554
        # parse - requires split_lines, better to have byte offsets
555
        # here (but not by much - we only split the region for the
556
        # recipe, and we often want to end up with lines anyway.
557
        return plain, split_lines(plain[index_memo[3]:index_memo[4]])
558
0.17.5 by Robert Collins
nograph tests completely passing.
559
    def get_record_stream(self, keys, ordering, include_delta_closure):
560
        """Get a stream of records for keys.
561
562
        :param keys: The keys to include.
563
        :param ordering: Either 'unordered' or 'topological'. A topologically
564
            sorted stream has compression parents strictly before their
565
            children.
566
        :param include_delta_closure: If True then the closure across any
567
            compression parents will be included (in the opaque data).
568
        :return: An iterator of ContentFactory objects, each of which is only
569
            valid until the iterator is advanced.
570
        """
571
        # keys might be a generator
572
        keys = set(keys)
573
        if not keys:
574
            return
575
        if not self._index.has_graph:
576
            # Cannot topological order when no graph has been stored.
577
            ordering = 'unordered'
578
        # Cheap: iterate
579
        locations = self._index.get_build_details(keys)
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
580
        local_keys = frozenset(keys).intersection(set(self._unadded_refs))
581
        locations.update((key, None) for key in local_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
582
        if ordering == 'topological':
583
            # would be better to not globally sort initially but instead
584
            # start with one key, recurse to its oldest parent, then grab
585
            # everything in the same group, etc.
586
            parent_map = dict((key, details[2]) for key, details in
587
                locations.iteritems())
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
588
            for key in local_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
589
                parent_map[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
590
            present_keys = topo_sort(parent_map)
591
            # Now group by source:
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
592
        elif ordering == 'gc-optimal':
593
            parent_map = dict((key, details[2]) for key, details in
594
                locations.iteritems())
595
            for key in local_keys:
596
                parent_map[key] = self._unadded_refs[key]
597
            # XXX: This only optimizes for the target ordering. We may need to
598
            #      balance that with the time it takes to extract ordering, by
599
            #      somehow grouping based on locations[key][0:3]
600
            present_keys = sort_gc_optimal(parent_map)
0.17.5 by Robert Collins
nograph tests completely passing.
601
        else:
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
602
            # We want to yield the keys in a semi-optimal (read-wise) ordering.
603
            # Otherwise we thrash the _group_cache and destroy performance
604
            def get_group(key):
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
605
                # This is the group the bytes are stored in, followed by the
606
                # location in the group
607
                return locations[key][0]
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
608
            present_keys = sorted(locations.iterkeys(), key=get_group)
609
            # We don't have an ordering for keys in the in-memory object, but
610
            # lets process the in-memory ones first.
611
            local = list(local_keys)
612
            present_keys = list(local_keys) + present_keys
0.17.5 by Robert Collins
nograph tests completely passing.
613
        absent_keys = keys.difference(set(locations))
614
        for key in absent_keys:
615
            yield AbsentContentFactory(key)
616
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
617
            if key in self._unadded_refs:
618
                lines, sha1 = self._compressor.extract(key)
619
                parents = self._unadded_refs[key]
620
            else:
621
                index_memo, _, parents, (method, _) = locations[key]
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
622
                plain, delta_lines = self._get_group_and_delta_lines(index_memo)
0.22.2 by John Arbash Meinel
experiment with removing the label and sha1 fields. Seems to shrink texts by 10-30%.
623
                delta = parse(delta_lines)
624
                # label, sha1, delta = parse(delta_lines)
625
                # if label != key:
626
                #     raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.20.9 by John Arbash Meinel
Revert previous change.
627
                lines = apply_delta(plain, delta)
0.22.2 by John Arbash Meinel
experiment with removing the label and sha1 fields. Seems to shrink texts by 10-30%.
628
                sha1 = sha_strings(lines)
629
                # if sha_strings(lines) != sha1:
630
                #     raise AssertionError('sha1 sum did not match')
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
631
            yield ChunkedContentFactory(key, parents, sha1, lines)
632
0.17.5 by Robert Collins
nograph tests completely passing.
633
    def get_sha1s(self, keys):
634
        """See VersionedFiles.get_sha1s()."""
635
        result = {}
636
        for record in self.get_record_stream(keys, 'unordered', True):
637
            if record.sha1 != None:
638
                result[record.key] = record.sha1
639
            else:
640
                if record.storage_kind != 'absent':
641
                    result[record.key] == sha_string(record.get_bytes_as(
642
                        'fulltext'))
643
        return result
644
0.17.2 by Robert Collins
Core proof of concept working.
645
    def insert_record_stream(self, stream):
646
        """Insert a record stream into this container.
647
648
        :param stream: A stream of records to insert. 
649
        :return: None
650
        :seealso VersionedFiles.get_record_stream:
651
        """
0.17.5 by Robert Collins
nograph tests completely passing.
652
        for _ in self._insert_record_stream(stream):
653
            pass
0.17.2 by Robert Collins
Core proof of concept working.
654
0.17.5 by Robert Collins
nograph tests completely passing.
655
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
656
        """Internal core to insert a record stream into this container.
657
658
        This helper function has a different interface than insert_record_stream
659
        to allow add_lines to be minimal, but still return the needed data.
660
661
        :param stream: A stream of records to insert. 
662
        :return: An iterator over the sha1 of the inserted records.
663
        :seealso insert_record_stream:
664
        :seealso add_lines:
665
        """
0.17.5 by Robert Collins
nograph tests completely passing.
666
        def get_adapter(adapter_key):
667
            try:
668
                return adapters[adapter_key]
669
            except KeyError:
670
                adapter_factory = adapter_registry.get(adapter_key)
671
                adapter = adapter_factory(self)
672
                adapters[adapter_key] = adapter
673
                return adapter
674
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
675
        # This will go up to fulltexts for gc to gc fetching, which isn't
676
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
677
        self._compressor = GroupCompressor(self._delta)
678
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
679
        keys_to_add = []
680
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
681
        groups = 1
682
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
683
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
684
            index, start, length = self._access.add_raw_records(
685
                [(None, len(compressed))], compressed)[0]
686
            nodes = []
687
            for key, reads, refs in keys_to_add:
688
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
689
            self._index.add_records(nodes, random_id=random_id)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
690
        last_prefix = None
0.17.2 by Robert Collins
Core proof of concept working.
691
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
692
            # Raise an error when a record is missing.
693
            if record.storage_kind == 'absent':
694
                raise errors.RevisionNotPresent([record.key], self)
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
695
            elif record.storage_kind in ('chunked', 'fulltext'):
696
                lines = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.5 by Robert Collins
nograph tests completely passing.
697
            else:
698
                adapter_key = record.storage_kind, 'fulltext'
699
                adapter = get_adapter(adapter_key)
700
                bytes = adapter.get_bytes(record,
701
                    record.get_bytes_as(record.storage_kind))
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
702
                lines = osutils.split_lines(bytes)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
703
            soft = False
0.20.13 by John Arbash Meinel
Play around a bit.
704
            if len(record.key) > 1:
705
                prefix = record.key[0]
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
706
                if (last_prefix is not None and prefix != last_prefix):
707
                    soft = True
708
                    if basis_end > 1024 * 1024 * 4:
709
                        flush()
710
                        self._compressor = GroupCompressor(self._delta)
711
                        self._unadded_refs = {}
712
                        keys_to_add = []
713
                        basis_end = 0
714
                        groups += 1
715
                last_prefix = prefix
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
716
            found_sha1, end_point = self._compressor.compress(record.key,
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
717
                lines, record.sha1, soft=soft)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
718
            if record.key[-1] is None:
719
                key = record.key[:-1] + ('sha1:' + found_sha1,)
720
            else:
721
                key = record.key
722
            self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
723
            yield found_sha1
0.17.26 by Robert Collins
Working better --gc-plain-chk.
724
            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
0.17.5 by Robert Collins
nograph tests completely passing.
725
                (record.parents,)))
726
            basis_end = end_point
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
727
            if basis_end > 1024 * 1024 * 8:
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
728
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
729
                self._compressor = GroupCompressor(self._delta)
730
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
731
                keys_to_add = []
732
                basis_end = 0
733
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
734
        if len(keys_to_add):
735
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
736
        self._compressor = None
737
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
738
739
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
740
        """Iterate over the lines in the versioned files from keys.
741
742
        This may return lines from other keys. Each item the returned
743
        iterator yields is a tuple of a line and a text version that that line
744
        is present in (not introduced in).
745
746
        Ordering of results is in whatever order is most suitable for the
747
        underlying storage format.
748
749
        If a progress bar is supplied, it may be used to indicate progress.
750
        The caller is responsible for cleaning up progress bars (because this
751
        is an iterator).
752
753
        NOTES:
754
         * Lines are normalised by the underlying store: they will all have \n
755
           terminators.
756
         * Lines are returned in arbitrary order.
757
758
        :return: An iterator over (line, key).
759
        """
760
        if pb is None:
761
            pb = progress.DummyProgress()
762
        keys = set(keys)
763
        total = len(keys)
764
        # we don't care about inclusions, the caller cares.
765
        # but we need to setup a list of records to visit.
766
        # we need key, position, length
767
        for key_idx, record in enumerate(self.get_record_stream(keys,
768
            'unordered', True)):
769
            # XXX: todo - optimise to use less than full texts.
770
            key = record.key
771
            pb.update('Walking content.', key_idx, total)
772
            if record.storage_kind == 'absent':
773
                raise errors.RevisionNotPresent(record.key, self)
774
            lines = split_lines(record.get_bytes_as('fulltext'))
775
            for line in lines:
776
                yield line, key
777
        pb.update('Walking content.', total, total)
778
779
    def keys(self):
780
        """See VersionedFiles.keys."""
781
        if 'evil' in debug.debug_flags:
782
            trace.mutter_callsite(2, "keys scales with size of history")
783
        sources = [self._index]
784
        result = set()
785
        for source in sources:
786
            result.update(source.keys())
787
        return result
788
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
789
790
class _GCGraphIndex(object):
791
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
792
0.17.9 by Robert Collins
Initial stab at repository format support.
793
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
794
        add_callback=None):
795
        """Construct a _GCGraphIndex on a graph_index.
796
797
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
798
        :param is_locked: A callback to check whether the object should answer
799
            queries.
800
        :param parents: If True, record knits parents, if not do not record 
801
            parents.
802
        :param add_callback: If not None, allow additions to the index and call
803
            this callback with a list of added GraphIndex nodes:
804
            [(node, value, node_refs), ...]
805
        :param is_locked: A callback, returns True if the index is locked and
806
            thus usable.
807
        """
808
        self._add_callback = add_callback
809
        self._graph_index = graph_index
810
        self._parents = parents
811
        self.has_graph = parents
812
        self._is_locked = is_locked
813
0.17.5 by Robert Collins
nograph tests completely passing.
814
    def add_records(self, records, random_id=False):
815
        """Add multiple records to the index.
816
        
817
        This function does not insert data into the Immutable GraphIndex
818
        backing the KnitGraphIndex, instead it prepares data for insertion by
819
        the caller and checks that it is safe to insert then calls
820
        self._add_callback with the prepared GraphIndex nodes.
821
822
        :param records: a list of tuples:
823
                         (key, options, access_memo, parents).
824
        :param random_id: If True the ids being added were randomly generated
825
            and no check for existence will be performed.
826
        """
827
        if not self._add_callback:
828
            raise errors.ReadOnlyError(self)
829
        # we hope there are no repositories with inconsistent parentage
830
        # anymore.
831
832
        changed = False
833
        keys = {}
834
        for (key, value, refs) in records:
835
            if not self._parents:
836
                if refs:
837
                    for ref in refs:
838
                        if ref:
839
                            raise KnitCorrupt(self,
840
                                "attempt to add node with parents "
841
                                "in parentless index.")
842
                    refs = ()
843
                    changed = True
844
            keys[key] = (value, refs)
845
        # check for dups
846
        if not random_id:
847
            present_nodes = self._get_entries(keys)
848
            for (index, key, value, node_refs) in present_nodes:
849
                if node_refs != keys[key][1]:
850
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
851
                        ": %s %s" % ((value, node_refs), keys[key]))
852
                del keys[key]
853
                changed = True
854
        if changed:
855
            result = []
856
            if self._parents:
857
                for key, (value, node_refs) in keys.iteritems():
858
                    result.append((key, value, node_refs))
859
            else:
860
                for key, (value, node_refs) in keys.iteritems():
861
                    result.append((key, value))
862
            records = result
863
        self._add_callback(records)
864
        
865
    def _check_read(self):
866
        """raise if reads are not permitted."""
867
        if not self._is_locked():
868
            raise errors.ObjectNotLocked(self)
869
0.17.2 by Robert Collins
Core proof of concept working.
870
    def _check_write_ok(self):
871
        """Assert if writes are not permitted."""
872
        if not self._is_locked():
873
            raise errors.ObjectNotLocked(self)
874
0.17.5 by Robert Collins
nograph tests completely passing.
875
    def _get_entries(self, keys, check_present=False):
876
        """Get the entries for keys.
877
        
878
        :param keys: An iterable of index key tuples.
879
        """
880
        keys = set(keys)
881
        found_keys = set()
882
        if self._parents:
883
            for node in self._graph_index.iter_entries(keys):
884
                yield node
885
                found_keys.add(node[1])
886
        else:
887
            # adapt parentless index to the rest of the code.
888
            for node in self._graph_index.iter_entries(keys):
889
                yield node[0], node[1], node[2], ()
890
                found_keys.add(node[1])
891
        if check_present:
892
            missing_keys = keys.difference(found_keys)
893
            if missing_keys:
894
                raise RevisionNotPresent(missing_keys.pop(), self)
895
896
    def get_parent_map(self, keys):
897
        """Get a map of the parents of keys.
898
899
        :param keys: The keys to look up parents for.
900
        :return: A mapping from keys to parents. Absent keys are absent from
901
            the mapping.
902
        """
903
        self._check_read()
904
        nodes = self._get_entries(keys)
905
        result = {}
906
        if self._parents:
907
            for node in nodes:
908
                result[node[1]] = node[3][0]
909
        else:
910
            for node in nodes:
911
                result[node[1]] = None
912
        return result
913
914
    def get_build_details(self, keys):
915
        """Get the various build details for keys.
916
917
        Ghosts are omitted from the result.
918
919
        :param keys: An iterable of keys.
920
        :return: A dict of key:
921
            (index_memo, compression_parent, parents, record_details).
922
            index_memo
923
                opaque structure to pass to read_records to extract the raw
924
                data
925
            compression_parent
926
                Content that this record is built upon, may be None
927
            parents
928
                Logical parents of this node
929
            record_details
930
                extra information about the content which needs to be passed to
931
                Factory.parse_record
932
        """
933
        self._check_read()
934
        result = {}
935
        entries = self._get_entries(keys, False)
936
        for entry in entries:
937
            key = entry[1]
938
            if not self._parents:
939
                parents = None
940
            else:
941
                parents = entry[3][0]
942
            value = entry[2]
943
            method = 'group'
944
            result[key] = (self._node_to_position(entry),
945
                                  None, parents, (method, None))
946
        return result
947
    
948
    def keys(self):
949
        """Get all the keys in the collection.
950
        
951
        The keys are not ordered.
952
        """
953
        self._check_read()
954
        return [node[1] for node in self._graph_index.iter_all_entries()]
955
    
956
    def _node_to_position(self, node):
957
        """Convert an index value to position details."""
958
        bits = node[2].split(' ')
959
        # It would be nice not to read the entire gzip.
960
        start = int(bits[0])
961
        stop = int(bits[1])
962
        basis_end = int(bits[2])
963
        delta_end = int(bits[3])
964
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
965
966
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
967
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
968
    """Get the longest possible match for the current position."""
969
    range_start = pos
970
    range_len = 0
971
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
972
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
973
        if locations is None:
974
            locations = equivalence_table.get_idx_matches(pos)
975
        if locations is None:
976
            # No more matches, just return whatever we have, but we know that
977
            # this last position is not going to match anything
978
            pos += 1
979
            break
980
        else:
981
            if copy_ends is None:
982
                # We are starting a new range
983
                copy_ends = [loc + 1 for loc in locations]
984
                range_len = 1
985
                locations = None # Consumed
986
            else:
987
                # We are currently in the middle of a match
988
                next_locations = set(copy_ends).intersection(locations)
989
                if len(next_locations):
990
                    # range continues
991
                    copy_ends = [loc + 1 for loc in next_locations]
992
                    range_len += 1
993
                    locations = None # Consumed
994
                else:
995
                    # But we are done with this match, we should be
996
                    # starting a new one, though. We will pass back 'locations'
997
                    # so that we don't have to do another lookup.
998
                    break
999
        pos += 1
1000
    if copy_ends is None:
1001
        return None, pos, locations
1002
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
1003
1004
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
1005
try:
1006
    from bzrlib.plugins.groupcompress import _groupcompress_c
1007
except ImportError:
1008
    pass
1009
else:
1010
    GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
1011
    _get_longest_match = _groupcompress_c._get_longest_match