/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
30
    osutils,
0.17.4 by Robert Collins
Annotate.
31
    pack,
32
    patiencediff,
33
    )
34
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
35
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
36
from bzrlib.osutils import (
37
    contains_whitespace,
38
    contains_linebreaks,
39
    sha_string,
40
    sha_strings,
41
    split_lines,
42
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
43
from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
44
from bzrlib.lru_cache import LRUSizeCache
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
45
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
46
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
47
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
48
    adapter_registry,
49
    AbsentContentFactory,
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
50
    ChunkedContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
51
    FulltextContentFactory,
52
    VersionedFiles,
53
    )
54
0.23.15 by John Arbash Meinel
Handle when self._index is NULL, mostly because the source text was the empty strig.
55
_NO_LABELS = False
0.17.2 by Robert Collins
Core proof of concept working.
56
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
57
def parse(bytes):
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
58
    if _NO_LABELS:
59
        action_byte = bytes[0]
60
        action = {'f':'fulltext', 'd':'delta'}[action_byte]
61
        return action, None, None, bytes[1:]
0.23.12 by John Arbash Meinel
Add a 'len:' field to the data.
62
    (action, label_line, sha1_line, len_line,
0.23.16 by John Arbash Meinel
Properly restore the label functionality.
63
     delta_bytes) = bytes.split('\n', 4)
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
64
    if (action not in ('fulltext', 'delta')
0.23.27 by John Arbash Meinel
Fix up some failing tests.
65
        or not label_line.startswith('label:')
66
        or not sha1_line.startswith('sha1:')
67
        or not len_line.startswith('len:')
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
68
        ):
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
69
        raise AssertionError("bad text record %r" % (bytes,))
0.23.34 by John Arbash Meinel
Update groupcompress to allow it to read older conversions.
70
    label = tuple(label_line[6:].lstrip(' ').split('\x00'))
71
    sha1 = sha1_line[5:].lstrip(' ')
0.23.27 by John Arbash Meinel
Fix up some failing tests.
72
    length = int(len_line[4:])
0.23.12 by John Arbash Meinel
Add a 'len:' field to the data.
73
    if not len(delta_bytes) == length:
74
        raise AssertionError("bad length record %r" % (bytes,))
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
75
    return action, label, sha1, delta_bytes
76
77
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
78
def sort_gc_optimal(parent_map):
79
    """Sort and group the keys in parent_map into gc-optimal order.
80
81
    gc-optimal is defined (currently) as reverse-topological order, grouped by
82
    the key prefix.
83
84
    :return: A sorted-list of keys
85
    """
86
    # gc-optimal ordering is approximately reverse topological,
87
    # properly grouped by file-id.
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
88
    per_prefix_map = {}
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
89
    for item in parent_map.iteritems():
90
        key = item[0]
91
        if isinstance(key, str) or len(key) == 1:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
92
            prefix = ''
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
93
        else:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
94
            prefix = key[0]
95
        try:
96
            per_prefix_map[prefix].append(item)
97
        except KeyError:
98
            per_prefix_map[prefix] = [item]
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
99
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
100
    present_keys = []
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
101
    for prefix in sorted(per_prefix_map):
102
        present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
103
    return present_keys
104
105
0.17.2 by Robert Collins
Core proof of concept working.
106
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
107
    """Produce a serialised group of compressed texts.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
108
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
109
    It contains code very similar to SequenceMatcher because of having a similar
110
    task. However some key differences apply:
111
     - there is no junk, we want a minimal edit not a human readable diff.
112
     - we don't filter very common lines (because we don't know where a good
113
       range will start, and after the first text we want to be emitting minmal
114
       edits only.
115
     - we chain the left side, not the right side
116
     - we incrementally update the adjacency matrix as new lines are provided.
117
     - we look for matches in all of the left side, so the routine which does
118
       the analagous task of find_longest_match does not need to filter on the
119
       left side.
120
    """
0.17.2 by Robert Collins
Core proof of concept working.
121
122
    def __init__(self, delta=True):
123
        """Create a GroupCompressor.
124
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
125
        :param delta: If False, do not compress records.
0.17.2 by Robert Collins
Core proof of concept working.
126
        """
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
127
        # Consider seeding the lines with some sort of GC Start flag, or
128
        # putting it as part of the output stream, rather than in the
129
        # compressed bytes.
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
130
        self.lines = []
0.17.2 by Robert Collins
Core proof of concept working.
131
        self.endpoint = 0
132
        self.input_bytes = 0
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
133
        self.labels_deltas = {}
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
134
        self._delta_index = _groupcompress_pyx.DeltaIndex()
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
135
136
    def compress(self, key, chunks, expected_sha, soft=False):
0.17.2 by Robert Collins
Core proof of concept working.
137
        """Compress lines with label key.
138
139
        :param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins
Working better --gc-plain-chk.
140
            for identification of the text during decompression. If the last
141
            element is 'None' it is replaced with the sha1 of the text -
142
            e.g. sha1:xxxxxxx.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
143
        :param chunks: The chunks to be compressed
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
144
        :param expected_sha: If non-None, the sha the lines are believed to
0.17.2 by Robert Collins
Core proof of concept working.
145
            have. During compression the sha is calculated; a mismatch will
146
            cause an error.
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
147
        :param soft: Do a 'soft' compression. This means that we require larger
148
            ranges to match to be considered for a copy command.
0.17.2 by Robert Collins
Core proof of concept working.
149
        :return: The sha1 of lines, and the number of bytes accumulated in
150
            the group output so far.
151
        """
0.23.37 by John Arbash Meinel
If you are going to join the bytes anyway, use sha_string instead of sha_strings.
152
        target_text = ''.join(chunks)
153
        sha1 = sha_string(target_text)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
154
        if key[-1] is None:
155
            key = key[:-1] + ('sha1:' + sha1,)
0.17.2 by Robert Collins
Core proof of concept working.
156
        label = '\x00'.join(key)
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
157
        input_len = len(target_text)
0.23.12 by John Arbash Meinel
Add a 'len:' field to the data.
158
        # By having action/label/sha1/len, we can parse the group if the index
159
        # was ever destroyed, we have the key in 'label', we know the final
160
        # bytes are valid from sha1, and we know where to find the end of this
161
        # record because of 'len'. (the delta record itself will store the
162
        # total length for the expanded record)
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
163
        # 'len: %d\n' costs approximately 1% increase in total data
164
        # Having the labels at all costs us 9-10% increase, 38% increase for
165
        # inventory pages, and 5.8% increase for text pages
166
        if _NO_LABELS:
167
            new_chunks = []
168
        else:
0.23.27 by John Arbash Meinel
Fix up some failing tests.
169
            new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
170
        if self._delta_index._source_offset != self.endpoint:
171
            raise AssertionError('_source_offset != endpoint'
172
                ' somehow the DeltaIndex got out of sync with'
173
                ' the output lines')
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
174
        delta = self._delta_index.make_delta(target_text)
0.23.11 by John Arbash Meinel
Insert a fulltext if the delta is more than half the total size.
175
        if (delta is None
176
            or len(delta) > len(target_text) / 2):
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
177
            # We can't delta (perhaps source_text is empty)
178
            # so mark this as an insert
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
179
            if _NO_LABELS:
180
                new_chunks = ['f']
181
            else:
182
                new_chunks.insert(0, 'fulltext\n')
0.23.27 by John Arbash Meinel
Fix up some failing tests.
183
                new_chunks.append('len:%s\n' % (input_len,))
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
184
            unadded_bytes = sum(map(len, new_chunks))
185
            self._delta_index.add_source(target_text, unadded_bytes)
186
            new_chunks.append(target_text)
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
187
        else:
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
188
            if _NO_LABELS:
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
189
                new_chunks = ['d']
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
190
            else:
191
                new_chunks.insert(0, 'delta\n')
0.23.27 by John Arbash Meinel
Fix up some failing tests.
192
                new_chunks.append('len:%s\n' % (len(delta),))
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
193
            # unadded_bytes = sum(map(len, new_chunks))
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
194
            # self._delta_index.add_source(delta, unadded_bytes)
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
195
            new_chunks.append(delta)
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
196
            unadded_bytes = sum(map(len, new_chunks))
197
            self._delta_index._source_offset += unadded_bytes
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
198
        delta_start = (self.endpoint, len(self.lines))
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
199
        self.output_chunks(new_chunks)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
200
        self.input_bytes += input_len
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
201
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
202
        self.labels_deltas[key] = (delta_start, delta_end)
0.23.29 by John Arbash Meinel
Forgot to add the delta bytes to the index objects.
203
        if not self._delta_index._source_offset == self.endpoint:
204
            raise AssertionError('the delta index is out of sync'
205
                'with the output lines %s != %s'
206
                % (self._delta_index._source_offset, self.endpoint))
0.17.2 by Robert Collins
Core proof of concept working.
207
        return sha1, self.endpoint
208
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
209
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
210
        """Extract a key previously added to the compressor.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
211
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
212
        :param key: The key to extract.
213
        :return: An iterable over bytes and the sha1.
214
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
215
        delta_details = self.labels_deltas[key]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
216
        delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
217
        action, label, sha1, delta = parse(''.join(delta_chunks))
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
218
        if not _NO_LABELS and label != key:
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
219
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
220
        if action == 'fulltext':
221
            bytes = delta
222
        else:
223
            source = ''.join(self.lines[delta_details[0][0]])
0.23.21 by John Arbash Meinel
Rename the extension to _pyx, since Robert prefers that form
224
            bytes = _groupcompress_pyx.apply_delta(source, delta)
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
225
        if _NO_LABELS:
226
            sha1 = sha_string(bytes)
227
        else:
228
            assert sha1 == sha_string(bytes)
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
229
        return [bytes], sha1
230
231
    def output_chunks(self, new_chunks):
232
        """Output some chunks.
233
234
        :param new_chunks: The chunks to output.
235
        """
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
236
        endpoint = self.endpoint
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
237
        self.lines.extend(new_chunks)
238
        endpoint += sum(map(len, new_chunks))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
239
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
240
0.17.2 by Robert Collins
Core proof of concept working.
241
    def ratio(self):
242
        """Return the overall compression ratio."""
243
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
244
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
245
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
246
def make_pack_factory(graph, delta, keylength):
247
    """Create a factory for creating a pack based groupcompress.
248
249
    This is only functional enough to run interface tests, it doesn't try to
250
    provide a full pack environment.
251
    
252
    :param graph: Store a graph.
253
    :param delta: Delta compress contents.
254
    :param keylength: How long should keys be.
255
    """
256
    def factory(transport):
257
        parents = graph or delta
258
        ref_length = 0
259
        if graph:
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
260
            ref_length = 1
0.17.7 by Robert Collins
Update for current index2 changes.
261
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
262
            key_elements=keylength)
263
        stream = transport.open_write_stream('newpack')
264
        writer = pack.ContainerWriter(stream.write)
265
        writer.begin()
266
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
267
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
268
        access = _DirectPackAccess({})
269
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
270
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
271
        result.stream = stream
272
        result.writer = writer
273
        return result
274
    return factory
275
276
277
def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
278
    versioned_files.writer.end()
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
279
    versioned_files.stream.close()
280
281
282
class GroupCompressVersionedFiles(VersionedFiles):
283
    """A group-compress based VersionedFiles implementation."""
284
0.17.2 by Robert Collins
Core proof of concept working.
285
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
286
        """Create a GroupCompressVersionedFiles object.
287
288
        :param index: The index object storing access and graph data.
289
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
290
        :param delta: Whether to delta compress or just entropy compress.
291
        """
292
        self._index = index
293
        self._access = access
294
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
295
        self._unadded_refs = {}
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
296
        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
0.17.2 by Robert Collins
Core proof of concept working.
297
298
    def add_lines(self, key, parents, lines, parent_texts=None,
299
        left_matching_blocks=None, nostore_sha=None, random_id=False,
300
        check_content=True):
301
        """Add a text to the store.
302
303
        :param key: The key tuple of the text to add.
304
        :param parents: The parents key tuples of the text to add.
305
        :param lines: A list of lines. Each line must be a bytestring. And all
306
            of them except the last must be terminated with \n and contain no
307
            other \n's. The last line may either contain no \n's or a single
308
            terminating \n. If the lines list does meet this constraint the add
309
            routine may error or may succeed - but you will be unable to read
310
            the data back accurately. (Checking the lines have been split
311
            correctly is expensive and extremely unlikely to catch bugs so it
312
            is not done at runtime unless check_content is True.)
313
        :param parent_texts: An optional dictionary containing the opaque 
314
            representations of some or all of the parents of version_id to
315
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
316
            returned by add_lines or data corruption can be caused.
317
        :param left_matching_blocks: a hint about which areas are common
318
            between the text and its left-hand-parent.  The format is
319
            the SequenceMatcher.get_matching_blocks format.
320
        :param nostore_sha: Raise ExistingContent and do not add the lines to
321
            the versioned file if the digest of the lines matches this.
322
        :param random_id: If True a random id has been selected rather than
323
            an id determined by some deterministic process such as a converter
324
            from a foreign VCS. When True the backend may choose not to check
325
            for uniqueness of the resulting key within the versioned file, so
326
            this should only be done when the result is expected to be unique
327
            anyway.
328
        :param check_content: If True, the lines supplied are verified to be
329
            bytestrings that are correctly formed lines.
330
        :return: The text sha1, the number of bytes in the text, and an opaque
331
                 representation of the inserted version which can be provided
332
                 back to future add_lines calls in the parent_texts dictionary.
333
        """
334
        self._index._check_write_ok()
335
        self._check_add(key, lines, random_id, check_content)
336
        if parents is None:
337
            # The caller might pass None if there is no graph data, but kndx
338
            # indexes can't directly store that, so we give them
339
            # an empty tuple instead.
340
            parents = ()
341
        # double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
342
        length = sum(map(len, lines))
343
        record = ChunkedContentFactory(key, parents, None, lines)
0.17.5 by Robert Collins
nograph tests completely passing.
344
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
345
        return sha1, length, None
0.17.2 by Robert Collins
Core proof of concept working.
346
0.17.4 by Robert Collins
Annotate.
347
    def annotate(self, key):
348
        """See VersionedFiles.annotate."""
349
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
350
        parent_map = self.get_parent_map([key])
351
        if not parent_map:
352
            raise errors.RevisionNotPresent(key, self)
353
        if parent_map[key] is not None:
354
            search = graph._make_breadth_first_searcher([key])
355
            keys = set()
356
            while True:
357
                try:
358
                    present, ghosts = search.next_with_ghosts()
359
                except StopIteration:
360
                    break
361
                keys.update(present)
362
            parent_map = self.get_parent_map(keys)
363
        else:
364
            keys = [key]
365
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
366
        head_cache = _mod_graph.FrozenHeadsCache(graph)
367
        parent_cache = {}
368
        reannotate = annotate.reannotate
369
        for record in self.get_record_stream(keys, 'topological', True):
370
            key = record.key
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
371
            chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins
Annotate.
372
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
373
            parent_cache[key] = list(
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
374
                reannotate(parent_lines, chunks, key, None, head_cache))
0.17.4 by Robert Collins
Annotate.
375
        return parent_cache[key]
376
0.17.5 by Robert Collins
nograph tests completely passing.
377
    def check(self, progress_bar=None):
378
        """See VersionedFiles.check()."""
379
        keys = self.keys()
380
        for record in self.get_record_stream(keys, 'unordered', True):
381
            record.get_bytes_as('fulltext')
382
0.17.2 by Robert Collins
Core proof of concept working.
383
    def _check_add(self, key, lines, random_id, check_content):
384
        """check that version_id and lines are safe to add."""
385
        version_id = key[-1]
0.17.26 by Robert Collins
Working better --gc-plain-chk.
386
        if version_id is not None:
387
            if contains_whitespace(version_id):
388
                raise InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins
Core proof of concept working.
389
        self.check_not_reserved_id(version_id)
390
        # TODO: If random_id==False and the key is already present, we should
391
        # probably check that the existing content is identical to what is
392
        # being inserted, and otherwise raise an exception.  This would make
393
        # the bundle code simpler.
394
        if check_content:
395
            self._check_lines_not_unicode(lines)
396
            self._check_lines_are_lines(lines)
397
0.17.5 by Robert Collins
nograph tests completely passing.
398
    def get_parent_map(self, keys):
399
        """Get a map of the parents of keys.
400
401
        :param keys: The keys to look up parents for.
402
        :return: A mapping from keys to parents. Absent keys are absent from
403
            the mapping.
404
        """
405
        result = {}
406
        sources = [self._index]
407
        source_results = []
408
        missing = set(keys)
409
        for source in sources:
410
            if not missing:
411
                break
412
            new_result = source.get_parent_map(missing)
413
            source_results.append(new_result)
414
            result.update(new_result)
415
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
416
        if self._unadded_refs:
417
            for key in missing:
418
                if key in self._unadded_refs:
419
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
420
        return result
421
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
422
    def _get_group_and_delta_bytes(self, index_memo):
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
423
        read_memo = index_memo[0:3]
424
        # get the group:
425
        try:
426
            plain = self._group_cache[read_memo]
427
        except KeyError:
428
            # read the group
429
            zdata = self._access.get_raw_records([read_memo]).next()
430
            # decompress - whole thing - this is not a bug, as it
431
            # permits caching. We might want to store the partially
432
            # decompresed group and decompress object, so that recent
433
            # texts are not penalised by big groups.
434
            plain = zlib.decompress(zdata) #, index_memo[4])
435
            self._group_cache[read_memo] = plain
436
        # cheapo debugging:
437
        # print len(zdata), len(plain)
438
        # parse - requires split_lines, better to have byte offsets
439
        # here (but not by much - we only split the region for the
440
        # recipe, and we often want to end up with lines anyway.
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
441
        return plain, plain[index_memo[3]:index_memo[4]]
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
442
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
443
    def get_missing_compression_parent_keys(self):
444
        """Return the keys of missing compression parents.
445
446
        Missing compression parents occur when a record stream was missing
447
        basis texts, or a index was scanned that had missing basis texts.
448
        """
449
        # GroupCompress cannot currently reference texts that are not in the
450
        # group, so this is valid for now
451
        return frozenset()
452
0.17.5 by Robert Collins
nograph tests completely passing.
453
    def get_record_stream(self, keys, ordering, include_delta_closure):
454
        """Get a stream of records for keys.
455
456
        :param keys: The keys to include.
457
        :param ordering: Either 'unordered' or 'topological'. A topologically
458
            sorted stream has compression parents strictly before their
459
            children.
460
        :param include_delta_closure: If True then the closure across any
461
            compression parents will be included (in the opaque data).
462
        :return: An iterator of ContentFactory objects, each of which is only
463
            valid until the iterator is advanced.
464
        """
465
        # keys might be a generator
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
466
        orig_keys = list(keys)
467
        keys = set(orig_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
468
        if not keys:
469
            return
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
470
        if (not self._index.has_graph
471
            and ordering in ('topological', 'gc-optimal')):
0.17.5 by Robert Collins
nograph tests completely passing.
472
            # Cannot topological order when no graph has been stored.
473
            ordering = 'unordered'
474
        # Cheap: iterate
475
        locations = self._index.get_build_details(keys)
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
476
        local_keys = frozenset(keys).intersection(set(self._unadded_refs))
0.17.5 by Robert Collins
nograph tests completely passing.
477
        if ordering == 'topological':
478
            # would be better to not globally sort initially but instead
479
            # start with one key, recurse to its oldest parent, then grab
480
            # everything in the same group, etc.
481
            parent_map = dict((key, details[2]) for key, details in
482
                locations.iteritems())
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
483
            for key in local_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
484
                parent_map[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
485
            present_keys = topo_sort(parent_map)
486
            # Now group by source:
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
487
        elif ordering == 'gc-optimal':
488
            parent_map = dict((key, details[2]) for key, details in
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
489
                              locations.iteritems())
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
490
            for key in local_keys:
491
                parent_map[key] = self._unadded_refs[key]
492
            # XXX: This only optimizes for the target ordering. We may need to
493
            #      balance that with the time it takes to extract ordering, by
494
            #      somehow grouping based on locations[key][0:3]
495
            present_keys = sort_gc_optimal(parent_map)
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
496
        elif ordering == 'as-requested':
0.20.32 by John Arbash Meinel
Fix bug #336373 by adding local keys to locations after the fact, rather than before.
497
            present_keys = [key for key in orig_keys if key in locations
498
                            or key in local_keys]
0.17.5 by Robert Collins
nograph tests completely passing.
499
        else:
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
500
            # We want to yield the keys in a semi-optimal (read-wise) ordering.
501
            # Otherwise we thrash the _group_cache and destroy performance
502
            def get_group(key):
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
503
                # This is the group the bytes are stored in, followed by the
504
                # location in the group
505
                return locations[key][0]
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
506
            present_keys = sorted(locations.iterkeys(), key=get_group)
507
            # We don't have an ordering for keys in the in-memory object, but
508
            # lets process the in-memory ones first.
509
            present_keys = list(local_keys) + present_keys
0.20.32 by John Arbash Meinel
Fix bug #336373 by adding local keys to locations after the fact, rather than before.
510
        locations.update((key, None) for key in local_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
511
        absent_keys = keys.difference(set(locations))
512
        for key in absent_keys:
513
            yield AbsentContentFactory(key)
514
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
515
            if key in self._unadded_refs:
0.20.28 by John Arbash Meinel
Fix typo with the recent lines => chunks rename.
516
                chunks, sha1 = self._compressor.extract(key)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
517
                parents = self._unadded_refs[key]
518
            else:
519
                index_memo, _, parents, (method, _) = locations[key]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
520
                plain, delta_bytes = self._get_group_and_delta_bytes(index_memo)
521
                action, label, sha1, delta = parse(delta_bytes)
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
522
                if not _NO_LABELS and label != key:
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
523
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
524
                if action == 'fulltext':
525
                    chunks = [delta]
526
                else:
527
                    # TODO: relax apply_delta so that it can allow source to be
528
                    #       longer than expected
0.23.21 by John Arbash Meinel
Rename the extension to _pyx, since Robert prefers that form
529
                    bytes = _groupcompress_pyx.apply_delta(plain, delta)
0.23.10 by John Arbash Meinel
Allowing the source bytes to be longer than expected.
530
                    if bytes is None:
531
                        import pdb; pdb.set_trace()
532
                    chunks = [bytes]
533
                    del bytes
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
534
                if _NO_LABELS:
535
                    sha1 = sha_strings(chunks)
536
                else:
537
                    if sha_strings(chunks) != sha1:
538
                        raise AssertionError('sha1 sum did not match')
0.20.22 by John Arbash Meinel
Make it clear that the bits you get from 'apply_delta' are chunks, not lines.
539
            yield ChunkedContentFactory(key, parents, sha1, chunks)
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
540
0.17.5 by Robert Collins
nograph tests completely passing.
541
    def get_sha1s(self, keys):
542
        """See VersionedFiles.get_sha1s()."""
543
        result = {}
544
        for record in self.get_record_stream(keys, 'unordered', True):
545
            if record.sha1 != None:
546
                result[record.key] = record.sha1
547
            else:
548
                if record.storage_kind != 'absent':
549
                    result[record.key] == sha_string(record.get_bytes_as(
550
                        'fulltext'))
551
        return result
552
0.17.2 by Robert Collins
Core proof of concept working.
553
    def insert_record_stream(self, stream):
554
        """Insert a record stream into this container.
555
556
        :param stream: A stream of records to insert. 
557
        :return: None
558
        :seealso VersionedFiles.get_record_stream:
559
        """
0.17.5 by Robert Collins
nograph tests completely passing.
560
        for _ in self._insert_record_stream(stream):
561
            pass
0.17.2 by Robert Collins
Core proof of concept working.
562
0.17.5 by Robert Collins
nograph tests completely passing.
563
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
564
        """Internal core to insert a record stream into this container.
565
566
        This helper function has a different interface than insert_record_stream
567
        to allow add_lines to be minimal, but still return the needed data.
568
569
        :param stream: A stream of records to insert. 
570
        :return: An iterator over the sha1 of the inserted records.
571
        :seealso insert_record_stream:
572
        :seealso add_lines:
573
        """
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
574
        adapters = {}
0.17.5 by Robert Collins
nograph tests completely passing.
575
        def get_adapter(adapter_key):
576
            try:
577
                return adapters[adapter_key]
578
            except KeyError:
579
                adapter_factory = adapter_registry.get(adapter_key)
580
                adapter = adapter_factory(self)
581
                adapters[adapter_key] = adapter
582
                return adapter
0.17.2 by Robert Collins
Core proof of concept working.
583
        # This will go up to fulltexts for gc to gc fetching, which isn't
584
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
585
        self._compressor = GroupCompressor(self._delta)
586
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
587
        keys_to_add = []
588
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
589
        groups = 1
590
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
591
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
592
            index, start, length = self._access.add_raw_records(
593
                [(None, len(compressed))], compressed)[0]
594
            nodes = []
595
            for key, reads, refs in keys_to_add:
596
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
597
            self._index.add_records(nodes, random_id=random_id)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
598
        last_prefix = None
0.17.2 by Robert Collins
Core proof of concept working.
599
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
600
            # Raise an error when a record is missing.
601
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
602
                raise errors.RevisionNotPresent(record.key, self)
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
603
            try:
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
604
                lines = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
605
            except errors.UnavailableRepresentation:
0.17.5 by Robert Collins
nograph tests completely passing.
606
                adapter_key = record.storage_kind, 'fulltext'
607
                adapter = get_adapter(adapter_key)
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
608
                bytes = adapter.get_bytes(record)
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
609
                lines = osutils.split_lines(bytes)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
610
            soft = False
0.20.13 by John Arbash Meinel
Play around a bit.
611
            if len(record.key) > 1:
612
                prefix = record.key[0]
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
613
                if (last_prefix is not None and prefix != last_prefix):
614
                    soft = True
0.23.11 by John Arbash Meinel
Insert a fulltext if the delta is more than half the total size.
615
                    if basis_end > 1024 * 1024 * 2:
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
616
                        flush()
617
                        self._compressor = GroupCompressor(self._delta)
618
                        self._unadded_refs = {}
619
                        keys_to_add = []
620
                        basis_end = 0
621
                        groups += 1
622
                last_prefix = prefix
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
623
            found_sha1, end_point = self._compressor.compress(record.key,
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
624
                lines, record.sha1, soft=soft)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
625
            if record.key[-1] is None:
626
                key = record.key[:-1] + ('sha1:' + found_sha1,)
627
            else:
628
                key = record.key
629
            self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
630
            yield found_sha1
0.17.26 by Robert Collins
Working better --gc-plain-chk.
631
            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
0.17.5 by Robert Collins
nograph tests completely passing.
632
                (record.parents,)))
633
            basis_end = end_point
0.23.11 by John Arbash Meinel
Insert a fulltext if the delta is more than half the total size.
634
            if basis_end > 1024 * 1024 * 4:
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
635
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
636
                self._compressor = GroupCompressor(self._delta)
637
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
638
                keys_to_add = []
639
                basis_end = 0
640
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
641
        if len(keys_to_add):
642
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
643
        self._compressor = None
644
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
645
646
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
647
        """Iterate over the lines in the versioned files from keys.
648
649
        This may return lines from other keys. Each item the returned
650
        iterator yields is a tuple of a line and a text version that that line
651
        is present in (not introduced in).
652
653
        Ordering of results is in whatever order is most suitable for the
654
        underlying storage format.
655
656
        If a progress bar is supplied, it may be used to indicate progress.
657
        The caller is responsible for cleaning up progress bars (because this
658
        is an iterator).
659
660
        NOTES:
661
         * Lines are normalised by the underlying store: they will all have \n
662
           terminators.
663
         * Lines are returned in arbitrary order.
664
665
        :return: An iterator over (line, key).
666
        """
667
        if pb is None:
668
            pb = progress.DummyProgress()
669
        keys = set(keys)
670
        total = len(keys)
671
        # we don't care about inclusions, the caller cares.
672
        # but we need to setup a list of records to visit.
673
        # we need key, position, length
674
        for key_idx, record in enumerate(self.get_record_stream(keys,
675
            'unordered', True)):
676
            # XXX: todo - optimise to use less than full texts.
677
            key = record.key
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
678
            pb.update('Walking content.', key_idx + 1, total)
0.17.5 by Robert Collins
nograph tests completely passing.
679
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
680
                raise errors.RevisionNotPresent(key, self)
0.17.5 by Robert Collins
nograph tests completely passing.
681
            lines = split_lines(record.get_bytes_as('fulltext'))
682
            for line in lines:
683
                yield line, key
684
        pb.update('Walking content.', total, total)
685
686
    def keys(self):
687
        """See VersionedFiles.keys."""
688
        if 'evil' in debug.debug_flags:
689
            trace.mutter_callsite(2, "keys scales with size of history")
690
        sources = [self._index]
691
        result = set()
692
        for source in sources:
693
            result.update(source.keys())
694
        return result
695
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
696
697
class _GCGraphIndex(object):
698
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
699
0.17.9 by Robert Collins
Initial stab at repository format support.
700
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
701
        add_callback=None):
702
        """Construct a _GCGraphIndex on a graph_index.
703
704
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
705
        :param is_locked: A callback, returns True if the index is locked and
706
            thus usable.
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
707
        :param parents: If True, record knits parents, if not do not record 
708
            parents.
709
        :param add_callback: If not None, allow additions to the index and call
710
            this callback with a list of added GraphIndex nodes:
711
            [(node, value, node_refs), ...]
712
        """
713
        self._add_callback = add_callback
714
        self._graph_index = graph_index
715
        self._parents = parents
716
        self.has_graph = parents
717
        self._is_locked = is_locked
718
0.17.5 by Robert Collins
nograph tests completely passing.
719
    def add_records(self, records, random_id=False):
720
        """Add multiple records to the index.
721
        
722
        This function does not insert data into the Immutable GraphIndex
723
        backing the KnitGraphIndex, instead it prepares data for insertion by
724
        the caller and checks that it is safe to insert then calls
725
        self._add_callback with the prepared GraphIndex nodes.
726
727
        :param records: a list of tuples:
728
                         (key, options, access_memo, parents).
729
        :param random_id: If True the ids being added were randomly generated
730
            and no check for existence will be performed.
731
        """
732
        if not self._add_callback:
733
            raise errors.ReadOnlyError(self)
734
        # we hope there are no repositories with inconsistent parentage
735
        # anymore.
736
737
        changed = False
738
        keys = {}
739
        for (key, value, refs) in records:
740
            if not self._parents:
741
                if refs:
742
                    for ref in refs:
743
                        if ref:
744
                            raise KnitCorrupt(self,
745
                                "attempt to add node with parents "
746
                                "in parentless index.")
747
                    refs = ()
748
                    changed = True
749
            keys[key] = (value, refs)
750
        # check for dups
751
        if not random_id:
752
            present_nodes = self._get_entries(keys)
753
            for (index, key, value, node_refs) in present_nodes:
754
                if node_refs != keys[key][1]:
755
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
756
                        ": %s %s" % ((value, node_refs), keys[key]))
757
                del keys[key]
758
                changed = True
759
        if changed:
760
            result = []
761
            if self._parents:
762
                for key, (value, node_refs) in keys.iteritems():
763
                    result.append((key, value, node_refs))
764
            else:
765
                for key, (value, node_refs) in keys.iteritems():
766
                    result.append((key, value))
767
            records = result
768
        self._add_callback(records)
769
        
770
    def _check_read(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
771
        """Raise an exception if reads are not permitted."""
0.17.5 by Robert Collins
nograph tests completely passing.
772
        if not self._is_locked():
773
            raise errors.ObjectNotLocked(self)
774
0.17.2 by Robert Collins
Core proof of concept working.
775
    def _check_write_ok(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
776
        """Raise an exception if writes are not permitted."""
0.17.2 by Robert Collins
Core proof of concept working.
777
        if not self._is_locked():
778
            raise errors.ObjectNotLocked(self)
779
0.17.5 by Robert Collins
nograph tests completely passing.
780
    def _get_entries(self, keys, check_present=False):
781
        """Get the entries for keys.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
782
783
        Note: Callers are responsible for checking that the index is locked
784
        before calling this method.
785
0.17.5 by Robert Collins
nograph tests completely passing.
786
        :param keys: An iterable of index key tuples.
787
        """
788
        keys = set(keys)
789
        found_keys = set()
790
        if self._parents:
791
            for node in self._graph_index.iter_entries(keys):
792
                yield node
793
                found_keys.add(node[1])
794
        else:
795
            # adapt parentless index to the rest of the code.
796
            for node in self._graph_index.iter_entries(keys):
797
                yield node[0], node[1], node[2], ()
798
                found_keys.add(node[1])
799
        if check_present:
800
            missing_keys = keys.difference(found_keys)
801
            if missing_keys:
802
                raise RevisionNotPresent(missing_keys.pop(), self)
803
804
    def get_parent_map(self, keys):
805
        """Get a map of the parents of keys.
806
807
        :param keys: The keys to look up parents for.
808
        :return: A mapping from keys to parents. Absent keys are absent from
809
            the mapping.
810
        """
811
        self._check_read()
812
        nodes = self._get_entries(keys)
813
        result = {}
814
        if self._parents:
815
            for node in nodes:
816
                result[node[1]] = node[3][0]
817
        else:
818
            for node in nodes:
819
                result[node[1]] = None
820
        return result
821
822
    def get_build_details(self, keys):
823
        """Get the various build details for keys.
824
825
        Ghosts are omitted from the result.
826
827
        :param keys: An iterable of keys.
828
        :return: A dict of key:
829
            (index_memo, compression_parent, parents, record_details).
830
            index_memo
831
                opaque structure to pass to read_records to extract the raw
832
                data
833
            compression_parent
834
                Content that this record is built upon, may be None
835
            parents
836
                Logical parents of this node
837
            record_details
838
                extra information about the content which needs to be passed to
839
                Factory.parse_record
840
        """
841
        self._check_read()
842
        result = {}
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
843
        entries = self._get_entries(keys)
0.17.5 by Robert Collins
nograph tests completely passing.
844
        for entry in entries:
845
            key = entry[1]
846
            if not self._parents:
847
                parents = None
848
            else:
849
                parents = entry[3][0]
850
            method = 'group'
851
            result[key] = (self._node_to_position(entry),
852
                                  None, parents, (method, None))
853
        return result
854
    
855
    def keys(self):
856
        """Get all the keys in the collection.
857
        
858
        The keys are not ordered.
859
        """
860
        self._check_read()
861
        return [node[1] for node in self._graph_index.iter_all_entries()]
862
    
863
    def _node_to_position(self, node):
864
        """Convert an index value to position details."""
865
        bits = node[2].split(' ')
866
        # It would be nice not to read the entire gzip.
867
        start = int(bits[0])
868
        stop = int(bits[1])
869
        basis_end = int(bits[2])
870
        delta_end = int(bits[3])
871
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
872
873
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
874
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
875
    """Get the longest possible match for the current position."""
876
    range_start = pos
877
    range_len = 0
878
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
879
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
880
        if locations is None:
881
            locations = equivalence_table.get_idx_matches(pos)
882
        if locations is None:
883
            # No more matches, just return whatever we have, but we know that
884
            # this last position is not going to match anything
885
            pos += 1
886
            break
887
        else:
888
            if copy_ends is None:
889
                # We are starting a new range
890
                copy_ends = [loc + 1 for loc in locations]
891
                range_len = 1
892
                locations = None # Consumed
893
            else:
894
                # We are currently in the middle of a match
895
                next_locations = set(copy_ends).intersection(locations)
896
                if len(next_locations):
897
                    # range continues
898
                    copy_ends = [loc + 1 for loc in next_locations]
899
                    range_len += 1
900
                    locations = None # Consumed
901
                else:
902
                    # But we are done with this match, we should be
903
                    # starting a new one, though. We will pass back 'locations'
904
                    # so that we don't have to do another lookup.
905
                    break
906
        pos += 1
907
    if copy_ends is None:
908
        return None, pos, locations
909
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
910
911
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
912
try:
0.23.21 by John Arbash Meinel
Rename the extension to _pyx, since Robert prefers that form
913
    from bzrlib.plugins.groupcompress_rabin import _groupcompress_pyx
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
914
except ImportError:
915
    pass