/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
0.25.3 by John Arbash Meinel
Add a encode/decode base128 functions.
22
import struct
0.17.5 by Robert Collins
nograph tests completely passing.
23
import zlib
24
0.17.4 by Robert Collins
Annotate.
25
from bzrlib import (
26
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
27
    debug,
0.17.4 by Robert Collins
Annotate.
28
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
29
    errors,
0.17.4 by Robert Collins
Annotate.
30
    graph as _mod_graph,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
31
    osutils,
0.17.4 by Robert Collins
Annotate.
32
    pack,
33
    patiencediff,
34
    )
35
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
36
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
37
from bzrlib.osutils import (
38
    contains_whitespace,
39
    contains_linebreaks,
40
    sha_string,
41
    sha_strings,
42
    split_lines,
43
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
44
from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
45
from bzrlib.lru_cache import LRUSizeCache
0.17.9 by Robert Collins
Initial stab at repository format support.
46
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
47
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
48
    adapter_registry,
49
    AbsentContentFactory,
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
50
    ChunkedContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
51
    FulltextContentFactory,
52
    VersionedFiles,
53
    )
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
54
from bzrlib.plugins.groupcompress import errors as gc_errors
0.17.2 by Robert Collins
Core proof of concept working.
55
0.23.15 by John Arbash Meinel
Handle when self._index is NULL, mostly because the source text was the empty strig.
56
_NO_LABELS = False
0.23.55 by John Arbash Meinel
Make sure the default is _FAST=False for now.
57
_FAST = False
0.17.2 by Robert Collins
Core proof of concept working.
58
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
59
def parse(bytes):
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
60
    if _NO_LABELS:
61
        action_byte = bytes[0]
62
        action = {'f':'fulltext', 'd':'delta'}[action_byte]
63
        return action, None, None, bytes[1:]
0.23.12 by John Arbash Meinel
Add a 'len:' field to the data.
64
    (action, label_line, sha1_line, len_line,
0.23.16 by John Arbash Meinel
Properly restore the label functionality.
65
     delta_bytes) = bytes.split('\n', 4)
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
66
    if (action not in ('fulltext', 'delta')
0.23.27 by John Arbash Meinel
Fix up some failing tests.
67
        or not label_line.startswith('label:')
68
        or not sha1_line.startswith('sha1:')
69
        or not len_line.startswith('len:')
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
70
        ):
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
71
        raise AssertionError("bad text record %r" % (bytes,))
0.23.53 by John Arbash Meinel
Remove the temporary adjustment for handling multiple formats of labels.
72
    label = tuple(label_line[6:].split('\x00'))
73
    sha1 = sha1_line[5:]
0.23.27 by John Arbash Meinel
Fix up some failing tests.
74
    length = int(len_line[4:])
0.23.12 by John Arbash Meinel
Add a 'len:' field to the data.
75
    if not len(delta_bytes) == length:
76
        raise AssertionError("bad length record %r" % (bytes,))
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
77
    return action, label, sha1, delta_bytes
78
79
0.25.3 by John Arbash Meinel
Add a encode/decode base128 functions.
80
def encode_base128_int(val):
81
    """Convert an integer into a 7-bit lsb encoding."""
82
    bytes = []
83
    count = 0
84
    while val >= 0x80:
85
        bytes.append(chr((val | 0x80) & 0xFF))
86
        val >>= 7
87
    bytes.append(chr(val))
88
    return ''.join(bytes)
89
90
91
def decode_base128_int(bytes):
92
    """Decode an integer from a 7-bit lsb encoding."""
93
    offset = 0
94
    val = 0
95
    shift = 0
96
    bval = ord(bytes[offset])
97
    while bval >= 0x80:
98
        val |= (bval & 0x7F) << shift
99
        shift += 7
100
        offset += 1
101
        bval = ord(bytes[offset])
102
    val |= bval << shift
103
    offset += 1
104
    return val, offset
105
106
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
107
def sort_gc_optimal(parent_map):
108
    """Sort and group the keys in parent_map into gc-optimal order.
109
110
    gc-optimal is defined (currently) as reverse-topological order, grouped by
111
    the key prefix.
112
113
    :return: A sorted-list of keys
114
    """
115
    # gc-optimal ordering is approximately reverse topological,
116
    # properly grouped by file-id.
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
117
    per_prefix_map = {}
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
118
    for item in parent_map.iteritems():
119
        key = item[0]
120
        if isinstance(key, str) or len(key) == 1:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
121
            prefix = ''
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
122
        else:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
123
            prefix = key[0]
124
        try:
125
            per_prefix_map[prefix].append(item)
126
        except KeyError:
127
            per_prefix_map[prefix] = [item]
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
128
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
129
    present_keys = []
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
130
    for prefix in sorted(per_prefix_map):
131
        present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
132
    return present_keys
133
134
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
135
class GroupCompressBlockEntry(object):
136
    """Track the information about a single object inside a GC group.
137
138
    This is generally just the dumb data structure.
139
    """
140
141
    def __init__(self, key, type, sha1, start, length):
142
        self.key = key
143
        self.type = type # delta, fulltext, external?
144
        self.sha1 = sha1 # Sha1 of content
145
        self.start = start # Byte offset to start of data
146
        self.length = length # Length of content
147
148
149
class GroupCompressBlock(object):
150
    """An object which maintains the internal structure of the compressed data.
151
152
    This tracks the meta info (start of text, length, type, etc.)
153
    """
154
155
    # Group Compress Block v1 Plain
156
    GCB_HEADER = 'gcb1p\n'
157
158
    def __init__(self):
159
        # map by key? or just order in file?
160
        self._entries = {}
161
162
    def _parse_header(self):
163
        """Parse the meta-info from the stream."""
164
165
    @classmethod
166
    def from_zlib_bytes(cls, bytes):
167
        """Get the info about this block from the compressed bytes.
168
169
        :return: A new GroupCompressBlock
170
        """
171
        return cls()
172
173
    @classmethod
174
    def from_bytes(cls, bytes):
175
        out = cls()
176
        if bytes[:6] != cls.GCB_HEADER:
177
            raise gc_errors.InvalidGroupCompressBlock(
178
                'bytes did not start with %r' % (cls.GCB_HEADER,))
179
        return out
180
181
    def extract(self, key, sha1=None):
182
        """Extract the text for a specific key.
183
184
        :param key: The label used for this content
185
        :param sha1: TODO (should we validate only when sha1 is supplied?)
186
        :return: The bytes for the content
187
        """
188
189
    def add_entry(self, key, type, sha1, start, length):
190
        """Add new meta info about an entry.
191
192
        :param key: The key for the new content
193
        :param type: Whether this is a delta or fulltext entry (external?)
194
        :param sha1: sha1sum of the fulltext of this entry
195
        :param start: where the encoded bytes start
196
        :param length: total number of bytes in the encoded form
197
        :return: The entry?
198
        """
199
        entry = GroupCompressBlockEntry(key, type, sha1, start, length)
200
        assert key not in self._entries
201
        self._entries[key] = entry
202
        return entry
203
204
    def to_bytes(self):
205
        """Encode the information into a byte stream."""
206
        chunks = []
207
        for key in sorted(self._entries):
208
            entry = self._entries[key]
209
            chunk = ('key:%s\n'
210
                     'type:%s\n'
211
                     'sha1:%s\n'
212
                     'start:%s\n'
213
                     'length:%s\n'
214
                     '\n'
215
                     ) % ('\x00'.join(entry.key),
216
                          entry.type,
217
                          entry.sha1,
218
                          entry.start,
219
                          entry.length,
220
                          )
221
            chunks.append(chunk)
222
        info_len = sum(map(len, chunks))
223
        chunks = [self.GCB_HEADER, '%d\n' % (info_len,)] + chunks
224
        return ''.join(chunks)
225
226
0.17.2 by Robert Collins
Core proof of concept working.
227
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
228
    """Produce a serialised group of compressed texts.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
229
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
230
    It contains code very similar to SequenceMatcher because of having a similar
231
    task. However some key differences apply:
232
     - there is no junk, we want a minimal edit not a human readable diff.
233
     - we don't filter very common lines (because we don't know where a good
234
       range will start, and after the first text we want to be emitting minmal
235
       edits only.
236
     - we chain the left side, not the right side
237
     - we incrementally update the adjacency matrix as new lines are provided.
238
     - we look for matches in all of the left side, so the routine which does
239
       the analagous task of find_longest_match does not need to filter on the
240
       left side.
241
    """
0.17.2 by Robert Collins
Core proof of concept working.
242
243
    def __init__(self, delta=True):
244
        """Create a GroupCompressor.
245
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
246
        :param delta: If False, do not compress records.
0.17.2 by Robert Collins
Core proof of concept working.
247
        """
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
248
        # Consider seeding the lines with some sort of GC Start flag, or
249
        # putting it as part of the output stream, rather than in the
250
        # compressed bytes.
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
251
        self.lines = []
0.17.2 by Robert Collins
Core proof of concept working.
252
        self.endpoint = 0
253
        self.input_bytes = 0
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
254
        self.labels_deltas = {}
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
255
        self._delta_index = _groupcompress_pyx.DeltaIndex()
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
256
0.23.58 by John Arbash Meinel
fix up the failing tests.
257
    def compress(self, key, bytes, expected_sha, soft=False):
0.17.2 by Robert Collins
Core proof of concept working.
258
        """Compress lines with label key.
259
260
        :param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins
Working better --gc-plain-chk.
261
            for identification of the text during decompression. If the last
262
            element is 'None' it is replaced with the sha1 of the text -
263
            e.g. sha1:xxxxxxx.
0.23.58 by John Arbash Meinel
fix up the failing tests.
264
        :param bytes: The bytes to be compressed
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
265
        :param expected_sha: If non-None, the sha the lines are believed to
0.17.2 by Robert Collins
Core proof of concept working.
266
            have. During compression the sha is calculated; a mismatch will
267
            cause an error.
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
268
        :param soft: Do a 'soft' compression. This means that we require larger
269
            ranges to match to be considered for a copy command.
0.17.2 by Robert Collins
Core proof of concept working.
270
        :return: The sha1 of lines, and the number of bytes accumulated in
271
            the group output so far.
272
        """
0.23.54 by John Arbash Meinel
'bzr pack' _FAST during compress() now is 32s versus 25s.
273
        if not _FAST or expected_sha is None:
274
            sha1 = sha_string(bytes)
275
        else:
276
            sha1 = expected_sha
0.17.26 by Robert Collins
Working better --gc-plain-chk.
277
        if key[-1] is None:
278
            key = key[:-1] + ('sha1:' + sha1,)
0.17.2 by Robert Collins
Core proof of concept working.
279
        label = '\x00'.join(key)
0.23.52 by John Arbash Meinel
Use the max_delta flag.
280
        input_len = len(bytes)
0.23.12 by John Arbash Meinel
Add a 'len:' field to the data.
281
        # By having action/label/sha1/len, we can parse the group if the index
282
        # was ever destroyed, we have the key in 'label', we know the final
283
        # bytes are valid from sha1, and we know where to find the end of this
284
        # record because of 'len'. (the delta record itself will store the
285
        # total length for the expanded record)
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
286
        # 'len: %d\n' costs approximately 1% increase in total data
287
        # Having the labels at all costs us 9-10% increase, 38% increase for
288
        # inventory pages, and 5.8% increase for text pages
289
        if _NO_LABELS:
290
            new_chunks = []
291
        else:
0.23.27 by John Arbash Meinel
Fix up some failing tests.
292
            new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
293
        if self._delta_index._source_offset != self.endpoint:
294
            raise AssertionError('_source_offset != endpoint'
295
                ' somehow the DeltaIndex got out of sync with'
296
                ' the output lines')
0.23.52 by John Arbash Meinel
Use the max_delta flag.
297
        max_delta_size = len(bytes) / 2
298
        delta = self._delta_index.make_delta(bytes, max_delta_size)
299
        if (delta is None):
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
300
            # We can't delta (perhaps source_text is empty)
301
            # so mark this as an insert
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
302
            if _NO_LABELS:
303
                new_chunks = ['f']
304
            else:
305
                new_chunks.insert(0, 'fulltext\n')
0.23.27 by John Arbash Meinel
Fix up some failing tests.
306
                new_chunks.append('len:%s\n' % (input_len,))
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
307
            unadded_bytes = sum(map(len, new_chunks))
0.23.52 by John Arbash Meinel
Use the max_delta flag.
308
            self._delta_index.add_source(bytes, unadded_bytes)
309
            new_chunks.append(bytes)
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
310
        else:
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
311
            if _NO_LABELS:
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
312
                new_chunks = ['d']
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
313
            else:
314
                new_chunks.insert(0, 'delta\n')
0.23.27 by John Arbash Meinel
Fix up some failing tests.
315
                new_chunks.append('len:%s\n' % (len(delta),))
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
316
            if _FAST:
317
                new_chunks.append(delta)
318
                unadded_bytes = sum(map(len, new_chunks))
319
                self._delta_index._source_offset += unadded_bytes
320
            else:
321
                unadded_bytes = sum(map(len, new_chunks))
0.23.47 by John Arbash Meinel
Use the new add_delta_source.
322
                self._delta_index.add_delta_source(delta, unadded_bytes)
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
323
                new_chunks.append(delta)
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
324
        delta_start = (self.endpoint, len(self.lines))
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
325
        self.output_chunks(new_chunks)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
326
        self.input_bytes += input_len
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
327
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
328
        self.labels_deltas[key] = (delta_start, delta_end)
0.23.29 by John Arbash Meinel
Forgot to add the delta bytes to the index objects.
329
        if not self._delta_index._source_offset == self.endpoint:
330
            raise AssertionError('the delta index is out of sync'
331
                'with the output lines %s != %s'
332
                % (self._delta_index._source_offset, self.endpoint))
0.17.2 by Robert Collins
Core proof of concept working.
333
        return sha1, self.endpoint
334
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
335
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
336
        """Extract a key previously added to the compressor.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
337
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
338
        :param key: The key to extract.
339
        :return: An iterable over bytes and the sha1.
340
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
341
        delta_details = self.labels_deltas[key]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
342
        delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
343
        action, label, sha1, delta = parse(''.join(delta_chunks))
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
344
        if not _NO_LABELS and label != key:
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
345
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
346
        if action == 'fulltext':
347
            bytes = delta
348
        else:
349
            source = ''.join(self.lines[delta_details[0][0]])
0.23.21 by John Arbash Meinel
Rename the extension to _pyx, since Robert prefers that form
350
            bytes = _groupcompress_pyx.apply_delta(source, delta)
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
351
        if _NO_LABELS:
352
            sha1 = sha_string(bytes)
353
        else:
354
            assert sha1 == sha_string(bytes)
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
355
        return [bytes], sha1
356
357
    def output_chunks(self, new_chunks):
358
        """Output some chunks.
359
360
        :param new_chunks: The chunks to output.
361
        """
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
362
        endpoint = self.endpoint
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
363
        self.lines.extend(new_chunks)
364
        endpoint += sum(map(len, new_chunks))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
365
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
366
0.17.2 by Robert Collins
Core proof of concept working.
367
    def ratio(self):
368
        """Return the overall compression ratio."""
369
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
370
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
371
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
372
def make_pack_factory(graph, delta, keylength):
373
    """Create a factory for creating a pack based groupcompress.
374
375
    This is only functional enough to run interface tests, it doesn't try to
376
    provide a full pack environment.
377
    
378
    :param graph: Store a graph.
379
    :param delta: Delta compress contents.
380
    :param keylength: How long should keys be.
381
    """
382
    def factory(transport):
383
        parents = graph or delta
384
        ref_length = 0
385
        if graph:
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
386
            ref_length = 1
0.17.7 by Robert Collins
Update for current index2 changes.
387
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
388
            key_elements=keylength)
389
        stream = transport.open_write_stream('newpack')
390
        writer = pack.ContainerWriter(stream.write)
391
        writer.begin()
392
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
393
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
394
        access = _DirectPackAccess({})
395
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
396
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
397
        result.stream = stream
398
        result.writer = writer
399
        return result
400
    return factory
401
402
403
def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
404
    versioned_files.writer.end()
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
405
    versioned_files.stream.close()
406
407
408
class GroupCompressVersionedFiles(VersionedFiles):
409
    """A group-compress based VersionedFiles implementation."""
410
0.17.2 by Robert Collins
Core proof of concept working.
411
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
412
        """Create a GroupCompressVersionedFiles object.
413
414
        :param index: The index object storing access and graph data.
415
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
416
        :param delta: Whether to delta compress or just entropy compress.
417
        """
418
        self._index = index
419
        self._access = access
420
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
421
        self._unadded_refs = {}
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
422
        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
0.17.2 by Robert Collins
Core proof of concept working.
423
424
    def add_lines(self, key, parents, lines, parent_texts=None,
425
        left_matching_blocks=None, nostore_sha=None, random_id=False,
426
        check_content=True):
427
        """Add a text to the store.
428
429
        :param key: The key tuple of the text to add.
430
        :param parents: The parents key tuples of the text to add.
431
        :param lines: A list of lines. Each line must be a bytestring. And all
432
            of them except the last must be terminated with \n and contain no
433
            other \n's. The last line may either contain no \n's or a single
434
            terminating \n. If the lines list does meet this constraint the add
435
            routine may error or may succeed - but you will be unable to read
436
            the data back accurately. (Checking the lines have been split
437
            correctly is expensive and extremely unlikely to catch bugs so it
438
            is not done at runtime unless check_content is True.)
439
        :param parent_texts: An optional dictionary containing the opaque 
440
            representations of some or all of the parents of version_id to
441
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
442
            returned by add_lines or data corruption can be caused.
443
        :param left_matching_blocks: a hint about which areas are common
444
            between the text and its left-hand-parent.  The format is
445
            the SequenceMatcher.get_matching_blocks format.
446
        :param nostore_sha: Raise ExistingContent and do not add the lines to
447
            the versioned file if the digest of the lines matches this.
448
        :param random_id: If True a random id has been selected rather than
449
            an id determined by some deterministic process such as a converter
450
            from a foreign VCS. When True the backend may choose not to check
451
            for uniqueness of the resulting key within the versioned file, so
452
            this should only be done when the result is expected to be unique
453
            anyway.
454
        :param check_content: If True, the lines supplied are verified to be
455
            bytestrings that are correctly formed lines.
456
        :return: The text sha1, the number of bytes in the text, and an opaque
457
                 representation of the inserted version which can be provided
458
                 back to future add_lines calls in the parent_texts dictionary.
459
        """
460
        self._index._check_write_ok()
461
        self._check_add(key, lines, random_id, check_content)
462
        if parents is None:
463
            # The caller might pass None if there is no graph data, but kndx
464
            # indexes can't directly store that, so we give them
465
            # an empty tuple instead.
466
            parents = ()
467
        # double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
468
        length = sum(map(len, lines))
469
        record = ChunkedContentFactory(key, parents, None, lines)
0.17.5 by Robert Collins
nograph tests completely passing.
470
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
471
        return sha1, length, None
0.17.2 by Robert Collins
Core proof of concept working.
472
0.17.4 by Robert Collins
Annotate.
473
    def annotate(self, key):
474
        """See VersionedFiles.annotate."""
475
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
476
        parent_map = self.get_parent_map([key])
477
        if not parent_map:
478
            raise errors.RevisionNotPresent(key, self)
479
        if parent_map[key] is not None:
480
            search = graph._make_breadth_first_searcher([key])
481
            keys = set()
482
            while True:
483
                try:
484
                    present, ghosts = search.next_with_ghosts()
485
                except StopIteration:
486
                    break
487
                keys.update(present)
488
            parent_map = self.get_parent_map(keys)
489
        else:
490
            keys = [key]
491
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
492
        head_cache = _mod_graph.FrozenHeadsCache(graph)
493
        parent_cache = {}
494
        reannotate = annotate.reannotate
495
        for record in self.get_record_stream(keys, 'topological', True):
496
            key = record.key
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
497
            chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins
Annotate.
498
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
499
            parent_cache[key] = list(
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
500
                reannotate(parent_lines, chunks, key, None, head_cache))
0.17.4 by Robert Collins
Annotate.
501
        return parent_cache[key]
502
0.17.5 by Robert Collins
nograph tests completely passing.
503
    def check(self, progress_bar=None):
504
        """See VersionedFiles.check()."""
505
        keys = self.keys()
506
        for record in self.get_record_stream(keys, 'unordered', True):
507
            record.get_bytes_as('fulltext')
508
0.17.2 by Robert Collins
Core proof of concept working.
509
    def _check_add(self, key, lines, random_id, check_content):
510
        """check that version_id and lines are safe to add."""
511
        version_id = key[-1]
0.17.26 by Robert Collins
Working better --gc-plain-chk.
512
        if version_id is not None:
513
            if contains_whitespace(version_id):
514
                raise InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins
Core proof of concept working.
515
        self.check_not_reserved_id(version_id)
516
        # TODO: If random_id==False and the key is already present, we should
517
        # probably check that the existing content is identical to what is
518
        # being inserted, and otherwise raise an exception.  This would make
519
        # the bundle code simpler.
520
        if check_content:
521
            self._check_lines_not_unicode(lines)
522
            self._check_lines_are_lines(lines)
523
0.17.5 by Robert Collins
nograph tests completely passing.
524
    def get_parent_map(self, keys):
525
        """Get a map of the parents of keys.
526
527
        :param keys: The keys to look up parents for.
528
        :return: A mapping from keys to parents. Absent keys are absent from
529
            the mapping.
530
        """
531
        result = {}
532
        sources = [self._index]
533
        source_results = []
534
        missing = set(keys)
535
        for source in sources:
536
            if not missing:
537
                break
538
            new_result = source.get_parent_map(missing)
539
            source_results.append(new_result)
540
            result.update(new_result)
541
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
542
        if self._unadded_refs:
543
            for key in missing:
544
                if key in self._unadded_refs:
545
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
546
        return result
547
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
548
    def _get_group_and_delta_bytes(self, index_memo):
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
549
        read_memo = index_memo[0:3]
550
        # get the group:
551
        try:
552
            plain = self._group_cache[read_memo]
553
        except KeyError:
554
            # read the group
555
            zdata = self._access.get_raw_records([read_memo]).next()
556
            # decompress - whole thing - this is not a bug, as it
557
            # permits caching. We might want to store the partially
558
            # decompresed group and decompress object, so that recent
559
            # texts are not penalised by big groups.
560
            plain = zlib.decompress(zdata) #, index_memo[4])
561
            self._group_cache[read_memo] = plain
562
        # cheapo debugging:
563
        # print len(zdata), len(plain)
564
        # parse - requires split_lines, better to have byte offsets
565
        # here (but not by much - we only split the region for the
566
        # recipe, and we often want to end up with lines anyway.
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
567
        return plain, plain[index_memo[3]:index_memo[4]]
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
568
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
569
    def get_missing_compression_parent_keys(self):
570
        """Return the keys of missing compression parents.
571
572
        Missing compression parents occur when a record stream was missing
573
        basis texts, or a index was scanned that had missing basis texts.
574
        """
575
        # GroupCompress cannot currently reference texts that are not in the
576
        # group, so this is valid for now
577
        return frozenset()
578
0.17.5 by Robert Collins
nograph tests completely passing.
579
    def get_record_stream(self, keys, ordering, include_delta_closure):
580
        """Get a stream of records for keys.
581
582
        :param keys: The keys to include.
583
        :param ordering: Either 'unordered' or 'topological'. A topologically
584
            sorted stream has compression parents strictly before their
585
            children.
586
        :param include_delta_closure: If True then the closure across any
587
            compression parents will be included (in the opaque data).
588
        :return: An iterator of ContentFactory objects, each of which is only
589
            valid until the iterator is advanced.
590
        """
591
        # keys might be a generator
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
592
        orig_keys = list(keys)
593
        keys = set(orig_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
594
        if not keys:
595
            return
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
596
        if (not self._index.has_graph
597
            and ordering in ('topological', 'gc-optimal')):
0.17.5 by Robert Collins
nograph tests completely passing.
598
            # Cannot topological order when no graph has been stored.
599
            ordering = 'unordered'
600
        # Cheap: iterate
601
        locations = self._index.get_build_details(keys)
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
602
        local_keys = frozenset(keys).intersection(set(self._unadded_refs))
0.17.5 by Robert Collins
nograph tests completely passing.
603
        if ordering == 'topological':
604
            # would be better to not globally sort initially but instead
605
            # start with one key, recurse to its oldest parent, then grab
606
            # everything in the same group, etc.
607
            parent_map = dict((key, details[2]) for key, details in
608
                locations.iteritems())
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
609
            for key in local_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
610
                parent_map[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
611
            present_keys = topo_sort(parent_map)
612
            # Now group by source:
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
613
        elif ordering == 'gc-optimal':
614
            parent_map = dict((key, details[2]) for key, details in
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
615
                              locations.iteritems())
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
616
            for key in local_keys:
617
                parent_map[key] = self._unadded_refs[key]
618
            # XXX: This only optimizes for the target ordering. We may need to
619
            #      balance that with the time it takes to extract ordering, by
620
            #      somehow grouping based on locations[key][0:3]
621
            present_keys = sort_gc_optimal(parent_map)
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
622
        elif ordering == 'as-requested':
0.20.32 by John Arbash Meinel
Fix bug #336373 by adding local keys to locations after the fact, rather than before.
623
            present_keys = [key for key in orig_keys if key in locations
624
                            or key in local_keys]
0.17.5 by Robert Collins
nograph tests completely passing.
625
        else:
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
626
            # We want to yield the keys in a semi-optimal (read-wise) ordering.
627
            # Otherwise we thrash the _group_cache and destroy performance
628
            def get_group(key):
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
629
                # This is the group the bytes are stored in, followed by the
630
                # location in the group
631
                return locations[key][0]
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
632
            present_keys = sorted(locations.iterkeys(), key=get_group)
633
            # We don't have an ordering for keys in the in-memory object, but
634
            # lets process the in-memory ones first.
635
            present_keys = list(local_keys) + present_keys
0.20.32 by John Arbash Meinel
Fix bug #336373 by adding local keys to locations after the fact, rather than before.
636
        locations.update((key, None) for key in local_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
637
        absent_keys = keys.difference(set(locations))
638
        for key in absent_keys:
639
            yield AbsentContentFactory(key)
640
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
641
            if key in self._unadded_refs:
0.20.28 by John Arbash Meinel
Fix typo with the recent lines => chunks rename.
642
                chunks, sha1 = self._compressor.extract(key)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
643
                parents = self._unadded_refs[key]
644
            else:
645
                index_memo, _, parents, (method, _) = locations[key]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
646
                plain, delta_bytes = self._get_group_and_delta_bytes(index_memo)
647
                action, label, sha1, delta = parse(delta_bytes)
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
648
                if not _NO_LABELS and label != key:
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
649
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
650
                if action == 'fulltext':
651
                    chunks = [delta]
652
                else:
653
                    # TODO: relax apply_delta so that it can allow source to be
654
                    #       longer than expected
0.23.21 by John Arbash Meinel
Rename the extension to _pyx, since Robert prefers that form
655
                    bytes = _groupcompress_pyx.apply_delta(plain, delta)
0.23.10 by John Arbash Meinel
Allowing the source bytes to be longer than expected.
656
                    if bytes is None:
657
                        import pdb; pdb.set_trace()
658
                    chunks = [bytes]
659
                    del bytes
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
660
                if _NO_LABELS:
661
                    sha1 = sha_strings(chunks)
662
                else:
0.23.54 by John Arbash Meinel
'bzr pack' _FAST during compress() now is 32s versus 25s.
663
                    if not _FAST and sha_strings(chunks) != sha1:
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
664
                        raise AssertionError('sha1 sum did not match')
0.20.22 by John Arbash Meinel
Make it clear that the bits you get from 'apply_delta' are chunks, not lines.
665
            yield ChunkedContentFactory(key, parents, sha1, chunks)
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
666
0.17.5 by Robert Collins
nograph tests completely passing.
667
    def get_sha1s(self, keys):
668
        """See VersionedFiles.get_sha1s()."""
669
        result = {}
670
        for record in self.get_record_stream(keys, 'unordered', True):
671
            if record.sha1 != None:
672
                result[record.key] = record.sha1
673
            else:
674
                if record.storage_kind != 'absent':
675
                    result[record.key] == sha_string(record.get_bytes_as(
676
                        'fulltext'))
677
        return result
678
0.17.2 by Robert Collins
Core proof of concept working.
679
    def insert_record_stream(self, stream):
680
        """Insert a record stream into this container.
681
682
        :param stream: A stream of records to insert. 
683
        :return: None
684
        :seealso VersionedFiles.get_record_stream:
685
        """
0.17.5 by Robert Collins
nograph tests completely passing.
686
        for _ in self._insert_record_stream(stream):
687
            pass
0.17.2 by Robert Collins
Core proof of concept working.
688
0.17.5 by Robert Collins
nograph tests completely passing.
689
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
690
        """Internal core to insert a record stream into this container.
691
692
        This helper function has a different interface than insert_record_stream
693
        to allow add_lines to be minimal, but still return the needed data.
694
695
        :param stream: A stream of records to insert. 
696
        :return: An iterator over the sha1 of the inserted records.
697
        :seealso insert_record_stream:
698
        :seealso add_lines:
699
        """
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
700
        adapters = {}
0.17.5 by Robert Collins
nograph tests completely passing.
701
        def get_adapter(adapter_key):
702
            try:
703
                return adapters[adapter_key]
704
            except KeyError:
705
                adapter_factory = adapter_registry.get(adapter_key)
706
                adapter = adapter_factory(self)
707
                adapters[adapter_key] = adapter
708
                return adapter
0.17.2 by Robert Collins
Core proof of concept working.
709
        # This will go up to fulltexts for gc to gc fetching, which isn't
710
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
711
        self._compressor = GroupCompressor(self._delta)
712
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
713
        keys_to_add = []
714
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
715
        groups = 1
716
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
717
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
718
            index, start, length = self._access.add_raw_records(
719
                [(None, len(compressed))], compressed)[0]
720
            nodes = []
721
            for key, reads, refs in keys_to_add:
722
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
723
            self._index.add_records(nodes, random_id=random_id)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
724
        last_prefix = None
0.17.2 by Robert Collins
Core proof of concept working.
725
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
726
            # Raise an error when a record is missing.
727
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
728
                raise errors.RevisionNotPresent(record.key, self)
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
729
            try:
0.23.52 by John Arbash Meinel
Use the max_delta flag.
730
                bytes = record.get_bytes_as('fulltext')
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
731
            except errors.UnavailableRepresentation:
0.17.5 by Robert Collins
nograph tests completely passing.
732
                adapter_key = record.storage_kind, 'fulltext'
733
                adapter = get_adapter(adapter_key)
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
734
                bytes = adapter.get_bytes(record)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
735
            soft = False
0.20.13 by John Arbash Meinel
Play around a bit.
736
            if len(record.key) > 1:
737
                prefix = record.key[0]
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
738
                if (last_prefix is not None and prefix != last_prefix):
739
                    soft = True
0.23.11 by John Arbash Meinel
Insert a fulltext if the delta is more than half the total size.
740
                    if basis_end > 1024 * 1024 * 2:
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
741
                        flush()
742
                        self._compressor = GroupCompressor(self._delta)
743
                        self._unadded_refs = {}
744
                        keys_to_add = []
745
                        basis_end = 0
746
                        groups += 1
747
                last_prefix = prefix
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
748
            found_sha1, end_point = self._compressor.compress(record.key,
0.23.58 by John Arbash Meinel
fix up the failing tests.
749
                bytes, record.sha1, soft=soft)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
750
            if record.key[-1] is None:
751
                key = record.key[:-1] + ('sha1:' + found_sha1,)
752
            else:
753
                key = record.key
754
            self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
755
            yield found_sha1
0.17.26 by Robert Collins
Working better --gc-plain-chk.
756
            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
0.17.5 by Robert Collins
nograph tests completely passing.
757
                (record.parents,)))
758
            basis_end = end_point
0.23.11 by John Arbash Meinel
Insert a fulltext if the delta is more than half the total size.
759
            if basis_end > 1024 * 1024 * 4:
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
760
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
761
                self._compressor = GroupCompressor(self._delta)
762
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
763
                keys_to_add = []
764
                basis_end = 0
765
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
766
        if len(keys_to_add):
767
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
768
        self._compressor = None
769
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
770
771
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
772
        """Iterate over the lines in the versioned files from keys.
773
774
        This may return lines from other keys. Each item the returned
775
        iterator yields is a tuple of a line and a text version that that line
776
        is present in (not introduced in).
777
778
        Ordering of results is in whatever order is most suitable for the
779
        underlying storage format.
780
781
        If a progress bar is supplied, it may be used to indicate progress.
782
        The caller is responsible for cleaning up progress bars (because this
783
        is an iterator).
784
785
        NOTES:
786
         * Lines are normalised by the underlying store: they will all have \n
787
           terminators.
788
         * Lines are returned in arbitrary order.
789
790
        :return: An iterator over (line, key).
791
        """
792
        if pb is None:
793
            pb = progress.DummyProgress()
794
        keys = set(keys)
795
        total = len(keys)
796
        # we don't care about inclusions, the caller cares.
797
        # but we need to setup a list of records to visit.
798
        # we need key, position, length
799
        for key_idx, record in enumerate(self.get_record_stream(keys,
800
            'unordered', True)):
801
            # XXX: todo - optimise to use less than full texts.
802
            key = record.key
0.17.31 by John Arbash Meinel
Bring in the 'rabin' experiment.
803
            pb.update('Walking content.', key_idx, total)
0.17.5 by Robert Collins
nograph tests completely passing.
804
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
805
                raise errors.RevisionNotPresent(key, self)
0.17.5 by Robert Collins
nograph tests completely passing.
806
            lines = split_lines(record.get_bytes_as('fulltext'))
807
            for line in lines:
808
                yield line, key
809
        pb.update('Walking content.', total, total)
810
811
    def keys(self):
812
        """See VersionedFiles.keys."""
813
        if 'evil' in debug.debug_flags:
814
            trace.mutter_callsite(2, "keys scales with size of history")
815
        sources = [self._index]
816
        result = set()
817
        for source in sources:
818
            result.update(source.keys())
819
        return result
820
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
821
822
class _GCGraphIndex(object):
823
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
824
0.17.9 by Robert Collins
Initial stab at repository format support.
825
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
826
        add_callback=None):
827
        """Construct a _GCGraphIndex on a graph_index.
828
829
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
830
        :param is_locked: A callback, returns True if the index is locked and
831
            thus usable.
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
832
        :param parents: If True, record knits parents, if not do not record 
833
            parents.
834
        :param add_callback: If not None, allow additions to the index and call
835
            this callback with a list of added GraphIndex nodes:
836
            [(node, value, node_refs), ...]
837
        """
838
        self._add_callback = add_callback
839
        self._graph_index = graph_index
840
        self._parents = parents
841
        self.has_graph = parents
842
        self._is_locked = is_locked
843
0.17.5 by Robert Collins
nograph tests completely passing.
844
    def add_records(self, records, random_id=False):
845
        """Add multiple records to the index.
846
        
847
        This function does not insert data into the Immutable GraphIndex
848
        backing the KnitGraphIndex, instead it prepares data for insertion by
849
        the caller and checks that it is safe to insert then calls
850
        self._add_callback with the prepared GraphIndex nodes.
851
852
        :param records: a list of tuples:
853
                         (key, options, access_memo, parents).
854
        :param random_id: If True the ids being added were randomly generated
855
            and no check for existence will be performed.
856
        """
857
        if not self._add_callback:
858
            raise errors.ReadOnlyError(self)
859
        # we hope there are no repositories with inconsistent parentage
860
        # anymore.
861
862
        changed = False
863
        keys = {}
864
        for (key, value, refs) in records:
865
            if not self._parents:
866
                if refs:
867
                    for ref in refs:
868
                        if ref:
869
                            raise KnitCorrupt(self,
870
                                "attempt to add node with parents "
871
                                "in parentless index.")
872
                    refs = ()
873
                    changed = True
874
            keys[key] = (value, refs)
875
        # check for dups
876
        if not random_id:
877
            present_nodes = self._get_entries(keys)
878
            for (index, key, value, node_refs) in present_nodes:
879
                if node_refs != keys[key][1]:
880
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
881
                        ": %s %s" % ((value, node_refs), keys[key]))
882
                del keys[key]
883
                changed = True
884
        if changed:
885
            result = []
886
            if self._parents:
887
                for key, (value, node_refs) in keys.iteritems():
888
                    result.append((key, value, node_refs))
889
            else:
890
                for key, (value, node_refs) in keys.iteritems():
891
                    result.append((key, value))
892
            records = result
893
        self._add_callback(records)
894
        
895
    def _check_read(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
896
        """Raise an exception if reads are not permitted."""
0.17.5 by Robert Collins
nograph tests completely passing.
897
        if not self._is_locked():
898
            raise errors.ObjectNotLocked(self)
899
0.17.2 by Robert Collins
Core proof of concept working.
900
    def _check_write_ok(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
901
        """Raise an exception if writes are not permitted."""
0.17.2 by Robert Collins
Core proof of concept working.
902
        if not self._is_locked():
903
            raise errors.ObjectNotLocked(self)
904
0.17.5 by Robert Collins
nograph tests completely passing.
905
    def _get_entries(self, keys, check_present=False):
906
        """Get the entries for keys.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
907
908
        Note: Callers are responsible for checking that the index is locked
909
        before calling this method.
910
0.17.5 by Robert Collins
nograph tests completely passing.
911
        :param keys: An iterable of index key tuples.
912
        """
913
        keys = set(keys)
914
        found_keys = set()
915
        if self._parents:
916
            for node in self._graph_index.iter_entries(keys):
917
                yield node
918
                found_keys.add(node[1])
919
        else:
920
            # adapt parentless index to the rest of the code.
921
            for node in self._graph_index.iter_entries(keys):
922
                yield node[0], node[1], node[2], ()
923
                found_keys.add(node[1])
924
        if check_present:
925
            missing_keys = keys.difference(found_keys)
926
            if missing_keys:
927
                raise RevisionNotPresent(missing_keys.pop(), self)
928
929
    def get_parent_map(self, keys):
930
        """Get a map of the parents of keys.
931
932
        :param keys: The keys to look up parents for.
933
        :return: A mapping from keys to parents. Absent keys are absent from
934
            the mapping.
935
        """
936
        self._check_read()
937
        nodes = self._get_entries(keys)
938
        result = {}
939
        if self._parents:
940
            for node in nodes:
941
                result[node[1]] = node[3][0]
942
        else:
943
            for node in nodes:
944
                result[node[1]] = None
945
        return result
946
947
    def get_build_details(self, keys):
948
        """Get the various build details for keys.
949
950
        Ghosts are omitted from the result.
951
952
        :param keys: An iterable of keys.
953
        :return: A dict of key:
954
            (index_memo, compression_parent, parents, record_details).
955
            index_memo
956
                opaque structure to pass to read_records to extract the raw
957
                data
958
            compression_parent
959
                Content that this record is built upon, may be None
960
            parents
961
                Logical parents of this node
962
            record_details
963
                extra information about the content which needs to be passed to
964
                Factory.parse_record
965
        """
966
        self._check_read()
967
        result = {}
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
968
        entries = self._get_entries(keys)
0.17.5 by Robert Collins
nograph tests completely passing.
969
        for entry in entries:
970
            key = entry[1]
971
            if not self._parents:
972
                parents = None
973
            else:
974
                parents = entry[3][0]
975
            method = 'group'
976
            result[key] = (self._node_to_position(entry),
977
                                  None, parents, (method, None))
978
        return result
979
    
980
    def keys(self):
981
        """Get all the keys in the collection.
982
        
983
        The keys are not ordered.
984
        """
985
        self._check_read()
986
        return [node[1] for node in self._graph_index.iter_all_entries()]
987
    
988
    def _node_to_position(self, node):
989
        """Convert an index value to position details."""
990
        bits = node[2].split(' ')
991
        # It would be nice not to read the entire gzip.
992
        start = int(bits[0])
993
        stop = int(bits[1])
994
        basis_end = int(bits[2])
995
        delta_end = int(bits[3])
996
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
997
998
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
999
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
1000
    """Get the longest possible match for the current position."""
1001
    range_start = pos
1002
    range_len = 0
1003
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
1004
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
1005
        if locations is None:
1006
            locations = equivalence_table.get_idx_matches(pos)
1007
        if locations is None:
1008
            # No more matches, just return whatever we have, but we know that
1009
            # this last position is not going to match anything
1010
            pos += 1
1011
            break
1012
        else:
1013
            if copy_ends is None:
1014
                # We are starting a new range
1015
                copy_ends = [loc + 1 for loc in locations]
1016
                range_len = 1
1017
                locations = None # Consumed
1018
            else:
1019
                # We are currently in the middle of a match
1020
                next_locations = set(copy_ends).intersection(locations)
1021
                if len(next_locations):
1022
                    # range continues
1023
                    copy_ends = [loc + 1 for loc in next_locations]
1024
                    range_len += 1
1025
                    locations = None # Consumed
1026
                else:
1027
                    # But we are done with this match, we should be
1028
                    # starting a new one, though. We will pass back 'locations'
1029
                    # so that we don't have to do another lookup.
1030
                    break
1031
        pos += 1
1032
    if copy_ends is None:
1033
        return None, pos, locations
1034
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
1035
1036
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
1037
try:
0.17.31 by John Arbash Meinel
Bring in the 'rabin' experiment.
1038
    from bzrlib.plugins.groupcompress import _groupcompress_pyx
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
1039
except ImportError:
1040
    pass