/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1
# Copyright (C) 2008, 2009 Canonical Ltd
2
#
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
3
# This program is free software; you can redistribute it and/or modify
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
12
#
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
16
17
"""Core compression logic for compressing streams of related files."""
18
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
19
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
20
from cStringIO import StringIO
0.25.3 by John Arbash Meinel
Add a encode/decode base128 functions.
21
import struct
0.17.5 by Robert Collins
nograph tests completely passing.
22
import zlib
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
23
try:
24
    import pylzma
25
except ImportError:
26
    pylzma = None
0.17.5 by Robert Collins
nograph tests completely passing.
27
0.17.4 by Robert Collins
Annotate.
28
from bzrlib import (
29
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
30
    debug,
0.17.4 by Robert Collins
Annotate.
31
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
32
    errors,
0.17.4 by Robert Collins
Annotate.
33
    graph as _mod_graph,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
34
    osutils,
0.17.4 by Robert Collins
Annotate.
35
    pack,
36
    patiencediff,
37
    )
38
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
39
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
40
from bzrlib.osutils import (
41
    contains_whitespace,
42
    sha_string,
43
    split_lines,
44
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
45
from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
46
from bzrlib.lru_cache import LRUSizeCache
0.17.9 by Robert Collins
Initial stab at repository format support.
47
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
48
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
49
    adapter_registry,
50
    AbsentContentFactory,
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
51
    ChunkedContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
52
    FulltextContentFactory,
53
    VersionedFiles,
54
    )
55
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
56
_USE_LZMA = False and (pylzma is not None)
3735.32.1 by John Arbash Meinel
Fix the VF WalkingContent checks.
57
_NO_LABELS = False
0.25.16 by John Arbash Meinel
Make sure we don't inter-pack to GCCHKBig repos.
58
_FAST = False
0.17.2 by Robert Collins
Core proof of concept working.
59
0.25.3 by John Arbash Meinel
Add a encode/decode base128 functions.
60
def encode_base128_int(val):
61
    """Convert an integer into a 7-bit lsb encoding."""
62
    bytes = []
63
    count = 0
64
    while val >= 0x80:
65
        bytes.append(chr((val | 0x80) & 0xFF))
66
        val >>= 7
67
    bytes.append(chr(val))
68
    return ''.join(bytes)
69
70
71
def decode_base128_int(bytes):
72
    """Decode an integer from a 7-bit lsb encoding."""
73
    offset = 0
74
    val = 0
75
    shift = 0
76
    bval = ord(bytes[offset])
77
    while bval >= 0x80:
78
        val |= (bval & 0x7F) << shift
79
        shift += 7
80
        offset += 1
81
        bval = ord(bytes[offset])
82
    val |= bval << shift
83
    offset += 1
84
    return val, offset
85
86
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
87
def sort_gc_optimal(parent_map):
3735.31.14 by John Arbash Meinel
Change the gc-optimal to 'groupcompress'
88
    """Sort and group the keys in parent_map into groupcompress order.
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
89
3735.31.14 by John Arbash Meinel
Change the gc-optimal to 'groupcompress'
90
    groupcompress is defined (currently) as reverse-topological order, grouped by
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
91
    the key prefix.
92
93
    :return: A sorted-list of keys
94
    """
3735.31.14 by John Arbash Meinel
Change the gc-optimal to 'groupcompress'
95
    # groupcompress ordering is approximately reverse topological,
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
96
    # properly grouped by file-id.
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
97
    per_prefix_map = {}
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
98
    for item in parent_map.iteritems():
99
        key = item[0]
100
        if isinstance(key, str) or len(key) == 1:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
101
            prefix = ''
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
102
        else:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
103
            prefix = key[0]
104
        try:
105
            per_prefix_map[prefix].append(item)
106
        except KeyError:
107
            per_prefix_map[prefix] = [item]
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
108
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
109
    present_keys = []
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
110
    for prefix in sorted(per_prefix_map):
111
        present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
112
    return present_keys
113
114
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
115
class GroupCompressBlockEntry(object):
116
    """Track the information about a single object inside a GC group.
117
118
    This is generally just the dumb data structure.
119
    """
120
121
    def __init__(self, key, type, sha1, start, length):
122
        self.key = key
123
        self.type = type # delta, fulltext, external?
124
        self.sha1 = sha1 # Sha1 of content
125
        self.start = start # Byte offset to start of data
126
        self.length = length # Length of content
127
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
128
    def __repr__(self):
129
        return '%s(%s, %s, %s, %s, %s)' % (
130
            self.__class__.__name__,
131
            self.key, self.type, self.sha1, self.start, self.length
132
            )
133
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
134
135
class GroupCompressBlock(object):
136
    """An object which maintains the internal structure of the compressed data.
137
138
    This tracks the meta info (start of text, length, type, etc.)
139
    """
140
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
141
    # Group Compress Block v1 Zlib
142
    GCB_HEADER = 'gcb1z\n'
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
143
    GCB_LZ_HEADER = 'gcb1l\n'
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
144
145
    def __init__(self):
146
        # map by key? or just order in file?
147
        self._entries = {}
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
148
        self._content = None
149
        self._size = 0
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
150
151
    def _parse_header(self):
152
        """Parse the meta-info from the stream."""
153
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
154
    def __len__(self):
155
        return self._size
156
0.17.48 by John Arbash Meinel
if _NO_LABELS is set, don't bother parsing the mini header.
157
    def _parse_header_bytes(self, header_bytes):
158
        """Parse the header part of the block."""
159
        if _NO_LABELS:
160
            # Don't parse the label structure if we aren't going to use it
161
            return
162
        lines = header_bytes.split('\n')
163
        info_dict = {}
164
        for line in lines:
165
            if not line: #End of record
166
                if not info_dict:
167
                    break
168
                self.add_entry(**info_dict)
169
                info_dict = {}
170
                continue
171
            key, value = line.split(':', 1)
172
            if key == 'key':
173
                value = tuple(map(intern, value.split('\x00')))
174
            elif key in ('start', 'length'):
175
                value = int(value)
176
            elif key == 'type':
177
                value = intern(value)
178
            info_dict[key] = value
179
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
180
    @classmethod
181
    def from_bytes(cls, bytes):
182
        out = cls()
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
183
        if bytes[:6] not in (cls.GCB_HEADER, cls.GCB_LZ_HEADER):
3735.31.1 by John Arbash Meinel
Bring the groupcompress plugin into the brisbane-core branch.
184
            raise ValueError('bytes did not start with %r' % (cls.GCB_HEADER,))
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
185
        if bytes[4] == 'z':
186
            decomp = zlib.decompress
0.17.45 by John Arbash Meinel
Just make sure we have the right decompressor
187
        elif bytes[4] == 'l':
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
188
            decomp = pylzma.decompress
0.17.45 by John Arbash Meinel
Just make sure we have the right decompressor
189
        else:
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
190
            raise ValueError('unknown compressor: %r' % (bytes,))
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
191
        pos = bytes.index('\n', 6)
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
192
        z_header_length = int(bytes[6:pos])
193
        pos += 1
194
        pos2 = bytes.index('\n', pos)
195
        header_length = int(bytes[pos:pos2])
196
        if z_header_length == 0:
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
197
            if header_length != 0:
198
                raise ValueError('z_header_length 0, but header length != 0')
0.17.35 by John Arbash Meinel
Get the tests passing again
199
            zcontent = bytes[pos2+1:]
200
            if zcontent:
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
201
                out._content = decomp(zcontent)
0.17.35 by John Arbash Meinel
Get the tests passing again
202
                out._size = len(out._content)
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
203
            return out
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
204
        pos = pos2 + 1
205
        pos2 = pos + z_header_length
206
        z_header_bytes = bytes[pos:pos2]
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
207
        if len(z_header_bytes) != z_header_length:
208
            raise ValueError('Wrong length of compressed header. %s != %s'
209
                             % (len(z_header_bytes), z_header_length))
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
210
        header_bytes = decomp(z_header_bytes)
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
211
        if len(header_bytes) != header_length:
212
            raise ValueError('Wrong length of header. %s != %s'
213
                             % (len(header_bytes), header_length))
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
214
        del z_header_bytes
0.17.48 by John Arbash Meinel
if _NO_LABELS is set, don't bother parsing the mini header.
215
        out._parse_header_bytes(header_bytes)
0.17.49 by John Arbash Meinel
Fix some missing variable names.
216
        del header_bytes
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
217
        zcontent = bytes[pos2:]
218
        if zcontent:
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
219
            out._content = decomp(zcontent)
0.17.49 by John Arbash Meinel
Fix some missing variable names.
220
            out._size = header_length + len(out._content)
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
221
        return out
222
0.25.12 by John Arbash Meinel
Add a single byte to indicate whether the following text is a fulltext
223
    def extract(self, key, index_memo, sha1=None):
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
224
        """Extract the text for a specific key.
225
226
        :param key: The label used for this content
227
        :param sha1: TODO (should we validate only when sha1 is supplied?)
228
        :return: The bytes for the content
229
        """
0.25.16 by John Arbash Meinel
Make sure we don't inter-pack to GCCHKBig repos.
230
        if _NO_LABELS or not self._entries:
0.25.13 by John Arbash Meinel
bring back the code that handles _NO_LABELS.
231
            start, end = index_memo[3:5]
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
232
            # The bytes are 'f' or 'd' for the type, then a variable-length
233
            # base128 integer for the content size, then the actual content
234
            # We know that the variable-length integer won't be longer than 10
235
            # bytes (it only takes 5 bytes to encode 2^32)
0.25.13 by John Arbash Meinel
bring back the code that handles _NO_LABELS.
236
            c = self._content[start]
237
            if c == 'f':
238
                type = 'fulltext'
239
            else:
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
240
                if c != 'd':
241
                    raise ValueError('Unknown content control code: %s'
242
                                     % (c,))
0.25.13 by John Arbash Meinel
bring back the code that handles _NO_LABELS.
243
                type = 'delta'
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
244
            entry = GroupCompressBlockEntry(key, type, sha1=None,
245
                                            start=start, length=end-start)
246
        else:
247
            entry = self._entries[key]
248
            c = self._content[entry.start]
249
            if entry.type == 'fulltext':
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
250
                if c != 'f':
251
                    raise ValueError('Label claimed fulltext, byte claims: %s'
252
                                     % (c,))
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
253
            elif entry.type == 'delta':
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
254
                if c != 'd':
255
                    raise ValueError('Label claimed delta, byte claims: %s'
256
                                     % (c,))
0.17.37 by Ian Clatworthy
fix initialization of start variable
257
            start = entry.start
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
258
        content_len, len_len = decode_base128_int(
0.26.1 by John Arbash Meinel
Prototype using LZMA as the secondary compressor, rather than zlib.
259
                            self._content[entry.start + 1:entry.start + 11])
260
        content_start = entry.start + 1 + len_len
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
261
        end = entry.start + entry.length
262
        content = self._content[content_start:end]
263
        if c == 'f':
264
            bytes = content
265
        elif c == 'd':
266
            bytes = _groupcompress_pyx.apply_delta(self._content, content)
267
        if entry.sha1 is None:
268
            entry.sha1 = sha_string(bytes)
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
269
        return entry, bytes
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
270
271
    def add_entry(self, key, type, sha1, start, length):
272
        """Add new meta info about an entry.
273
274
        :param key: The key for the new content
275
        :param type: Whether this is a delta or fulltext entry (external?)
276
        :param sha1: sha1sum of the fulltext of this entry
277
        :param start: where the encoded bytes start
278
        :param length: total number of bytes in the encoded form
279
        :return: The entry?
280
        """
281
        entry = GroupCompressBlockEntry(key, type, sha1, start, length)
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
282
        if key in self._entries:
283
            raise ValueError('Duplicate key found: %s' % (key,))
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
284
        self._entries[key] = entry
285
        return entry
286
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
287
    def to_bytes(self, content=''):
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
288
        """Encode the information into a byte stream."""
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
289
        compress = zlib.compress
290
        if _USE_LZMA:
291
            compress = pylzma.compress
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
292
        chunks = []
293
        for key in sorted(self._entries):
294
            entry = self._entries[key]
295
            chunk = ('key:%s\n'
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
296
                     'sha1:%s\n'
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
297
                     'type:%s\n'
298
                     'start:%s\n'
299
                     'length:%s\n'
300
                     '\n'
301
                     ) % ('\x00'.join(entry.key),
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
302
                          entry.sha1,
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
303
                          entry.type,
304
                          entry.start,
305
                          entry.length,
306
                          )
307
            chunks.append(chunk)
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
308
        bytes = ''.join(chunks)
309
        info_len = len(bytes)
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
310
        z_bytes = []
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
311
        z_bytes.append(compress(bytes))
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
312
        del bytes
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
313
        # TODO: we may want to have the header compressed in the same chain
314
        #       as the data, or we may not, evaulate it
315
        #       having them compressed together is probably a win for
316
        #       revisions and the 'inv' portion of chk inventories. As the
317
        #       label in the header is duplicated in the text.
318
        #       For chk pages and real bytes, I would guess this is not
319
        #       true.
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
320
        z_len = sum(map(len, z_bytes))
321
        c_len = len(content)
0.25.13 by John Arbash Meinel
bring back the code that handles _NO_LABELS.
322
        if _NO_LABELS:
323
            z_bytes = []
324
            z_len = 0
325
            info_len = 0
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
326
        z_bytes.append(compress(content))
0.17.46 by John Arbash Meinel
Set the proper header when using/not using lzma
327
        if _USE_LZMA:
328
            header = self.GCB_LZ_HEADER
329
        else:
330
            header = self.GCB_HEADER
331
        chunks = [header,
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
332
                  '%d\n' % (z_len,),
333
                  '%d\n' % (info_len,),
334
                  #'%d\n' % (c_len,),
335
                 ]
336
        chunks.extend(z_bytes)
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
337
        return ''.join(chunks)
338
339
0.17.2 by Robert Collins
Core proof of concept working.
340
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
341
    """Produce a serialised group of compressed texts.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
342
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
343
    It contains code very similar to SequenceMatcher because of having a similar
344
    task. However some key differences apply:
345
     - there is no junk, we want a minimal edit not a human readable diff.
346
     - we don't filter very common lines (because we don't know where a good
347
       range will start, and after the first text we want to be emitting minmal
348
       edits only.
349
     - we chain the left side, not the right side
350
     - we incrementally update the adjacency matrix as new lines are provided.
351
     - we look for matches in all of the left side, so the routine which does
352
       the analagous task of find_longest_match does not need to filter on the
353
       left side.
354
    """
0.17.2 by Robert Collins
Core proof of concept working.
355
356
    def __init__(self, delta=True):
357
        """Create a GroupCompressor.
358
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
359
        :param delta: If False, do not compress records.
0.17.2 by Robert Collins
Core proof of concept working.
360
        """
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
361
        # Consider seeding the lines with some sort of GC Start flag, or
362
        # putting it as part of the output stream, rather than in the
363
        # compressed bytes.
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
364
        self.lines = []
0.17.2 by Robert Collins
Core proof of concept working.
365
        self.endpoint = 0
366
        self.input_bytes = 0
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
367
        self.num_keys = 0
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
368
        self.labels_deltas = {}
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
369
        self._last = None
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
370
        self._delta_index = _groupcompress_pyx.DeltaIndex()
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
371
        self._block = GroupCompressBlock()
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
372
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
373
    def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):
0.17.2 by Robert Collins
Core proof of concept working.
374
        """Compress lines with label key.
375
376
        :param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins
Working better --gc-plain-chk.
377
            for identification of the text during decompression. If the last
378
            element is 'None' it is replaced with the sha1 of the text -
379
            e.g. sha1:xxxxxxx.
0.23.58 by John Arbash Meinel
fix up the failing tests.
380
        :param bytes: The bytes to be compressed
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
381
        :param expected_sha: If non-None, the sha the lines are believed to
0.17.2 by Robert Collins
Core proof of concept working.
382
            have. During compression the sha is calculated; a mismatch will
383
            cause an error.
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
384
        :param nostore_sha: If the computed sha1 sum matches, we will raise
385
            ExistingContent rather than adding the text.
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
386
        :param soft: Do a 'soft' compression. This means that we require larger
387
            ranges to match to be considered for a copy command.
0.17.2 by Robert Collins
Core proof of concept working.
388
        :return: The sha1 of lines, and the number of bytes accumulated in
389
            the group output so far.
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
390
        :seealso VersionedFiles.add_lines:
0.17.2 by Robert Collins
Core proof of concept working.
391
        """
0.23.54 by John Arbash Meinel
'bzr pack' _FAST during compress() now is 32s versus 25s.
392
        if not _FAST or expected_sha is None:
393
            sha1 = sha_string(bytes)
394
        else:
395
            sha1 = expected_sha
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
396
        if sha1 == nostore_sha:
397
            raise errors.ExistingContent()
0.17.26 by Robert Collins
Working better --gc-plain-chk.
398
        if key[-1] is None:
399
            key = key[:-1] + ('sha1:' + sha1,)
0.23.52 by John Arbash Meinel
Use the max_delta flag.
400
        input_len = len(bytes)
0.23.12 by John Arbash Meinel
Add a 'len:' field to the data.
401
        # By having action/label/sha1/len, we can parse the group if the index
402
        # was ever destroyed, we have the key in 'label', we know the final
403
        # bytes are valid from sha1, and we know where to find the end of this
404
        # record because of 'len'. (the delta record itself will store the
405
        # total length for the expanded record)
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
406
        # 'len: %d\n' costs approximately 1% increase in total data
407
        # Having the labels at all costs us 9-10% increase, 38% increase for
408
        # inventory pages, and 5.8% increase for text pages
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
409
        # new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
410
        if self._delta_index._source_offset != self.endpoint:
411
            raise AssertionError('_source_offset != endpoint'
412
                ' somehow the DeltaIndex got out of sync with'
413
                ' the output lines')
0.23.52 by John Arbash Meinel
Use the max_delta flag.
414
        max_delta_size = len(bytes) / 2
415
        delta = self._delta_index.make_delta(bytes, max_delta_size)
416
        if (delta is None):
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
417
            type = 'fulltext'
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
418
            enc_length = encode_base128_int(len(bytes))
419
            len_mini_header = 1 + len(enc_length)
420
            length = len(bytes) + len_mini_header
421
            self._delta_index.add_source(bytes, len_mini_header)
422
            new_chunks = ['f', enc_length, bytes]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
423
        else:
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
424
            type = 'delta'
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
425
            enc_length = encode_base128_int(len(delta))
426
            len_mini_header = 1 + len(enc_length)
427
            length = len(delta) + len_mini_header
428
            new_chunks = ['d', enc_length, delta]
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
429
            if _FAST:
0.25.14 by John Arbash Meinel
Fix a bug in 'FAST' handling.
430
                self._delta_index._source_offset += length
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
431
            else:
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
432
                self._delta_index.add_delta_source(delta, len_mini_header)
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
433
        self._block.add_entry(key, type=type, sha1=sha1,
434
                              start=self.endpoint, length=length)
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
435
        delta_start = (self.endpoint, len(self.lines))
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
436
        self.num_keys += 1
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
437
        self.output_chunks(new_chunks)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
438
        self.input_bytes += input_len
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
439
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
440
        self.labels_deltas[key] = (delta_start, delta_end)
0.23.29 by John Arbash Meinel
Forgot to add the delta bytes to the index objects.
441
        if not self._delta_index._source_offset == self.endpoint:
442
            raise AssertionError('the delta index is out of sync'
443
                'with the output lines %s != %s'
444
                % (self._delta_index._source_offset, self.endpoint))
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
445
        return sha1, self.endpoint, type, length
0.17.2 by Robert Collins
Core proof of concept working.
446
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
447
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
448
        """Extract a key previously added to the compressor.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
449
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
450
        :param key: The key to extract.
451
        :return: An iterable over bytes and the sha1.
452
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
453
        delta_details = self.labels_deltas[key]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
454
        delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
455
        stored_bytes = ''.join(delta_chunks)
456
        # TODO: Fix this, we shouldn't really be peeking here
457
        entry = self._block._entries[key]
458
        if entry.type == 'fulltext':
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
459
            if stored_bytes[0] != 'f':
460
                raise ValueError('Index claimed fulltext, but stored bytes'
461
                                 ' indicate %s' % (stored_bytes[0],))
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
462
            fulltext_len, offset = decode_base128_int(stored_bytes[1:10])
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
463
            if fulltext_len + 1 + offset != len(stored_bytes):
464
                raise ValueError('Index claimed fulltext len, but stored bytes'
465
                                 ' claim %s != %s'
466
                                 % (len(stored_bytes),
467
                                    fulltext_len + 1 + offset))
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
468
            bytes = stored_bytes[offset + 1:]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
469
        else:
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
470
            if entry.type != 'delta':
471
                raise ValueError('Unknown entry type: %s' % (entry.type,))
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
472
            # XXX: This is inefficient at best
473
            source = ''.join(self.lines)
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
474
            if stored_bytes[0] != 'd':
475
                raise ValueError('Entry type claims delta, bytes claim %s'
476
                                 % (stored_bytes[0],))
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
477
            delta_len, offset = decode_base128_int(stored_bytes[1:10])
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
478
            if delta_len + 1 + offset != len(stored_bytes):
479
                raise ValueError('Index claimed delta len, but stored bytes'
480
                                 ' claim %s != %s'
481
                                 % (len(stored_bytes),
482
                                    delta_len + 1 + offset))
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
483
            bytes = _groupcompress_pyx.apply_delta(source,
484
                                                   stored_bytes[offset + 1:])
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
485
        bytes_sha1 = sha_string(bytes)
486
        if entry.sha1 != bytes_sha1:
487
            raise ValueError('Recorded sha1 != measured %s != %s'
488
                             % (entry.sha1, bytes_sha1))
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
489
        return bytes, entry.sha1
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
490
491
    def output_chunks(self, new_chunks):
492
        """Output some chunks.
493
494
        :param new_chunks: The chunks to output.
495
        """
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
496
        self._last = (len(self.lines), self.endpoint)
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
497
        endpoint = self.endpoint
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
498
        self.lines.extend(new_chunks)
499
        endpoint += sum(map(len, new_chunks))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
500
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
501
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
502
    def pop_last(self):
503
        """Call this if you want to 'revoke' the last compression.
504
505
        After this, the data structures will be rolled back, but you cannot do
506
        more compression.
507
        """
508
        self._delta_index = None
509
        del self.lines[self._last[0]:]
510
        self.endpoint = self._last[1]
511
        self._last = None
512
0.17.2 by Robert Collins
Core proof of concept working.
513
    def ratio(self):
514
        """Return the overall compression ratio."""
515
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
516
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
517
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
518
def make_pack_factory(graph, delta, keylength):
519
    """Create a factory for creating a pack based groupcompress.
520
521
    This is only functional enough to run interface tests, it doesn't try to
522
    provide a full pack environment.
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
523
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
524
    :param graph: Store a graph.
525
    :param delta: Delta compress contents.
526
    :param keylength: How long should keys be.
527
    """
528
    def factory(transport):
3735.32.2 by John Arbash Meinel
The 'delta' flag has no effect on the content (all GC is delta'd),
529
        parents = graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
530
        ref_length = 0
531
        if graph:
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
532
            ref_length = 1
0.17.7 by Robert Collins
Update for current index2 changes.
533
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
534
            key_elements=keylength)
535
        stream = transport.open_write_stream('newpack')
536
        writer = pack.ContainerWriter(stream.write)
537
        writer.begin()
538
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
539
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
540
        access = _DirectPackAccess({})
541
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
542
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
543
        result.stream = stream
544
        result.writer = writer
545
        return result
546
    return factory
547
548
549
def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
550
    versioned_files.writer.end()
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
551
    versioned_files.stream.close()
552
553
554
class GroupCompressVersionedFiles(VersionedFiles):
555
    """A group-compress based VersionedFiles implementation."""
556
0.17.2 by Robert Collins
Core proof of concept working.
557
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
558
        """Create a GroupCompressVersionedFiles object.
559
560
        :param index: The index object storing access and graph data.
561
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
562
        :param delta: Whether to delta compress or just entropy compress.
563
        """
564
        self._index = index
565
        self._access = access
566
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
567
        self._unadded_refs = {}
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
568
        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
569
        self._fallback_vfs = []
0.17.2 by Robert Collins
Core proof of concept working.
570
571
    def add_lines(self, key, parents, lines, parent_texts=None,
572
        left_matching_blocks=None, nostore_sha=None, random_id=False,
573
        check_content=True):
574
        """Add a text to the store.
575
576
        :param key: The key tuple of the text to add.
577
        :param parents: The parents key tuples of the text to add.
578
        :param lines: A list of lines. Each line must be a bytestring. And all
579
            of them except the last must be terminated with \n and contain no
580
            other \n's. The last line may either contain no \n's or a single
581
            terminating \n. If the lines list does meet this constraint the add
582
            routine may error or may succeed - but you will be unable to read
583
            the data back accurately. (Checking the lines have been split
584
            correctly is expensive and extremely unlikely to catch bugs so it
585
            is not done at runtime unless check_content is True.)
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
586
        :param parent_texts: An optional dictionary containing the opaque
0.17.2 by Robert Collins
Core proof of concept working.
587
            representations of some or all of the parents of version_id to
588
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
589
            returned by add_lines or data corruption can be caused.
590
        :param left_matching_blocks: a hint about which areas are common
591
            between the text and its left-hand-parent.  The format is
592
            the SequenceMatcher.get_matching_blocks format.
593
        :param nostore_sha: Raise ExistingContent and do not add the lines to
594
            the versioned file if the digest of the lines matches this.
595
        :param random_id: If True a random id has been selected rather than
596
            an id determined by some deterministic process such as a converter
597
            from a foreign VCS. When True the backend may choose not to check
598
            for uniqueness of the resulting key within the versioned file, so
599
            this should only be done when the result is expected to be unique
600
            anyway.
601
        :param check_content: If True, the lines supplied are verified to be
602
            bytestrings that are correctly formed lines.
603
        :return: The text sha1, the number of bytes in the text, and an opaque
604
                 representation of the inserted version which can be provided
605
                 back to future add_lines calls in the parent_texts dictionary.
606
        """
607
        self._index._check_write_ok()
608
        self._check_add(key, lines, random_id, check_content)
609
        if parents is None:
610
            # The caller might pass None if there is no graph data, but kndx
611
            # indexes can't directly store that, so we give them
612
            # an empty tuple instead.
613
            parents = ()
614
        # double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
615
        length = sum(map(len, lines))
616
        record = ChunkedContentFactory(key, parents, None, lines)
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
617
        sha1 = list(self._insert_record_stream([record], random_id=random_id,
618
                                               nostore_sha=nostore_sha))[0]
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
619
        return sha1, length, None
0.17.2 by Robert Collins
Core proof of concept working.
620
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
621
    def add_fallback_versioned_files(self, a_versioned_files):
622
        """Add a source of texts for texts not present in this knit.
623
624
        :param a_versioned_files: A VersionedFiles object.
625
        """
626
        self._fallback_vfs.append(a_versioned_files)
627
0.17.4 by Robert Collins
Annotate.
628
    def annotate(self, key):
629
        """See VersionedFiles.annotate."""
630
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
631
        parent_map = self.get_parent_map([key])
632
        if not parent_map:
633
            raise errors.RevisionNotPresent(key, self)
634
        if parent_map[key] is not None:
635
            search = graph._make_breadth_first_searcher([key])
636
            keys = set()
637
            while True:
638
                try:
639
                    present, ghosts = search.next_with_ghosts()
640
                except StopIteration:
641
                    break
642
                keys.update(present)
643
            parent_map = self.get_parent_map(keys)
644
        else:
645
            keys = [key]
646
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
647
        head_cache = _mod_graph.FrozenHeadsCache(graph)
648
        parent_cache = {}
649
        reannotate = annotate.reannotate
650
        for record in self.get_record_stream(keys, 'topological', True):
651
            key = record.key
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
652
            chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins
Annotate.
653
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
654
            parent_cache[key] = list(
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
655
                reannotate(parent_lines, chunks, key, None, head_cache))
0.17.4 by Robert Collins
Annotate.
656
        return parent_cache[key]
657
0.17.5 by Robert Collins
nograph tests completely passing.
658
    def check(self, progress_bar=None):
659
        """See VersionedFiles.check()."""
660
        keys = self.keys()
661
        for record in self.get_record_stream(keys, 'unordered', True):
662
            record.get_bytes_as('fulltext')
663
0.17.2 by Robert Collins
Core proof of concept working.
664
    def _check_add(self, key, lines, random_id, check_content):
665
        """check that version_id and lines are safe to add."""
666
        version_id = key[-1]
0.17.26 by Robert Collins
Working better --gc-plain-chk.
667
        if version_id is not None:
668
            if contains_whitespace(version_id):
3735.31.1 by John Arbash Meinel
Bring the groupcompress plugin into the brisbane-core branch.
669
                raise errors.InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins
Core proof of concept working.
670
        self.check_not_reserved_id(version_id)
671
        # TODO: If random_id==False and the key is already present, we should
672
        # probably check that the existing content is identical to what is
673
        # being inserted, and otherwise raise an exception.  This would make
674
        # the bundle code simpler.
675
        if check_content:
676
            self._check_lines_not_unicode(lines)
677
            self._check_lines_are_lines(lines)
678
0.17.5 by Robert Collins
nograph tests completely passing.
679
    def get_parent_map(self, keys):
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
680
        """Get a map of the graph parents of keys.
0.17.5 by Robert Collins
nograph tests completely passing.
681
682
        :param keys: The keys to look up parents for.
683
        :return: A mapping from keys to parents. Absent keys are absent from
684
            the mapping.
685
        """
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
686
        return self._get_parent_map_with_sources(keys)[0]
687
688
    def _get_parent_map_with_sources(self, keys):
689
        """Get a map of the parents of keys.
690
691
        :param keys: The keys to look up parents for.
692
        :return: A tuple. The first element is a mapping from keys to parents.
693
            Absent keys are absent from the mapping. The second element is a
694
            list with the locations each key was found in. The first element
695
            is the in-this-knit parents, the second the first fallback source,
696
            and so on.
697
        """
0.17.5 by Robert Collins
nograph tests completely passing.
698
        result = {}
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
699
        sources = [self._index] + self._fallback_vfs
0.17.5 by Robert Collins
nograph tests completely passing.
700
        source_results = []
701
        missing = set(keys)
702
        for source in sources:
703
            if not missing:
704
                break
705
            new_result = source.get_parent_map(missing)
706
            source_results.append(new_result)
707
            result.update(new_result)
708
            missing.difference_update(set(new_result))
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
709
        return result, source_results
0.17.5 by Robert Collins
nograph tests completely passing.
710
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
711
    def _get_block(self, index_memo):
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
712
        read_memo = index_memo[0:3]
713
        # get the group:
714
        try:
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
715
            block = self._group_cache[read_memo]
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
716
        except KeyError:
717
            # read the group
718
            zdata = self._access.get_raw_records([read_memo]).next()
719
            # decompress - whole thing - this is not a bug, as it
720
            # permits caching. We might want to store the partially
721
            # decompresed group and decompress object, so that recent
722
            # texts are not penalised by big groups.
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
723
            block = GroupCompressBlock.from_bytes(zdata)
724
            self._group_cache[read_memo] = block
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
725
        # cheapo debugging:
726
        # print len(zdata), len(plain)
727
        # parse - requires split_lines, better to have byte offsets
728
        # here (but not by much - we only split the region for the
729
        # recipe, and we often want to end up with lines anyway.
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
730
        return block
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
731
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
732
    def get_missing_compression_parent_keys(self):
733
        """Return the keys of missing compression parents.
734
735
        Missing compression parents occur when a record stream was missing
736
        basis texts, or a index was scanned that had missing basis texts.
737
        """
738
        # GroupCompress cannot currently reference texts that are not in the
739
        # group, so this is valid for now
740
        return frozenset()
741
0.17.5 by Robert Collins
nograph tests completely passing.
742
    def get_record_stream(self, keys, ordering, include_delta_closure):
743
        """Get a stream of records for keys.
744
745
        :param keys: The keys to include.
746
        :param ordering: Either 'unordered' or 'topological'. A topologically
747
            sorted stream has compression parents strictly before their
748
            children.
749
        :param include_delta_closure: If True then the closure across any
750
            compression parents will be included (in the opaque data).
751
        :return: An iterator of ContentFactory objects, each of which is only
752
            valid until the iterator is advanced.
753
        """
754
        # keys might be a generator
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
755
        orig_keys = list(keys)
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
756
        keys = set(keys)
0.17.5 by Robert Collins
nograph tests completely passing.
757
        if not keys:
758
            return
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
759
        if (not self._index.has_graph
3735.31.14 by John Arbash Meinel
Change the gc-optimal to 'groupcompress'
760
            and ordering in ('topological', 'groupcompress')):
0.17.5 by Robert Collins
nograph tests completely passing.
761
            # Cannot topological order when no graph has been stored.
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
762
            # but we allow 'as-requested' or 'unordered'
0.17.5 by Robert Collins
nograph tests completely passing.
763
            ordering = 'unordered'
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
764
765
        remaining_keys = keys
766
        while True:
767
            try:
768
                keys = set(remaining_keys)
769
                for content_factory in self._get_remaining_record_stream(keys,
770
                        orig_keys, ordering, include_delta_closure):
771
                    remaining_keys.discard(content_factory.key)
772
                    yield content_factory
773
                return
774
            except errors.RetryWithNewPacks, e:
775
                self._access.reload_or_raise(e)
776
777
    def _find_from_fallback(self, missing):
778
        """Find whatever keys you can from the fallbacks.
779
780
        :param missing: A set of missing keys. This set will be mutated as keys
781
            are found from a fallback_vfs
782
        :return: (parent_map, key_to_source_map, source_results)
783
            parent_map  the overall key => parent_keys
784
            key_to_source_map   a dict from {key: source}
785
            source_results      a list of (source: keys)
786
        """
787
        parent_map = {}
788
        key_to_source_map = {}
789
        source_results = []
790
        for source in self._fallback_vfs:
791
            if not missing:
792
                break
793
            source_parents = source.get_parent_map(missing)
794
            parent_map.update(source_parents)
795
            source_parents = list(source_parents)
796
            source_results.append((source, source_parents))
797
            key_to_source_map.update((key, source) for key in source_parents)
798
            missing.difference_update(source_parents)
799
        return parent_map, key_to_source_map, source_results
800
801
    def _get_ordered_source_keys(self, ordering, parent_map, key_to_source_map):
802
        """Get the (source, [keys]) list.
803
804
        The returned objects should be in the order defined by 'ordering',
805
        which can weave between different sources.
806
        :param ordering: Must be one of 'topological' or 'groupcompress'
807
        :return: List of [(source, [keys])] tuples, such that all keys are in
808
            the defined order, regardless of source.
809
        """
810
        if ordering == 'topological':
811
            present_keys = topo_sort(parent_map)
812
        else:
813
            # ordering == 'groupcompress'
814
            # XXX: This only optimizes for the target ordering. We may need
815
            #      to balance that with the time it takes to extract
816
            #      ordering, by somehow grouping based on
817
            #      locations[key][0:3]
818
            present_keys = sort_gc_optimal(parent_map)
819
        # Now group by source:
820
        source_keys = []
821
        current_source = None
822
        for key in present_keys:
823
            source = key_to_source_map.get(key, self)
824
            if source is not current_source:
825
                source_keys.append((source, []))
3735.2.151 by John Arbash Meinel
A the source grouping code needs to update current_source
826
                current_source = source
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
827
            source_keys[-1][1].append(key)
828
        return source_keys
829
830
    def _get_as_requested_source_keys(self, orig_keys, locations, unadded_keys,
831
                                      key_to_source_map):
832
        source_keys = []
833
        current_source = None
834
        for key in orig_keys:
835
            if key in locations or key in unadded_keys:
836
                source = self
837
            elif key in key_to_source_map:
838
                source = key_to_source_map[key]
839
            else: # absent
840
                continue
841
            if source is not current_source:
842
                source_keys.append((source, []))
3735.2.151 by John Arbash Meinel
A the source grouping code needs to update current_source
843
                current_source = source
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
844
            source_keys[-1][1].append(key)
845
        return source_keys
846
847
    def _get_io_ordered_source_keys(self, locations, unadded_keys,
848
                                    source_result):
849
        def get_group(key):
850
            # This is the group the bytes are stored in, followed by the
851
            # location in the group
852
            return locations[key][0]
853
        present_keys = sorted(locations.iterkeys(), key=get_group)
854
        # We don't have an ordering for keys in the in-memory object, but
855
        # lets process the in-memory ones first.
856
        present_keys = list(unadded_keys) + present_keys
857
        # Now grab all of the ones from other sources
858
        source_keys = [(self, present_keys)]
859
        source_keys.extend(source_result)
860
        return source_keys
861
862
    def _get_remaining_record_stream(self, keys, orig_keys, ordering,
863
                                     include_delta_closure):
864
        """Get a stream of records for keys.
865
866
        :param keys: The keys to include.
867
        :param ordering: one of 'unordered', 'topological', 'groupcompress' or
868
            'as-requested'
869
        :param include_delta_closure: If True then the closure across any
870
            compression parents will be included (in the opaque data).
871
        :return: An iterator of ContentFactory objects, each of which is only
872
            valid until the iterator is advanced.
873
        """
0.17.5 by Robert Collins
nograph tests completely passing.
874
        # Cheap: iterate
875
        locations = self._index.get_build_details(keys)
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
876
        unadded_keys = set(self._unadded_refs).intersection(keys)
877
        missing = keys.difference(locations)
878
        missing.difference_update(unadded_keys)
879
        (fallback_parent_map, key_to_source_map,
880
         source_result) = self._find_from_fallback(missing)
881
        if ordering in ('topological', 'groupcompress'):
0.17.5 by Robert Collins
nograph tests completely passing.
882
            # would be better to not globally sort initially but instead
883
            # start with one key, recurse to its oldest parent, then grab
884
            # everything in the same group, etc.
885
            parent_map = dict((key, details[2]) for key, details in
886
                locations.iteritems())
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
887
            for key in unadded_keys:
888
                parent_map[key] = self._unadded_refs[key]
889
            parent_map.update(fallback_parent_map)
890
            source_keys = self._get_ordered_source_keys(ordering, parent_map,
891
                                                        key_to_source_map)
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
892
        elif ordering == 'as-requested':
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
893
            source_keys = self._get_as_requested_source_keys(orig_keys,
894
                locations, unadded_keys, key_to_source_map)
0.17.5 by Robert Collins
nograph tests completely passing.
895
        else:
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
896
            # We want to yield the keys in a semi-optimal (read-wise) ordering.
897
            # Otherwise we thrash the _group_cache and destroy performance
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
898
            source_keys = self._get_io_ordered_source_keys(locations,
899
                unadded_keys, source_result)
900
        for key in missing:
0.17.5 by Robert Collins
nograph tests completely passing.
901
            yield AbsentContentFactory(key)
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
902
        for source, keys in source_keys:
903
            if source is self:
904
                for key in keys:
905
                    if key in self._unadded_refs:
906
                        bytes, sha1 = self._compressor.extract(key)
907
                        parents = self._unadded_refs[key]
908
                    else:
909
                        index_memo, _, parents, (method, _) = locations[key]
910
                        block = self._get_block(index_memo)
911
                        entry, bytes = block.extract(key, index_memo)
912
                        sha1 = entry.sha1
913
                        # TODO: If we don't have labels, then the sha1 here is computed
914
                        #       from the data, so we don't want to re-sha the string.
915
                        if not _FAST and sha_string(bytes) != sha1:
916
                            raise AssertionError('sha1 sum did not match')
917
                    yield FulltextContentFactory(key, parents, sha1, bytes)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
918
            else:
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
919
                for record in source.get_record_stream(keys, ordering,
920
                                                       include_delta_closure):
921
                    yield record
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
922
0.17.5 by Robert Collins
nograph tests completely passing.
923
    def get_sha1s(self, keys):
924
        """See VersionedFiles.get_sha1s()."""
925
        result = {}
926
        for record in self.get_record_stream(keys, 'unordered', True):
927
            if record.sha1 != None:
928
                result[record.key] = record.sha1
929
            else:
930
                if record.storage_kind != 'absent':
931
                    result[record.key] == sha_string(record.get_bytes_as(
932
                        'fulltext'))
933
        return result
934
0.17.2 by Robert Collins
Core proof of concept working.
935
    def insert_record_stream(self, stream):
936
        """Insert a record stream into this container.
937
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
938
        :param stream: A stream of records to insert.
0.17.2 by Robert Collins
Core proof of concept working.
939
        :return: None
940
        :seealso VersionedFiles.get_record_stream:
941
        """
0.17.5 by Robert Collins
nograph tests completely passing.
942
        for _ in self._insert_record_stream(stream):
943
            pass
0.17.2 by Robert Collins
Core proof of concept working.
944
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
945
    def _insert_record_stream(self, stream, random_id=False, nostore_sha=None):
0.17.2 by Robert Collins
Core proof of concept working.
946
        """Internal core to insert a record stream into this container.
947
948
        This helper function has a different interface than insert_record_stream
949
        to allow add_lines to be minimal, but still return the needed data.
950
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
951
        :param stream: A stream of records to insert.
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
952
        :param nostore_sha: If the sha1 of a given text matches nostore_sha,
953
            raise ExistingContent, rather than committing the new text.
0.17.2 by Robert Collins
Core proof of concept working.
954
        :return: An iterator over the sha1 of the inserted records.
955
        :seealso insert_record_stream:
956
        :seealso add_lines:
957
        """
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
958
        adapters = {}
0.17.5 by Robert Collins
nograph tests completely passing.
959
        def get_adapter(adapter_key):
960
            try:
961
                return adapters[adapter_key]
962
            except KeyError:
963
                adapter_factory = adapter_registry.get(adapter_key)
964
                adapter = adapter_factory(self)
965
                adapters[adapter_key] = adapter
966
                return adapter
0.17.2 by Robert Collins
Core proof of concept working.
967
        # This will go up to fulltexts for gc to gc fetching, which isn't
968
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
969
        self._compressor = GroupCompressor(self._delta)
970
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
971
        keys_to_add = []
972
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
973
        def flush():
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
974
            bytes = self._compressor._block.to_bytes(
975
                ''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
976
            index, start, length = self._access.add_raw_records(
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
977
                [(None, len(bytes))], bytes)[0]
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
978
            nodes = []
979
            for key, reads, refs in keys_to_add:
980
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
981
            self._index.add_records(nodes, random_id=random_id)
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
982
            self._unadded_refs = {}
983
            del keys_to_add[:]
984
            self._compressor = GroupCompressor(self._delta)
985
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
986
        last_prefix = None
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
987
        last_fulltext_len = None
988
        max_fulltext_len = 0
0.25.11 by John Arbash Meinel
Slightly different handling of large texts.
989
        max_fulltext_prefix = None
0.17.2 by Robert Collins
Core proof of concept working.
990
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
991
            # Raise an error when a record is missing.
992
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
993
                raise errors.RevisionNotPresent(record.key, self)
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
994
            try:
0.23.52 by John Arbash Meinel
Use the max_delta flag.
995
                bytes = record.get_bytes_as('fulltext')
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
996
            except errors.UnavailableRepresentation:
0.17.5 by Robert Collins
nograph tests completely passing.
997
                adapter_key = record.storage_kind, 'fulltext'
998
                adapter = get_adapter(adapter_key)
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
999
                bytes = adapter.get_bytes(record)
0.20.13 by John Arbash Meinel
Play around a bit.
1000
            if len(record.key) > 1:
1001
                prefix = record.key[0]
0.25.11 by John Arbash Meinel
Slightly different handling of large texts.
1002
                soft = (prefix == last_prefix)
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
1003
            else:
1004
                prefix = None
0.25.11 by John Arbash Meinel
Slightly different handling of large texts.
1005
                soft = False
1006
            if max_fulltext_len < len(bytes):
1007
                max_fulltext_len = len(bytes)
1008
                max_fulltext_prefix = prefix
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
1009
            (found_sha1, end_point, type,
1010
             length) = self._compressor.compress(record.key,
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
1011
                bytes, record.sha1, soft=soft,
1012
                nostore_sha=nostore_sha)
0.25.11 by John Arbash Meinel
Slightly different handling of large texts.
1013
            # delta_ratio = float(len(bytes)) / length
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
1014
            # Check if we want to continue to include that text
0.25.11 by John Arbash Meinel
Slightly different handling of large texts.
1015
            if (prefix == max_fulltext_prefix
1016
                and end_point < 2 * max_fulltext_len):
1017
                # As long as we are on the same file_id, we will fill at least
1018
                # 2 * max_fulltext_len
1019
                start_new_block = False
1020
            elif end_point > 4*1024*1024:
1021
                start_new_block = True
1022
            elif (prefix is not None and prefix != last_prefix
1023
                  and end_point > 2*1024*1024):
1024
                start_new_block = True
1025
            else:
1026
                start_new_block = False
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
1027
            # if type == 'fulltext':
1028
            #     # If this is the first text, we don't do anything
1029
            #     if self._compressor.num_keys > 1:
1030
            #         if prefix is not None and prefix != last_prefix:
1031
            #             # We just inserted a fulltext for a different prefix
1032
            #             # (aka file-id).
1033
            #             if end_point > 512 * 1024:
1034
            #                 start_new_block = True
1035
            #             # TODO: Consider packing several small texts together
1036
            #             #       maybe only flush if end_point > some threshold
1037
            #             # if end_point > 512 * 1024 or len(bytes) <
1038
            #             #     start_new_block = true
1039
            #         else:
1040
            #             # We just added a fulltext, part of the same file-id
1041
            #             if (end_point > 2*1024*1024
1042
            #                 and end_point > 5*max_fulltext_len):
1043
            #                 start_new_block = True
1044
            #     last_fulltext_len = len(bytes)
1045
            # else:
1046
            #     delta_ratio = float(len(bytes)) / length
1047
            #     if delta_ratio < 3: # Not much compression
1048
            #         if end_point > 1*1024*1024:
1049
            #             start_new_block = True
1050
            #     elif delta_ratio < 10: # 10:1 compression
1051
            #         if end_point > 4*1024*1024:
1052
            #             start_new_block = True
1053
            last_prefix = prefix
1054
            if start_new_block:
1055
                self._compressor.pop_last()
1056
                flush()
1057
                basis_end = 0
1058
                max_fulltext_len = len(bytes)
1059
                (found_sha1, end_point, type,
1060
                 length) = self._compressor.compress(record.key,
1061
                    bytes, record.sha1)
1062
                last_fulltext_len = length
0.17.26 by Robert Collins
Working better --gc-plain-chk.
1063
            if record.key[-1] is None:
1064
                key = record.key[:-1] + ('sha1:' + found_sha1,)
1065
            else:
1066
                key = record.key
1067
            self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
1068
            yield found_sha1
0.17.26 by Robert Collins
Working better --gc-plain-chk.
1069
            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
0.17.5 by Robert Collins
nograph tests completely passing.
1070
                (record.parents,)))
1071
            basis_end = end_point
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
1072
        if len(keys_to_add):
1073
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
1074
        self._compressor = None
0.17.5 by Robert Collins
nograph tests completely passing.
1075
1076
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
1077
        """Iterate over the lines in the versioned files from keys.
1078
1079
        This may return lines from other keys. Each item the returned
1080
        iterator yields is a tuple of a line and a text version that that line
1081
        is present in (not introduced in).
1082
1083
        Ordering of results is in whatever order is most suitable for the
1084
        underlying storage format.
1085
1086
        If a progress bar is supplied, it may be used to indicate progress.
1087
        The caller is responsible for cleaning up progress bars (because this
1088
        is an iterator).
1089
1090
        NOTES:
1091
         * Lines are normalised by the underlying store: they will all have \n
1092
           terminators.
1093
         * Lines are returned in arbitrary order.
1094
1095
        :return: An iterator over (line, key).
1096
        """
1097
        if pb is None:
1098
            pb = progress.DummyProgress()
1099
        keys = set(keys)
1100
        total = len(keys)
1101
        # we don't care about inclusions, the caller cares.
1102
        # but we need to setup a list of records to visit.
1103
        # we need key, position, length
1104
        for key_idx, record in enumerate(self.get_record_stream(keys,
1105
            'unordered', True)):
1106
            # XXX: todo - optimise to use less than full texts.
1107
            key = record.key
3735.32.1 by John Arbash Meinel
Fix the VF WalkingContent checks.
1108
            pb.update('Walking content', key_idx, total)
0.17.5 by Robert Collins
nograph tests completely passing.
1109
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1110
                raise errors.RevisionNotPresent(key, self)
0.17.5 by Robert Collins
nograph tests completely passing.
1111
            lines = split_lines(record.get_bytes_as('fulltext'))
1112
            for line in lines:
1113
                yield line, key
3735.32.1 by John Arbash Meinel
Fix the VF WalkingContent checks.
1114
        pb.update('Walking content', total, total)
0.17.5 by Robert Collins
nograph tests completely passing.
1115
1116
    def keys(self):
1117
        """See VersionedFiles.keys."""
1118
        if 'evil' in debug.debug_flags:
1119
            trace.mutter_callsite(2, "keys scales with size of history")
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
1120
        sources = [self._index] + self._fallback_vfs
0.17.5 by Robert Collins
nograph tests completely passing.
1121
        result = set()
1122
        for source in sources:
1123
            result.update(source.keys())
1124
        return result
1125
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1126
1127
class _GCGraphIndex(object):
1128
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
1129
0.17.9 by Robert Collins
Initial stab at repository format support.
1130
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1131
        add_callback=None):
1132
        """Construct a _GCGraphIndex on a graph_index.
1133
1134
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1135
        :param is_locked: A callback, returns True if the index is locked and
1136
            thus usable.
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1137
        :param parents: If True, record knits parents, if not do not record
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1138
            parents.
1139
        :param add_callback: If not None, allow additions to the index and call
1140
            this callback with a list of added GraphIndex nodes:
1141
            [(node, value, node_refs), ...]
1142
        """
1143
        self._add_callback = add_callback
1144
        self._graph_index = graph_index
1145
        self._parents = parents
1146
        self.has_graph = parents
1147
        self._is_locked = is_locked
1148
0.17.5 by Robert Collins
nograph tests completely passing.
1149
    def add_records(self, records, random_id=False):
1150
        """Add multiple records to the index.
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1151
0.17.5 by Robert Collins
nograph tests completely passing.
1152
        This function does not insert data into the Immutable GraphIndex
1153
        backing the KnitGraphIndex, instead it prepares data for insertion by
1154
        the caller and checks that it is safe to insert then calls
1155
        self._add_callback with the prepared GraphIndex nodes.
1156
1157
        :param records: a list of tuples:
1158
                         (key, options, access_memo, parents).
1159
        :param random_id: If True the ids being added were randomly generated
1160
            and no check for existence will be performed.
1161
        """
1162
        if not self._add_callback:
1163
            raise errors.ReadOnlyError(self)
1164
        # we hope there are no repositories with inconsistent parentage
1165
        # anymore.
1166
1167
        changed = False
1168
        keys = {}
1169
        for (key, value, refs) in records:
1170
            if not self._parents:
1171
                if refs:
1172
                    for ref in refs:
1173
                        if ref:
1174
                            raise KnitCorrupt(self,
1175
                                "attempt to add node with parents "
1176
                                "in parentless index.")
1177
                    refs = ()
1178
                    changed = True
1179
            keys[key] = (value, refs)
1180
        # check for dups
1181
        if not random_id:
1182
            present_nodes = self._get_entries(keys)
1183
            for (index, key, value, node_refs) in present_nodes:
1184
                if node_refs != keys[key][1]:
1185
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
1186
                        ": %s %s" % ((value, node_refs), keys[key]))
1187
                del keys[key]
1188
                changed = True
1189
        if changed:
1190
            result = []
1191
            if self._parents:
1192
                for key, (value, node_refs) in keys.iteritems():
1193
                    result.append((key, value, node_refs))
1194
            else:
1195
                for key, (value, node_refs) in keys.iteritems():
1196
                    result.append((key, value))
1197
            records = result
1198
        self._add_callback(records)
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1199
0.17.5 by Robert Collins
nograph tests completely passing.
1200
    def _check_read(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1201
        """Raise an exception if reads are not permitted."""
0.17.5 by Robert Collins
nograph tests completely passing.
1202
        if not self._is_locked():
1203
            raise errors.ObjectNotLocked(self)
1204
0.17.2 by Robert Collins
Core proof of concept working.
1205
    def _check_write_ok(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1206
        """Raise an exception if writes are not permitted."""
0.17.2 by Robert Collins
Core proof of concept working.
1207
        if not self._is_locked():
1208
            raise errors.ObjectNotLocked(self)
1209
0.17.5 by Robert Collins
nograph tests completely passing.
1210
    def _get_entries(self, keys, check_present=False):
1211
        """Get the entries for keys.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1212
1213
        Note: Callers are responsible for checking that the index is locked
1214
        before calling this method.
1215
0.17.5 by Robert Collins
nograph tests completely passing.
1216
        :param keys: An iterable of index key tuples.
1217
        """
1218
        keys = set(keys)
1219
        found_keys = set()
1220
        if self._parents:
1221
            for node in self._graph_index.iter_entries(keys):
1222
                yield node
1223
                found_keys.add(node[1])
1224
        else:
1225
            # adapt parentless index to the rest of the code.
1226
            for node in self._graph_index.iter_entries(keys):
1227
                yield node[0], node[1], node[2], ()
1228
                found_keys.add(node[1])
1229
        if check_present:
1230
            missing_keys = keys.difference(found_keys)
1231
            if missing_keys:
1232
                raise RevisionNotPresent(missing_keys.pop(), self)
1233
1234
    def get_parent_map(self, keys):
1235
        """Get a map of the parents of keys.
1236
1237
        :param keys: The keys to look up parents for.
1238
        :return: A mapping from keys to parents. Absent keys are absent from
1239
            the mapping.
1240
        """
1241
        self._check_read()
1242
        nodes = self._get_entries(keys)
1243
        result = {}
1244
        if self._parents:
1245
            for node in nodes:
1246
                result[node[1]] = node[3][0]
1247
        else:
1248
            for node in nodes:
1249
                result[node[1]] = None
1250
        return result
1251
1252
    def get_build_details(self, keys):
1253
        """Get the various build details for keys.
1254
1255
        Ghosts are omitted from the result.
1256
1257
        :param keys: An iterable of keys.
1258
        :return: A dict of key:
1259
            (index_memo, compression_parent, parents, record_details).
1260
            index_memo
1261
                opaque structure to pass to read_records to extract the raw
1262
                data
1263
            compression_parent
1264
                Content that this record is built upon, may be None
1265
            parents
1266
                Logical parents of this node
1267
            record_details
1268
                extra information about the content which needs to be passed to
1269
                Factory.parse_record
1270
        """
1271
        self._check_read()
1272
        result = {}
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1273
        entries = self._get_entries(keys)
0.17.5 by Robert Collins
nograph tests completely passing.
1274
        for entry in entries:
1275
            key = entry[1]
1276
            if not self._parents:
1277
                parents = None
1278
            else:
1279
                parents = entry[3][0]
1280
            method = 'group'
1281
            result[key] = (self._node_to_position(entry),
1282
                                  None, parents, (method, None))
1283
        return result
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1284
0.17.5 by Robert Collins
nograph tests completely passing.
1285
    def keys(self):
1286
        """Get all the keys in the collection.
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1287
0.17.5 by Robert Collins
nograph tests completely passing.
1288
        The keys are not ordered.
1289
        """
1290
        self._check_read()
1291
        return [node[1] for node in self._graph_index.iter_all_entries()]
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1292
0.17.5 by Robert Collins
nograph tests completely passing.
1293
    def _node_to_position(self, node):
1294
        """Convert an index value to position details."""
1295
        bits = node[2].split(' ')
1296
        # It would be nice not to read the entire gzip.
1297
        start = int(bits[0])
1298
        stop = int(bits[1])
1299
        basis_end = int(bits[2])
1300
        delta_end = int(bits[3])
1301
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
1302
1303
1304
try:
3735.31.1 by John Arbash Meinel
Bring the groupcompress plugin into the brisbane-core branch.
1305
    from bzrlib import _groupcompress_pyx
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
1306
except ImportError:
1307
    pass