/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1
# Copyright (C) 2008, 2009 Canonical Ltd
2
#
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
3
# This program is free software; you can redistribute it and/or modify
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
12
#
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
3735.36.3 by John Arbash Meinel
Add the new address for FSF to the new files.
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
16
17
"""Core compression logic for compressing streams of related files."""
18
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
19
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
20
from cStringIO import StringIO
0.25.3 by John Arbash Meinel
Add a encode/decode base128 functions.
21
import struct
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
22
import time
0.17.5 by Robert Collins
nograph tests completely passing.
23
import zlib
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
24
try:
25
    import pylzma
26
except ImportError:
27
    pylzma = None
0.17.5 by Robert Collins
nograph tests completely passing.
28
0.17.4 by Robert Collins
Annotate.
29
from bzrlib import (
30
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
31
    debug,
0.17.4 by Robert Collins
Annotate.
32
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
33
    errors,
0.17.4 by Robert Collins
Annotate.
34
    graph as _mod_graph,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
35
    osutils,
0.17.4 by Robert Collins
Annotate.
36
    pack,
37
    patiencediff,
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
38
    trace,
0.17.4 by Robert Collins
Annotate.
39
    )
40
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
41
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
42
from bzrlib.osutils import (
43
    contains_whitespace,
44
    sha_string,
45
    split_lines,
46
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
47
from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
48
from bzrlib.lru_cache import LRUSizeCache
0.17.9 by Robert Collins
Initial stab at repository format support.
49
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
50
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
51
    adapter_registry,
52
    AbsentContentFactory,
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
53
    ChunkedContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
54
    FulltextContentFactory,
55
    VersionedFiles,
56
    )
57
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
58
_USE_LZMA = False and (pylzma is not None)
3735.32.17 by John Arbash Meinel
We now round-trip the wire_bytes.
59
_NO_LABELS = True
0.25.16 by John Arbash Meinel
Make sure we don't inter-pack to GCCHKBig repos.
60
_FAST = False
0.17.2 by Robert Collins
Core proof of concept working.
61
3735.2.162 by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point.
62
# osutils.sha_string('')
63
_null_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
64
65
0.25.3 by John Arbash Meinel
Add a encode/decode base128 functions.
66
def encode_base128_int(val):
67
    """Convert an integer into a 7-bit lsb encoding."""
68
    bytes = []
69
    count = 0
70
    while val >= 0x80:
71
        bytes.append(chr((val | 0x80) & 0xFF))
72
        val >>= 7
73
    bytes.append(chr(val))
74
    return ''.join(bytes)
75
76
77
def decode_base128_int(bytes):
78
    """Decode an integer from a 7-bit lsb encoding."""
79
    offset = 0
80
    val = 0
81
    shift = 0
82
    bval = ord(bytes[offset])
83
    while bval >= 0x80:
84
        val |= (bval & 0x7F) << shift
85
        shift += 7
86
        offset += 1
87
        bval = ord(bytes[offset])
88
    val |= bval << shift
89
    offset += 1
90
    return val, offset
91
92
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
93
def sort_gc_optimal(parent_map):
3735.31.14 by John Arbash Meinel
Change the gc-optimal to 'groupcompress'
94
    """Sort and group the keys in parent_map into groupcompress order.
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
95
3735.31.14 by John Arbash Meinel
Change the gc-optimal to 'groupcompress'
96
    groupcompress is defined (currently) as reverse-topological order, grouped by
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
97
    the key prefix.
98
99
    :return: A sorted-list of keys
100
    """
3735.31.14 by John Arbash Meinel
Change the gc-optimal to 'groupcompress'
101
    # groupcompress ordering is approximately reverse topological,
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
102
    # properly grouped by file-id.
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
103
    per_prefix_map = {}
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
104
    for item in parent_map.iteritems():
105
        key = item[0]
106
        if isinstance(key, str) or len(key) == 1:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
107
            prefix = ''
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
108
        else:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
109
            prefix = key[0]
110
        try:
111
            per_prefix_map[prefix].append(item)
112
        except KeyError:
113
            per_prefix_map[prefix] = [item]
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
114
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
115
    present_keys = []
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
116
    for prefix in sorted(per_prefix_map):
117
        present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
118
    return present_keys
119
120
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
121
class GroupCompressBlockEntry(object):
122
    """Track the information about a single object inside a GC group.
123
124
    This is generally just the dumb data structure.
125
    """
126
127
    def __init__(self, key, type, sha1, start, length):
128
        self.key = key
129
        self.type = type # delta, fulltext, external?
130
        self.sha1 = sha1 # Sha1 of content
131
        self.start = start # Byte offset to start of data
132
        self.length = length # Length of content
133
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
134
    def __repr__(self):
135
        return '%s(%s, %s, %s, %s, %s)' % (
136
            self.__class__.__name__,
137
            self.key, self.type, self.sha1, self.start, self.length
138
            )
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
139
140
    @property
141
    def end(self):
142
        return self.start + self.length
143
3735.32.9 by John Arbash Meinel
Use a 32kB extension, since that is the max window size for zlib.
144
# The max zlib window size is 32kB, so if we set 'max_size' output of the
145
# decompressor to the requested bytes + 32kB, then we should guarantee
146
# num_bytes coming out.
147
_ZLIB_DECOMP_WINDOW = 32*1024
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
148
149
class GroupCompressBlock(object):
150
    """An object which maintains the internal structure of the compressed data.
151
152
    This tracks the meta info (start of text, length, type, etc.)
153
    """
154
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
155
    # Group Compress Block v1 Zlib
156
    GCB_HEADER = 'gcb1z\n'
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
157
    GCB_LZ_HEADER = 'gcb1l\n'
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
158
159
    def __init__(self):
160
        # map by key? or just order in file?
161
        self._entries = {}
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
162
        self._compressor_name = None
3735.32.5 by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes.
163
        self._z_header_length = None
164
        self._header_length = None
165
        self._z_header = None
166
        self._z_content = None
3735.32.7 by John Arbash Meinel
Implement partial decompression support.
167
        self._z_content_decompressor = None
3735.32.5 by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes.
168
        self._z_content_length = None
169
        self._content_length = None
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
170
        self._content = None
3735.32.5 by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes.
171
172
    def __len__(self):
173
        return self._content_length + self._header_length
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
174
175
    def _parse_header(self):
0.17.48 by John Arbash Meinel
if _NO_LABELS is set, don't bother parsing the mini header.
176
        """Parse the header part of the block."""
3735.32.5 by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes.
177
        assert self._z_header is not None
178
        if self._z_header == '':
179
            # Nothing to process
180
            self._z_header = None
0.17.48 by John Arbash Meinel
if _NO_LABELS is set, don't bother parsing the mini header.
181
            return
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
182
        if self._compressor_name == 'lzma':
3735.32.5 by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes.
183
            header = pylzma.decompress(self._z_header)
184
        else:
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
185
            assert self._compressor_name == 'zlib'
3735.32.5 by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes.
186
            header = zlib.decompress(self._z_header)
187
        self._z_header = None # We have consumed the header
188
        lines = header.split('\n')
189
        del header
0.17.48 by John Arbash Meinel
if _NO_LABELS is set, don't bother parsing the mini header.
190
        info_dict = {}
191
        for line in lines:
192
            if not line: #End of record
193
                if not info_dict:
194
                    break
195
                self.add_entry(**info_dict)
196
                info_dict = {}
197
                continue
198
            key, value = line.split(':', 1)
199
            if key == 'key':
200
                value = tuple(map(intern, value.split('\x00')))
201
            elif key in ('start', 'length'):
202
                value = int(value)
203
            elif key == 'type':
204
                value = intern(value)
205
            info_dict[key] = value
206
3735.32.7 by John Arbash Meinel
Implement partial decompression support.
207
    def _ensure_content(self, num_bytes=None):
208
        """Make sure that content has been expanded enough.
209
210
        :param num_bytes: Ensure that we have extracted at least num_bytes of
211
            content. If None, consume everything
212
        """
3735.32.15 by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'.
213
        # TODO: If we re-use the same content block at different times during
214
        #       get_record_stream(), it is possible that the first pass will
215
        #       get inserted, triggering an extract/_ensure_content() which
216
        #       will get rid of _z_content. And then the next use of the block
217
        #       will try to access _z_content (to send it over the wire), and
218
        #       fail because it is already extracted. Consider never releasing
219
        #       _z_content because of this.
3735.32.7 by John Arbash Meinel
Implement partial decompression support.
220
        if num_bytes is None:
221
            num_bytes = self._content_length
3735.32.11 by John Arbash Meinel
Add tests for the ability to do partial decompression without knowing the final length.
222
        if self._content_length is not None:
223
            assert num_bytes <= self._content_length
3735.32.6 by John Arbash Meinel
A bit of reworking changes things so content is expanded at extract() time.
224
        if self._content is None:
3735.32.7 by John Arbash Meinel
Implement partial decompression support.
225
            assert self._z_content is not None
226
            if self._z_content == '':
227
                self._content = ''
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
228
            elif self._compressor_name == 'lzma':
3735.32.7 by John Arbash Meinel
Implement partial decompression support.
229
                # We don't do partial lzma decomp yet
3735.2.160 by John Arbash Meinel
Fix a trivial typo
230
                self._content = pylzma.decompress(self._z_content)
3735.32.7 by John Arbash Meinel
Implement partial decompression support.
231
            else:
232
                # Start a zlib decompressor
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
233
                assert self._compressor_name == 'zlib'
3735.32.27 by John Arbash Meinel
Have _LazyGroupContentManager pre-extract everything it holds.
234
                if num_bytes is None:
235
                    self._content = zlib.decompress(self._z_content)
236
                else:
237
                    self._z_content_decompressor = zlib.decompressobj()
238
                    # Seed the decompressor with the uncompressed bytes, so
239
                    # that the rest of the code is simplified
240
                    self._content = self._z_content_decompressor.decompress(
241
                        self._z_content, num_bytes + _ZLIB_DECOMP_WINDOW)
3735.32.7 by John Arbash Meinel
Implement partial decompression support.
242
                # Any bytes remaining to be decompressed will be in the
243
                # decompressors 'unconsumed_tail'
244
        # Do we have enough bytes already?
3735.32.11 by John Arbash Meinel
Add tests for the ability to do partial decompression without knowing the final length.
245
        if num_bytes is not None and len(self._content) >= num_bytes:
3735.32.7 by John Arbash Meinel
Implement partial decompression support.
246
            return
3735.32.27 by John Arbash Meinel
Have _LazyGroupContentManager pre-extract everything it holds.
247
        if num_bytes is None and self._z_content_decompressor is None:
248
            # We must have already decompressed everything
249
            return
3735.32.7 by John Arbash Meinel
Implement partial decompression support.
250
        # If we got this far, and don't have a decompressor, something is wrong
251
        assert self._z_content_decompressor is not None
252
        remaining_decomp = self._z_content_decompressor.unconsumed_tail
3735.32.11 by John Arbash Meinel
Add tests for the ability to do partial decompression without knowing the final length.
253
        if num_bytes is None:
254
            if remaining_decomp:
255
                # We don't know how much is left, but we'll decompress it all
256
                self._content += self._z_content_decompressor.decompress(
257
                    remaining_decomp)
258
                # Note: There what I consider a bug in zlib.decompressobj
259
                #       If you pass back in the entire unconsumed_tail, only
260
                #       this time you don't pass a max-size, it doesn't
261
                #       change the unconsumed_tail back to None/''.
262
                #       However, we know we are done with the whole stream
263
                self._z_content_decompressor = None
264
            self._content_length = len(self._content)
265
        else:
266
            # If we have nothing left to decomp, we ran out of decomp bytes
267
            assert remaining_decomp
268
            needed_bytes = num_bytes - len(self._content)
3735.32.12 by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types.
269
            # We always set max_size to 32kB over the minimum needed, so that
270
            # zlib will give us as much as we really want.
271
            # TODO: If this isn't good enough, we could make a loop here,
272
            #       that keeps expanding the request until we get enough
3735.32.11 by John Arbash Meinel
Add tests for the ability to do partial decompression without knowing the final length.
273
            self._content += self._z_content_decompressor.decompress(
274
                remaining_decomp, needed_bytes + _ZLIB_DECOMP_WINDOW)
275
            assert len(self._content) >= num_bytes
276
            if not self._z_content_decompressor.unconsumed_tail:
277
                # The stream is finished
278
                self._z_content_decompressor = None
3735.32.6 by John Arbash Meinel
A bit of reworking changes things so content is expanded at extract() time.
279
3735.32.5 by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes.
280
    def _parse_bytes(self, bytes):
281
        """Read the various lengths from the header.
282
283
        This also populates the various 'compressed' buffers.
284
285
        :return: The position in bytes just after the last newline
286
        """
287
        # At present, there are 4 lengths to be read, we have 2 integers for
288
        # the length of the compressed and uncompressed header, and 2 integers
289
        # for the compressed and uncompressed content
3735.32.10 by John Arbash Meinel
test that we support reading from the gc blocks that didn't have their lengths.
290
        # 14 bytes can represent > 1TB, so to avoid checking too far, cap the
291
        # search to 14 bytes.
292
        pos = bytes.index('\n', 6, 20)
3735.32.5 by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes.
293
        self._z_header_length = int(bytes[6:pos])
294
        pos += 1
3735.32.10 by John Arbash Meinel
test that we support reading from the gc blocks that didn't have their lengths.
295
        pos2 = bytes.index('\n', pos, pos + 14)
3735.32.5 by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes.
296
        self._header_length = int(bytes[pos:pos2])
3735.32.10 by John Arbash Meinel
test that we support reading from the gc blocks that didn't have their lengths.
297
        end_of_z_lengths = pos2
3735.32.5 by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes.
298
        pos2 += 1
299
        # Older versions don't have the content lengths, if we want to preserve
300
        # backwards compatibility, we could try/except over these, and allow
301
        # them to be skipped
3735.32.10 by John Arbash Meinel
test that we support reading from the gc blocks that didn't have their lengths.
302
        try:
303
            pos = bytes.index('\n', pos2, pos2 + 14)
304
            self._z_content_length = int(bytes[pos2:pos])
305
            pos += 1
306
            pos2 = bytes.index('\n', pos, pos + 14)
307
            self._content_length = int(bytes[pos:pos2])
308
            pos = pos2 + 1
309
            assert len(bytes) == (pos + self._z_header_length +
310
                                  self._z_content_length)
311
            pos2 = pos + self._z_header_length
312
            self._z_header = bytes[pos:pos2]
313
            self._z_content = bytes[pos2:]
314
            assert len(self._z_content) == self._z_content_length
315
        except ValueError:
316
            # This is the older form, which did not encode its content length
317
            pos = end_of_z_lengths + 1
318
            pos2 = pos + self._z_header_length
319
            self._z_header = bytes[pos:pos2]
320
            self._z_content = bytes[pos2:]
321
            self._z_content_length = len(self._z_content)
3735.32.5 by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes.
322
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
323
    @classmethod
324
    def from_bytes(cls, bytes):
325
        out = cls()
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
326
        if bytes[:6] not in (cls.GCB_HEADER, cls.GCB_LZ_HEADER):
3735.31.1 by John Arbash Meinel
Bring the groupcompress plugin into the brisbane-core branch.
327
            raise ValueError('bytes did not start with %r' % (cls.GCB_HEADER,))
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
328
        if bytes[4] == 'z':
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
329
            out._compressor_name = 'zlib'
0.17.45 by John Arbash Meinel
Just make sure we have the right decompressor
330
        elif bytes[4] == 'l':
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
331
            out._compressor_name = 'lzma'
0.17.45 by John Arbash Meinel
Just make sure we have the right decompressor
332
        else:
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
333
            raise ValueError('unknown compressor: %r' % (bytes,))
3735.32.5 by John Arbash Meinel
Change the parsing code to start out just holding the compressed bytes.
334
        out._parse_bytes(bytes)
335
        if not _NO_LABELS:
336
            out._parse_header()
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
337
        return out
338
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
339
    def extract(self, key, start, end, sha1=None):
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
340
        """Extract the text for a specific key.
341
342
        :param key: The label used for this content
343
        :param sha1: TODO (should we validate only when sha1 is supplied?)
344
        :return: The bytes for the content
345
        """
3735.34.1 by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit.
346
        if start == end == 0:
3735.2.158 by John Arbash Meinel
Remove support for passing None for end in GroupCompressBlock.extract.
347
            return ''
348
        self._ensure_content(end)
3735.32.7 by John Arbash Meinel
Implement partial decompression support.
349
        # The bytes are 'f' or 'd' for the type, then a variable-length
350
        # base128 integer for the content size, then the actual content
3735.32.15 by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'.
351
        # We know that the variable-length integer won't be longer than 5
352
        # bytes (it takes 5 bytes to encode 2^32)
3735.32.7 by John Arbash Meinel
Implement partial decompression support.
353
        c = self._content[start]
354
        if c == 'f':
355
            type = 'fulltext'
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
356
        else:
3735.32.7 by John Arbash Meinel
Implement partial decompression support.
357
            if c != 'd':
358
                raise ValueError('Unknown content control code: %s'
359
                                 % (c,))
360
            type = 'delta'
3735.32.15 by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'.
361
        content_len, len_len = decode_base128_int(
362
                            self._content[start + 1:start + 6])
363
        content_start = start + 1 + len_len
3735.2.158 by John Arbash Meinel
Remove support for passing None for end in GroupCompressBlock.extract.
364
        if end != content_start + content_len:
365
            raise ValueError('end != len according to field header'
366
                ' %s != %s' % (end, content_start + content_len))
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
367
        content = self._content[content_start:end]
368
        if c == 'f':
369
            bytes = content
370
        elif c == 'd':
371
            bytes = _groupcompress_pyx.apply_delta(self._content, content)
3735.2.158 by John Arbash Meinel
Remove support for passing None for end in GroupCompressBlock.extract.
372
        return bytes
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
373
374
    def add_entry(self, key, type, sha1, start, length):
375
        """Add new meta info about an entry.
376
377
        :param key: The key for the new content
378
        :param type: Whether this is a delta or fulltext entry (external?)
379
        :param sha1: sha1sum of the fulltext of this entry
380
        :param start: where the encoded bytes start
381
        :param length: total number of bytes in the encoded form
382
        :return: The entry?
383
        """
384
        entry = GroupCompressBlockEntry(key, type, sha1, start, length)
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
385
        if key in self._entries:
386
            raise ValueError('Duplicate key found: %s' % (key,))
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
387
        self._entries[key] = entry
388
        return entry
389
3735.32.17 by John Arbash Meinel
We now round-trip the wire_bytes.
390
    def set_content(self, content):
391
        """Set the content of this block."""
392
        self._content_length = len(content)
393
        self._content = content
394
        self._z_content = None
395
        self._z_header_length = None
396
397
    def to_bytes(self):
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
398
        """Encode the information into a byte stream."""
0.17.44 by John Arbash Meinel
Use the bit field to allow both lzma groups and zlib groups.
399
        compress = zlib.compress
400
        if _USE_LZMA:
401
            compress = pylzma.compress
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
402
        chunks = []
403
        for key in sorted(self._entries):
404
            entry = self._entries[key]
405
            chunk = ('key:%s\n'
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
406
                     'sha1:%s\n'
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
407
                     'type:%s\n'
408
                     'start:%s\n'
409
                     'length:%s\n'
410
                     '\n'
411
                     ) % ('\x00'.join(entry.key),
0.25.4 by John Arbash Meinel
We at least have the rudimentary ability to encode and decode values.
412
                          entry.sha1,
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
413
                          entry.type,
414
                          entry.start,
415
                          entry.length,
416
                          )
417
            chunks.append(chunk)
0.25.5 by John Arbash Meinel
Now using a zlib compressed format.
418
        bytes = ''.join(chunks)
419
        info_len = len(bytes)
3735.32.4 by John Arbash Meinel
Change the byte representation of a groupcompress block.
420
        z_header_bytes = compress(bytes)
421
        del bytes, chunks
422
        z_header_len = len(z_header_bytes)
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
423
        # TODO: we may want to have the header compressed in the same chain
424
        #       as the data, or we may not, evaulate it
425
        #       having them compressed together is probably a win for
426
        #       revisions and the 'inv' portion of chk inventories. As the
427
        #       label in the header is duplicated in the text.
428
        #       For chk pages and real bytes, I would guess this is not
429
        #       true.
0.25.13 by John Arbash Meinel
bring back the code that handles _NO_LABELS.
430
        if _NO_LABELS:
3735.32.4 by John Arbash Meinel
Change the byte representation of a groupcompress block.
431
            z_header_bytes = ''
432
            z_header_len = 0
0.25.13 by John Arbash Meinel
bring back the code that handles _NO_LABELS.
433
            info_len = 0
3735.32.17 by John Arbash Meinel
We now round-trip the wire_bytes.
434
        if self._z_content is not None:
435
            content_len = self._content_length
436
            z_content_len = self._z_content_length
437
            z_content_bytes = self._z_content
438
        else:
439
            assert self._content is not None
440
            content_len = self._content_length
441
            z_content_bytes = compress(self._content)
442
            self._z_content = z_content_bytes
443
            z_content_len = len(z_content_bytes)
444
            self._z_content_length = z_content_len
0.17.46 by John Arbash Meinel
Set the proper header when using/not using lzma
445
        if _USE_LZMA:
446
            header = self.GCB_LZ_HEADER
447
        else:
448
            header = self.GCB_HEADER
449
        chunks = [header,
3735.32.4 by John Arbash Meinel
Change the byte representation of a groupcompress block.
450
                  '%d\n%d\n%d\n%d\n' % (z_header_len, info_len,
451
                                        z_content_len, content_len)
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
452
                 ]
3735.32.4 by John Arbash Meinel
Change the byte representation of a groupcompress block.
453
        chunks.append(z_header_bytes)
454
        chunks.append(z_content_bytes)
0.25.2 by John Arbash Meinel
First cut at meta-info as text form.
455
        return ''.join(chunks)
456
457
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
458
class _LazyGroupCompressFactory(object):
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
459
    """Yield content from a GroupCompressBlock on demand."""
460
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
461
    def __init__(self, key, parents, manager, start, end, first):
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
462
        """Create a _LazyGroupCompressFactory
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
463
464
        :param key: The key of just this record
465
        :param parents: The parents of this key (possibly None)
466
        :param gc_block: A GroupCompressBlock object
467
        :param start: Offset of the first byte for this record in the
468
            uncompressd content
469
        :param end: Offset of the byte just after the end of this record
470
            (ie, bytes = content[start:end])
471
        :param first: Is this the first Factory for the given block?
472
        """
473
        self.key = key
474
        self.parents = parents
475
        self.sha1 = None
3735.32.15 by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'.
476
        # Note: This attribute coupled with Manager._factories creates a
477
        #       reference cycle. Perhaps we would rather use a weakref(), or
478
        #       find an appropriate time to release the ref. After the first
479
        #       get_bytes_as call? After Manager.get_record_stream() returns
480
        #       the object?
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
481
        self._manager = manager
3735.34.1 by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit.
482
        self._bytes = None
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
483
        self.storage_kind = 'groupcompress-block'
484
        if not first:
485
            self.storage_kind = 'groupcompress-block-ref'
486
        self._first = first
487
        self._start = start
488
        self._end = end
489
3735.32.12 by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types.
490
    def __repr__(self):
491
        return '%s(%s, first=%s)' % (self.__class__.__name__,
492
            self.key, self._first)
493
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
494
    def get_bytes_as(self, storage_kind):
495
        if storage_kind == self.storage_kind:
496
            if self._first:
497
                # wire bytes, something...
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
498
                return self._manager._wire_bytes()
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
499
            else:
500
                return ''
501
        if storage_kind in ('fulltext', 'chunked'):
3735.34.1 by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit.
502
            if self._bytes is None:
3735.34.3 by John Arbash Meinel
Cleanup, in preparation for merging to brisbane-core.
503
                # Grab and cache the raw bytes for this entry
504
                # and break the ref-cycle with _manager since we don't need it
505
                # anymore
3735.34.1 by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit.
506
                self._manager._prepare_for_extract()
507
                block = self._manager._block
3735.34.2 by John Arbash Meinel
Merge brisbane-core tip, resolve differences.
508
                self._bytes = block.extract(self.key, self._start, self._end)
3735.2.163 by John Arbash Meinel
Merge bzr.dev 4187, and revert the change to fix refcycle issues.
509
                # XXX: It seems the smart fetch extracts inventories and chk
510
                #      pages as fulltexts to find the next chk pages, but then
511
                #      passes them down to be inserted as a
512
                #      groupcompress-block, so this is not safe to do. Perhaps
513
                #      we could just change the storage kind to "fulltext" at
514
                #      that point?
515
                # self._manager = None
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
516
            if storage_kind == 'fulltext':
3735.34.1 by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit.
517
                return self._bytes
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
518
            else:
3735.34.1 by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit.
519
                return [self._bytes]
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
520
        raise errors.UnavailableRepresentation(self.key, storage_kind,
3735.34.3 by John Arbash Meinel
Cleanup, in preparation for merging to brisbane-core.
521
                                               self.storage_kind)
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
522
523
3735.32.17 by John Arbash Meinel
We now round-trip the wire_bytes.
524
class _LazyGroupContentManager(object):
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
525
    """This manages a group of _LazyGroupCompressFactory objects."""
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
526
527
    def __init__(self, block):
528
        self._block = block
529
        # We need to preserve the ordering
530
        self._factories = []
3735.32.27 by John Arbash Meinel
Have _LazyGroupContentManager pre-extract everything it holds.
531
        self._last_byte = 0
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
532
533
    def add_factory(self, key, parents, start, end):
534
        if not self._factories:
535
            first = True
536
        else:
537
            first = False
538
        # Note that this creates a reference cycle....
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
539
        factory = _LazyGroupCompressFactory(key, parents, self,
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
540
            start, end, first=first)
3735.32.27 by John Arbash Meinel
Have _LazyGroupContentManager pre-extract everything it holds.
541
        self._last_byte = max(end, self._last_byte)
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
542
        self._factories.append(factory)
543
544
    def get_record_stream(self):
545
        """Get a record for all keys added so far."""
546
        for factory in self._factories:
547
            yield factory
3735.34.3 by John Arbash Meinel
Cleanup, in preparation for merging to brisbane-core.
548
            # Break the ref-cycle
3735.34.2 by John Arbash Meinel
Merge brisbane-core tip, resolve differences.
549
            factory._bytes = None
3735.2.163 by John Arbash Meinel
Merge bzr.dev 4187, and revert the change to fix refcycle issues.
550
            # XXX: this is not safe, the smart fetch code requests the content
551
            #      as both a 'fulltext', and then later on as a
3735.36.5 by John Arbash Meinel
iter_interesting_nodes really is a culprit causing us to hold on to a record
552
            #      groupcompress-block. The iter_interesting_nodes code also is
553
            #      still buffering multiple records and returning them later.
554
            #      So that code would need to be updated to either re-fetch the
555
            #      original object, or buffer it somehow.
3735.2.163 by John Arbash Meinel
Merge bzr.dev 4187, and revert the change to fix refcycle issues.
556
            # factory._manager = None
3735.32.15 by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'.
557
        # TODO: Consider setting self._factories = None after the above loop,
558
        #       as it will break the reference cycle
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
559
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
560
    def _trim_block(self, last_byte):
561
        """Create a new GroupCompressBlock, with just some of the content."""
562
        # None of the factories need to be adjusted, because the content is
563
        # located in an identical place. Just that some of the unreferenced
564
        # trailing bytes are stripped
565
        trace.mutter('stripping trailing bytes from groupcompress block'
566
                     ' %d => %d', self._block._content_length, last_byte)
567
        new_block = GroupCompressBlock()
568
        self._block._ensure_content(last_byte)
569
        new_block.set_content(self._block._content[:last_byte])
570
        self._block = new_block
571
572
    def _rebuild_block(self):
573
        """Create a new GroupCompressBlock with only the referenced texts."""
574
        compressor = GroupCompressor()
575
        tstart = time.time()
576
        old_length = self._block._content_length
3735.2.162 by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point.
577
        end_point = 0
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
578
        for factory in self._factories:
579
            bytes = factory.get_bytes_as('fulltext')
3735.2.162 by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point.
580
            (found_sha1, start_point, end_point, type,
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
581
             length) = compressor.compress(factory.key, bytes, factory.sha1)
582
            # Now update this factory with the new offsets, etc
583
            factory.sha1 = found_sha1
3735.2.162 by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point.
584
            factory._start = start_point
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
585
            factory._end = end_point
3735.2.162 by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point.
586
        self._last_byte = end_point
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
587
        new_block = compressor.flush()
588
        # TODO: Should we check that new_block really *is* smaller than the old
589
        #       block? It seems hard to come up with a method that it would
590
        #       expand, since we do full compression again. Perhaps based on a
591
        #       request that ends up poorly ordered?
592
        delta = time.time() - tstart
593
        self._block = new_block
594
        trace.mutter('creating new compressed block on-the-fly in %.3fs'
595
                     ' %d bytes => %d bytes', delta, old_length,
596
                     self._block._content_length)
597
3735.32.27 by John Arbash Meinel
Have _LazyGroupContentManager pre-extract everything it holds.
598
    def _prepare_for_extract(self):
599
        """A _LazyGroupCompressFactory is about to extract to fulltext."""
600
        # We expect that if one child is going to fulltext, all will be. This
601
        # helps prevent all of them from extracting a small amount at a time.
602
        # Which in itself isn't terribly expensive, but resizing 2MB 32kB at a
603
        # time (self._block._content) is a little expensive.
604
        self._block._ensure_content(self._last_byte)
605
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
606
    def _check_rebuild_block(self):
607
        """Check to see if our block should be repacked."""
608
        total_bytes_used = 0
609
        last_byte_used = 0
610
        for factory in self._factories:
611
            total_bytes_used += factory._end - factory._start
612
            last_byte_used = max(last_byte_used, factory._end)
613
        # If we are using most of the bytes from the block, we have nothing
614
        # else to check (currently more that 1/2)
615
        if total_bytes_used * 2 >= self._block._content_length:
616
            return
617
        # Can we just strip off the trailing bytes? If we are going to be
618
        # transmitting more than 50% of the front of the content, go ahead
619
        if total_bytes_used * 2 > last_byte_used:
620
            self._trim_block(last_byte_used)
621
            return
622
623
        # We are using a small amount of the data, and it isn't just packed
624
        # nicely at the front, so rebuild the content.
625
        # Note: This would be *nicer* as a strip-data-from-group, rather than
626
        #       building it up again from scratch
627
        #       It might be reasonable to consider the fulltext sizes for
628
        #       different bits when deciding this, too. As you may have a small
629
        #       fulltext, and a trivial delta, and you are just trading around
630
        #       for another fulltext. If we do a simple 'prune' you may end up
631
        #       expanding many deltas into fulltexts, as well.
632
        #       If we build a cheap enough 'strip', then we could try a strip,
633
        #       if that expands the content, we then rebuild.
634
        self._rebuild_block()
635
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
636
    def _wire_bytes(self):
637
        """Return a byte stream suitable for transmitting over the wire."""
3735.32.24 by John Arbash Meinel
_wire_bytes() now strips groups as necessary, as does _insert_record_stream
638
        self._check_rebuild_block()
3735.32.16 by John Arbash Meinel
We now have a general header for the GC block.
639
        # The outer block starts with:
640
        #   'groupcompress-block\n'
641
        #   <length of compressed key info>\n
642
        #   <length of uncompressed info>\n
643
        #   <length of gc block>\n
644
        #   <header bytes>
645
        #   <gc-block>
646
        lines = ['groupcompress-block\n']
647
        # The minimal info we need is the key, the start offset, and the
648
        # parents. The length and type are encoded in the record itself.
649
        # However, passing in the other bits makes it easier.  The list of
650
        # keys, and the start offset, the length
651
        # 1 line key
652
        # 1 line with parents, '' for ()
653
        # 1 line for start offset
654
        # 1 line for end byte
655
        header_lines = []
3735.32.15 by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'.
656
        for factory in self._factories:
3735.32.16 by John Arbash Meinel
We now have a general header for the GC block.
657
            key_bytes = '\x00'.join(factory.key)
658
            parents = factory.parents
659
            if parents is None:
660
                parent_bytes = 'None:'
661
            else:
662
                parent_bytes = '\t'.join('\x00'.join(key) for key in parents)
663
            record_header = '%s\n%s\n%d\n%d\n' % (
664
                key_bytes, parent_bytes, factory._start, factory._end)
665
            header_lines.append(record_header)
666
        header_bytes = ''.join(header_lines)
667
        del header_lines
668
        header_bytes_len = len(header_bytes)
669
        z_header_bytes = zlib.compress(header_bytes)
670
        del header_bytes
671
        z_header_bytes_len = len(z_header_bytes)
3735.32.17 by John Arbash Meinel
We now round-trip the wire_bytes.
672
        block_bytes = self._block.to_bytes()
3735.32.16 by John Arbash Meinel
We now have a general header for the GC block.
673
        lines.append('%d\n%d\n%d\n' % (z_header_bytes_len, header_bytes_len,
3735.32.17 by John Arbash Meinel
We now round-trip the wire_bytes.
674
                                       len(block_bytes)))
3735.32.16 by John Arbash Meinel
We now have a general header for the GC block.
675
        lines.append(z_header_bytes)
3735.32.17 by John Arbash Meinel
We now round-trip the wire_bytes.
676
        lines.append(block_bytes)
677
        del z_header_bytes, block_bytes
3735.32.16 by John Arbash Meinel
We now have a general header for the GC block.
678
        return ''.join(lines)
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
679
3735.32.17 by John Arbash Meinel
We now round-trip the wire_bytes.
680
    @classmethod
3735.32.18 by John Arbash Meinel
We now support generating a network stream.
681
    def from_bytes(cls, bytes):
3735.32.17 by John Arbash Meinel
We now round-trip the wire_bytes.
682
        # TODO: This does extra string copying, probably better to do it a
683
        #       different way
684
        (storage_kind, z_header_len, header_len,
685
         block_len, rest) = bytes.split('\n', 4)
686
        del bytes
687
        if storage_kind != 'groupcompress-block':
688
            raise ValueError('Unknown storage kind: %s' % (storage_kind,))
689
        z_header_len = int(z_header_len)
690
        if len(rest) < z_header_len:
691
            raise ValueError('Compressed header len shorter than all bytes')
692
        z_header = rest[:z_header_len]
693
        header_len = int(header_len)
694
        header = zlib.decompress(z_header)
695
        if len(header) != header_len:
696
            raise ValueError('invalid length for decompressed bytes')
697
        del z_header
698
        block_len = int(block_len)
699
        if len(rest) != z_header_len + block_len:
700
            raise ValueError('Invalid length for block')
701
        block_bytes = rest[z_header_len:]
702
        del rest
703
        # So now we have a valid GCB, we just need to parse the factories that
704
        # were sent to us
705
        header_lines = header.split('\n')
706
        del header
707
        last = header_lines.pop()
708
        if last != '':
709
            raise ValueError('header lines did not end with a trailing'
710
                             ' newline')
711
        if len(header_lines) % 4 != 0:
712
            raise ValueError('The header was not an even multiple of 4 lines')
713
        block = GroupCompressBlock.from_bytes(block_bytes)
714
        del block_bytes
715
        result = cls(block)
716
        for start in xrange(0, len(header_lines), 4):
717
            # intern()?
718
            key = tuple(header_lines[start].split('\x00'))
719
            parents_line = header_lines[start+1]
720
            if parents_line == 'None:':
721
                parents = None
722
            else:
723
                parents = tuple([tuple(segment.split('\x00'))
724
                                 for segment in parents_line.split('\t')
725
                                  if segment])
726
            start_offset = int(header_lines[start+2])
727
            end_offset = int(header_lines[start+3])
728
            result.add_factory(key, parents, start_offset, end_offset)
729
        return result
730
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
731
3735.32.18 by John Arbash Meinel
We now support generating a network stream.
732
def network_block_to_records(storage_kind, bytes, line_end):
733
    if storage_kind != 'groupcompress-block':
734
        raise ValueError('Unknown storage kind: %s' % (storage_kind,))
735
    manager = _LazyGroupContentManager.from_bytes(bytes)
736
    return manager.get_record_stream()
737
738
0.17.2 by Robert Collins
Core proof of concept working.
739
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
740
    """Produce a serialised group of compressed texts.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
741
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
742
    It contains code very similar to SequenceMatcher because of having a similar
743
    task. However some key differences apply:
744
     - there is no junk, we want a minimal edit not a human readable diff.
745
     - we don't filter very common lines (because we don't know where a good
746
       range will start, and after the first text we want to be emitting minmal
747
       edits only.
748
     - we chain the left side, not the right side
749
     - we incrementally update the adjacency matrix as new lines are provided.
750
     - we look for matches in all of the left side, so the routine which does
751
       the analagous task of find_longest_match does not need to filter on the
752
       left side.
753
    """
0.17.2 by Robert Collins
Core proof of concept working.
754
3735.32.19 by John Arbash Meinel
Get rid of the 'delta' flag to GroupCompressor. It didn't do anything anyway.
755
    def __init__(self):
756
        """Create a GroupCompressor."""
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
757
        # Consider seeding the lines with some sort of GC Start flag, or
758
        # putting it as part of the output stream, rather than in the
759
        # compressed bytes.
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
760
        self.lines = []
0.17.2 by Robert Collins
Core proof of concept working.
761
        self.endpoint = 0
762
        self.input_bytes = 0
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
763
        self.num_keys = 0
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
764
        self.labels_deltas = {}
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
765
        self._last = None
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
766
        self._delta_index = _groupcompress_pyx.DeltaIndex()
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
767
        self._block = GroupCompressBlock()
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
768
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
769
    def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):
0.17.2 by Robert Collins
Core proof of concept working.
770
        """Compress lines with label key.
771
772
        :param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins
Working better --gc-plain-chk.
773
            for identification of the text during decompression. If the last
774
            element is 'None' it is replaced with the sha1 of the text -
775
            e.g. sha1:xxxxxxx.
0.23.58 by John Arbash Meinel
fix up the failing tests.
776
        :param bytes: The bytes to be compressed
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
777
        :param expected_sha: If non-None, the sha the lines are believed to
0.17.2 by Robert Collins
Core proof of concept working.
778
            have. During compression the sha is calculated; a mismatch will
779
            cause an error.
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
780
        :param nostore_sha: If the computed sha1 sum matches, we will raise
781
            ExistingContent rather than adding the text.
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
782
        :param soft: Do a 'soft' compression. This means that we require larger
783
            ranges to match to be considered for a copy command.
0.17.2 by Robert Collins
Core proof of concept working.
784
        :return: The sha1 of lines, and the number of bytes accumulated in
785
            the group output so far.
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
786
        :seealso VersionedFiles.add_lines:
0.17.2 by Robert Collins
Core proof of concept working.
787
        """
3735.2.162 by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point.
788
        if not bytes: # empty, like a dir entry, etc
789
            if nostore_sha == _null_sha1:
790
                raise errors.ExistingContent()
791
            self._block.add_entry(key, type='empty',
792
                                  sha1=None, start=0,
793
                                  length=0)
794
            return _null_sha1, 0, 0, 'fulltext', 0
795
        # we assume someone knew what they were doing when they passed it in
796
        if expected_sha is not None:
0.23.54 by John Arbash Meinel
'bzr pack' _FAST during compress() now is 32s versus 25s.
797
            sha1 = expected_sha
3735.2.162 by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point.
798
        else:
799
            sha1 = osutils.sha_string(bytes)
800
        if nostore_sha is not None:
801
            if sha1 == nostore_sha:
802
                raise errors.ExistingContent()
0.17.26 by Robert Collins
Working better --gc-plain-chk.
803
        if key[-1] is None:
804
            key = key[:-1] + ('sha1:' + sha1,)
0.23.52 by John Arbash Meinel
Use the max_delta flag.
805
        input_len = len(bytes)
0.23.12 by John Arbash Meinel
Add a 'len:' field to the data.
806
        # By having action/label/sha1/len, we can parse the group if the index
807
        # was ever destroyed, we have the key in 'label', we know the final
808
        # bytes are valid from sha1, and we know where to find the end of this
809
        # record because of 'len'. (the delta record itself will store the
810
        # total length for the expanded record)
0.23.13 by John Arbash Meinel
Factor out the ability to have/not have labels.
811
        # 'len: %d\n' costs approximately 1% increase in total data
812
        # Having the labels at all costs us 9-10% increase, 38% increase for
813
        # inventory pages, and 5.8% increase for text pages
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
814
        # new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
815
        if self._delta_index._source_offset != self.endpoint:
816
            raise AssertionError('_source_offset != endpoint'
817
                ' somehow the DeltaIndex got out of sync with'
818
                ' the output lines')
0.23.52 by John Arbash Meinel
Use the max_delta flag.
819
        max_delta_size = len(bytes) / 2
820
        delta = self._delta_index.make_delta(bytes, max_delta_size)
821
        if (delta is None):
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
822
            type = 'fulltext'
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
823
            enc_length = encode_base128_int(len(bytes))
824
            len_mini_header = 1 + len(enc_length)
825
            length = len(bytes) + len_mini_header
826
            self._delta_index.add_source(bytes, len_mini_header)
827
            new_chunks = ['f', enc_length, bytes]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
828
        else:
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
829
            type = 'delta'
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
830
            enc_length = encode_base128_int(len(delta))
831
            len_mini_header = 1 + len(enc_length)
832
            length = len(delta) + len_mini_header
833
            new_chunks = ['d', enc_length, delta]
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
834
            if _FAST:
0.25.14 by John Arbash Meinel
Fix a bug in 'FAST' handling.
835
                self._delta_index._source_offset += length
0.23.43 by John Arbash Meinel
Change the internals to allow delta indexes to be expanded with new source data.
836
            else:
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
837
                self._delta_index.add_delta_source(delta, len_mini_header)
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
838
        self._block.add_entry(key, type=type, sha1=sha1,
839
                              start=self.endpoint, length=length)
3735.2.162 by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point.
840
        start = self.endpoint
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
841
        delta_start = (self.endpoint, len(self.lines))
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
842
        self.num_keys += 1
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
843
        self.output_chunks(new_chunks)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
844
        self.input_bytes += input_len
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
845
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
846
        self.labels_deltas[key] = (delta_start, delta_end)
0.23.29 by John Arbash Meinel
Forgot to add the delta bytes to the index objects.
847
        if not self._delta_index._source_offset == self.endpoint:
848
            raise AssertionError('the delta index is out of sync'
849
                'with the output lines %s != %s'
850
                % (self._delta_index._source_offset, self.endpoint))
3735.2.162 by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point.
851
        return sha1, start, self.endpoint, type, length
0.17.2 by Robert Collins
Core proof of concept working.
852
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
853
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
854
        """Extract a key previously added to the compressor.
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
855
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
856
        :param key: The key to extract.
857
        :return: An iterable over bytes and the sha1.
858
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
859
        delta_details = self.labels_deltas[key]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
860
        delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
861
        stored_bytes = ''.join(delta_chunks)
862
        # TODO: Fix this, we shouldn't really be peeking here
863
        entry = self._block._entries[key]
864
        if entry.type == 'fulltext':
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
865
            if stored_bytes[0] != 'f':
866
                raise ValueError('Index claimed fulltext, but stored bytes'
867
                                 ' indicate %s' % (stored_bytes[0],))
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
868
            fulltext_len, offset = decode_base128_int(stored_bytes[1:10])
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
869
            if fulltext_len + 1 + offset != len(stored_bytes):
870
                raise ValueError('Index claimed fulltext len, but stored bytes'
871
                                 ' claim %s != %s'
872
                                 % (len(stored_bytes),
873
                                    fulltext_len + 1 + offset))
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
874
            bytes = stored_bytes[offset + 1:]
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
875
        else:
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
876
            if entry.type != 'delta':
877
                raise ValueError('Unknown entry type: %s' % (entry.type,))
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
878
            # XXX: This is inefficient at best
879
            source = ''.join(self.lines)
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
880
            if stored_bytes[0] != 'd':
881
                raise ValueError('Entry type claims delta, bytes claim %s'
882
                                 % (stored_bytes[0],))
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
883
            delta_len, offset = decode_base128_int(stored_bytes[1:10])
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
884
            if delta_len + 1 + offset != len(stored_bytes):
885
                raise ValueError('Index claimed delta len, but stored bytes'
886
                                 ' claim %s != %s'
887
                                 % (len(stored_bytes),
888
                                    delta_len + 1 + offset))
0.17.36 by John Arbash Meinel
Adding a mini-len to the delta/fulltext bytes
889
            bytes = _groupcompress_pyx.apply_delta(source,
890
                                                   stored_bytes[offset + 1:])
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
891
        bytes_sha1 = sha_string(bytes)
892
        if entry.sha1 != bytes_sha1:
893
            raise ValueError('Recorded sha1 != measured %s != %s'
894
                             % (entry.sha1, bytes_sha1))
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
895
        return bytes, entry.sha1
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
896
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
897
    def flush(self):
898
        """Finish this group, creating a formatted stream."""
899
        content = ''.join(self.lines)
900
        self.lines = None
3735.32.17 by John Arbash Meinel
We now round-trip the wire_bytes.
901
        self._block.set_content(content)
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
902
        return self._block
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
903
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
904
    def output_chunks(self, new_chunks):
905
        """Output some chunks.
906
907
        :param new_chunks: The chunks to output.
908
        """
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
909
        self._last = (len(self.lines), self.endpoint)
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
910
        endpoint = self.endpoint
0.23.9 by John Arbash Meinel
We now basically have full support for using diff-delta as the compressor.
911
        self.lines.extend(new_chunks)
912
        endpoint += sum(map(len, new_chunks))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
913
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
914
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
915
    def pop_last(self):
916
        """Call this if you want to 'revoke' the last compression.
917
918
        After this, the data structures will be rolled back, but you cannot do
919
        more compression.
920
        """
921
        self._delta_index = None
922
        del self.lines[self._last[0]:]
923
        self.endpoint = self._last[1]
924
        self._last = None
925
0.17.2 by Robert Collins
Core proof of concept working.
926
    def ratio(self):
927
        """Return the overall compression ratio."""
928
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
929
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
930
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
931
def make_pack_factory(graph, delta, keylength):
932
    """Create a factory for creating a pack based groupcompress.
933
934
    This is only functional enough to run interface tests, it doesn't try to
935
    provide a full pack environment.
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
936
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
937
    :param graph: Store a graph.
938
    :param delta: Delta compress contents.
939
    :param keylength: How long should keys be.
940
    """
941
    def factory(transport):
3735.32.2 by John Arbash Meinel
The 'delta' flag has no effect on the content (all GC is delta'd),
942
        parents = graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
943
        ref_length = 0
944
        if graph:
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
945
            ref_length = 1
0.17.7 by Robert Collins
Update for current index2 changes.
946
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
947
            key_elements=keylength)
948
        stream = transport.open_write_stream('newpack')
949
        writer = pack.ContainerWriter(stream.write)
950
        writer.begin()
951
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
952
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
953
        access = _DirectPackAccess({})
954
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
955
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
956
        result.stream = stream
957
        result.writer = writer
958
        return result
959
    return factory
960
961
962
def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
963
    versioned_files.writer.end()
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
964
    versioned_files.stream.close()
965
966
967
class GroupCompressVersionedFiles(VersionedFiles):
968
    """A group-compress based VersionedFiles implementation."""
969
0.17.2 by Robert Collins
Core proof of concept working.
970
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
971
        """Create a GroupCompressVersionedFiles object.
972
973
        :param index: The index object storing access and graph data.
974
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
975
        :param delta: Whether to delta compress or just entropy compress.
976
        """
977
        self._index = index
978
        self._access = access
979
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
980
        self._unadded_refs = {}
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
981
        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
982
        self._fallback_vfs = []
0.17.2 by Robert Collins
Core proof of concept working.
983
984
    def add_lines(self, key, parents, lines, parent_texts=None,
985
        left_matching_blocks=None, nostore_sha=None, random_id=False,
986
        check_content=True):
987
        """Add a text to the store.
988
989
        :param key: The key tuple of the text to add.
990
        :param parents: The parents key tuples of the text to add.
991
        :param lines: A list of lines. Each line must be a bytestring. And all
992
            of them except the last must be terminated with \n and contain no
993
            other \n's. The last line may either contain no \n's or a single
994
            terminating \n. If the lines list does meet this constraint the add
995
            routine may error or may succeed - but you will be unable to read
996
            the data back accurately. (Checking the lines have been split
997
            correctly is expensive and extremely unlikely to catch bugs so it
998
            is not done at runtime unless check_content is True.)
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
999
        :param parent_texts: An optional dictionary containing the opaque
0.17.2 by Robert Collins
Core proof of concept working.
1000
            representations of some or all of the parents of version_id to
1001
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
1002
            returned by add_lines or data corruption can be caused.
1003
        :param left_matching_blocks: a hint about which areas are common
1004
            between the text and its left-hand-parent.  The format is
1005
            the SequenceMatcher.get_matching_blocks format.
1006
        :param nostore_sha: Raise ExistingContent and do not add the lines to
1007
            the versioned file if the digest of the lines matches this.
1008
        :param random_id: If True a random id has been selected rather than
1009
            an id determined by some deterministic process such as a converter
1010
            from a foreign VCS. When True the backend may choose not to check
1011
            for uniqueness of the resulting key within the versioned file, so
1012
            this should only be done when the result is expected to be unique
1013
            anyway.
1014
        :param check_content: If True, the lines supplied are verified to be
1015
            bytestrings that are correctly formed lines.
1016
        :return: The text sha1, the number of bytes in the text, and an opaque
1017
                 representation of the inserted version which can be provided
1018
                 back to future add_lines calls in the parent_texts dictionary.
1019
        """
1020
        self._index._check_write_ok()
1021
        self._check_add(key, lines, random_id, check_content)
1022
        if parents is None:
1023
            # The caller might pass None if there is no graph data, but kndx
1024
            # indexes can't directly store that, so we give them
1025
            # an empty tuple instead.
1026
            parents = ()
1027
        # double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
1028
        length = sum(map(len, lines))
1029
        record = ChunkedContentFactory(key, parents, None, lines)
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
1030
        sha1 = list(self._insert_record_stream([record], random_id=random_id,
1031
                                               nostore_sha=nostore_sha))[0]
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
1032
        return sha1, length, None
0.17.2 by Robert Collins
Core proof of concept working.
1033
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
1034
    def add_fallback_versioned_files(self, a_versioned_files):
1035
        """Add a source of texts for texts not present in this knit.
1036
1037
        :param a_versioned_files: A VersionedFiles object.
1038
        """
1039
        self._fallback_vfs.append(a_versioned_files)
1040
0.17.4 by Robert Collins
Annotate.
1041
    def annotate(self, key):
1042
        """See VersionedFiles.annotate."""
1043
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
1044
        parent_map = self.get_parent_map([key])
1045
        if not parent_map:
1046
            raise errors.RevisionNotPresent(key, self)
1047
        if parent_map[key] is not None:
1048
            search = graph._make_breadth_first_searcher([key])
1049
            keys = set()
1050
            while True:
1051
                try:
1052
                    present, ghosts = search.next_with_ghosts()
1053
                except StopIteration:
1054
                    break
1055
                keys.update(present)
1056
            parent_map = self.get_parent_map(keys)
1057
        else:
1058
            keys = [key]
1059
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
1060
        head_cache = _mod_graph.FrozenHeadsCache(graph)
1061
        parent_cache = {}
1062
        reannotate = annotate.reannotate
1063
        for record in self.get_record_stream(keys, 'topological', True):
1064
            key = record.key
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
1065
            chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins
Annotate.
1066
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
1067
            parent_cache[key] = list(
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
1068
                reannotate(parent_lines, chunks, key, None, head_cache))
0.17.4 by Robert Collins
Annotate.
1069
        return parent_cache[key]
1070
0.17.5 by Robert Collins
nograph tests completely passing.
1071
    def check(self, progress_bar=None):
1072
        """See VersionedFiles.check()."""
1073
        keys = self.keys()
1074
        for record in self.get_record_stream(keys, 'unordered', True):
1075
            record.get_bytes_as('fulltext')
1076
0.17.2 by Robert Collins
Core proof of concept working.
1077
    def _check_add(self, key, lines, random_id, check_content):
1078
        """check that version_id and lines are safe to add."""
1079
        version_id = key[-1]
0.17.26 by Robert Collins
Working better --gc-plain-chk.
1080
        if version_id is not None:
1081
            if contains_whitespace(version_id):
3735.31.1 by John Arbash Meinel
Bring the groupcompress plugin into the brisbane-core branch.
1082
                raise errors.InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins
Core proof of concept working.
1083
        self.check_not_reserved_id(version_id)
1084
        # TODO: If random_id==False and the key is already present, we should
1085
        # probably check that the existing content is identical to what is
1086
        # being inserted, and otherwise raise an exception.  This would make
1087
        # the bundle code simpler.
1088
        if check_content:
1089
            self._check_lines_not_unicode(lines)
1090
            self._check_lines_are_lines(lines)
1091
0.17.5 by Robert Collins
nograph tests completely passing.
1092
    def get_parent_map(self, keys):
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
1093
        """Get a map of the graph parents of keys.
0.17.5 by Robert Collins
nograph tests completely passing.
1094
1095
        :param keys: The keys to look up parents for.
1096
        :return: A mapping from keys to parents. Absent keys are absent from
1097
            the mapping.
1098
        """
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
1099
        return self._get_parent_map_with_sources(keys)[0]
1100
1101
    def _get_parent_map_with_sources(self, keys):
1102
        """Get a map of the parents of keys.
1103
1104
        :param keys: The keys to look up parents for.
1105
        :return: A tuple. The first element is a mapping from keys to parents.
1106
            Absent keys are absent from the mapping. The second element is a
1107
            list with the locations each key was found in. The first element
1108
            is the in-this-knit parents, the second the first fallback source,
1109
            and so on.
1110
        """
0.17.5 by Robert Collins
nograph tests completely passing.
1111
        result = {}
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
1112
        sources = [self._index] + self._fallback_vfs
0.17.5 by Robert Collins
nograph tests completely passing.
1113
        source_results = []
1114
        missing = set(keys)
1115
        for source in sources:
1116
            if not missing:
1117
                break
1118
            new_result = source.get_parent_map(missing)
1119
            source_results.append(new_result)
1120
            result.update(new_result)
1121
            missing.difference_update(set(new_result))
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
1122
        return result, source_results
0.17.5 by Robert Collins
nograph tests completely passing.
1123
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
1124
    def _get_block(self, index_memo):
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
1125
        read_memo = index_memo[0:3]
1126
        # get the group:
1127
        try:
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
1128
            block = self._group_cache[read_memo]
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
1129
        except KeyError:
1130
            # read the group
1131
            zdata = self._access.get_raw_records([read_memo]).next()
1132
            # decompress - whole thing - this is not a bug, as it
1133
            # permits caching. We might want to store the partially
1134
            # decompresed group and decompress object, so that recent
1135
            # texts are not penalised by big groups.
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
1136
            block = GroupCompressBlock.from_bytes(zdata)
1137
            self._group_cache[read_memo] = block
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
1138
        # cheapo debugging:
1139
        # print len(zdata), len(plain)
1140
        # parse - requires split_lines, better to have byte offsets
1141
        # here (but not by much - we only split the region for the
1142
        # recipe, and we often want to end up with lines anyway.
0.25.6 by John Arbash Meinel
(tests broken) implement the basic ability to have a separate header
1143
        return block
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
1144
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
1145
    def get_missing_compression_parent_keys(self):
1146
        """Return the keys of missing compression parents.
1147
1148
        Missing compression parents occur when a record stream was missing
1149
        basis texts, or a index was scanned that had missing basis texts.
1150
        """
1151
        # GroupCompress cannot currently reference texts that are not in the
1152
        # group, so this is valid for now
1153
        return frozenset()
1154
0.17.5 by Robert Collins
nograph tests completely passing.
1155
    def get_record_stream(self, keys, ordering, include_delta_closure):
1156
        """Get a stream of records for keys.
1157
1158
        :param keys: The keys to include.
1159
        :param ordering: Either 'unordered' or 'topological'. A topologically
1160
            sorted stream has compression parents strictly before their
1161
            children.
1162
        :param include_delta_closure: If True then the closure across any
1163
            compression parents will be included (in the opaque data).
1164
        :return: An iterator of ContentFactory objects, each of which is only
1165
            valid until the iterator is advanced.
1166
        """
1167
        # keys might be a generator
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
1168
        orig_keys = list(keys)
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1169
        keys = set(keys)
0.17.5 by Robert Collins
nograph tests completely passing.
1170
        if not keys:
1171
            return
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
1172
        if (not self._index.has_graph
3735.31.14 by John Arbash Meinel
Change the gc-optimal to 'groupcompress'
1173
            and ordering in ('topological', 'groupcompress')):
0.17.5 by Robert Collins
nograph tests completely passing.
1174
            # Cannot topological order when no graph has been stored.
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1175
            # but we allow 'as-requested' or 'unordered'
0.17.5 by Robert Collins
nograph tests completely passing.
1176
            ordering = 'unordered'
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1177
1178
        remaining_keys = keys
1179
        while True:
1180
            try:
1181
                keys = set(remaining_keys)
1182
                for content_factory in self._get_remaining_record_stream(keys,
1183
                        orig_keys, ordering, include_delta_closure):
1184
                    remaining_keys.discard(content_factory.key)
1185
                    yield content_factory
1186
                return
1187
            except errors.RetryWithNewPacks, e:
1188
                self._access.reload_or_raise(e)
1189
1190
    def _find_from_fallback(self, missing):
1191
        """Find whatever keys you can from the fallbacks.
1192
1193
        :param missing: A set of missing keys. This set will be mutated as keys
1194
            are found from a fallback_vfs
1195
        :return: (parent_map, key_to_source_map, source_results)
1196
            parent_map  the overall key => parent_keys
1197
            key_to_source_map   a dict from {key: source}
1198
            source_results      a list of (source: keys)
1199
        """
1200
        parent_map = {}
1201
        key_to_source_map = {}
1202
        source_results = []
1203
        for source in self._fallback_vfs:
1204
            if not missing:
1205
                break
1206
            source_parents = source.get_parent_map(missing)
1207
            parent_map.update(source_parents)
1208
            source_parents = list(source_parents)
1209
            source_results.append((source, source_parents))
1210
            key_to_source_map.update((key, source) for key in source_parents)
1211
            missing.difference_update(source_parents)
1212
        return parent_map, key_to_source_map, source_results
1213
1214
    def _get_ordered_source_keys(self, ordering, parent_map, key_to_source_map):
1215
        """Get the (source, [keys]) list.
1216
1217
        The returned objects should be in the order defined by 'ordering',
1218
        which can weave between different sources.
1219
        :param ordering: Must be one of 'topological' or 'groupcompress'
1220
        :return: List of [(source, [keys])] tuples, such that all keys are in
1221
            the defined order, regardless of source.
1222
        """
1223
        if ordering == 'topological':
1224
            present_keys = topo_sort(parent_map)
1225
        else:
1226
            # ordering == 'groupcompress'
1227
            # XXX: This only optimizes for the target ordering. We may need
1228
            #      to balance that with the time it takes to extract
1229
            #      ordering, by somehow grouping based on
1230
            #      locations[key][0:3]
1231
            present_keys = sort_gc_optimal(parent_map)
1232
        # Now group by source:
1233
        source_keys = []
1234
        current_source = None
1235
        for key in present_keys:
1236
            source = key_to_source_map.get(key, self)
1237
            if source is not current_source:
1238
                source_keys.append((source, []))
3735.32.12 by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types.
1239
                current_source = source
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1240
            source_keys[-1][1].append(key)
1241
        return source_keys
1242
1243
    def _get_as_requested_source_keys(self, orig_keys, locations, unadded_keys,
1244
                                      key_to_source_map):
1245
        source_keys = []
1246
        current_source = None
1247
        for key in orig_keys:
1248
            if key in locations or key in unadded_keys:
1249
                source = self
1250
            elif key in key_to_source_map:
1251
                source = key_to_source_map[key]
1252
            else: # absent
1253
                continue
1254
            if source is not current_source:
1255
                source_keys.append((source, []))
3735.32.12 by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types.
1256
                current_source = source
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1257
            source_keys[-1][1].append(key)
1258
        return source_keys
1259
1260
    def _get_io_ordered_source_keys(self, locations, unadded_keys,
1261
                                    source_result):
1262
        def get_group(key):
1263
            # This is the group the bytes are stored in, followed by the
1264
            # location in the group
1265
            return locations[key][0]
1266
        present_keys = sorted(locations.iterkeys(), key=get_group)
1267
        # We don't have an ordering for keys in the in-memory object, but
1268
        # lets process the in-memory ones first.
1269
        present_keys = list(unadded_keys) + present_keys
1270
        # Now grab all of the ones from other sources
1271
        source_keys = [(self, present_keys)]
1272
        source_keys.extend(source_result)
1273
        return source_keys
1274
1275
    def _get_remaining_record_stream(self, keys, orig_keys, ordering,
1276
                                     include_delta_closure):
1277
        """Get a stream of records for keys.
1278
1279
        :param keys: The keys to include.
1280
        :param ordering: one of 'unordered', 'topological', 'groupcompress' or
1281
            'as-requested'
1282
        :param include_delta_closure: If True then the closure across any
1283
            compression parents will be included (in the opaque data).
1284
        :return: An iterator of ContentFactory objects, each of which is only
1285
            valid until the iterator is advanced.
1286
        """
0.17.5 by Robert Collins
nograph tests completely passing.
1287
        # Cheap: iterate
1288
        locations = self._index.get_build_details(keys)
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1289
        unadded_keys = set(self._unadded_refs).intersection(keys)
1290
        missing = keys.difference(locations)
1291
        missing.difference_update(unadded_keys)
1292
        (fallback_parent_map, key_to_source_map,
1293
         source_result) = self._find_from_fallback(missing)
1294
        if ordering in ('topological', 'groupcompress'):
0.17.5 by Robert Collins
nograph tests completely passing.
1295
            # would be better to not globally sort initially but instead
1296
            # start with one key, recurse to its oldest parent, then grab
1297
            # everything in the same group, etc.
1298
            parent_map = dict((key, details[2]) for key, details in
1299
                locations.iteritems())
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1300
            for key in unadded_keys:
1301
                parent_map[key] = self._unadded_refs[key]
1302
            parent_map.update(fallback_parent_map)
1303
            source_keys = self._get_ordered_source_keys(ordering, parent_map,
1304
                                                        key_to_source_map)
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
1305
        elif ordering == 'as-requested':
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1306
            source_keys = self._get_as_requested_source_keys(orig_keys,
1307
                locations, unadded_keys, key_to_source_map)
0.17.5 by Robert Collins
nograph tests completely passing.
1308
        else:
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
1309
            # We want to yield the keys in a semi-optimal (read-wise) ordering.
1310
            # Otherwise we thrash the _group_cache and destroy performance
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1311
            source_keys = self._get_io_ordered_source_keys(locations,
1312
                unadded_keys, source_result)
1313
        for key in missing:
0.17.5 by Robert Collins
nograph tests completely passing.
1314
            yield AbsentContentFactory(key)
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
1315
        manager = None
3735.34.3 by John Arbash Meinel
Cleanup, in preparation for merging to brisbane-core.
1316
        last_read_memo = None
3735.32.15 by John Arbash Meinel
Change the GroupCompressBlock code to allow not recording 'end'.
1317
        # TODO: This works fairly well at batching up existing groups into a
1318
        #       streamable format, and possibly allowing for taking one big
1319
        #       group and splitting it when it isn't fully utilized.
1320
        #       However, it doesn't allow us to find under-utilized groups and
1321
        #       combine them into a bigger group on the fly.
1322
        #       (Consider the issue with how chk_map inserts texts
1323
        #       one-at-a-time.) This could be done at insert_record_stream()
1324
        #       time, but it probably would decrease the number of
1325
        #       bytes-on-the-wire for fetch.
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1326
        for source, keys in source_keys:
1327
            if source is self:
1328
                for key in keys:
1329
                    if key in self._unadded_refs:
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
1330
                        if manager is not None:
1331
                            for factory in manager.get_record_stream():
3735.32.12 by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types.
1332
                                yield factory
3735.34.3 by John Arbash Meinel
Cleanup, in preparation for merging to brisbane-core.
1333
                            last_read_memo = manager = None
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1334
                        bytes, sha1 = self._compressor.extract(key)
1335
                        parents = self._unadded_refs[key]
3735.32.12 by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types.
1336
                        yield FulltextContentFactory(key, parents, sha1, bytes)
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1337
                    else:
1338
                        index_memo, _, parents, (method, _) = locations[key]
3735.34.1 by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit.
1339
                        read_memo = index_memo[0:3]
3735.34.3 by John Arbash Meinel
Cleanup, in preparation for merging to brisbane-core.
1340
                        if last_read_memo != read_memo:
1341
                            # We are starting a new block. If we have a
1342
                            # manager, we have found everything that fits for
1343
                            # now, so yield records
1344
                            if manager is not None:
1345
                                for factory in manager.get_record_stream():
1346
                                    yield factory
1347
                            # Now start a new manager
3735.34.1 by John Arbash Meinel
Some testing to see if we can decrease the peak memory consumption a bit.
1348
                            block = self._get_block(index_memo)
3735.34.3 by John Arbash Meinel
Cleanup, in preparation for merging to brisbane-core.
1349
                            manager = _LazyGroupContentManager(block)
1350
                            last_read_memo = read_memo
3735.32.8 by John Arbash Meinel
Some tests for the LazyGroupCompressFactory
1351
                        start, end = index_memo[3:5]
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
1352
                        manager.add_factory(key, parents, start, end)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
1353
            else:
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
1354
                if manager is not None:
1355
                    for factory in manager.get_record_stream():
3735.32.12 by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types.
1356
                        yield factory
3735.34.3 by John Arbash Meinel
Cleanup, in preparation for merging to brisbane-core.
1357
                    last_read_memo = manager = None
3735.31.18 by John Arbash Meinel
Implement stacking support across all ordering implementations.
1358
                for record in source.get_record_stream(keys, ordering,
1359
                                                       include_delta_closure):
1360
                    yield record
3735.32.14 by John Arbash Meinel
Move the tests over to testing the LazyGroupContentManager object.
1361
        if manager is not None:
1362
            for factory in manager.get_record_stream():
3735.32.12 by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types.
1363
                yield factory
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
1364
0.17.5 by Robert Collins
nograph tests completely passing.
1365
    def get_sha1s(self, keys):
1366
        """See VersionedFiles.get_sha1s()."""
1367
        result = {}
1368
        for record in self.get_record_stream(keys, 'unordered', True):
1369
            if record.sha1 != None:
1370
                result[record.key] = record.sha1
1371
            else:
1372
                if record.storage_kind != 'absent':
3735.32.12 by John Arbash Meinel
Add groupcompress-block[-ref] as valid stream types.
1373
                    result[record.key] = sha_string(record.get_bytes_as(
0.17.5 by Robert Collins
nograph tests completely passing.
1374
                        'fulltext'))
1375
        return result
1376
0.17.2 by Robert Collins
Core proof of concept working.
1377
    def insert_record_stream(self, stream):
1378
        """Insert a record stream into this container.
1379
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1380
        :param stream: A stream of records to insert.
0.17.2 by Robert Collins
Core proof of concept working.
1381
        :return: None
1382
        :seealso VersionedFiles.get_record_stream:
1383
        """
0.17.5 by Robert Collins
nograph tests completely passing.
1384
        for _ in self._insert_record_stream(stream):
1385
            pass
0.17.2 by Robert Collins
Core proof of concept working.
1386
3735.32.21 by John Arbash Meinel
We now have a 'reuse_blocks=False' flag for autopack et al.
1387
    def _insert_record_stream(self, stream, random_id=False, nostore_sha=None,
1388
                              reuse_blocks=True):
0.17.2 by Robert Collins
Core proof of concept working.
1389
        """Internal core to insert a record stream into this container.
1390
1391
        This helper function has a different interface than insert_record_stream
1392
        to allow add_lines to be minimal, but still return the needed data.
1393
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1394
        :param stream: A stream of records to insert.
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
1395
        :param nostore_sha: If the sha1 of a given text matches nostore_sha,
1396
            raise ExistingContent, rather than committing the new text.
3735.32.21 by John Arbash Meinel
We now have a 'reuse_blocks=False' flag for autopack et al.
1397
        :param reuse_blocks: If the source is streaming from
1398
            groupcompress-blocks, just insert the blocks as-is, rather than
1399
            expanding the texts and inserting again.
0.17.2 by Robert Collins
Core proof of concept working.
1400
        :return: An iterator over the sha1 of the inserted records.
1401
        :seealso insert_record_stream:
1402
        :seealso add_lines:
1403
        """
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1404
        adapters = {}
0.17.5 by Robert Collins
nograph tests completely passing.
1405
        def get_adapter(adapter_key):
1406
            try:
1407
                return adapters[adapter_key]
1408
            except KeyError:
1409
                adapter_factory = adapter_registry.get(adapter_key)
1410
                adapter = adapter_factory(self)
1411
                adapters[adapter_key] = adapter
1412
                return adapter
0.17.2 by Robert Collins
Core proof of concept working.
1413
        # This will go up to fulltexts for gc to gc fetching, which isn't
1414
        # ideal.
3735.32.19 by John Arbash Meinel
Get rid of the 'delta' flag to GroupCompressor. It didn't do anything anyway.
1415
        self._compressor = GroupCompressor()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
1416
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
1417
        keys_to_add = []
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
1418
        def flush():
3735.32.23 by John Arbash Meinel
Add a _LazyGroupContentManager._check_rebuild_block
1419
            bytes = self._compressor.flush().to_bytes()
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
1420
            index, start, length = self._access.add_raw_records(
0.25.7 by John Arbash Meinel
Have the GroupCompressBlock decide how to compress the header and content.
1421
                [(None, len(bytes))], bytes)[0]
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
1422
            nodes = []
1423
            for key, reads, refs in keys_to_add:
1424
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
1425
            self._index.add_records(nodes, random_id=random_id)
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
1426
            self._unadded_refs = {}
1427
            del keys_to_add[:]
3735.32.19 by John Arbash Meinel
Get rid of the 'delta' flag to GroupCompressor. It didn't do anything anyway.
1428
            self._compressor = GroupCompressor()
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
1429
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
1430
        last_prefix = None
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
1431
        last_fulltext_len = None
1432
        max_fulltext_len = 0
0.25.11 by John Arbash Meinel
Slightly different handling of large texts.
1433
        max_fulltext_prefix = None
3735.32.20 by John Arbash Meinel
groupcompress now copies the blocks exactly as they were given.
1434
        insert_manager = None
1435
        block_start = None
1436
        block_length = None
0.17.2 by Robert Collins
Core proof of concept working.
1437
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
1438
            # Raise an error when a record is missing.
1439
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1440
                raise errors.RevisionNotPresent(record.key, self)
3735.32.21 by John Arbash Meinel
We now have a 'reuse_blocks=False' flag for autopack et al.
1441
            if reuse_blocks:
1442
                # If the reuse_blocks flag is set, check to see if we can just
1443
                # copy a groupcompress block as-is.
1444
                if record.storage_kind == 'groupcompress-block':
1445
                    # Insert the raw block into the target repo
1446
                    insert_manager = record._manager
3735.2.163 by John Arbash Meinel
Merge bzr.dev 4187, and revert the change to fix refcycle issues.
1447
                    insert_manager._check_rebuild_block()
3735.32.21 by John Arbash Meinel
We now have a 'reuse_blocks=False' flag for autopack et al.
1448
                    bytes = record._manager._block.to_bytes()
1449
                    _, start, length = self._access.add_raw_records(
1450
                        [(None, len(bytes))], bytes)[0]
1451
                    del bytes
1452
                    block_start = start
1453
                    block_length = length
1454
                if record.storage_kind in ('groupcompress-block',
1455
                                           'groupcompress-block-ref'):
1456
                    assert insert_manager is not None
1457
                    assert record._manager is insert_manager
1458
                    value = "%d %d %d %d" % (block_start, block_length,
1459
                                             record._start, record._end)
1460
                    nodes = [(record.key, value, (record.parents,))]
1461
                    self._index.add_records(nodes, random_id=random_id)
1462
                    continue
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
1463
            try:
0.23.52 by John Arbash Meinel
Use the max_delta flag.
1464
                bytes = record.get_bytes_as('fulltext')
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
1465
            except errors.UnavailableRepresentation:
0.17.5 by Robert Collins
nograph tests completely passing.
1466
                adapter_key = record.storage_kind, 'fulltext'
1467
                adapter = get_adapter(adapter_key)
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
1468
                bytes = adapter.get_bytes(record)
0.20.13 by John Arbash Meinel
Play around a bit.
1469
            if len(record.key) > 1:
1470
                prefix = record.key[0]
0.25.11 by John Arbash Meinel
Slightly different handling of large texts.
1471
                soft = (prefix == last_prefix)
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
1472
            else:
1473
                prefix = None
0.25.11 by John Arbash Meinel
Slightly different handling of large texts.
1474
                soft = False
1475
            if max_fulltext_len < len(bytes):
1476
                max_fulltext_len = len(bytes)
1477
                max_fulltext_prefix = prefix
3735.2.162 by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point.
1478
            (found_sha1, start_point, end_point, type,
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
1479
             length) = self._compressor.compress(record.key,
3735.31.12 by John Arbash Meinel
Push nostore_sha down through the stack.
1480
                bytes, record.sha1, soft=soft,
1481
                nostore_sha=nostore_sha)
0.25.11 by John Arbash Meinel
Slightly different handling of large texts.
1482
            # delta_ratio = float(len(bytes)) / length
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
1483
            # Check if we want to continue to include that text
0.25.11 by John Arbash Meinel
Slightly different handling of large texts.
1484
            if (prefix == max_fulltext_prefix
1485
                and end_point < 2 * max_fulltext_len):
1486
                # As long as we are on the same file_id, we will fill at least
1487
                # 2 * max_fulltext_len
1488
                start_new_block = False
1489
            elif end_point > 4*1024*1024:
1490
                start_new_block = True
1491
            elif (prefix is not None and prefix != last_prefix
1492
                  and end_point > 2*1024*1024):
1493
                start_new_block = True
1494
            else:
1495
                start_new_block = False
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
1496
            # if type == 'fulltext':
1497
            #     # If this is the first text, we don't do anything
1498
            #     if self._compressor.num_keys > 1:
1499
            #         if prefix is not None and prefix != last_prefix:
1500
            #             # We just inserted a fulltext for a different prefix
1501
            #             # (aka file-id).
1502
            #             if end_point > 512 * 1024:
1503
            #                 start_new_block = True
1504
            #             # TODO: Consider packing several small texts together
1505
            #             #       maybe only flush if end_point > some threshold
1506
            #             # if end_point > 512 * 1024 or len(bytes) <
1507
            #             #     start_new_block = true
1508
            #         else:
1509
            #             # We just added a fulltext, part of the same file-id
1510
            #             if (end_point > 2*1024*1024
1511
            #                 and end_point > 5*max_fulltext_len):
1512
            #                 start_new_block = True
1513
            #     last_fulltext_len = len(bytes)
1514
            # else:
1515
            #     delta_ratio = float(len(bytes)) / length
1516
            #     if delta_ratio < 3: # Not much compression
1517
            #         if end_point > 1*1024*1024:
1518
            #             start_new_block = True
1519
            #     elif delta_ratio < 10: # 10:1 compression
1520
            #         if end_point > 4*1024*1024:
1521
            #             start_new_block = True
1522
            last_prefix = prefix
1523
            if start_new_block:
1524
                self._compressor.pop_last()
1525
                flush()
1526
                max_fulltext_len = len(bytes)
3735.2.162 by John Arbash Meinel
Change GroupCompressor.compress() to return the start_point.
1527
                (found_sha1, start_point, end_point, type,
0.25.10 by John Arbash Meinel
Play around with detecting compression breaks.
1528
                 length) = self._compressor.compress(record.key,
1529
                    bytes, record.sha1)
1530
                last_fulltext_len = length
0.17.26 by Robert Collins
Working better --gc-plain-chk.
1531
            if record.key[-1] is None:
1532
                key = record.key[:-1] + ('sha1:' + found_sha1,)
1533
            else:
1534
                key = record.key
1535
            self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
1536
            yield found_sha1
3735.2.164 by John Arbash Meinel
Fix a critical bug that caused problems with the index entries.
1537
            keys_to_add.append((key, '%d %d' % (start_point, end_point),
0.17.5 by Robert Collins
nograph tests completely passing.
1538
                (record.parents,)))
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
1539
        if len(keys_to_add):
1540
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
1541
        self._compressor = None
0.17.5 by Robert Collins
nograph tests completely passing.
1542
1543
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
1544
        """Iterate over the lines in the versioned files from keys.
1545
1546
        This may return lines from other keys. Each item the returned
1547
        iterator yields is a tuple of a line and a text version that that line
1548
        is present in (not introduced in).
1549
1550
        Ordering of results is in whatever order is most suitable for the
1551
        underlying storage format.
1552
1553
        If a progress bar is supplied, it may be used to indicate progress.
1554
        The caller is responsible for cleaning up progress bars (because this
1555
        is an iterator).
1556
1557
        NOTES:
1558
         * Lines are normalised by the underlying store: they will all have \n
1559
           terminators.
1560
         * Lines are returned in arbitrary order.
1561
1562
        :return: An iterator over (line, key).
1563
        """
1564
        if pb is None:
1565
            pb = progress.DummyProgress()
1566
        keys = set(keys)
1567
        total = len(keys)
1568
        # we don't care about inclusions, the caller cares.
1569
        # but we need to setup a list of records to visit.
1570
        # we need key, position, length
1571
        for key_idx, record in enumerate(self.get_record_stream(keys,
1572
            'unordered', True)):
1573
            # XXX: todo - optimise to use less than full texts.
1574
            key = record.key
3735.32.1 by John Arbash Meinel
Fix the VF WalkingContent checks.
1575
            pb.update('Walking content', key_idx, total)
0.17.5 by Robert Collins
nograph tests completely passing.
1576
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1577
                raise errors.RevisionNotPresent(key, self)
0.17.5 by Robert Collins
nograph tests completely passing.
1578
            lines = split_lines(record.get_bytes_as('fulltext'))
1579
            for line in lines:
1580
                yield line, key
3735.32.1 by John Arbash Meinel
Fix the VF WalkingContent checks.
1581
        pb.update('Walking content', total, total)
0.17.5 by Robert Collins
nograph tests completely passing.
1582
1583
    def keys(self):
1584
        """See VersionedFiles.keys."""
1585
        if 'evil' in debug.debug_flags:
1586
            trace.mutter_callsite(2, "keys scales with size of history")
3735.31.7 by John Arbash Meinel
Start bringing in stacking support for Groupcompress repos.
1587
        sources = [self._index] + self._fallback_vfs
0.17.5 by Robert Collins
nograph tests completely passing.
1588
        result = set()
1589
        for source in sources:
1590
            result.update(source.keys())
1591
        return result
1592
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1593
1594
class _GCGraphIndex(object):
1595
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
1596
0.17.9 by Robert Collins
Initial stab at repository format support.
1597
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1598
        add_callback=None):
1599
        """Construct a _GCGraphIndex on a graph_index.
1600
1601
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1602
        :param is_locked: A callback, returns True if the index is locked and
1603
            thus usable.
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1604
        :param parents: If True, record knits parents, if not do not record
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1605
            parents.
1606
        :param add_callback: If not None, allow additions to the index and call
1607
            this callback with a list of added GraphIndex nodes:
1608
            [(node, value, node_refs), ...]
1609
        """
1610
        self._add_callback = add_callback
1611
        self._graph_index = graph_index
1612
        self._parents = parents
1613
        self.has_graph = parents
1614
        self._is_locked = is_locked
1615
0.17.5 by Robert Collins
nograph tests completely passing.
1616
    def add_records(self, records, random_id=False):
1617
        """Add multiple records to the index.
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1618
0.17.5 by Robert Collins
nograph tests completely passing.
1619
        This function does not insert data into the Immutable GraphIndex
1620
        backing the KnitGraphIndex, instead it prepares data for insertion by
1621
        the caller and checks that it is safe to insert then calls
1622
        self._add_callback with the prepared GraphIndex nodes.
1623
1624
        :param records: a list of tuples:
1625
                         (key, options, access_memo, parents).
1626
        :param random_id: If True the ids being added were randomly generated
1627
            and no check for existence will be performed.
1628
        """
1629
        if not self._add_callback:
1630
            raise errors.ReadOnlyError(self)
1631
        # we hope there are no repositories with inconsistent parentage
1632
        # anymore.
1633
1634
        changed = False
1635
        keys = {}
1636
        for (key, value, refs) in records:
1637
            if not self._parents:
1638
                if refs:
1639
                    for ref in refs:
1640
                        if ref:
1641
                            raise KnitCorrupt(self,
1642
                                "attempt to add node with parents "
1643
                                "in parentless index.")
1644
                    refs = ()
1645
                    changed = True
1646
            keys[key] = (value, refs)
1647
        # check for dups
1648
        if not random_id:
1649
            present_nodes = self._get_entries(keys)
1650
            for (index, key, value, node_refs) in present_nodes:
1651
                if node_refs != keys[key][1]:
1652
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
1653
                        ": %s %s" % ((value, node_refs), keys[key]))
1654
                del keys[key]
1655
                changed = True
1656
        if changed:
1657
            result = []
1658
            if self._parents:
1659
                for key, (value, node_refs) in keys.iteritems():
1660
                    result.append((key, value, node_refs))
1661
            else:
1662
                for key, (value, node_refs) in keys.iteritems():
1663
                    result.append((key, value))
1664
            records = result
1665
        self._add_callback(records)
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1666
0.17.5 by Robert Collins
nograph tests completely passing.
1667
    def _check_read(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1668
        """Raise an exception if reads are not permitted."""
0.17.5 by Robert Collins
nograph tests completely passing.
1669
        if not self._is_locked():
1670
            raise errors.ObjectNotLocked(self)
1671
0.17.2 by Robert Collins
Core proof of concept working.
1672
    def _check_write_ok(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1673
        """Raise an exception if writes are not permitted."""
0.17.2 by Robert Collins
Core proof of concept working.
1674
        if not self._is_locked():
1675
            raise errors.ObjectNotLocked(self)
1676
0.17.5 by Robert Collins
nograph tests completely passing.
1677
    def _get_entries(self, keys, check_present=False):
1678
        """Get the entries for keys.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1679
1680
        Note: Callers are responsible for checking that the index is locked
1681
        before calling this method.
1682
0.17.5 by Robert Collins
nograph tests completely passing.
1683
        :param keys: An iterable of index key tuples.
1684
        """
1685
        keys = set(keys)
1686
        found_keys = set()
1687
        if self._parents:
1688
            for node in self._graph_index.iter_entries(keys):
1689
                yield node
1690
                found_keys.add(node[1])
1691
        else:
1692
            # adapt parentless index to the rest of the code.
1693
            for node in self._graph_index.iter_entries(keys):
1694
                yield node[0], node[1], node[2], ()
1695
                found_keys.add(node[1])
1696
        if check_present:
1697
            missing_keys = keys.difference(found_keys)
1698
            if missing_keys:
1699
                raise RevisionNotPresent(missing_keys.pop(), self)
1700
1701
    def get_parent_map(self, keys):
1702
        """Get a map of the parents of keys.
1703
1704
        :param keys: The keys to look up parents for.
1705
        :return: A mapping from keys to parents. Absent keys are absent from
1706
            the mapping.
1707
        """
1708
        self._check_read()
1709
        nodes = self._get_entries(keys)
1710
        result = {}
1711
        if self._parents:
1712
            for node in nodes:
1713
                result[node[1]] = node[3][0]
1714
        else:
1715
            for node in nodes:
1716
                result[node[1]] = None
1717
        return result
1718
1719
    def get_build_details(self, keys):
1720
        """Get the various build details for keys.
1721
1722
        Ghosts are omitted from the result.
1723
1724
        :param keys: An iterable of keys.
1725
        :return: A dict of key:
1726
            (index_memo, compression_parent, parents, record_details).
1727
            index_memo
1728
                opaque structure to pass to read_records to extract the raw
1729
                data
1730
            compression_parent
1731
                Content that this record is built upon, may be None
1732
            parents
1733
                Logical parents of this node
1734
            record_details
1735
                extra information about the content which needs to be passed to
1736
                Factory.parse_record
1737
        """
1738
        self._check_read()
1739
        result = {}
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
1740
        entries = self._get_entries(keys)
0.17.5 by Robert Collins
nograph tests completely passing.
1741
        for entry in entries:
1742
            key = entry[1]
1743
            if not self._parents:
1744
                parents = None
1745
            else:
1746
                parents = entry[3][0]
1747
            method = 'group'
1748
            result[key] = (self._node_to_position(entry),
1749
                                  None, parents, (method, None))
1750
        return result
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1751
0.17.5 by Robert Collins
nograph tests completely passing.
1752
    def keys(self):
1753
        """Get all the keys in the collection.
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1754
0.17.5 by Robert Collins
nograph tests completely passing.
1755
        The keys are not ordered.
1756
        """
1757
        self._check_read()
1758
        return [node[1] for node in self._graph_index.iter_all_entries()]
3735.31.2 by John Arbash Meinel
Cleanup trailing whitespace, get test_source to pass by removing asserts.
1759
0.17.5 by Robert Collins
nograph tests completely passing.
1760
    def _node_to_position(self, node):
1761
        """Convert an index value to position details."""
1762
        bits = node[2].split(' ')
1763
        # It would be nice not to read the entire gzip.
1764
        start = int(bits[0])
1765
        stop = int(bits[1])
1766
        basis_end = int(bits[2])
1767
        delta_end = int(bits[3])
1768
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
1769
1770
1771
try:
3735.31.1 by John Arbash Meinel
Bring the groupcompress plugin into the brisbane-core branch.
1772
    from bzrlib import _groupcompress_pyx
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
1773
except ImportError:
1774
    pass