/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
30
    osutils,
0.17.4 by Robert Collins
Annotate.
31
    pack,
32
    patiencediff,
33
    )
34
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
35
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
36
from bzrlib.osutils import (
37
    contains_whitespace,
38
    contains_linebreaks,
39
    sha_string,
40
    sha_strings,
41
    split_lines,
42
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
43
from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
44
from bzrlib.lru_cache import LRUSizeCache
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
45
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
46
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
47
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
48
    adapter_registry,
49
    AbsentContentFactory,
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
50
    ChunkedContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
51
    FulltextContentFactory,
52
    VersionedFiles,
53
    )
54
55
0.17.5 by Robert Collins
nograph tests completely passing.
56
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
57
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
58
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
59
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
60
    label_line = lines.next()
61
    sha1_line = lines.next()
62
    if (not label_line.startswith('label: ') or
63
        not sha1_line.startswith('sha1: ')):
64
        raise AssertionError("bad text record %r" % lines)
65
    label = tuple(label_line[7:-1].split('\x00'))
66
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
67
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
68
        op = header[0]
69
        numbers = header[2:]
70
        numbers = [int(n) for n in header[2:].split(',')]
71
        if op == 'c':
72
            result.append((op, numbers[0], numbers[1], None))
73
        else:
74
            contents = [next() for i in xrange(numbers[0])]
75
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
76
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
77
78
def apply_delta(basis, delta):
79
    """Apply delta to this object to become new_version_id."""
80
    lines = []
81
    last_offset = 0
82
    # eq ranges occur where gaps occur
83
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
84
    for op, start, count, delta_lines in delta:
85
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
86
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
87
        else:
88
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
89
    trim_encoding_newline(lines)
90
    return lines
91
92
93
def trim_encoding_newline(lines):
94
    if lines[-1] == '\n':
95
        del lines[-1]
96
    else:
97
        lines[-1] = lines[-1][:-1]
98
99
100
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
101
    """Produce a serialised group of compressed texts.
102
    
103
    It contains code very similar to SequenceMatcher because of having a similar
104
    task. However some key differences apply:
105
     - there is no junk, we want a minimal edit not a human readable diff.
106
     - we don't filter very common lines (because we don't know where a good
107
       range will start, and after the first text we want to be emitting minmal
108
       edits only.
109
     - we chain the left side, not the right side
110
     - we incrementally update the adjacency matrix as new lines are provided.
111
     - we look for matches in all of the left side, so the routine which does
112
       the analagous task of find_longest_match does not need to filter on the
113
       left side.
114
    """
0.17.2 by Robert Collins
Core proof of concept working.
115
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
116
    _equivalence_table_class = equivalence_table.EquivalenceTable
117
0.17.2 by Robert Collins
Core proof of concept working.
118
    def __init__(self, delta=True):
119
        """Create a GroupCompressor.
120
121
        :paeam delta: If False, do not compress records.
122
        """
123
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
124
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
125
        self.endpoint = 0
126
        self.input_bytes = 0
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
127
        self.line_locations = self._equivalence_table_class([])
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
128
        self.lines = self.line_locations.lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
129
        self.labels_deltas = {}
0.17.2 by Robert Collins
Core proof of concept working.
130
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
131
    def get_matching_blocks(self, lines):
132
        """Return an the ranges in lines which match self.lines.
133
134
        :param lines: lines to compress
135
        :return: A list of (old_start, new_start, length) tuples which reflect
136
            a region in self.lines that is present in lines.  The last element
137
            of the list is always (old_len, new_len, 0) to provide a end point
138
            for generating instructions from the matching blocks list.
139
        """
140
        result = []
141
        pos = 0
142
        line_locations = self.line_locations
0.18.11 by John Arbash Meinel
Convert back into grabbing a right-lines ahead of time.
143
        line_locations.set_right_lines(lines)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
144
        # We either copy a range (while there are reusable lines) or we 
145
        # insert new lines. To find reusable lines we traverse 
0.18.24 by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func.
146
        locations = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
147
        max_pos = len(lines)
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
148
        max_time = 0.0
149
        max_info = None
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
150
        result_append = result.append
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
151
        while pos < max_pos:
0.18.35 by John Arbash Meinel
remove the timing calls
152
            block, pos, locations = _get_longest_match(line_locations, pos,
153
                                                       max_pos, locations)
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
154
            if block is not None:
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
155
                result_append(block)
156
        result_append((len(self.lines), len(lines), 0))
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
157
        return result
158
0.17.2 by Robert Collins
Core proof of concept working.
159
    def compress(self, key, lines, expected_sha):
160
        """Compress lines with label key.
161
162
        :param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins
Working better --gc-plain-chk.
163
            for identification of the text during decompression. If the last
164
            element is 'None' it is replaced with the sha1 of the text -
165
            e.g. sha1:xxxxxxx.
0.17.2 by Robert Collins
Core proof of concept working.
166
        :param lines: The lines to be compressed. Must be split
167
            on \n, with the \n preserved.'
168
        :param expected_sha: If non-None, the sha the lines are blieved to
169
            have. During compression the sha is calculated; a mismatch will
170
            cause an error.
171
        :return: The sha1 of lines, and the number of bytes accumulated in
172
            the group output so far.
173
        """
174
        sha1 = sha_strings(lines)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
175
        if key[-1] is None:
176
            key = key[:-1] + ('sha1:' + sha1,)
0.17.2 by Robert Collins
Core proof of concept working.
177
        label = '\x00'.join(key)
178
        # setup good encoding for trailing \n support.
179
        if not lines or lines[-1].endswith('\n'):
180
            lines.append('\n')
181
        else:
182
            lines[-1] = lines[-1] + '\n'
183
        new_lines = []
184
        new_lines.append('label: %s\n' % label)
185
        new_lines.append('sha1: %s\n' % sha1)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
186
        index_lines = [False, False]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
187
        pos = 0
0.17.14 by Robert Collins
Cleaner code.
188
        range_len = 0
189
        range_start = 0
190
        flush_range = self.flush_range
191
        copy_ends = None
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
192
        blocks = self.get_matching_blocks(lines)
193
        current_pos = 0
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
194
        # We either copy a range (while there are reusable lines) or we 
195
        # insert new lines. To find reusable lines we traverse 
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
196
        for old_start, new_start, range_len in blocks:
197
            if new_start != current_pos:
198
                # non-matching region
0.19.1 by Robert Collins
Start to simplify flush_range.
199
                flush_range(current_pos, None, new_start - current_pos,
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
200
                    lines, new_lines, index_lines)
201
            current_pos = new_start + range_len
202
            if not range_len:
203
                continue
0.19.1 by Robert Collins
Start to simplify flush_range.
204
            flush_range(new_start, old_start, range_len, lines,
205
                new_lines, index_lines)
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
206
        delta_start = (self.endpoint, len(self.lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
207
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
208
        trim_encoding_newline(lines)
209
        self.input_bytes += sum(map(len, lines))
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
210
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
211
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
212
        return sha1, self.endpoint
213
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
214
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
215
        """Extract a key previously added to the compressor.
216
        
217
        :param key: The key to extract.
218
        :return: An iterable over bytes and the sha1.
219
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
220
        delta_details = self.labels_deltas[key]
221
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
222
        label, sha1, delta = parse(delta_lines)
223
        if label != key:
224
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
225
        # Perhaps we want to keep the line offsets too in memory at least?
226
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
227
        sha1 = sha_strings(lines)
228
        return lines, sha1
229
0.19.1 by Robert Collins
Start to simplify flush_range.
230
    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
0.17.14 by Robert Collins
Cleaner code.
231
        insert_instruction = "i,%d\n" % range_len
0.19.1 by Robert Collins
Start to simplify flush_range.
232
        if copy_start is not None:
0.17.14 by Robert Collins
Cleaner code.
233
            # range stops, flush and start a new copy range
234
            stop_byte = self.line_offsets[copy_start + range_len - 1]
235
            if copy_start == 0:
236
                start_byte = 0
237
            else:
238
                start_byte = self.line_offsets[copy_start - 1]
239
            bytes = stop_byte - start_byte
240
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
241
            if (bytes + len(insert_instruction) >
242
                len(copy_control_instruction)):
243
                new_lines.append(copy_control_instruction)
244
                index_lines.append(False)
245
                return
246
        # not copying, or inserting is shorter than copying, so insert.
247
        new_lines.append(insert_instruction)
248
        new_lines.extend(lines[range_start:range_start+range_len])
249
        index_lines.append(False)
0.19.1 by Robert Collins
Start to simplify flush_range.
250
        index_lines.extend([copy_start is None]*range_len)
0.17.14 by Robert Collins
Cleaner code.
251
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
252
    def output_lines(self, new_lines, index_lines):
253
        """Output some lines.
254
255
        :param new_lines: The lines to output.
256
        :param index_lines: A boolean flag for each line - when True, index
257
            that line.
258
        """
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
259
        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
260
        #                          if val and new_lines[idx] == '\n']
261
        # if indexed_newlines:
262
        #     import pdb; pdb.set_trace()
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
263
        endpoint = self.endpoint
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
264
        self.line_locations.extend_lines(new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
265
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
266
            endpoint += len(line)
267
            self.line_offsets.append(endpoint)
268
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
269
0.17.2 by Robert Collins
Core proof of concept working.
270
    def ratio(self):
271
        """Return the overall compression ratio."""
272
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
273
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
274
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
275
def make_pack_factory(graph, delta, keylength):
276
    """Create a factory for creating a pack based groupcompress.
277
278
    This is only functional enough to run interface tests, it doesn't try to
279
    provide a full pack environment.
280
    
281
    :param graph: Store a graph.
282
    :param delta: Delta compress contents.
283
    :param keylength: How long should keys be.
284
    """
285
    def factory(transport):
286
        parents = graph or delta
287
        ref_length = 0
288
        if graph:
289
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
290
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
291
            key_elements=keylength)
292
        stream = transport.open_write_stream('newpack')
293
        writer = pack.ContainerWriter(stream.write)
294
        writer.begin()
295
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
296
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
297
        access = _DirectPackAccess({})
298
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
299
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
300
        result.stream = stream
301
        result.writer = writer
302
        return result
303
    return factory
304
305
306
def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
307
    versioned_files.writer.end()
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
308
    versioned_files.stream.close()
309
310
311
class GroupCompressVersionedFiles(VersionedFiles):
312
    """A group-compress based VersionedFiles implementation."""
313
0.17.2 by Robert Collins
Core proof of concept working.
314
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
315
        """Create a GroupCompressVersionedFiles object.
316
317
        :param index: The index object storing access and graph data.
318
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
319
        :param delta: Whether to delta compress or just entropy compress.
320
        """
321
        self._index = index
322
        self._access = access
323
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
324
        self._unadded_refs = {}
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
325
        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
0.17.2 by Robert Collins
Core proof of concept working.
326
327
    def add_lines(self, key, parents, lines, parent_texts=None,
328
        left_matching_blocks=None, nostore_sha=None, random_id=False,
329
        check_content=True):
330
        """Add a text to the store.
331
332
        :param key: The key tuple of the text to add.
333
        :param parents: The parents key tuples of the text to add.
334
        :param lines: A list of lines. Each line must be a bytestring. And all
335
            of them except the last must be terminated with \n and contain no
336
            other \n's. The last line may either contain no \n's or a single
337
            terminating \n. If the lines list does meet this constraint the add
338
            routine may error or may succeed - but you will be unable to read
339
            the data back accurately. (Checking the lines have been split
340
            correctly is expensive and extremely unlikely to catch bugs so it
341
            is not done at runtime unless check_content is True.)
342
        :param parent_texts: An optional dictionary containing the opaque 
343
            representations of some or all of the parents of version_id to
344
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
345
            returned by add_lines or data corruption can be caused.
346
        :param left_matching_blocks: a hint about which areas are common
347
            between the text and its left-hand-parent.  The format is
348
            the SequenceMatcher.get_matching_blocks format.
349
        :param nostore_sha: Raise ExistingContent and do not add the lines to
350
            the versioned file if the digest of the lines matches this.
351
        :param random_id: If True a random id has been selected rather than
352
            an id determined by some deterministic process such as a converter
353
            from a foreign VCS. When True the backend may choose not to check
354
            for uniqueness of the resulting key within the versioned file, so
355
            this should only be done when the result is expected to be unique
356
            anyway.
357
        :param check_content: If True, the lines supplied are verified to be
358
            bytestrings that are correctly formed lines.
359
        :return: The text sha1, the number of bytes in the text, and an opaque
360
                 representation of the inserted version which can be provided
361
                 back to future add_lines calls in the parent_texts dictionary.
362
        """
363
        self._index._check_write_ok()
364
        self._check_add(key, lines, random_id, check_content)
365
        if parents is None:
366
            # The caller might pass None if there is no graph data, but kndx
367
            # indexes can't directly store that, so we give them
368
            # an empty tuple instead.
369
            parents = ()
370
        # double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
371
        length = sum(map(len, lines))
372
        record = ChunkedContentFactory(key, parents, None, lines)
0.17.5 by Robert Collins
nograph tests completely passing.
373
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
374
        return sha1, length, None
0.17.2 by Robert Collins
Core proof of concept working.
375
0.17.4 by Robert Collins
Annotate.
376
    def annotate(self, key):
377
        """See VersionedFiles.annotate."""
378
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
379
        parent_map = self.get_parent_map([key])
380
        if not parent_map:
381
            raise errors.RevisionNotPresent(key, self)
382
        if parent_map[key] is not None:
383
            search = graph._make_breadth_first_searcher([key])
384
            keys = set()
385
            while True:
386
                try:
387
                    present, ghosts = search.next_with_ghosts()
388
                except StopIteration:
389
                    break
390
                keys.update(present)
391
            parent_map = self.get_parent_map(keys)
392
        else:
393
            keys = [key]
394
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
395
        head_cache = _mod_graph.FrozenHeadsCache(graph)
396
        parent_cache = {}
397
        reannotate = annotate.reannotate
398
        for record in self.get_record_stream(keys, 'topological', True):
399
            key = record.key
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
400
            chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins
Annotate.
401
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
402
            parent_cache[key] = list(
403
                reannotate(parent_lines, fulltext, key, None, head_cache))
404
        return parent_cache[key]
405
0.17.5 by Robert Collins
nograph tests completely passing.
406
    def check(self, progress_bar=None):
407
        """See VersionedFiles.check()."""
408
        keys = self.keys()
409
        for record in self.get_record_stream(keys, 'unordered', True):
410
            record.get_bytes_as('fulltext')
411
0.17.2 by Robert Collins
Core proof of concept working.
412
    def _check_add(self, key, lines, random_id, check_content):
413
        """check that version_id and lines are safe to add."""
414
        version_id = key[-1]
0.17.26 by Robert Collins
Working better --gc-plain-chk.
415
        if version_id is not None:
416
            if contains_whitespace(version_id):
417
                raise InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins
Core proof of concept working.
418
        self.check_not_reserved_id(version_id)
419
        # TODO: If random_id==False and the key is already present, we should
420
        # probably check that the existing content is identical to what is
421
        # being inserted, and otherwise raise an exception.  This would make
422
        # the bundle code simpler.
423
        if check_content:
424
            self._check_lines_not_unicode(lines)
425
            self._check_lines_are_lines(lines)
426
0.17.5 by Robert Collins
nograph tests completely passing.
427
    def get_parent_map(self, keys):
428
        """Get a map of the parents of keys.
429
430
        :param keys: The keys to look up parents for.
431
        :return: A mapping from keys to parents. Absent keys are absent from
432
            the mapping.
433
        """
434
        result = {}
435
        sources = [self._index]
436
        source_results = []
437
        missing = set(keys)
438
        for source in sources:
439
            if not missing:
440
                break
441
            new_result = source.get_parent_map(missing)
442
            source_results.append(new_result)
443
            result.update(new_result)
444
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
445
        if self._unadded_refs:
446
            for key in missing:
447
                if key in self._unadded_refs:
448
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
449
        return result
450
451
    def get_record_stream(self, keys, ordering, include_delta_closure):
452
        """Get a stream of records for keys.
453
454
        :param keys: The keys to include.
455
        :param ordering: Either 'unordered' or 'topological'. A topologically
456
            sorted stream has compression parents strictly before their
457
            children.
458
        :param include_delta_closure: If True then the closure across any
459
            compression parents will be included (in the opaque data).
460
        :return: An iterator of ContentFactory objects, each of which is only
461
            valid until the iterator is advanced.
462
        """
463
        # keys might be a generator
464
        keys = set(keys)
465
        if not keys:
466
            return
467
        if not self._index.has_graph:
468
            # Cannot topological order when no graph has been stored.
469
            ordering = 'unordered'
470
        # Cheap: iterate
471
        locations = self._index.get_build_details(keys)
472
        if ordering == 'topological':
473
            # would be better to not globally sort initially but instead
474
            # start with one key, recurse to its oldest parent, then grab
475
            # everything in the same group, etc.
476
            parent_map = dict((key, details[2]) for key, details in
477
                locations.iteritems())
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
478
            local = frozenset(keys).intersection(set(self._unadded_refs))
479
            for key in local:
480
                parent_map[key] = self._unadded_refs[key]
481
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
482
            present_keys = topo_sort(parent_map)
483
            # Now group by source:
484
        else:
485
            present_keys = locations.keys()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
486
            local = frozenset(keys).intersection(set(self._unadded_refs))
487
            for key in local:
488
                present_keys.append(key)
489
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
490
        absent_keys = keys.difference(set(locations))
491
        for key in absent_keys:
492
            yield AbsentContentFactory(key)
493
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
494
            if key in self._unadded_refs:
495
                lines, sha1 = self._compressor.extract(key)
496
                parents = self._unadded_refs[key]
497
            else:
498
                index_memo, _, parents, (method, _) = locations[key]
499
                read_memo = index_memo[0:3]
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
500
                # get the group:
501
                try:
502
                    plain = self._group_cache[read_memo]
503
                except KeyError:
504
                    # read the group
505
                    zdata = self._access.get_raw_records([read_memo]).next()
506
                    # decompress - whole thing - this is not a bug, as it
507
                    # permits caching. We might want to store the partially
508
                    # decompresed group and decompress object, so that recent
509
                    # texts are not penalised by big groups.
510
                    decomp = zlib.decompressobj()
511
                    plain = decomp.decompress(zdata) #, index_memo[4])
512
                    self._group_cache[read_memo] = plain
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
513
                # cheapo debugging:
514
                # print len(zdata), len(plain)
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
515
                # parse - requires split_lines, better to have byte offsets
516
                # here (but not by much - we only split the region for the
517
                # recipe, and we often want to end up with lines anyway.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
518
                delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
519
                label, sha1, delta = parse(delta_lines)
520
                if label != key:
521
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
522
                lines = apply_delta(plain, delta)
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
523
            yield ChunkedContentFactory(key, parents, sha1, lines)
524
0.17.5 by Robert Collins
nograph tests completely passing.
525
    def get_sha1s(self, keys):
526
        """See VersionedFiles.get_sha1s()."""
527
        result = {}
528
        for record in self.get_record_stream(keys, 'unordered', True):
529
            if record.sha1 != None:
530
                result[record.key] = record.sha1
531
            else:
532
                if record.storage_kind != 'absent':
533
                    result[record.key] == sha_string(record.get_bytes_as(
534
                        'fulltext'))
535
        return result
536
0.17.2 by Robert Collins
Core proof of concept working.
537
    def insert_record_stream(self, stream):
538
        """Insert a record stream into this container.
539
540
        :param stream: A stream of records to insert. 
541
        :return: None
542
        :seealso VersionedFiles.get_record_stream:
543
        """
0.17.5 by Robert Collins
nograph tests completely passing.
544
        for _ in self._insert_record_stream(stream):
545
            pass
0.17.2 by Robert Collins
Core proof of concept working.
546
0.17.5 by Robert Collins
nograph tests completely passing.
547
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
548
        """Internal core to insert a record stream into this container.
549
550
        This helper function has a different interface than insert_record_stream
551
        to allow add_lines to be minimal, but still return the needed data.
552
553
        :param stream: A stream of records to insert. 
554
        :return: An iterator over the sha1 of the inserted records.
555
        :seealso insert_record_stream:
556
        :seealso add_lines:
557
        """
0.17.5 by Robert Collins
nograph tests completely passing.
558
        def get_adapter(adapter_key):
559
            try:
560
                return adapters[adapter_key]
561
            except KeyError:
562
                adapter_factory = adapter_registry.get(adapter_key)
563
                adapter = adapter_factory(self)
564
                adapters[adapter_key] = adapter
565
                return adapter
566
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
567
        # This will go up to fulltexts for gc to gc fetching, which isn't
568
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
569
        self._compressor = GroupCompressor(self._delta)
570
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
571
        keys_to_add = []
572
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
573
        groups = 1
574
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
575
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
576
            index, start, length = self._access.add_raw_records(
577
                [(None, len(compressed))], compressed)[0]
578
            nodes = []
579
            for key, reads, refs in keys_to_add:
580
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
581
            self._index.add_records(nodes, random_id=random_id)
0.17.2 by Robert Collins
Core proof of concept working.
582
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
583
            # Raise an error when a record is missing.
584
            if record.storage_kind == 'absent':
585
                raise errors.RevisionNotPresent([record.key], self)
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
586
            elif record.storage_kind in ('chunked', 'fulltext'):
587
                lines = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.5 by Robert Collins
nograph tests completely passing.
588
            else:
589
                adapter_key = record.storage_kind, 'fulltext'
590
                adapter = get_adapter(adapter_key)
591
                bytes = adapter.get_bytes(record,
592
                    record.get_bytes_as(record.storage_kind))
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
593
                lines = osutils.split_lines(bytes)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
594
            found_sha1, end_point = self._compressor.compress(record.key,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
595
                lines, record.sha1)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
596
            if record.key[-1] is None:
597
                key = record.key[:-1] + ('sha1:' + found_sha1,)
598
            else:
599
                key = record.key
600
            self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
601
            yield found_sha1
0.17.26 by Robert Collins
Working better --gc-plain-chk.
602
            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
0.17.5 by Robert Collins
nograph tests completely passing.
603
                (record.parents,)))
604
            basis_end = end_point
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
605
            if basis_end > 1024 * 1024 * 20:
606
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
607
                self._compressor = GroupCompressor(self._delta)
608
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
609
                keys_to_add = []
610
                basis_end = 0
611
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
612
        if len(keys_to_add):
613
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
614
        self._compressor = None
615
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
616
617
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
618
        """Iterate over the lines in the versioned files from keys.
619
620
        This may return lines from other keys. Each item the returned
621
        iterator yields is a tuple of a line and a text version that that line
622
        is present in (not introduced in).
623
624
        Ordering of results is in whatever order is most suitable for the
625
        underlying storage format.
626
627
        If a progress bar is supplied, it may be used to indicate progress.
628
        The caller is responsible for cleaning up progress bars (because this
629
        is an iterator).
630
631
        NOTES:
632
         * Lines are normalised by the underlying store: they will all have \n
633
           terminators.
634
         * Lines are returned in arbitrary order.
635
636
        :return: An iterator over (line, key).
637
        """
638
        if pb is None:
639
            pb = progress.DummyProgress()
640
        keys = set(keys)
641
        total = len(keys)
642
        # we don't care about inclusions, the caller cares.
643
        # but we need to setup a list of records to visit.
644
        # we need key, position, length
645
        for key_idx, record in enumerate(self.get_record_stream(keys,
646
            'unordered', True)):
647
            # XXX: todo - optimise to use less than full texts.
648
            key = record.key
649
            pb.update('Walking content.', key_idx, total)
650
            if record.storage_kind == 'absent':
651
                raise errors.RevisionNotPresent(record.key, self)
652
            lines = split_lines(record.get_bytes_as('fulltext'))
653
            for line in lines:
654
                yield line, key
655
        pb.update('Walking content.', total, total)
656
657
    def keys(self):
658
        """See VersionedFiles.keys."""
659
        if 'evil' in debug.debug_flags:
660
            trace.mutter_callsite(2, "keys scales with size of history")
661
        sources = [self._index]
662
        result = set()
663
        for source in sources:
664
            result.update(source.keys())
665
        return result
666
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
667
668
class _GCGraphIndex(object):
669
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
670
0.17.9 by Robert Collins
Initial stab at repository format support.
671
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
672
        add_callback=None):
673
        """Construct a _GCGraphIndex on a graph_index.
674
675
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
676
        :param is_locked: A callback to check whether the object should answer
677
            queries.
678
        :param parents: If True, record knits parents, if not do not record 
679
            parents.
680
        :param add_callback: If not None, allow additions to the index and call
681
            this callback with a list of added GraphIndex nodes:
682
            [(node, value, node_refs), ...]
683
        :param is_locked: A callback, returns True if the index is locked and
684
            thus usable.
685
        """
686
        self._add_callback = add_callback
687
        self._graph_index = graph_index
688
        self._parents = parents
689
        self.has_graph = parents
690
        self._is_locked = is_locked
691
0.17.5 by Robert Collins
nograph tests completely passing.
692
    def add_records(self, records, random_id=False):
693
        """Add multiple records to the index.
694
        
695
        This function does not insert data into the Immutable GraphIndex
696
        backing the KnitGraphIndex, instead it prepares data for insertion by
697
        the caller and checks that it is safe to insert then calls
698
        self._add_callback with the prepared GraphIndex nodes.
699
700
        :param records: a list of tuples:
701
                         (key, options, access_memo, parents).
702
        :param random_id: If True the ids being added were randomly generated
703
            and no check for existence will be performed.
704
        """
705
        if not self._add_callback:
706
            raise errors.ReadOnlyError(self)
707
        # we hope there are no repositories with inconsistent parentage
708
        # anymore.
709
710
        changed = False
711
        keys = {}
712
        for (key, value, refs) in records:
713
            if not self._parents:
714
                if refs:
715
                    for ref in refs:
716
                        if ref:
717
                            raise KnitCorrupt(self,
718
                                "attempt to add node with parents "
719
                                "in parentless index.")
720
                    refs = ()
721
                    changed = True
722
            keys[key] = (value, refs)
723
        # check for dups
724
        if not random_id:
725
            present_nodes = self._get_entries(keys)
726
            for (index, key, value, node_refs) in present_nodes:
727
                if node_refs != keys[key][1]:
728
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
729
                        ": %s %s" % ((value, node_refs), keys[key]))
730
                del keys[key]
731
                changed = True
732
        if changed:
733
            result = []
734
            if self._parents:
735
                for key, (value, node_refs) in keys.iteritems():
736
                    result.append((key, value, node_refs))
737
            else:
738
                for key, (value, node_refs) in keys.iteritems():
739
                    result.append((key, value))
740
            records = result
741
        self._add_callback(records)
742
        
743
    def _check_read(self):
744
        """raise if reads are not permitted."""
745
        if not self._is_locked():
746
            raise errors.ObjectNotLocked(self)
747
0.17.2 by Robert Collins
Core proof of concept working.
748
    def _check_write_ok(self):
749
        """Assert if writes are not permitted."""
750
        if not self._is_locked():
751
            raise errors.ObjectNotLocked(self)
752
0.17.5 by Robert Collins
nograph tests completely passing.
753
    def _get_entries(self, keys, check_present=False):
754
        """Get the entries for keys.
755
        
756
        :param keys: An iterable of index key tuples.
757
        """
758
        keys = set(keys)
759
        found_keys = set()
760
        if self._parents:
761
            for node in self._graph_index.iter_entries(keys):
762
                yield node
763
                found_keys.add(node[1])
764
        else:
765
            # adapt parentless index to the rest of the code.
766
            for node in self._graph_index.iter_entries(keys):
767
                yield node[0], node[1], node[2], ()
768
                found_keys.add(node[1])
769
        if check_present:
770
            missing_keys = keys.difference(found_keys)
771
            if missing_keys:
772
                raise RevisionNotPresent(missing_keys.pop(), self)
773
774
    def get_parent_map(self, keys):
775
        """Get a map of the parents of keys.
776
777
        :param keys: The keys to look up parents for.
778
        :return: A mapping from keys to parents. Absent keys are absent from
779
            the mapping.
780
        """
781
        self._check_read()
782
        nodes = self._get_entries(keys)
783
        result = {}
784
        if self._parents:
785
            for node in nodes:
786
                result[node[1]] = node[3][0]
787
        else:
788
            for node in nodes:
789
                result[node[1]] = None
790
        return result
791
792
    def get_build_details(self, keys):
793
        """Get the various build details for keys.
794
795
        Ghosts are omitted from the result.
796
797
        :param keys: An iterable of keys.
798
        :return: A dict of key:
799
            (index_memo, compression_parent, parents, record_details).
800
            index_memo
801
                opaque structure to pass to read_records to extract the raw
802
                data
803
            compression_parent
804
                Content that this record is built upon, may be None
805
            parents
806
                Logical parents of this node
807
            record_details
808
                extra information about the content which needs to be passed to
809
                Factory.parse_record
810
        """
811
        self._check_read()
812
        result = {}
813
        entries = self._get_entries(keys, False)
814
        for entry in entries:
815
            key = entry[1]
816
            if not self._parents:
817
                parents = None
818
            else:
819
                parents = entry[3][0]
820
            value = entry[2]
821
            method = 'group'
822
            result[key] = (self._node_to_position(entry),
823
                                  None, parents, (method, None))
824
        return result
825
    
826
    def keys(self):
827
        """Get all the keys in the collection.
828
        
829
        The keys are not ordered.
830
        """
831
        self._check_read()
832
        return [node[1] for node in self._graph_index.iter_all_entries()]
833
    
834
    def _node_to_position(self, node):
835
        """Convert an index value to position details."""
836
        bits = node[2].split(' ')
837
        # It would be nice not to read the entire gzip.
838
        start = int(bits[0])
839
        stop = int(bits[1])
840
        basis_end = int(bits[2])
841
        delta_end = int(bits[3])
842
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
843
844
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
845
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
846
    """Get the longest possible match for the current position."""
847
    range_start = pos
848
    range_len = 0
849
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
850
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
851
        if locations is None:
852
            locations = equivalence_table.get_idx_matches(pos)
853
        if locations is None:
854
            # No more matches, just return whatever we have, but we know that
855
            # this last position is not going to match anything
856
            pos += 1
857
            break
858
        else:
859
            if copy_ends is None:
860
                # We are starting a new range
861
                copy_ends = [loc + 1 for loc in locations]
862
                range_len = 1
863
                locations = None # Consumed
864
            else:
865
                # We are currently in the middle of a match
866
                next_locations = set(copy_ends).intersection(locations)
867
                if len(next_locations):
868
                    # range continues
869
                    copy_ends = [loc + 1 for loc in next_locations]
870
                    range_len += 1
871
                    locations = None # Consumed
872
                else:
873
                    # But we are done with this match, we should be
874
                    # starting a new one, though. We will pass back 'locations'
875
                    # so that we don't have to do another lookup.
876
                    break
877
        pos += 1
878
    if copy_ends is None:
879
        return None, pos, locations
880
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
881
882
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
883
try:
884
    from bzrlib.plugins.groupcompress import _groupcompress_c
885
except ImportError:
886
    pass
887
else:
888
    GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
889
    _get_longest_match = _groupcompress_c._get_longest_match