/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
30
    pack,
31
    patiencediff,
32
    )
33
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
34
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
35
from bzrlib.osutils import (
36
    contains_whitespace,
37
    contains_linebreaks,
38
    sha_string,
39
    sha_strings,
40
    split_lines,
41
    )
0.17.7 by Robert Collins
Update for current index2 changes.
42
from bzrlib.plugins.index2.btree_index import BTreeBuilder
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
43
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
44
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
45
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
46
    adapter_registry,
47
    AbsentContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
48
    FulltextContentFactory,
49
    VersionedFiles,
50
    )
51
52
0.17.5 by Robert Collins
nograph tests completely passing.
53
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
54
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
55
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
56
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
57
    label_line = lines.next()
58
    sha1_line = lines.next()
59
    if (not label_line.startswith('label: ') or
60
        not sha1_line.startswith('sha1: ')):
61
        raise AssertionError("bad text record %r" % lines)
62
    label = tuple(label_line[7:-1].split('\x00'))
63
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
64
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
65
        op = header[0]
66
        numbers = header[2:]
67
        numbers = [int(n) for n in header[2:].split(',')]
68
        if op == 'c':
69
            result.append((op, numbers[0], numbers[1], None))
70
        else:
71
            contents = [next() for i in xrange(numbers[0])]
72
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
73
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
74
75
def apply_delta(basis, delta):
76
    """Apply delta to this object to become new_version_id."""
77
    lines = []
78
    last_offset = 0
79
    # eq ranges occur where gaps occur
80
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
81
    for op, start, count, delta_lines in delta:
82
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
83
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
84
        else:
85
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
86
    trim_encoding_newline(lines)
87
    return lines
88
89
90
def trim_encoding_newline(lines):
91
    if lines[-1] == '\n':
92
        del lines[-1]
93
    else:
94
        lines[-1] = lines[-1][:-1]
95
96
97
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
98
    """Produce a serialised group of compressed texts.
99
    
100
    It contains code very similar to SequenceMatcher because of having a similar
101
    task. However some key differences apply:
102
     - there is no junk, we want a minimal edit not a human readable diff.
103
     - we don't filter very common lines (because we don't know where a good
104
       range will start, and after the first text we want to be emitting minmal
105
       edits only.
106
     - we chain the left side, not the right side
107
     - we incrementally update the adjacency matrix as new lines are provided.
108
     - we look for matches in all of the left side, so the routine which does
109
       the analagous task of find_longest_match does not need to filter on the
110
       left side.
111
    """
0.17.2 by Robert Collins
Core proof of concept working.
112
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
113
    _equivalence_table_class = equivalence_table.EquivalenceTable
114
0.17.2 by Robert Collins
Core proof of concept working.
115
    def __init__(self, delta=True):
116
        """Create a GroupCompressor.
117
118
        :paeam delta: If False, do not compress records.
119
        """
120
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
121
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
122
        self.endpoint = 0
123
        self.input_bytes = 0
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
124
        self.line_locations = self._equivalence_table_class([])
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
125
        self.lines = self.line_locations.lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
126
        self.labels_deltas = {}
0.17.2 by Robert Collins
Core proof of concept working.
127
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
128
    def get_matching_blocks(self, lines):
129
        """Return an the ranges in lines which match self.lines.
130
131
        :param lines: lines to compress
132
        :return: A list of (old_start, new_start, length) tuples which reflect
133
            a region in self.lines that is present in lines.  The last element
134
            of the list is always (old_len, new_len, 0) to provide a end point
135
            for generating instructions from the matching blocks list.
136
        """
137
        result = []
138
        pos = 0
139
        line_locations = self.line_locations
0.18.11 by John Arbash Meinel
Convert back into grabbing a right-lines ahead of time.
140
        line_locations.set_right_lines(lines)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
141
        # We either copy a range (while there are reusable lines) or we 
142
        # insert new lines. To find reusable lines we traverse 
0.18.24 by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func.
143
        locations = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
144
        max_pos = len(lines)
145
        while pos < max_pos:
146
            block, pos, locations = _get_longest_match(line_locations, pos,
147
                                                       max_pos, locations)
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
148
            if block is not None:
0.18.24 by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func.
149
                result.append(block)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
150
        result.append((len(self.lines), len(lines), 0))
151
        return result
152
0.17.2 by Robert Collins
Core proof of concept working.
153
    def compress(self, key, lines, expected_sha):
154
        """Compress lines with label key.
155
156
        :param key: A key tuple. It is stored in the output
157
            for identification of the text during decompression.
158
        :param lines: The lines to be compressed. Must be split
159
            on \n, with the \n preserved.'
160
        :param expected_sha: If non-None, the sha the lines are blieved to
161
            have. During compression the sha is calculated; a mismatch will
162
            cause an error.
163
        :return: The sha1 of lines, and the number of bytes accumulated in
164
            the group output so far.
165
        """
166
        sha1 = sha_strings(lines)
167
        label = '\x00'.join(key)
168
        # setup good encoding for trailing \n support.
169
        if not lines or lines[-1].endswith('\n'):
170
            lines.append('\n')
171
        else:
172
            lines[-1] = lines[-1] + '\n'
173
        new_lines = []
174
        new_lines.append('label: %s\n' % label)
175
        new_lines.append('sha1: %s\n' % sha1)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
176
        index_lines = [False, False]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
177
        pos = 0
178
        line_locations = self.line_locations
0.17.14 by Robert Collins
Cleaner code.
179
        range_len = 0
180
        range_start = 0
181
        flush_range = self.flush_range
182
        copy_ends = None
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
183
        blocks = self.get_matching_blocks(lines)
184
        current_pos = 0
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
185
        # We either copy a range (while there are reusable lines) or we 
186
        # insert new lines. To find reusable lines we traverse 
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
187
        for old_start, new_start, range_len in blocks:
188
            if new_start != current_pos:
189
                # non-matching region
0.19.1 by Robert Collins
Start to simplify flush_range.
190
                flush_range(current_pos, None, new_start - current_pos,
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
191
                    lines, new_lines, index_lines)
192
            current_pos = new_start + range_len
193
            if not range_len:
194
                continue
0.19.1 by Robert Collins
Start to simplify flush_range.
195
            flush_range(new_start, old_start, range_len, lines,
196
                new_lines, index_lines)
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
197
        delta_start = (self.endpoint, len(self.lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
198
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
199
        trim_encoding_newline(lines)
200
        self.input_bytes += sum(map(len, lines))
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
201
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
202
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
203
        return sha1, self.endpoint
204
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
205
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
206
        """Extract a key previously added to the compressor.
207
        
208
        :param key: The key to extract.
209
        :return: An iterable over bytes and the sha1.
210
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
211
        delta_details = self.labels_deltas[key]
212
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
213
        label, sha1, delta = parse(delta_lines)
214
        if label != key:
215
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
216
        # Perhaps we want to keep the line offsets too in memory at least?
217
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
218
        sha1 = sha_strings(lines)
219
        return lines, sha1
220
0.19.1 by Robert Collins
Start to simplify flush_range.
221
    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
0.17.14 by Robert Collins
Cleaner code.
222
        if not range_len:
223
            return
224
        insert_instruction = "i,%d\n" % range_len
0.19.1 by Robert Collins
Start to simplify flush_range.
225
        if copy_start is not None:
0.17.14 by Robert Collins
Cleaner code.
226
            # range stops, flush and start a new copy range
227
            stop_byte = self.line_offsets[copy_start + range_len - 1]
228
            if copy_start == 0:
229
                start_byte = 0
230
            else:
231
                start_byte = self.line_offsets[copy_start - 1]
232
            bytes = stop_byte - start_byte
233
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
234
            if (bytes + len(insert_instruction) >
235
                len(copy_control_instruction)):
236
                new_lines.append(copy_control_instruction)
237
                index_lines.append(False)
238
                return
239
        # not copying, or inserting is shorter than copying, so insert.
240
        new_lines.append(insert_instruction)
241
        new_lines.extend(lines[range_start:range_start+range_len])
242
        index_lines.append(False)
0.19.1 by Robert Collins
Start to simplify flush_range.
243
        index_lines.extend([copy_start is None]*range_len)
0.17.14 by Robert Collins
Cleaner code.
244
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
245
    def output_lines(self, new_lines, index_lines):
246
        """Output some lines.
247
248
        :param new_lines: The lines to output.
249
        :param index_lines: A boolean flag for each line - when True, index
250
            that line.
251
        """
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
252
        endpoint = self.endpoint
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
253
        self.line_locations.extend_lines(new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
254
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
255
            endpoint += len(line)
256
            self.line_offsets.append(endpoint)
257
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
258
0.17.2 by Robert Collins
Core proof of concept working.
259
    def ratio(self):
260
        """Return the overall compression ratio."""
261
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
262
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
263
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
264
def make_pack_factory(graph, delta, keylength):
265
    """Create a factory for creating a pack based groupcompress.
266
267
    This is only functional enough to run interface tests, it doesn't try to
268
    provide a full pack environment.
269
    
270
    :param graph: Store a graph.
271
    :param delta: Delta compress contents.
272
    :param keylength: How long should keys be.
273
    """
274
    def factory(transport):
275
        parents = graph or delta
276
        ref_length = 0
277
        if graph:
278
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
279
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
280
            key_elements=keylength)
281
        stream = transport.open_write_stream('newpack')
282
        writer = pack.ContainerWriter(stream.write)
283
        writer.begin()
284
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
285
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
286
        access = _DirectPackAccess({})
287
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
288
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
289
        result.stream = stream
290
        result.writer = writer
291
        return result
292
    return factory
293
294
295
def cleanup_pack_group(versioned_files):
296
    versioned_files.stream.close()
297
    versioned_files.writer.end()
298
299
300
class GroupCompressVersionedFiles(VersionedFiles):
301
    """A group-compress based VersionedFiles implementation."""
302
0.17.2 by Robert Collins
Core proof of concept working.
303
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
304
        """Create a GroupCompressVersionedFiles object.
305
306
        :param index: The index object storing access and graph data.
307
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
308
        :param delta: Whether to delta compress or just entropy compress.
309
        """
310
        self._index = index
311
        self._access = access
312
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
313
        self._unadded_refs = {}
0.17.2 by Robert Collins
Core proof of concept working.
314
315
    def add_lines(self, key, parents, lines, parent_texts=None,
316
        left_matching_blocks=None, nostore_sha=None, random_id=False,
317
        check_content=True):
318
        """Add a text to the store.
319
320
        :param key: The key tuple of the text to add.
321
        :param parents: The parents key tuples of the text to add.
322
        :param lines: A list of lines. Each line must be a bytestring. And all
323
            of them except the last must be terminated with \n and contain no
324
            other \n's. The last line may either contain no \n's or a single
325
            terminating \n. If the lines list does meet this constraint the add
326
            routine may error or may succeed - but you will be unable to read
327
            the data back accurately. (Checking the lines have been split
328
            correctly is expensive and extremely unlikely to catch bugs so it
329
            is not done at runtime unless check_content is True.)
330
        :param parent_texts: An optional dictionary containing the opaque 
331
            representations of some or all of the parents of version_id to
332
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
333
            returned by add_lines or data corruption can be caused.
334
        :param left_matching_blocks: a hint about which areas are common
335
            between the text and its left-hand-parent.  The format is
336
            the SequenceMatcher.get_matching_blocks format.
337
        :param nostore_sha: Raise ExistingContent and do not add the lines to
338
            the versioned file if the digest of the lines matches this.
339
        :param random_id: If True a random id has been selected rather than
340
            an id determined by some deterministic process such as a converter
341
            from a foreign VCS. When True the backend may choose not to check
342
            for uniqueness of the resulting key within the versioned file, so
343
            this should only be done when the result is expected to be unique
344
            anyway.
345
        :param check_content: If True, the lines supplied are verified to be
346
            bytestrings that are correctly formed lines.
347
        :return: The text sha1, the number of bytes in the text, and an opaque
348
                 representation of the inserted version which can be provided
349
                 back to future add_lines calls in the parent_texts dictionary.
350
        """
351
        self._index._check_write_ok()
352
        self._check_add(key, lines, random_id, check_content)
353
        if parents is None:
354
            # The caller might pass None if there is no graph data, but kndx
355
            # indexes can't directly store that, so we give them
356
            # an empty tuple instead.
357
            parents = ()
358
        # double handling for now. Make it work until then.
359
        bytes = ''.join(lines)
360
        record = FulltextContentFactory(key, parents, None, bytes)
0.17.5 by Robert Collins
nograph tests completely passing.
361
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.17.2 by Robert Collins
Core proof of concept working.
362
        return sha1, len(bytes), None
363
0.17.4 by Robert Collins
Annotate.
364
    def annotate(self, key):
365
        """See VersionedFiles.annotate."""
366
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
367
        parent_map = self.get_parent_map([key])
368
        if not parent_map:
369
            raise errors.RevisionNotPresent(key, self)
370
        if parent_map[key] is not None:
371
            search = graph._make_breadth_first_searcher([key])
372
            keys = set()
373
            while True:
374
                try:
375
                    present, ghosts = search.next_with_ghosts()
376
                except StopIteration:
377
                    break
378
                keys.update(present)
379
            parent_map = self.get_parent_map(keys)
380
        else:
381
            keys = [key]
382
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
383
        head_cache = _mod_graph.FrozenHeadsCache(graph)
384
        parent_cache = {}
385
        reannotate = annotate.reannotate
386
        for record in self.get_record_stream(keys, 'topological', True):
387
            key = record.key
388
            fulltext = split_lines(record.get_bytes_as('fulltext'))
389
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
390
            parent_cache[key] = list(
391
                reannotate(parent_lines, fulltext, key, None, head_cache))
392
        return parent_cache[key]
393
0.17.5 by Robert Collins
nograph tests completely passing.
394
    def check(self, progress_bar=None):
395
        """See VersionedFiles.check()."""
396
        keys = self.keys()
397
        for record in self.get_record_stream(keys, 'unordered', True):
398
            record.get_bytes_as('fulltext')
399
0.17.2 by Robert Collins
Core proof of concept working.
400
    def _check_add(self, key, lines, random_id, check_content):
401
        """check that version_id and lines are safe to add."""
402
        version_id = key[-1]
403
        if contains_whitespace(version_id):
404
            raise InvalidRevisionId(version_id, self)
405
        self.check_not_reserved_id(version_id)
406
        # TODO: If random_id==False and the key is already present, we should
407
        # probably check that the existing content is identical to what is
408
        # being inserted, and otherwise raise an exception.  This would make
409
        # the bundle code simpler.
410
        if check_content:
411
            self._check_lines_not_unicode(lines)
412
            self._check_lines_are_lines(lines)
413
0.17.5 by Robert Collins
nograph tests completely passing.
414
    def get_parent_map(self, keys):
415
        """Get a map of the parents of keys.
416
417
        :param keys: The keys to look up parents for.
418
        :return: A mapping from keys to parents. Absent keys are absent from
419
            the mapping.
420
        """
421
        result = {}
422
        sources = [self._index]
423
        source_results = []
424
        missing = set(keys)
425
        for source in sources:
426
            if not missing:
427
                break
428
            new_result = source.get_parent_map(missing)
429
            source_results.append(new_result)
430
            result.update(new_result)
431
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
432
        if self._unadded_refs:
433
            for key in missing:
434
                if key in self._unadded_refs:
435
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
436
        return result
437
438
    def get_record_stream(self, keys, ordering, include_delta_closure):
439
        """Get a stream of records for keys.
440
441
        :param keys: The keys to include.
442
        :param ordering: Either 'unordered' or 'topological'. A topologically
443
            sorted stream has compression parents strictly before their
444
            children.
445
        :param include_delta_closure: If True then the closure across any
446
            compression parents will be included (in the opaque data).
447
        :return: An iterator of ContentFactory objects, each of which is only
448
            valid until the iterator is advanced.
449
        """
450
        # keys might be a generator
451
        keys = set(keys)
452
        if not keys:
453
            return
454
        if not self._index.has_graph:
455
            # Cannot topological order when no graph has been stored.
456
            ordering = 'unordered'
457
        # Cheap: iterate
458
        locations = self._index.get_build_details(keys)
459
        if ordering == 'topological':
460
            # would be better to not globally sort initially but instead
461
            # start with one key, recurse to its oldest parent, then grab
462
            # everything in the same group, etc.
463
            parent_map = dict((key, details[2]) for key, details in
464
                locations.iteritems())
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
465
            local = frozenset(keys).intersection(set(self._unadded_refs))
466
            for key in local:
467
                parent_map[key] = self._unadded_refs[key]
468
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
469
            present_keys = topo_sort(parent_map)
470
            # Now group by source:
471
        else:
472
            present_keys = locations.keys()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
473
            local = frozenset(keys).intersection(set(self._unadded_refs))
474
            for key in local:
475
                present_keys.append(key)
476
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
477
        absent_keys = keys.difference(set(locations))
478
        for key in absent_keys:
479
            yield AbsentContentFactory(key)
480
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
481
            if key in self._unadded_refs:
482
                lines, sha1 = self._compressor.extract(key)
483
                parents = self._unadded_refs[key]
484
            else:
485
                index_memo, _, parents, (method, _) = locations[key]
486
                # read
487
                read_memo = index_memo[0:3]
488
                zdata = self._access.get_raw_records([read_memo]).next()
489
                # decompress
490
                plain = zlib.decompress(zdata)
491
                # parse
492
                delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
493
                label, sha1, delta = parse(delta_lines)
494
                if label != key:
495
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
496
                basis = plain[:index_memo[3]]
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
497
                # basis = StringIO(basis).readlines()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
498
                #basis = split_lines(plain[:last_end])
499
                lines = apply_delta(basis, delta)
0.17.5 by Robert Collins
nograph tests completely passing.
500
            bytes = ''.join(lines)
501
            yield FulltextContentFactory(key, parents, sha1, bytes)
502
            
503
    def get_sha1s(self, keys):
504
        """See VersionedFiles.get_sha1s()."""
505
        result = {}
506
        for record in self.get_record_stream(keys, 'unordered', True):
507
            if record.sha1 != None:
508
                result[record.key] = record.sha1
509
            else:
510
                if record.storage_kind != 'absent':
511
                    result[record.key] == sha_string(record.get_bytes_as(
512
                        'fulltext'))
513
        return result
514
0.17.2 by Robert Collins
Core proof of concept working.
515
    def insert_record_stream(self, stream):
516
        """Insert a record stream into this container.
517
518
        :param stream: A stream of records to insert. 
519
        :return: None
520
        :seealso VersionedFiles.get_record_stream:
521
        """
0.17.5 by Robert Collins
nograph tests completely passing.
522
        for _ in self._insert_record_stream(stream):
523
            pass
0.17.2 by Robert Collins
Core proof of concept working.
524
0.17.5 by Robert Collins
nograph tests completely passing.
525
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
526
        """Internal core to insert a record stream into this container.
527
528
        This helper function has a different interface than insert_record_stream
529
        to allow add_lines to be minimal, but still return the needed data.
530
531
        :param stream: A stream of records to insert. 
532
        :return: An iterator over the sha1 of the inserted records.
533
        :seealso insert_record_stream:
534
        :seealso add_lines:
535
        """
0.17.5 by Robert Collins
nograph tests completely passing.
536
        def get_adapter(adapter_key):
537
            try:
538
                return adapters[adapter_key]
539
            except KeyError:
540
                adapter_factory = adapter_registry.get(adapter_key)
541
                adapter = adapter_factory(self)
542
                adapters[adapter_key] = adapter
543
                return adapter
544
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
545
        # This will go up to fulltexts for gc to gc fetching, which isn't
546
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
547
        self._compressor = GroupCompressor(self._delta)
548
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
549
        keys_to_add = []
550
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
551
        groups = 1
552
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
553
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
554
            index, start, length = self._access.add_raw_records(
555
                [(None, len(compressed))], compressed)[0]
556
            nodes = []
557
            for key, reads, refs in keys_to_add:
558
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
559
            self._index.add_records(nodes, random_id=random_id)
0.17.2 by Robert Collins
Core proof of concept working.
560
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
561
            # Raise an error when a record is missing.
562
            if record.storage_kind == 'absent':
563
                raise errors.RevisionNotPresent([record.key], self)
564
            elif record.storage_kind == 'fulltext':
565
                bytes = record.get_bytes_as('fulltext')
566
            else:
567
                adapter_key = record.storage_kind, 'fulltext'
568
                adapter = get_adapter(adapter_key)
569
                bytes = adapter.get_bytes(record,
570
                    record.get_bytes_as(record.storage_kind))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
571
            found_sha1, end_point = self._compressor.compress(record.key,
0.17.5 by Robert Collins
nograph tests completely passing.
572
                split_lines(bytes), record.sha1)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
573
            self._unadded_refs[record.key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
574
            yield found_sha1
0.17.5 by Robert Collins
nograph tests completely passing.
575
            keys_to_add.append((record.key, '%d %d' % (basis_end, end_point),
576
                (record.parents,)))
577
            basis_end = end_point
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
578
            if basis_end > 1024 * 1024 * 20:
579
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
580
                self._compressor = GroupCompressor(self._delta)
581
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
582
                keys_to_add = []
583
                basis_end = 0
584
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
585
        if len(keys_to_add):
586
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
587
        self._compressor = None
588
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
589
590
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
591
        """Iterate over the lines in the versioned files from keys.
592
593
        This may return lines from other keys. Each item the returned
594
        iterator yields is a tuple of a line and a text version that that line
595
        is present in (not introduced in).
596
597
        Ordering of results is in whatever order is most suitable for the
598
        underlying storage format.
599
600
        If a progress bar is supplied, it may be used to indicate progress.
601
        The caller is responsible for cleaning up progress bars (because this
602
        is an iterator).
603
604
        NOTES:
605
         * Lines are normalised by the underlying store: they will all have \n
606
           terminators.
607
         * Lines are returned in arbitrary order.
608
609
        :return: An iterator over (line, key).
610
        """
611
        if pb is None:
612
            pb = progress.DummyProgress()
613
        keys = set(keys)
614
        total = len(keys)
615
        # we don't care about inclusions, the caller cares.
616
        # but we need to setup a list of records to visit.
617
        # we need key, position, length
618
        for key_idx, record in enumerate(self.get_record_stream(keys,
619
            'unordered', True)):
620
            # XXX: todo - optimise to use less than full texts.
621
            key = record.key
622
            pb.update('Walking content.', key_idx, total)
623
            if record.storage_kind == 'absent':
624
                raise errors.RevisionNotPresent(record.key, self)
625
            lines = split_lines(record.get_bytes_as('fulltext'))
626
            for line in lines:
627
                yield line, key
628
        pb.update('Walking content.', total, total)
629
630
    def keys(self):
631
        """See VersionedFiles.keys."""
632
        if 'evil' in debug.debug_flags:
633
            trace.mutter_callsite(2, "keys scales with size of history")
634
        sources = [self._index]
635
        result = set()
636
        for source in sources:
637
            result.update(source.keys())
638
        return result
639
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
640
641
class _GCGraphIndex(object):
642
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
643
0.17.9 by Robert Collins
Initial stab at repository format support.
644
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
645
        add_callback=None):
646
        """Construct a _GCGraphIndex on a graph_index.
647
648
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
649
        :param is_locked: A callback to check whether the object should answer
650
            queries.
651
        :param parents: If True, record knits parents, if not do not record 
652
            parents.
653
        :param add_callback: If not None, allow additions to the index and call
654
            this callback with a list of added GraphIndex nodes:
655
            [(node, value, node_refs), ...]
656
        :param is_locked: A callback, returns True if the index is locked and
657
            thus usable.
658
        """
659
        self._add_callback = add_callback
660
        self._graph_index = graph_index
661
        self._parents = parents
662
        self.has_graph = parents
663
        self._is_locked = is_locked
664
0.17.5 by Robert Collins
nograph tests completely passing.
665
    def add_records(self, records, random_id=False):
666
        """Add multiple records to the index.
667
        
668
        This function does not insert data into the Immutable GraphIndex
669
        backing the KnitGraphIndex, instead it prepares data for insertion by
670
        the caller and checks that it is safe to insert then calls
671
        self._add_callback with the prepared GraphIndex nodes.
672
673
        :param records: a list of tuples:
674
                         (key, options, access_memo, parents).
675
        :param random_id: If True the ids being added were randomly generated
676
            and no check for existence will be performed.
677
        """
678
        if not self._add_callback:
679
            raise errors.ReadOnlyError(self)
680
        # we hope there are no repositories with inconsistent parentage
681
        # anymore.
682
683
        changed = False
684
        keys = {}
685
        for (key, value, refs) in records:
686
            if not self._parents:
687
                if refs:
688
                    for ref in refs:
689
                        if ref:
690
                            raise KnitCorrupt(self,
691
                                "attempt to add node with parents "
692
                                "in parentless index.")
693
                    refs = ()
694
                    changed = True
695
            keys[key] = (value, refs)
696
        # check for dups
697
        if not random_id:
698
            present_nodes = self._get_entries(keys)
699
            for (index, key, value, node_refs) in present_nodes:
700
                if node_refs != keys[key][1]:
701
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
702
                        ": %s %s" % ((value, node_refs), keys[key]))
703
                del keys[key]
704
                changed = True
705
        if changed:
706
            result = []
707
            if self._parents:
708
                for key, (value, node_refs) in keys.iteritems():
709
                    result.append((key, value, node_refs))
710
            else:
711
                for key, (value, node_refs) in keys.iteritems():
712
                    result.append((key, value))
713
            records = result
714
        self._add_callback(records)
715
        
716
    def _check_read(self):
717
        """raise if reads are not permitted."""
718
        if not self._is_locked():
719
            raise errors.ObjectNotLocked(self)
720
0.17.2 by Robert Collins
Core proof of concept working.
721
    def _check_write_ok(self):
722
        """Assert if writes are not permitted."""
723
        if not self._is_locked():
724
            raise errors.ObjectNotLocked(self)
725
0.17.5 by Robert Collins
nograph tests completely passing.
726
    def _get_entries(self, keys, check_present=False):
727
        """Get the entries for keys.
728
        
729
        :param keys: An iterable of index key tuples.
730
        """
731
        keys = set(keys)
732
        found_keys = set()
733
        if self._parents:
734
            for node in self._graph_index.iter_entries(keys):
735
                yield node
736
                found_keys.add(node[1])
737
        else:
738
            # adapt parentless index to the rest of the code.
739
            for node in self._graph_index.iter_entries(keys):
740
                yield node[0], node[1], node[2], ()
741
                found_keys.add(node[1])
742
        if check_present:
743
            missing_keys = keys.difference(found_keys)
744
            if missing_keys:
745
                raise RevisionNotPresent(missing_keys.pop(), self)
746
747
    def get_parent_map(self, keys):
748
        """Get a map of the parents of keys.
749
750
        :param keys: The keys to look up parents for.
751
        :return: A mapping from keys to parents. Absent keys are absent from
752
            the mapping.
753
        """
754
        self._check_read()
755
        nodes = self._get_entries(keys)
756
        result = {}
757
        if self._parents:
758
            for node in nodes:
759
                result[node[1]] = node[3][0]
760
        else:
761
            for node in nodes:
762
                result[node[1]] = None
763
        return result
764
765
    def get_build_details(self, keys):
766
        """Get the various build details for keys.
767
768
        Ghosts are omitted from the result.
769
770
        :param keys: An iterable of keys.
771
        :return: A dict of key:
772
            (index_memo, compression_parent, parents, record_details).
773
            index_memo
774
                opaque structure to pass to read_records to extract the raw
775
                data
776
            compression_parent
777
                Content that this record is built upon, may be None
778
            parents
779
                Logical parents of this node
780
            record_details
781
                extra information about the content which needs to be passed to
782
                Factory.parse_record
783
        """
784
        self._check_read()
785
        result = {}
786
        entries = self._get_entries(keys, False)
787
        for entry in entries:
788
            key = entry[1]
789
            if not self._parents:
790
                parents = None
791
            else:
792
                parents = entry[3][0]
793
            value = entry[2]
794
            method = 'group'
795
            result[key] = (self._node_to_position(entry),
796
                                  None, parents, (method, None))
797
        return result
798
    
799
    def keys(self):
800
        """Get all the keys in the collection.
801
        
802
        The keys are not ordered.
803
        """
804
        self._check_read()
805
        return [node[1] for node in self._graph_index.iter_all_entries()]
806
    
807
    def _node_to_position(self, node):
808
        """Convert an index value to position details."""
809
        bits = node[2].split(' ')
810
        # It would be nice not to read the entire gzip.
811
        start = int(bits[0])
812
        stop = int(bits[1])
813
        basis_end = int(bits[2])
814
        delta_end = int(bits[3])
815
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
816
817
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
818
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
819
    """Get the longest possible match for the current position."""
820
    range_start = pos
821
    range_len = 0
822
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
823
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
824
        if locations is None:
825
            locations = equivalence_table.get_idx_matches(pos)
826
        if locations is None:
827
            # No more matches, just return whatever we have, but we know that
828
            # this last position is not going to match anything
829
            pos += 1
830
            break
831
        else:
832
            if copy_ends is None:
833
                # We are starting a new range
834
                copy_ends = [loc + 1 for loc in locations]
835
                range_len = 1
836
                locations = None # Consumed
837
            else:
838
                # We are currently in the middle of a match
839
                next_locations = set(copy_ends).intersection(locations)
840
                if len(next_locations):
841
                    # range continues
842
                    copy_ends = [loc + 1 for loc in next_locations]
843
                    range_len += 1
844
                    locations = None # Consumed
845
                else:
846
                    # But we are done with this match, we should be
847
                    # starting a new one, though. We will pass back 'locations'
848
                    # so that we don't have to do another lookup.
849
                    break
850
        pos += 1
851
    if copy_ends is None:
852
        return None, pos, locations
853
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
854
855
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
856
try:
857
    from bzrlib.plugins.groupcompress import _groupcompress_c
858
except ImportError:
859
    pass
860
else:
861
    GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable