/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
30
    pack,
31
    patiencediff,
32
    )
33
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
34
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
35
from bzrlib.osutils import (
36
    contains_whitespace,
37
    contains_linebreaks,
38
    sha_string,
39
    sha_strings,
40
    split_lines,
41
    )
0.17.7 by Robert Collins
Update for current index2 changes.
42
from bzrlib.plugins.index2.btree_index import BTreeBuilder
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
43
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
44
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
45
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
46
    adapter_registry,
47
    AbsentContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
48
    FulltextContentFactory,
49
    VersionedFiles,
50
    )
51
52
0.17.5 by Robert Collins
nograph tests completely passing.
53
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
54
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
55
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
56
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
57
    label_line = lines.next()
58
    sha1_line = lines.next()
59
    if (not label_line.startswith('label: ') or
60
        not sha1_line.startswith('sha1: ')):
61
        raise AssertionError("bad text record %r" % lines)
62
    label = tuple(label_line[7:-1].split('\x00'))
63
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
64
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
65
        op = header[0]
66
        numbers = header[2:]
67
        numbers = [int(n) for n in header[2:].split(',')]
68
        if op == 'c':
69
            result.append((op, numbers[0], numbers[1], None))
70
        else:
71
            contents = [next() for i in xrange(numbers[0])]
72
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
73
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
74
75
def apply_delta(basis, delta):
76
    """Apply delta to this object to become new_version_id."""
77
    lines = []
78
    last_offset = 0
79
    # eq ranges occur where gaps occur
80
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
81
    for op, start, count, delta_lines in delta:
82
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
83
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
84
        else:
85
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
86
    trim_encoding_newline(lines)
87
    return lines
88
89
90
def trim_encoding_newline(lines):
91
    if lines[-1] == '\n':
92
        del lines[-1]
93
    else:
94
        lines[-1] = lines[-1][:-1]
95
96
97
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
98
    """Produce a serialised group of compressed texts.
99
    
100
    It contains code very similar to SequenceMatcher because of having a similar
101
    task. However some key differences apply:
102
     - there is no junk, we want a minimal edit not a human readable diff.
103
     - we don't filter very common lines (because we don't know where a good
104
       range will start, and after the first text we want to be emitting minmal
105
       edits only.
106
     - we chain the left side, not the right side
107
     - we incrementally update the adjacency matrix as new lines are provided.
108
     - we look for matches in all of the left side, so the routine which does
109
       the analagous task of find_longest_match does not need to filter on the
110
       left side.
111
    """
0.17.2 by Robert Collins
Core proof of concept working.
112
113
    def __init__(self, delta=True):
114
        """Create a GroupCompressor.
115
116
        :paeam delta: If False, do not compress records.
117
        """
118
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
119
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
120
        self.endpoint = 0
121
        self.input_bytes = 0
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
122
        self.line_locations = equivalence_table.EquivalenceTable([])
123
        self.lines = self.line_locations._left_lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
124
        self.labels_deltas = {}
0.17.2 by Robert Collins
Core proof of concept working.
125
126
    def compress(self, key, lines, expected_sha):
127
        """Compress lines with label key.
128
129
        :param key: A key tuple. It is stored in the output
130
            for identification of the text during decompression.
131
        :param lines: The lines to be compressed. Must be split
132
            on \n, with the \n preserved.'
133
        :param expected_sha: If non-None, the sha the lines are blieved to
134
            have. During compression the sha is calculated; a mismatch will
135
            cause an error.
136
        :return: The sha1 of lines, and the number of bytes accumulated in
137
            the group output so far.
138
        """
139
        sha1 = sha_strings(lines)
140
        label = '\x00'.join(key)
141
        # setup good encoding for trailing \n support.
142
        if not lines or lines[-1].endswith('\n'):
143
            lines.append('\n')
144
        else:
145
            lines[-1] = lines[-1] + '\n'
146
        new_lines = []
147
        new_lines.append('label: %s\n' % label)
148
        new_lines.append('sha1: %s\n' % sha1)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
149
        index_lines = [False, False]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
150
        pos = 0
151
        line_locations = self.line_locations
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
152
        line_locations.set_right_lines(lines)
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
153
        accumulator = []
154
        copying = False
0.17.14 by Robert Collins
Cleaner code.
155
        range_len = 0
156
        range_start = 0
157
        flush_range = self.flush_range
158
        copy_ends = None
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
159
        # We either copy a range (while there are reusable lines) or we 
160
        # insert new lines. To find reusable lines we traverse 
161
        while pos < len(lines):
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
162
            # line = lines[pos]
163
            matching_locs = line_locations.get_left_matches(pos)
164
            if not matching_locs:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
165
                if copying:
0.17.14 by Robert Collins
Cleaner code.
166
                    flush_range(copying, range_start, copy_ends, range_len,
167
                        lines, new_lines, index_lines)
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
168
                    copying = False
0.17.14 by Robert Collins
Cleaner code.
169
                    range_start = pos
170
                    range_len = 1
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
171
                else:
0.17.14 by Robert Collins
Cleaner code.
172
                    range_len += 1
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
173
            else:
174
                if copying:
0.18.7 by John Arbash Meinel
Switch to using a plain list as much as possible.
175
                    # TODO: We could do a merge sort here if we keep copy_ends
176
                    #       and matching_locs always sorted.
177
                    next_locations = set(copy_ends).intersection(matching_locs)
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
178
                    if len(next_locations):
179
                        # range continues
0.17.14 by Robert Collins
Cleaner code.
180
                        range_len += 1
0.18.7 by John Arbash Meinel
Switch to using a plain list as much as possible.
181
                        copy_ends = [loc + 1 for loc in next_locations]
0.17.14 by Robert Collins
Cleaner code.
182
                        pos += 1
183
                        continue
184
                # New copy range starts here:
185
                flush_range(copying, range_start, copy_ends, range_len, lines,
186
                    new_lines, index_lines)
187
                range_len = 1
0.18.7 by John Arbash Meinel
Switch to using a plain list as much as possible.
188
                copy_ends = [loc + 1 for loc in matching_locs]
0.17.14 by Robert Collins
Cleaner code.
189
                range_start = pos
190
                copying = True
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
191
            pos += 1
0.17.14 by Robert Collins
Cleaner code.
192
        flush_range(copying, range_start, copy_ends, range_len, lines,
193
            new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
194
        delta_start = (self.endpoint, len(self.line_locations._left_lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
195
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
196
        trim_encoding_newline(lines)
197
        self.input_bytes += sum(map(len, lines))
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
198
        delta_end = (self.endpoint, len(line_locations._left_lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
199
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
200
        return sha1, self.endpoint
201
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
202
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
203
        """Extract a key previously added to the compressor.
204
        
205
        :param key: The key to extract.
206
        :return: An iterable over bytes and the sha1.
207
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
208
        delta_details = self.labels_deltas[key]
209
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
210
        label, sha1, delta = parse(delta_lines)
211
        if label != key:
212
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
213
        # Perhaps we want to keep the line offsets too in memory at least?
214
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
215
        sha1 = sha_strings(lines)
216
        return lines, sha1
217
0.17.14 by Robert Collins
Cleaner code.
218
    def flush_range(self, copying, range_start, copy_ends, range_len, lines, new_lines, index_lines):
219
        if not range_len:
220
            return
221
        insert_instruction = "i,%d\n" % range_len
222
        if copying:
223
            # range stops, flush and start a new copy range
224
            copy_start = min(copy_ends) - range_len
225
            stop_byte = self.line_offsets[copy_start + range_len - 1]
226
            if copy_start == 0:
227
                start_byte = 0
228
            else:
229
                start_byte = self.line_offsets[copy_start - 1]
230
            bytes = stop_byte - start_byte
231
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
232
            if (bytes + len(insert_instruction) >
233
                len(copy_control_instruction)):
234
                new_lines.append(copy_control_instruction)
235
                index_lines.append(False)
236
                return
237
        # not copying, or inserting is shorter than copying, so insert.
238
        new_lines.append(insert_instruction)
239
        new_lines.extend(lines[range_start:range_start+range_len])
240
        index_lines.append(False)
241
        index_lines.extend([not copying]*range_len)
242
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
243
    def output_lines(self, new_lines, index_lines):
244
        """Output some lines.
245
246
        :param new_lines: The lines to output.
247
        :param index_lines: A boolean flag for each line - when True, index
248
            that line.
249
        """
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
250
        endpoint = self.endpoint
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
251
        self.line_locations.extend_left_lines(new_lines, index_lines)
252
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
253
            endpoint += len(line)
254
            self.line_offsets.append(endpoint)
255
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
256
0.17.2 by Robert Collins
Core proof of concept working.
257
    def ratio(self):
258
        """Return the overall compression ratio."""
259
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
260
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
261
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
262
def make_pack_factory(graph, delta, keylength):
263
    """Create a factory for creating a pack based groupcompress.
264
265
    This is only functional enough to run interface tests, it doesn't try to
266
    provide a full pack environment.
267
    
268
    :param graph: Store a graph.
269
    :param delta: Delta compress contents.
270
    :param keylength: How long should keys be.
271
    """
272
    def factory(transport):
273
        parents = graph or delta
274
        ref_length = 0
275
        if graph:
276
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
277
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
278
            key_elements=keylength)
279
        stream = transport.open_write_stream('newpack')
280
        writer = pack.ContainerWriter(stream.write)
281
        writer.begin()
282
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
283
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
284
        access = _DirectPackAccess({})
285
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
286
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
287
        result.stream = stream
288
        result.writer = writer
289
        return result
290
    return factory
291
292
293
def cleanup_pack_group(versioned_files):
294
    versioned_files.stream.close()
295
    versioned_files.writer.end()
296
297
298
class GroupCompressVersionedFiles(VersionedFiles):
299
    """A group-compress based VersionedFiles implementation."""
300
0.17.2 by Robert Collins
Core proof of concept working.
301
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
302
        """Create a GroupCompressVersionedFiles object.
303
304
        :param index: The index object storing access and graph data.
305
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
306
        :param delta: Whether to delta compress or just entropy compress.
307
        """
308
        self._index = index
309
        self._access = access
310
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
311
        self._unadded_refs = {}
0.17.2 by Robert Collins
Core proof of concept working.
312
313
    def add_lines(self, key, parents, lines, parent_texts=None,
314
        left_matching_blocks=None, nostore_sha=None, random_id=False,
315
        check_content=True):
316
        """Add a text to the store.
317
318
        :param key: The key tuple of the text to add.
319
        :param parents: The parents key tuples of the text to add.
320
        :param lines: A list of lines. Each line must be a bytestring. And all
321
            of them except the last must be terminated with \n and contain no
322
            other \n's. The last line may either contain no \n's or a single
323
            terminating \n. If the lines list does meet this constraint the add
324
            routine may error or may succeed - but you will be unable to read
325
            the data back accurately. (Checking the lines have been split
326
            correctly is expensive and extremely unlikely to catch bugs so it
327
            is not done at runtime unless check_content is True.)
328
        :param parent_texts: An optional dictionary containing the opaque 
329
            representations of some or all of the parents of version_id to
330
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
331
            returned by add_lines or data corruption can be caused.
332
        :param left_matching_blocks: a hint about which areas are common
333
            between the text and its left-hand-parent.  The format is
334
            the SequenceMatcher.get_matching_blocks format.
335
        :param nostore_sha: Raise ExistingContent and do not add the lines to
336
            the versioned file if the digest of the lines matches this.
337
        :param random_id: If True a random id has been selected rather than
338
            an id determined by some deterministic process such as a converter
339
            from a foreign VCS. When True the backend may choose not to check
340
            for uniqueness of the resulting key within the versioned file, so
341
            this should only be done when the result is expected to be unique
342
            anyway.
343
        :param check_content: If True, the lines supplied are verified to be
344
            bytestrings that are correctly formed lines.
345
        :return: The text sha1, the number of bytes in the text, and an opaque
346
                 representation of the inserted version which can be provided
347
                 back to future add_lines calls in the parent_texts dictionary.
348
        """
349
        self._index._check_write_ok()
350
        self._check_add(key, lines, random_id, check_content)
351
        if parents is None:
352
            # The caller might pass None if there is no graph data, but kndx
353
            # indexes can't directly store that, so we give them
354
            # an empty tuple instead.
355
            parents = ()
356
        # double handling for now. Make it work until then.
357
        bytes = ''.join(lines)
358
        record = FulltextContentFactory(key, parents, None, bytes)
0.17.5 by Robert Collins
nograph tests completely passing.
359
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.17.2 by Robert Collins
Core proof of concept working.
360
        return sha1, len(bytes), None
361
0.17.4 by Robert Collins
Annotate.
362
    def annotate(self, key):
363
        """See VersionedFiles.annotate."""
364
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
365
        parent_map = self.get_parent_map([key])
366
        if not parent_map:
367
            raise errors.RevisionNotPresent(key, self)
368
        if parent_map[key] is not None:
369
            search = graph._make_breadth_first_searcher([key])
370
            keys = set()
371
            while True:
372
                try:
373
                    present, ghosts = search.next_with_ghosts()
374
                except StopIteration:
375
                    break
376
                keys.update(present)
377
            parent_map = self.get_parent_map(keys)
378
        else:
379
            keys = [key]
380
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
381
        head_cache = _mod_graph.FrozenHeadsCache(graph)
382
        parent_cache = {}
383
        reannotate = annotate.reannotate
384
        for record in self.get_record_stream(keys, 'topological', True):
385
            key = record.key
386
            fulltext = split_lines(record.get_bytes_as('fulltext'))
387
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
388
            parent_cache[key] = list(
389
                reannotate(parent_lines, fulltext, key, None, head_cache))
390
        return parent_cache[key]
391
0.17.5 by Robert Collins
nograph tests completely passing.
392
    def check(self, progress_bar=None):
393
        """See VersionedFiles.check()."""
394
        keys = self.keys()
395
        for record in self.get_record_stream(keys, 'unordered', True):
396
            record.get_bytes_as('fulltext')
397
0.17.2 by Robert Collins
Core proof of concept working.
398
    def _check_add(self, key, lines, random_id, check_content):
399
        """check that version_id and lines are safe to add."""
400
        version_id = key[-1]
401
        if contains_whitespace(version_id):
402
            raise InvalidRevisionId(version_id, self)
403
        self.check_not_reserved_id(version_id)
404
        # TODO: If random_id==False and the key is already present, we should
405
        # probably check that the existing content is identical to what is
406
        # being inserted, and otherwise raise an exception.  This would make
407
        # the bundle code simpler.
408
        if check_content:
409
            self._check_lines_not_unicode(lines)
410
            self._check_lines_are_lines(lines)
411
0.17.5 by Robert Collins
nograph tests completely passing.
412
    def get_parent_map(self, keys):
413
        """Get a map of the parents of keys.
414
415
        :param keys: The keys to look up parents for.
416
        :return: A mapping from keys to parents. Absent keys are absent from
417
            the mapping.
418
        """
419
        result = {}
420
        sources = [self._index]
421
        source_results = []
422
        missing = set(keys)
423
        for source in sources:
424
            if not missing:
425
                break
426
            new_result = source.get_parent_map(missing)
427
            source_results.append(new_result)
428
            result.update(new_result)
429
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
430
        if self._unadded_refs:
431
            for key in missing:
432
                if key in self._unadded_refs:
433
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
434
        return result
435
436
    def get_record_stream(self, keys, ordering, include_delta_closure):
437
        """Get a stream of records for keys.
438
439
        :param keys: The keys to include.
440
        :param ordering: Either 'unordered' or 'topological'. A topologically
441
            sorted stream has compression parents strictly before their
442
            children.
443
        :param include_delta_closure: If True then the closure across any
444
            compression parents will be included (in the opaque data).
445
        :return: An iterator of ContentFactory objects, each of which is only
446
            valid until the iterator is advanced.
447
        """
448
        # keys might be a generator
449
        keys = set(keys)
450
        if not keys:
451
            return
452
        if not self._index.has_graph:
453
            # Cannot topological order when no graph has been stored.
454
            ordering = 'unordered'
455
        # Cheap: iterate
456
        locations = self._index.get_build_details(keys)
457
        if ordering == 'topological':
458
            # would be better to not globally sort initially but instead
459
            # start with one key, recurse to its oldest parent, then grab
460
            # everything in the same group, etc.
461
            parent_map = dict((key, details[2]) for key, details in
462
                locations.iteritems())
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
463
            local = frozenset(keys).intersection(set(self._unadded_refs))
464
            for key in local:
465
                parent_map[key] = self._unadded_refs[key]
466
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
467
            present_keys = topo_sort(parent_map)
468
            # Now group by source:
469
        else:
470
            present_keys = locations.keys()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
471
            local = frozenset(keys).intersection(set(self._unadded_refs))
472
            for key in local:
473
                present_keys.append(key)
474
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
475
        absent_keys = keys.difference(set(locations))
476
        for key in absent_keys:
477
            yield AbsentContentFactory(key)
478
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
479
            if key in self._unadded_refs:
480
                lines, sha1 = self._compressor.extract(key)
481
                parents = self._unadded_refs[key]
482
            else:
483
                index_memo, _, parents, (method, _) = locations[key]
484
                # read
485
                read_memo = index_memo[0:3]
486
                zdata = self._access.get_raw_records([read_memo]).next()
487
                # decompress
488
                plain = zlib.decompress(zdata)
489
                # parse
490
                delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
491
                label, sha1, delta = parse(delta_lines)
492
                if label != key:
493
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
494
                basis = plain[:index_memo[3]]
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
495
                # basis = StringIO(basis).readlines()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
496
                #basis = split_lines(plain[:last_end])
497
                lines = apply_delta(basis, delta)
0.17.5 by Robert Collins
nograph tests completely passing.
498
            bytes = ''.join(lines)
499
            yield FulltextContentFactory(key, parents, sha1, bytes)
500
            
501
    def get_sha1s(self, keys):
502
        """See VersionedFiles.get_sha1s()."""
503
        result = {}
504
        for record in self.get_record_stream(keys, 'unordered', True):
505
            if record.sha1 != None:
506
                result[record.key] = record.sha1
507
            else:
508
                if record.storage_kind != 'absent':
509
                    result[record.key] == sha_string(record.get_bytes_as(
510
                        'fulltext'))
511
        return result
512
0.17.2 by Robert Collins
Core proof of concept working.
513
    def insert_record_stream(self, stream):
514
        """Insert a record stream into this container.
515
516
        :param stream: A stream of records to insert. 
517
        :return: None
518
        :seealso VersionedFiles.get_record_stream:
519
        """
0.17.5 by Robert Collins
nograph tests completely passing.
520
        for _ in self._insert_record_stream(stream):
521
            pass
0.17.2 by Robert Collins
Core proof of concept working.
522
0.17.5 by Robert Collins
nograph tests completely passing.
523
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
524
        """Internal core to insert a record stream into this container.
525
526
        This helper function has a different interface than insert_record_stream
527
        to allow add_lines to be minimal, but still return the needed data.
528
529
        :param stream: A stream of records to insert. 
530
        :return: An iterator over the sha1 of the inserted records.
531
        :seealso insert_record_stream:
532
        :seealso add_lines:
533
        """
0.17.5 by Robert Collins
nograph tests completely passing.
534
        def get_adapter(adapter_key):
535
            try:
536
                return adapters[adapter_key]
537
            except KeyError:
538
                adapter_factory = adapter_registry.get(adapter_key)
539
                adapter = adapter_factory(self)
540
                adapters[adapter_key] = adapter
541
                return adapter
542
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
543
        # This will go up to fulltexts for gc to gc fetching, which isn't
544
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
545
        self._compressor = GroupCompressor(self._delta)
546
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
547
        keys_to_add = []
548
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
549
        groups = 1
550
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
551
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
552
            index, start, length = self._access.add_raw_records(
553
                [(None, len(compressed))], compressed)[0]
554
            nodes = []
555
            for key, reads, refs in keys_to_add:
556
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
557
            self._index.add_records(nodes, random_id=random_id)
0.17.2 by Robert Collins
Core proof of concept working.
558
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
559
            # Raise an error when a record is missing.
560
            if record.storage_kind == 'absent':
561
                raise errors.RevisionNotPresent([record.key], self)
562
            elif record.storage_kind == 'fulltext':
563
                bytes = record.get_bytes_as('fulltext')
564
            else:
565
                adapter_key = record.storage_kind, 'fulltext'
566
                adapter = get_adapter(adapter_key)
567
                bytes = adapter.get_bytes(record,
568
                    record.get_bytes_as(record.storage_kind))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
569
            found_sha1, end_point = self._compressor.compress(record.key,
0.17.5 by Robert Collins
nograph tests completely passing.
570
                split_lines(bytes), record.sha1)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
571
            self._unadded_refs[record.key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
572
            yield found_sha1
0.17.5 by Robert Collins
nograph tests completely passing.
573
            keys_to_add.append((record.key, '%d %d' % (basis_end, end_point),
574
                (record.parents,)))
575
            basis_end = end_point
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
576
            if basis_end > 1024 * 1024 * 20:
577
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
578
                self._compressor = GroupCompressor(self._delta)
579
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
580
                keys_to_add = []
581
                basis_end = 0
582
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
583
        if len(keys_to_add):
584
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
585
        self._compressor = None
586
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
587
588
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
589
        """Iterate over the lines in the versioned files from keys.
590
591
        This may return lines from other keys. Each item the returned
592
        iterator yields is a tuple of a line and a text version that that line
593
        is present in (not introduced in).
594
595
        Ordering of results is in whatever order is most suitable for the
596
        underlying storage format.
597
598
        If a progress bar is supplied, it may be used to indicate progress.
599
        The caller is responsible for cleaning up progress bars (because this
600
        is an iterator).
601
602
        NOTES:
603
         * Lines are normalised by the underlying store: they will all have \n
604
           terminators.
605
         * Lines are returned in arbitrary order.
606
607
        :return: An iterator over (line, key).
608
        """
609
        if pb is None:
610
            pb = progress.DummyProgress()
611
        keys = set(keys)
612
        total = len(keys)
613
        # we don't care about inclusions, the caller cares.
614
        # but we need to setup a list of records to visit.
615
        # we need key, position, length
616
        for key_idx, record in enumerate(self.get_record_stream(keys,
617
            'unordered', True)):
618
            # XXX: todo - optimise to use less than full texts.
619
            key = record.key
620
            pb.update('Walking content.', key_idx, total)
621
            if record.storage_kind == 'absent':
622
                raise errors.RevisionNotPresent(record.key, self)
623
            lines = split_lines(record.get_bytes_as('fulltext'))
624
            for line in lines:
625
                yield line, key
626
        pb.update('Walking content.', total, total)
627
628
    def keys(self):
629
        """See VersionedFiles.keys."""
630
        if 'evil' in debug.debug_flags:
631
            trace.mutter_callsite(2, "keys scales with size of history")
632
        sources = [self._index]
633
        result = set()
634
        for source in sources:
635
            result.update(source.keys())
636
        return result
637
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
638
639
class _GCGraphIndex(object):
640
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
641
0.17.9 by Robert Collins
Initial stab at repository format support.
642
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
643
        add_callback=None):
644
        """Construct a _GCGraphIndex on a graph_index.
645
646
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
647
        :param is_locked: A callback to check whether the object should answer
648
            queries.
649
        :param parents: If True, record knits parents, if not do not record 
650
            parents.
651
        :param add_callback: If not None, allow additions to the index and call
652
            this callback with a list of added GraphIndex nodes:
653
            [(node, value, node_refs), ...]
654
        :param is_locked: A callback, returns True if the index is locked and
655
            thus usable.
656
        """
657
        self._add_callback = add_callback
658
        self._graph_index = graph_index
659
        self._parents = parents
660
        self.has_graph = parents
661
        self._is_locked = is_locked
662
0.17.5 by Robert Collins
nograph tests completely passing.
663
    def add_records(self, records, random_id=False):
664
        """Add multiple records to the index.
665
        
666
        This function does not insert data into the Immutable GraphIndex
667
        backing the KnitGraphIndex, instead it prepares data for insertion by
668
        the caller and checks that it is safe to insert then calls
669
        self._add_callback with the prepared GraphIndex nodes.
670
671
        :param records: a list of tuples:
672
                         (key, options, access_memo, parents).
673
        :param random_id: If True the ids being added were randomly generated
674
            and no check for existence will be performed.
675
        """
676
        if not self._add_callback:
677
            raise errors.ReadOnlyError(self)
678
        # we hope there are no repositories with inconsistent parentage
679
        # anymore.
680
681
        changed = False
682
        keys = {}
683
        for (key, value, refs) in records:
684
            if not self._parents:
685
                if refs:
686
                    for ref in refs:
687
                        if ref:
688
                            raise KnitCorrupt(self,
689
                                "attempt to add node with parents "
690
                                "in parentless index.")
691
                    refs = ()
692
                    changed = True
693
            keys[key] = (value, refs)
694
        # check for dups
695
        if not random_id:
696
            present_nodes = self._get_entries(keys)
697
            for (index, key, value, node_refs) in present_nodes:
698
                if node_refs != keys[key][1]:
699
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
700
                        ": %s %s" % ((value, node_refs), keys[key]))
701
                del keys[key]
702
                changed = True
703
        if changed:
704
            result = []
705
            if self._parents:
706
                for key, (value, node_refs) in keys.iteritems():
707
                    result.append((key, value, node_refs))
708
            else:
709
                for key, (value, node_refs) in keys.iteritems():
710
                    result.append((key, value))
711
            records = result
712
        self._add_callback(records)
713
        
714
    def _check_read(self):
715
        """raise if reads are not permitted."""
716
        if not self._is_locked():
717
            raise errors.ObjectNotLocked(self)
718
0.17.2 by Robert Collins
Core proof of concept working.
719
    def _check_write_ok(self):
720
        """Assert if writes are not permitted."""
721
        if not self._is_locked():
722
            raise errors.ObjectNotLocked(self)
723
0.17.5 by Robert Collins
nograph tests completely passing.
724
    def _get_entries(self, keys, check_present=False):
725
        """Get the entries for keys.
726
        
727
        :param keys: An iterable of index key tuples.
728
        """
729
        keys = set(keys)
730
        found_keys = set()
731
        if self._parents:
732
            for node in self._graph_index.iter_entries(keys):
733
                yield node
734
                found_keys.add(node[1])
735
        else:
736
            # adapt parentless index to the rest of the code.
737
            for node in self._graph_index.iter_entries(keys):
738
                yield node[0], node[1], node[2], ()
739
                found_keys.add(node[1])
740
        if check_present:
741
            missing_keys = keys.difference(found_keys)
742
            if missing_keys:
743
                raise RevisionNotPresent(missing_keys.pop(), self)
744
745
    def get_parent_map(self, keys):
746
        """Get a map of the parents of keys.
747
748
        :param keys: The keys to look up parents for.
749
        :return: A mapping from keys to parents. Absent keys are absent from
750
            the mapping.
751
        """
752
        self._check_read()
753
        nodes = self._get_entries(keys)
754
        result = {}
755
        if self._parents:
756
            for node in nodes:
757
                result[node[1]] = node[3][0]
758
        else:
759
            for node in nodes:
760
                result[node[1]] = None
761
        return result
762
763
    def get_build_details(self, keys):
764
        """Get the various build details for keys.
765
766
        Ghosts are omitted from the result.
767
768
        :param keys: An iterable of keys.
769
        :return: A dict of key:
770
            (index_memo, compression_parent, parents, record_details).
771
            index_memo
772
                opaque structure to pass to read_records to extract the raw
773
                data
774
            compression_parent
775
                Content that this record is built upon, may be None
776
            parents
777
                Logical parents of this node
778
            record_details
779
                extra information about the content which needs to be passed to
780
                Factory.parse_record
781
        """
782
        self._check_read()
783
        result = {}
784
        entries = self._get_entries(keys, False)
785
        for entry in entries:
786
            key = entry[1]
787
            if not self._parents:
788
                parents = None
789
            else:
790
                parents = entry[3][0]
791
            value = entry[2]
792
            method = 'group'
793
            result[key] = (self._node_to_position(entry),
794
                                  None, parents, (method, None))
795
        return result
796
    
797
    def keys(self):
798
        """Get all the keys in the collection.
799
        
800
        The keys are not ordered.
801
        """
802
        self._check_read()
803
        return [node[1] for node in self._graph_index.iter_all_entries()]
804
    
805
    def _node_to_position(self, node):
806
        """Convert an index value to position details."""
807
        bits = node[2].split(' ')
808
        # It would be nice not to read the entire gzip.
809
        start = int(bits[0])
810
        stop = int(bits[1])
811
        basis_end = int(bits[2])
812
        delta_end = int(bits[3])
813
        return node[0], start, stop, basis_end, delta_end