/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
30
    pack,
31
    patiencediff,
32
    )
33
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
34
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
35
from bzrlib.osutils import (
36
    contains_whitespace,
37
    contains_linebreaks,
38
    sha_string,
39
    sha_strings,
40
    split_lines,
41
    )
0.17.7 by Robert Collins
Update for current index2 changes.
42
from bzrlib.plugins.index2.btree_index import BTreeBuilder
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
43
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
44
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
45
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
46
    adapter_registry,
47
    AbsentContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
48
    FulltextContentFactory,
49
    VersionedFiles,
50
    )
51
52
0.17.5 by Robert Collins
nograph tests completely passing.
53
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
54
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
55
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
56
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
57
    label_line = lines.next()
58
    sha1_line = lines.next()
59
    if (not label_line.startswith('label: ') or
60
        not sha1_line.startswith('sha1: ')):
61
        raise AssertionError("bad text record %r" % lines)
62
    label = tuple(label_line[7:-1].split('\x00'))
63
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
64
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
65
        op = header[0]
66
        numbers = header[2:]
67
        numbers = [int(n) for n in header[2:].split(',')]
68
        if op == 'c':
69
            result.append((op, numbers[0], numbers[1], None))
70
        else:
71
            contents = [next() for i in xrange(numbers[0])]
72
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
73
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
74
75
def apply_delta(basis, delta):
76
    """Apply delta to this object to become new_version_id."""
77
    lines = []
78
    last_offset = 0
79
    # eq ranges occur where gaps occur
80
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
81
    for op, start, count, delta_lines in delta:
82
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
83
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
84
        else:
85
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
86
    trim_encoding_newline(lines)
87
    return lines
88
89
90
def trim_encoding_newline(lines):
91
    if lines[-1] == '\n':
92
        del lines[-1]
93
    else:
94
        lines[-1] = lines[-1][:-1]
95
96
97
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
98
    """Produce a serialised group of compressed texts.
99
    
100
    It contains code very similar to SequenceMatcher because of having a similar
101
    task. However some key differences apply:
102
     - there is no junk, we want a minimal edit not a human readable diff.
103
     - we don't filter very common lines (because we don't know where a good
104
       range will start, and after the first text we want to be emitting minmal
105
       edits only.
106
     - we chain the left side, not the right side
107
     - we incrementally update the adjacency matrix as new lines are provided.
108
     - we look for matches in all of the left side, so the routine which does
109
       the analagous task of find_longest_match does not need to filter on the
110
       left side.
111
    """
0.17.2 by Robert Collins
Core proof of concept working.
112
113
    def __init__(self, delta=True):
114
        """Create a GroupCompressor.
115
116
        :paeam delta: If False, do not compress records.
117
        """
118
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
119
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
120
        self.endpoint = 0
121
        self.input_bytes = 0
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
122
        self.line_locations = equivalence_table.EquivalenceTable([])
123
        self.lines = self.line_locations._left_lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
124
        self.labels_deltas = {}
0.17.2 by Robert Collins
Core proof of concept working.
125
126
    def compress(self, key, lines, expected_sha):
127
        """Compress lines with label key.
128
129
        :param key: A key tuple. It is stored in the output
130
            for identification of the text during decompression.
131
        :param lines: The lines to be compressed. Must be split
132
            on \n, with the \n preserved.'
133
        :param expected_sha: If non-None, the sha the lines are blieved to
134
            have. During compression the sha is calculated; a mismatch will
135
            cause an error.
136
        :return: The sha1 of lines, and the number of bytes accumulated in
137
            the group output so far.
138
        """
139
        sha1 = sha_strings(lines)
140
        label = '\x00'.join(key)
141
        # setup good encoding for trailing \n support.
142
        if not lines or lines[-1].endswith('\n'):
143
            lines.append('\n')
144
        else:
145
            lines[-1] = lines[-1] + '\n'
146
        new_lines = []
147
        new_lines.append('label: %s\n' % label)
148
        new_lines.append('sha1: %s\n' % sha1)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
149
        index_lines = [False, False]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
150
        pos = 0
151
        line_locations = self.line_locations
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
152
        line_locations.set_right_lines(lines)
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
153
        accumulator = []
154
        copying = False
0.17.14 by Robert Collins
Cleaner code.
155
        range_len = 0
156
        range_start = 0
157
        flush_range = self.flush_range
158
        copy_ends = None
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
159
        # We either copy a range (while there are reusable lines) or we 
160
        # insert new lines. To find reusable lines we traverse 
161
        while pos < len(lines):
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
162
            # line = lines[pos]
163
            matching_locs = line_locations.get_left_matches(pos)
164
            if not matching_locs:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
165
                if copying:
0.17.14 by Robert Collins
Cleaner code.
166
                    flush_range(copying, range_start, copy_ends, range_len,
167
                        lines, new_lines, index_lines)
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
168
                    copying = False
0.17.14 by Robert Collins
Cleaner code.
169
                    range_start = pos
170
                    range_len = 1
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
171
                else:
0.17.14 by Robert Collins
Cleaner code.
172
                    range_len += 1
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
173
            else:
174
                if copying:
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
175
                    next_locations = copy_ends.intersection(matching_locs)
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
176
                    if len(next_locations):
177
                        # range continues
0.17.14 by Robert Collins
Cleaner code.
178
                        range_len += 1
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
179
                        copy_ends = set([loc + 1 for loc in next_locations])
0.17.14 by Robert Collins
Cleaner code.
180
                        pos += 1
181
                        continue
182
                # New copy range starts here:
183
                flush_range(copying, range_start, copy_ends, range_len, lines,
184
                    new_lines, index_lines)
185
                range_len = 1
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
186
                copy_ends = set([loc + 1 for loc in matching_locs])
0.17.14 by Robert Collins
Cleaner code.
187
                range_start = pos
188
                copying = True
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
189
            pos += 1
0.17.14 by Robert Collins
Cleaner code.
190
        flush_range(copying, range_start, copy_ends, range_len, lines,
191
            new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
192
        delta_start = (self.endpoint, len(self.line_locations._left_lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
193
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
194
        trim_encoding_newline(lines)
195
        self.input_bytes += sum(map(len, lines))
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
196
        delta_end = (self.endpoint, len(line_locations._left_lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
197
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
198
        return sha1, self.endpoint
199
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
200
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
201
        """Extract a key previously added to the compressor.
202
        
203
        :param key: The key to extract.
204
        :return: An iterable over bytes and the sha1.
205
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
206
        delta_details = self.labels_deltas[key]
207
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
208
        label, sha1, delta = parse(delta_lines)
209
        if label != key:
210
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
211
        # Perhaps we want to keep the line offsets too in memory at least?
212
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
213
        sha1 = sha_strings(lines)
214
        return lines, sha1
215
0.17.14 by Robert Collins
Cleaner code.
216
    def flush_range(self, copying, range_start, copy_ends, range_len, lines, new_lines, index_lines):
217
        if not range_len:
218
            return
219
        insert_instruction = "i,%d\n" % range_len
220
        if copying:
221
            # range stops, flush and start a new copy range
222
            copy_start = min(copy_ends) - range_len
223
            stop_byte = self.line_offsets[copy_start + range_len - 1]
224
            if copy_start == 0:
225
                start_byte = 0
226
            else:
227
                start_byte = self.line_offsets[copy_start - 1]
228
            bytes = stop_byte - start_byte
229
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
230
            if (bytes + len(insert_instruction) >
231
                len(copy_control_instruction)):
232
                new_lines.append(copy_control_instruction)
233
                index_lines.append(False)
234
                return
235
        # not copying, or inserting is shorter than copying, so insert.
236
        new_lines.append(insert_instruction)
237
        new_lines.extend(lines[range_start:range_start+range_len])
238
        index_lines.append(False)
239
        index_lines.extend([not copying]*range_len)
240
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
241
    def output_lines(self, new_lines, index_lines):
242
        """Output some lines.
243
244
        :param new_lines: The lines to output.
245
        :param index_lines: A boolean flag for each line - when True, index
246
            that line.
247
        """
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
248
        endpoint = self.endpoint
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
249
        self.line_locations.extend_left_lines(new_lines, index_lines)
250
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
251
            endpoint += len(line)
252
            self.line_offsets.append(endpoint)
253
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
254
0.17.2 by Robert Collins
Core proof of concept working.
255
    def ratio(self):
256
        """Return the overall compression ratio."""
257
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
258
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
259
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
260
def make_pack_factory(graph, delta, keylength):
261
    """Create a factory for creating a pack based groupcompress.
262
263
    This is only functional enough to run interface tests, it doesn't try to
264
    provide a full pack environment.
265
    
266
    :param graph: Store a graph.
267
    :param delta: Delta compress contents.
268
    :param keylength: How long should keys be.
269
    """
270
    def factory(transport):
271
        parents = graph or delta
272
        ref_length = 0
273
        if graph:
274
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
275
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
276
            key_elements=keylength)
277
        stream = transport.open_write_stream('newpack')
278
        writer = pack.ContainerWriter(stream.write)
279
        writer.begin()
280
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
281
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
282
        access = _DirectPackAccess({})
283
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
284
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
285
        result.stream = stream
286
        result.writer = writer
287
        return result
288
    return factory
289
290
291
def cleanup_pack_group(versioned_files):
292
    versioned_files.stream.close()
293
    versioned_files.writer.end()
294
295
296
class GroupCompressVersionedFiles(VersionedFiles):
297
    """A group-compress based VersionedFiles implementation."""
298
0.17.2 by Robert Collins
Core proof of concept working.
299
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
300
        """Create a GroupCompressVersionedFiles object.
301
302
        :param index: The index object storing access and graph data.
303
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
304
        :param delta: Whether to delta compress or just entropy compress.
305
        """
306
        self._index = index
307
        self._access = access
308
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
309
        self._unadded_refs = {}
0.17.2 by Robert Collins
Core proof of concept working.
310
311
    def add_lines(self, key, parents, lines, parent_texts=None,
312
        left_matching_blocks=None, nostore_sha=None, random_id=False,
313
        check_content=True):
314
        """Add a text to the store.
315
316
        :param key: The key tuple of the text to add.
317
        :param parents: The parents key tuples of the text to add.
318
        :param lines: A list of lines. Each line must be a bytestring. And all
319
            of them except the last must be terminated with \n and contain no
320
            other \n's. The last line may either contain no \n's or a single
321
            terminating \n. If the lines list does meet this constraint the add
322
            routine may error or may succeed - but you will be unable to read
323
            the data back accurately. (Checking the lines have been split
324
            correctly is expensive and extremely unlikely to catch bugs so it
325
            is not done at runtime unless check_content is True.)
326
        :param parent_texts: An optional dictionary containing the opaque 
327
            representations of some or all of the parents of version_id to
328
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
329
            returned by add_lines or data corruption can be caused.
330
        :param left_matching_blocks: a hint about which areas are common
331
            between the text and its left-hand-parent.  The format is
332
            the SequenceMatcher.get_matching_blocks format.
333
        :param nostore_sha: Raise ExistingContent and do not add the lines to
334
            the versioned file if the digest of the lines matches this.
335
        :param random_id: If True a random id has been selected rather than
336
            an id determined by some deterministic process such as a converter
337
            from a foreign VCS. When True the backend may choose not to check
338
            for uniqueness of the resulting key within the versioned file, so
339
            this should only be done when the result is expected to be unique
340
            anyway.
341
        :param check_content: If True, the lines supplied are verified to be
342
            bytestrings that are correctly formed lines.
343
        :return: The text sha1, the number of bytes in the text, and an opaque
344
                 representation of the inserted version which can be provided
345
                 back to future add_lines calls in the parent_texts dictionary.
346
        """
347
        self._index._check_write_ok()
348
        self._check_add(key, lines, random_id, check_content)
349
        if parents is None:
350
            # The caller might pass None if there is no graph data, but kndx
351
            # indexes can't directly store that, so we give them
352
            # an empty tuple instead.
353
            parents = ()
354
        # double handling for now. Make it work until then.
355
        bytes = ''.join(lines)
356
        record = FulltextContentFactory(key, parents, None, bytes)
0.17.5 by Robert Collins
nograph tests completely passing.
357
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.17.2 by Robert Collins
Core proof of concept working.
358
        return sha1, len(bytes), None
359
0.17.4 by Robert Collins
Annotate.
360
    def annotate(self, key):
361
        """See VersionedFiles.annotate."""
362
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
363
        parent_map = self.get_parent_map([key])
364
        if not parent_map:
365
            raise errors.RevisionNotPresent(key, self)
366
        if parent_map[key] is not None:
367
            search = graph._make_breadth_first_searcher([key])
368
            keys = set()
369
            while True:
370
                try:
371
                    present, ghosts = search.next_with_ghosts()
372
                except StopIteration:
373
                    break
374
                keys.update(present)
375
            parent_map = self.get_parent_map(keys)
376
        else:
377
            keys = [key]
378
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
379
        head_cache = _mod_graph.FrozenHeadsCache(graph)
380
        parent_cache = {}
381
        reannotate = annotate.reannotate
382
        for record in self.get_record_stream(keys, 'topological', True):
383
            key = record.key
384
            fulltext = split_lines(record.get_bytes_as('fulltext'))
385
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
386
            parent_cache[key] = list(
387
                reannotate(parent_lines, fulltext, key, None, head_cache))
388
        return parent_cache[key]
389
0.17.5 by Robert Collins
nograph tests completely passing.
390
    def check(self, progress_bar=None):
391
        """See VersionedFiles.check()."""
392
        keys = self.keys()
393
        for record in self.get_record_stream(keys, 'unordered', True):
394
            record.get_bytes_as('fulltext')
395
0.17.2 by Robert Collins
Core proof of concept working.
396
    def _check_add(self, key, lines, random_id, check_content):
397
        """check that version_id and lines are safe to add."""
398
        version_id = key[-1]
399
        if contains_whitespace(version_id):
400
            raise InvalidRevisionId(version_id, self)
401
        self.check_not_reserved_id(version_id)
402
        # TODO: If random_id==False and the key is already present, we should
403
        # probably check that the existing content is identical to what is
404
        # being inserted, and otherwise raise an exception.  This would make
405
        # the bundle code simpler.
406
        if check_content:
407
            self._check_lines_not_unicode(lines)
408
            self._check_lines_are_lines(lines)
409
0.17.5 by Robert Collins
nograph tests completely passing.
410
    def get_parent_map(self, keys):
411
        """Get a map of the parents of keys.
412
413
        :param keys: The keys to look up parents for.
414
        :return: A mapping from keys to parents. Absent keys are absent from
415
            the mapping.
416
        """
417
        result = {}
418
        sources = [self._index]
419
        source_results = []
420
        missing = set(keys)
421
        for source in sources:
422
            if not missing:
423
                break
424
            new_result = source.get_parent_map(missing)
425
            source_results.append(new_result)
426
            result.update(new_result)
427
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
428
        if self._unadded_refs:
429
            for key in missing:
430
                if key in self._unadded_refs:
431
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
432
        return result
433
434
    def get_record_stream(self, keys, ordering, include_delta_closure):
435
        """Get a stream of records for keys.
436
437
        :param keys: The keys to include.
438
        :param ordering: Either 'unordered' or 'topological'. A topologically
439
            sorted stream has compression parents strictly before their
440
            children.
441
        :param include_delta_closure: If True then the closure across any
442
            compression parents will be included (in the opaque data).
443
        :return: An iterator of ContentFactory objects, each of which is only
444
            valid until the iterator is advanced.
445
        """
446
        # keys might be a generator
447
        keys = set(keys)
448
        if not keys:
449
            return
450
        if not self._index.has_graph:
451
            # Cannot topological order when no graph has been stored.
452
            ordering = 'unordered'
453
        # Cheap: iterate
454
        locations = self._index.get_build_details(keys)
455
        if ordering == 'topological':
456
            # would be better to not globally sort initially but instead
457
            # start with one key, recurse to its oldest parent, then grab
458
            # everything in the same group, etc.
459
            parent_map = dict((key, details[2]) for key, details in
460
                locations.iteritems())
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
461
            local = frozenset(keys).intersection(set(self._unadded_refs))
462
            for key in local:
463
                parent_map[key] = self._unadded_refs[key]
464
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
465
            present_keys = topo_sort(parent_map)
466
            # Now group by source:
467
        else:
468
            present_keys = locations.keys()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
469
            local = frozenset(keys).intersection(set(self._unadded_refs))
470
            for key in local:
471
                present_keys.append(key)
472
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
473
        absent_keys = keys.difference(set(locations))
474
        for key in absent_keys:
475
            yield AbsentContentFactory(key)
476
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
477
            if key in self._unadded_refs:
478
                lines, sha1 = self._compressor.extract(key)
479
                parents = self._unadded_refs[key]
480
            else:
481
                index_memo, _, parents, (method, _) = locations[key]
482
                # read
483
                read_memo = index_memo[0:3]
484
                zdata = self._access.get_raw_records([read_memo]).next()
485
                # decompress
486
                plain = zlib.decompress(zdata)
487
                # parse
488
                delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
489
                label, sha1, delta = parse(delta_lines)
490
                if label != key:
491
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
492
                basis = plain[:index_memo[3]]
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
493
                # basis = StringIO(basis).readlines()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
494
                #basis = split_lines(plain[:last_end])
495
                lines = apply_delta(basis, delta)
0.17.5 by Robert Collins
nograph tests completely passing.
496
            bytes = ''.join(lines)
497
            yield FulltextContentFactory(key, parents, sha1, bytes)
498
            
499
    def get_sha1s(self, keys):
500
        """See VersionedFiles.get_sha1s()."""
501
        result = {}
502
        for record in self.get_record_stream(keys, 'unordered', True):
503
            if record.sha1 != None:
504
                result[record.key] = record.sha1
505
            else:
506
                if record.storage_kind != 'absent':
507
                    result[record.key] == sha_string(record.get_bytes_as(
508
                        'fulltext'))
509
        return result
510
0.17.2 by Robert Collins
Core proof of concept working.
511
    def insert_record_stream(self, stream):
512
        """Insert a record stream into this container.
513
514
        :param stream: A stream of records to insert. 
515
        :return: None
516
        :seealso VersionedFiles.get_record_stream:
517
        """
0.17.5 by Robert Collins
nograph tests completely passing.
518
        for _ in self._insert_record_stream(stream):
519
            pass
0.17.2 by Robert Collins
Core proof of concept working.
520
0.17.5 by Robert Collins
nograph tests completely passing.
521
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
522
        """Internal core to insert a record stream into this container.
523
524
        This helper function has a different interface than insert_record_stream
525
        to allow add_lines to be minimal, but still return the needed data.
526
527
        :param stream: A stream of records to insert. 
528
        :return: An iterator over the sha1 of the inserted records.
529
        :seealso insert_record_stream:
530
        :seealso add_lines:
531
        """
0.17.5 by Robert Collins
nograph tests completely passing.
532
        def get_adapter(adapter_key):
533
            try:
534
                return adapters[adapter_key]
535
            except KeyError:
536
                adapter_factory = adapter_registry.get(adapter_key)
537
                adapter = adapter_factory(self)
538
                adapters[adapter_key] = adapter
539
                return adapter
540
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
541
        # This will go up to fulltexts for gc to gc fetching, which isn't
542
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
543
        self._compressor = GroupCompressor(self._delta)
544
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
545
        keys_to_add = []
546
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
547
        groups = 1
548
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
549
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
550
            index, start, length = self._access.add_raw_records(
551
                [(None, len(compressed))], compressed)[0]
552
            nodes = []
553
            for key, reads, refs in keys_to_add:
554
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
555
            self._index.add_records(nodes, random_id=random_id)
0.17.2 by Robert Collins
Core proof of concept working.
556
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
557
            # Raise an error when a record is missing.
558
            if record.storage_kind == 'absent':
559
                raise errors.RevisionNotPresent([record.key], self)
560
            elif record.storage_kind == 'fulltext':
561
                bytes = record.get_bytes_as('fulltext')
562
            else:
563
                adapter_key = record.storage_kind, 'fulltext'
564
                adapter = get_adapter(adapter_key)
565
                bytes = adapter.get_bytes(record,
566
                    record.get_bytes_as(record.storage_kind))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
567
            found_sha1, end_point = self._compressor.compress(record.key,
0.17.5 by Robert Collins
nograph tests completely passing.
568
                split_lines(bytes), record.sha1)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
569
            self._unadded_refs[record.key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
570
            yield found_sha1
0.17.5 by Robert Collins
nograph tests completely passing.
571
            keys_to_add.append((record.key, '%d %d' % (basis_end, end_point),
572
                (record.parents,)))
573
            basis_end = end_point
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
574
            if basis_end > 1024 * 1024 * 20:
575
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
576
                self._compressor = GroupCompressor(self._delta)
577
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
578
                keys_to_add = []
579
                basis_end = 0
580
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
581
        if len(keys_to_add):
582
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
583
        self._compressor = None
584
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
585
586
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
587
        """Iterate over the lines in the versioned files from keys.
588
589
        This may return lines from other keys. Each item the returned
590
        iterator yields is a tuple of a line and a text version that that line
591
        is present in (not introduced in).
592
593
        Ordering of results is in whatever order is most suitable for the
594
        underlying storage format.
595
596
        If a progress bar is supplied, it may be used to indicate progress.
597
        The caller is responsible for cleaning up progress bars (because this
598
        is an iterator).
599
600
        NOTES:
601
         * Lines are normalised by the underlying store: they will all have \n
602
           terminators.
603
         * Lines are returned in arbitrary order.
604
605
        :return: An iterator over (line, key).
606
        """
607
        if pb is None:
608
            pb = progress.DummyProgress()
609
        keys = set(keys)
610
        total = len(keys)
611
        # we don't care about inclusions, the caller cares.
612
        # but we need to setup a list of records to visit.
613
        # we need key, position, length
614
        for key_idx, record in enumerate(self.get_record_stream(keys,
615
            'unordered', True)):
616
            # XXX: todo - optimise to use less than full texts.
617
            key = record.key
618
            pb.update('Walking content.', key_idx, total)
619
            if record.storage_kind == 'absent':
620
                raise errors.RevisionNotPresent(record.key, self)
621
            lines = split_lines(record.get_bytes_as('fulltext'))
622
            for line in lines:
623
                yield line, key
624
        pb.update('Walking content.', total, total)
625
626
    def keys(self):
627
        """See VersionedFiles.keys."""
628
        if 'evil' in debug.debug_flags:
629
            trace.mutter_callsite(2, "keys scales with size of history")
630
        sources = [self._index]
631
        result = set()
632
        for source in sources:
633
            result.update(source.keys())
634
        return result
635
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
636
637
class _GCGraphIndex(object):
638
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
639
0.17.9 by Robert Collins
Initial stab at repository format support.
640
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
641
        add_callback=None):
642
        """Construct a _GCGraphIndex on a graph_index.
643
644
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
645
        :param is_locked: A callback to check whether the object should answer
646
            queries.
647
        :param parents: If True, record knits parents, if not do not record 
648
            parents.
649
        :param add_callback: If not None, allow additions to the index and call
650
            this callback with a list of added GraphIndex nodes:
651
            [(node, value, node_refs), ...]
652
        :param is_locked: A callback, returns True if the index is locked and
653
            thus usable.
654
        """
655
        self._add_callback = add_callback
656
        self._graph_index = graph_index
657
        self._parents = parents
658
        self.has_graph = parents
659
        self._is_locked = is_locked
660
0.17.5 by Robert Collins
nograph tests completely passing.
661
    def add_records(self, records, random_id=False):
662
        """Add multiple records to the index.
663
        
664
        This function does not insert data into the Immutable GraphIndex
665
        backing the KnitGraphIndex, instead it prepares data for insertion by
666
        the caller and checks that it is safe to insert then calls
667
        self._add_callback with the prepared GraphIndex nodes.
668
669
        :param records: a list of tuples:
670
                         (key, options, access_memo, parents).
671
        :param random_id: If True the ids being added were randomly generated
672
            and no check for existence will be performed.
673
        """
674
        if not self._add_callback:
675
            raise errors.ReadOnlyError(self)
676
        # we hope there are no repositories with inconsistent parentage
677
        # anymore.
678
679
        changed = False
680
        keys = {}
681
        for (key, value, refs) in records:
682
            if not self._parents:
683
                if refs:
684
                    for ref in refs:
685
                        if ref:
686
                            raise KnitCorrupt(self,
687
                                "attempt to add node with parents "
688
                                "in parentless index.")
689
                    refs = ()
690
                    changed = True
691
            keys[key] = (value, refs)
692
        # check for dups
693
        if not random_id:
694
            present_nodes = self._get_entries(keys)
695
            for (index, key, value, node_refs) in present_nodes:
696
                if node_refs != keys[key][1]:
697
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
698
                        ": %s %s" % ((value, node_refs), keys[key]))
699
                del keys[key]
700
                changed = True
701
        if changed:
702
            result = []
703
            if self._parents:
704
                for key, (value, node_refs) in keys.iteritems():
705
                    result.append((key, value, node_refs))
706
            else:
707
                for key, (value, node_refs) in keys.iteritems():
708
                    result.append((key, value))
709
            records = result
710
        self._add_callback(records)
711
        
712
    def _check_read(self):
713
        """raise if reads are not permitted."""
714
        if not self._is_locked():
715
            raise errors.ObjectNotLocked(self)
716
0.17.2 by Robert Collins
Core proof of concept working.
717
    def _check_write_ok(self):
718
        """Assert if writes are not permitted."""
719
        if not self._is_locked():
720
            raise errors.ObjectNotLocked(self)
721
0.17.5 by Robert Collins
nograph tests completely passing.
722
    def _get_entries(self, keys, check_present=False):
723
        """Get the entries for keys.
724
        
725
        :param keys: An iterable of index key tuples.
726
        """
727
        keys = set(keys)
728
        found_keys = set()
729
        if self._parents:
730
            for node in self._graph_index.iter_entries(keys):
731
                yield node
732
                found_keys.add(node[1])
733
        else:
734
            # adapt parentless index to the rest of the code.
735
            for node in self._graph_index.iter_entries(keys):
736
                yield node[0], node[1], node[2], ()
737
                found_keys.add(node[1])
738
        if check_present:
739
            missing_keys = keys.difference(found_keys)
740
            if missing_keys:
741
                raise RevisionNotPresent(missing_keys.pop(), self)
742
743
    def get_parent_map(self, keys):
744
        """Get a map of the parents of keys.
745
746
        :param keys: The keys to look up parents for.
747
        :return: A mapping from keys to parents. Absent keys are absent from
748
            the mapping.
749
        """
750
        self._check_read()
751
        nodes = self._get_entries(keys)
752
        result = {}
753
        if self._parents:
754
            for node in nodes:
755
                result[node[1]] = node[3][0]
756
        else:
757
            for node in nodes:
758
                result[node[1]] = None
759
        return result
760
761
    def get_build_details(self, keys):
762
        """Get the various build details for keys.
763
764
        Ghosts are omitted from the result.
765
766
        :param keys: An iterable of keys.
767
        :return: A dict of key:
768
            (index_memo, compression_parent, parents, record_details).
769
            index_memo
770
                opaque structure to pass to read_records to extract the raw
771
                data
772
            compression_parent
773
                Content that this record is built upon, may be None
774
            parents
775
                Logical parents of this node
776
            record_details
777
                extra information about the content which needs to be passed to
778
                Factory.parse_record
779
        """
780
        self._check_read()
781
        result = {}
782
        entries = self._get_entries(keys, False)
783
        for entry in entries:
784
            key = entry[1]
785
            if not self._parents:
786
                parents = None
787
            else:
788
                parents = entry[3][0]
789
            value = entry[2]
790
            method = 'group'
791
            result[key] = (self._node_to_position(entry),
792
                                  None, parents, (method, None))
793
        return result
794
    
795
    def keys(self):
796
        """Get all the keys in the collection.
797
        
798
        The keys are not ordered.
799
        """
800
        self._check_read()
801
        return [node[1] for node in self._graph_index.iter_all_entries()]
802
    
803
    def _node_to_position(self, node):
804
        """Convert an index value to position details."""
805
        bits = node[2].split(' ')
806
        # It would be nice not to read the entire gzip.
807
        start = int(bits[0])
808
        stop = int(bits[1])
809
        basis_end = int(bits[2])
810
        delta_end = int(bits[3])
811
        return node[0], start, stop, basis_end, delta_end