/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
30
    pack,
31
    patiencediff,
32
    )
33
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
34
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
35
from bzrlib.osutils import (
36
    contains_whitespace,
37
    contains_linebreaks,
38
    sha_string,
39
    sha_strings,
40
    split_lines,
41
    )
0.17.7 by Robert Collins
Update for current index2 changes.
42
from bzrlib.plugins.index2.btree_index import BTreeBuilder
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
43
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
44
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
45
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
46
    adapter_registry,
47
    AbsentContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
48
    FulltextContentFactory,
49
    VersionedFiles,
50
    )
51
52
0.17.5 by Robert Collins
nograph tests completely passing.
53
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
54
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
55
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
56
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
57
    label_line = lines.next()
58
    sha1_line = lines.next()
59
    if (not label_line.startswith('label: ') or
60
        not sha1_line.startswith('sha1: ')):
61
        raise AssertionError("bad text record %r" % lines)
62
    label = tuple(label_line[7:-1].split('\x00'))
63
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
64
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
65
        op = header[0]
66
        numbers = header[2:]
67
        numbers = [int(n) for n in header[2:].split(',')]
68
        if op == 'c':
69
            result.append((op, numbers[0], numbers[1], None))
70
        else:
71
            contents = [next() for i in xrange(numbers[0])]
72
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
73
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
74
75
def apply_delta(basis, delta):
76
    """Apply delta to this object to become new_version_id."""
77
    lines = []
78
    last_offset = 0
79
    # eq ranges occur where gaps occur
80
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
81
    for op, start, count, delta_lines in delta:
82
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
83
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
84
        else:
85
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
86
    trim_encoding_newline(lines)
87
    return lines
88
89
90
def trim_encoding_newline(lines):
91
    if lines[-1] == '\n':
92
        del lines[-1]
93
    else:
94
        lines[-1] = lines[-1][:-1]
95
96
97
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
98
    """Produce a serialised group of compressed texts.
99
    
100
    It contains code very similar to SequenceMatcher because of having a similar
101
    task. However some key differences apply:
102
     - there is no junk, we want a minimal edit not a human readable diff.
103
     - we don't filter very common lines (because we don't know where a good
104
       range will start, and after the first text we want to be emitting minmal
105
       edits only.
106
     - we chain the left side, not the right side
107
     - we incrementally update the adjacency matrix as new lines are provided.
108
     - we look for matches in all of the left side, so the routine which does
109
       the analagous task of find_longest_match does not need to filter on the
110
       left side.
111
    """
0.17.2 by Robert Collins
Core proof of concept working.
112
113
    def __init__(self, delta=True):
114
        """Create a GroupCompressor.
115
116
        :paeam delta: If False, do not compress records.
117
        """
118
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
119
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
120
        self.endpoint = 0
121
        self.input_bytes = 0
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
122
        self.line_locations = equivalence_table.EquivalenceTable([])
123
        self.lines = self.line_locations._left_lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
124
        self.labels_deltas = {}
0.17.2 by Robert Collins
Core proof of concept working.
125
126
    def compress(self, key, lines, expected_sha):
127
        """Compress lines with label key.
128
129
        :param key: A key tuple. It is stored in the output
130
            for identification of the text during decompression.
131
        :param lines: The lines to be compressed. Must be split
132
            on \n, with the \n preserved.'
133
        :param expected_sha: If non-None, the sha the lines are blieved to
134
            have. During compression the sha is calculated; a mismatch will
135
            cause an error.
136
        :return: The sha1 of lines, and the number of bytes accumulated in
137
            the group output so far.
138
        """
139
        sha1 = sha_strings(lines)
140
        label = '\x00'.join(key)
141
        # setup good encoding for trailing \n support.
142
        if not lines or lines[-1].endswith('\n'):
143
            lines.append('\n')
144
        else:
145
            lines[-1] = lines[-1] + '\n'
146
        new_lines = []
147
        new_lines.append('label: %s\n' % label)
148
        new_lines.append('sha1: %s\n' % sha1)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
149
        index_lines = [False, False]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
150
        pos = 0
151
        line_locations = self.line_locations
152
        accumulator = []
153
        copying = False
0.17.14 by Robert Collins
Cleaner code.
154
        range_len = 0
155
        range_start = 0
156
        flush_range = self.flush_range
157
        copy_ends = None
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
158
        # We either copy a range (while there are reusable lines) or we 
159
        # insert new lines. To find reusable lines we traverse 
160
        while pos < len(lines):
0.18.8 by John Arbash Meinel
We don't actually care much about the right-line relationships. We only need to track the left-hand side.
161
            line = lines[pos]
162
            matching_locs = line_locations.get_left_matches(line)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
163
            if not matching_locs:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
164
                if copying:
0.17.14 by Robert Collins
Cleaner code.
165
                    flush_range(copying, range_start, copy_ends, range_len,
166
                        lines, new_lines, index_lines)
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
167
                    copying = False
0.17.14 by Robert Collins
Cleaner code.
168
                    range_start = pos
169
                    range_len = 1
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
170
                else:
0.17.14 by Robert Collins
Cleaner code.
171
                    range_len += 1
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
172
            else:
173
                if copying:
0.18.7 by John Arbash Meinel
Switch to using a plain list as much as possible.
174
                    # TODO: We could do a merge sort here if we keep copy_ends
175
                    #       and matching_locs always sorted.
176
                    next_locations = set(copy_ends).intersection(matching_locs)
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
177
                    if len(next_locations):
178
                        # range continues
0.17.14 by Robert Collins
Cleaner code.
179
                        range_len += 1
0.18.7 by John Arbash Meinel
Switch to using a plain list as much as possible.
180
                        copy_ends = [loc + 1 for loc in next_locations]
0.17.14 by Robert Collins
Cleaner code.
181
                        pos += 1
182
                        continue
183
                # New copy range starts here:
184
                flush_range(copying, range_start, copy_ends, range_len, lines,
185
                    new_lines, index_lines)
186
                range_len = 1
0.18.7 by John Arbash Meinel
Switch to using a plain list as much as possible.
187
                copy_ends = [loc + 1 for loc in matching_locs]
0.17.14 by Robert Collins
Cleaner code.
188
                range_start = pos
189
                copying = True
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
190
            pos += 1
0.17.14 by Robert Collins
Cleaner code.
191
        flush_range(copying, range_start, copy_ends, range_len, lines,
192
            new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
193
        delta_start = (self.endpoint, len(self.line_locations._left_lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
194
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
195
        trim_encoding_newline(lines)
196
        self.input_bytes += sum(map(len, lines))
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
197
        delta_end = (self.endpoint, len(line_locations._left_lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
198
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
199
        return sha1, self.endpoint
200
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
201
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
202
        """Extract a key previously added to the compressor.
203
        
204
        :param key: The key to extract.
205
        :return: An iterable over bytes and the sha1.
206
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
207
        delta_details = self.labels_deltas[key]
208
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
209
        label, sha1, delta = parse(delta_lines)
210
        if label != key:
211
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
212
        # Perhaps we want to keep the line offsets too in memory at least?
213
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
214
        sha1 = sha_strings(lines)
215
        return lines, sha1
216
0.17.14 by Robert Collins
Cleaner code.
217
    def flush_range(self, copying, range_start, copy_ends, range_len, lines, new_lines, index_lines):
218
        if not range_len:
219
            return
220
        insert_instruction = "i,%d\n" % range_len
221
        if copying:
222
            # range stops, flush and start a new copy range
223
            copy_start = min(copy_ends) - range_len
224
            stop_byte = self.line_offsets[copy_start + range_len - 1]
225
            if copy_start == 0:
226
                start_byte = 0
227
            else:
228
                start_byte = self.line_offsets[copy_start - 1]
229
            bytes = stop_byte - start_byte
230
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
231
            if (bytes + len(insert_instruction) >
232
                len(copy_control_instruction)):
233
                new_lines.append(copy_control_instruction)
234
                index_lines.append(False)
235
                return
236
        # not copying, or inserting is shorter than copying, so insert.
237
        new_lines.append(insert_instruction)
238
        new_lines.extend(lines[range_start:range_start+range_len])
239
        index_lines.append(False)
240
        index_lines.extend([not copying]*range_len)
241
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
242
    def output_lines(self, new_lines, index_lines):
243
        """Output some lines.
244
245
        :param new_lines: The lines to output.
246
        :param index_lines: A boolean flag for each line - when True, index
247
            that line.
248
        """
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
249
        endpoint = self.endpoint
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
250
        self.line_locations.extend_left_lines(new_lines, index_lines)
251
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
252
            endpoint += len(line)
253
            self.line_offsets.append(endpoint)
254
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
255
0.17.2 by Robert Collins
Core proof of concept working.
256
    def ratio(self):
257
        """Return the overall compression ratio."""
258
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
259
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
260
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
261
def make_pack_factory(graph, delta, keylength):
262
    """Create a factory for creating a pack based groupcompress.
263
264
    This is only functional enough to run interface tests, it doesn't try to
265
    provide a full pack environment.
266
    
267
    :param graph: Store a graph.
268
    :param delta: Delta compress contents.
269
    :param keylength: How long should keys be.
270
    """
271
    def factory(transport):
272
        parents = graph or delta
273
        ref_length = 0
274
        if graph:
275
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
276
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
277
            key_elements=keylength)
278
        stream = transport.open_write_stream('newpack')
279
        writer = pack.ContainerWriter(stream.write)
280
        writer.begin()
281
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
282
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
283
        access = _DirectPackAccess({})
284
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
285
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
286
        result.stream = stream
287
        result.writer = writer
288
        return result
289
    return factory
290
291
292
def cleanup_pack_group(versioned_files):
293
    versioned_files.stream.close()
294
    versioned_files.writer.end()
295
296
297
class GroupCompressVersionedFiles(VersionedFiles):
298
    """A group-compress based VersionedFiles implementation."""
299
0.17.2 by Robert Collins
Core proof of concept working.
300
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
301
        """Create a GroupCompressVersionedFiles object.
302
303
        :param index: The index object storing access and graph data.
304
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
305
        :param delta: Whether to delta compress or just entropy compress.
306
        """
307
        self._index = index
308
        self._access = access
309
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
310
        self._unadded_refs = {}
0.17.2 by Robert Collins
Core proof of concept working.
311
312
    def add_lines(self, key, parents, lines, parent_texts=None,
313
        left_matching_blocks=None, nostore_sha=None, random_id=False,
314
        check_content=True):
315
        """Add a text to the store.
316
317
        :param key: The key tuple of the text to add.
318
        :param parents: The parents key tuples of the text to add.
319
        :param lines: A list of lines. Each line must be a bytestring. And all
320
            of them except the last must be terminated with \n and contain no
321
            other \n's. The last line may either contain no \n's or a single
322
            terminating \n. If the lines list does meet this constraint the add
323
            routine may error or may succeed - but you will be unable to read
324
            the data back accurately. (Checking the lines have been split
325
            correctly is expensive and extremely unlikely to catch bugs so it
326
            is not done at runtime unless check_content is True.)
327
        :param parent_texts: An optional dictionary containing the opaque 
328
            representations of some or all of the parents of version_id to
329
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
330
            returned by add_lines or data corruption can be caused.
331
        :param left_matching_blocks: a hint about which areas are common
332
            between the text and its left-hand-parent.  The format is
333
            the SequenceMatcher.get_matching_blocks format.
334
        :param nostore_sha: Raise ExistingContent and do not add the lines to
335
            the versioned file if the digest of the lines matches this.
336
        :param random_id: If True a random id has been selected rather than
337
            an id determined by some deterministic process such as a converter
338
            from a foreign VCS. When True the backend may choose not to check
339
            for uniqueness of the resulting key within the versioned file, so
340
            this should only be done when the result is expected to be unique
341
            anyway.
342
        :param check_content: If True, the lines supplied are verified to be
343
            bytestrings that are correctly formed lines.
344
        :return: The text sha1, the number of bytes in the text, and an opaque
345
                 representation of the inserted version which can be provided
346
                 back to future add_lines calls in the parent_texts dictionary.
347
        """
348
        self._index._check_write_ok()
349
        self._check_add(key, lines, random_id, check_content)
350
        if parents is None:
351
            # The caller might pass None if there is no graph data, but kndx
352
            # indexes can't directly store that, so we give them
353
            # an empty tuple instead.
354
            parents = ()
355
        # double handling for now. Make it work until then.
356
        bytes = ''.join(lines)
357
        record = FulltextContentFactory(key, parents, None, bytes)
0.17.5 by Robert Collins
nograph tests completely passing.
358
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.17.2 by Robert Collins
Core proof of concept working.
359
        return sha1, len(bytes), None
360
0.17.4 by Robert Collins
Annotate.
361
    def annotate(self, key):
362
        """See VersionedFiles.annotate."""
363
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
364
        parent_map = self.get_parent_map([key])
365
        if not parent_map:
366
            raise errors.RevisionNotPresent(key, self)
367
        if parent_map[key] is not None:
368
            search = graph._make_breadth_first_searcher([key])
369
            keys = set()
370
            while True:
371
                try:
372
                    present, ghosts = search.next_with_ghosts()
373
                except StopIteration:
374
                    break
375
                keys.update(present)
376
            parent_map = self.get_parent_map(keys)
377
        else:
378
            keys = [key]
379
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
380
        head_cache = _mod_graph.FrozenHeadsCache(graph)
381
        parent_cache = {}
382
        reannotate = annotate.reannotate
383
        for record in self.get_record_stream(keys, 'topological', True):
384
            key = record.key
385
            fulltext = split_lines(record.get_bytes_as('fulltext'))
386
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
387
            parent_cache[key] = list(
388
                reannotate(parent_lines, fulltext, key, None, head_cache))
389
        return parent_cache[key]
390
0.17.5 by Robert Collins
nograph tests completely passing.
391
    def check(self, progress_bar=None):
392
        """See VersionedFiles.check()."""
393
        keys = self.keys()
394
        for record in self.get_record_stream(keys, 'unordered', True):
395
            record.get_bytes_as('fulltext')
396
0.17.2 by Robert Collins
Core proof of concept working.
397
    def _check_add(self, key, lines, random_id, check_content):
398
        """check that version_id and lines are safe to add."""
399
        version_id = key[-1]
400
        if contains_whitespace(version_id):
401
            raise InvalidRevisionId(version_id, self)
402
        self.check_not_reserved_id(version_id)
403
        # TODO: If random_id==False and the key is already present, we should
404
        # probably check that the existing content is identical to what is
405
        # being inserted, and otherwise raise an exception.  This would make
406
        # the bundle code simpler.
407
        if check_content:
408
            self._check_lines_not_unicode(lines)
409
            self._check_lines_are_lines(lines)
410
0.17.5 by Robert Collins
nograph tests completely passing.
411
    def get_parent_map(self, keys):
412
        """Get a map of the parents of keys.
413
414
        :param keys: The keys to look up parents for.
415
        :return: A mapping from keys to parents. Absent keys are absent from
416
            the mapping.
417
        """
418
        result = {}
419
        sources = [self._index]
420
        source_results = []
421
        missing = set(keys)
422
        for source in sources:
423
            if not missing:
424
                break
425
            new_result = source.get_parent_map(missing)
426
            source_results.append(new_result)
427
            result.update(new_result)
428
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
429
        if self._unadded_refs:
430
            for key in missing:
431
                if key in self._unadded_refs:
432
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
433
        return result
434
435
    def get_record_stream(self, keys, ordering, include_delta_closure):
436
        """Get a stream of records for keys.
437
438
        :param keys: The keys to include.
439
        :param ordering: Either 'unordered' or 'topological'. A topologically
440
            sorted stream has compression parents strictly before their
441
            children.
442
        :param include_delta_closure: If True then the closure across any
443
            compression parents will be included (in the opaque data).
444
        :return: An iterator of ContentFactory objects, each of which is only
445
            valid until the iterator is advanced.
446
        """
447
        # keys might be a generator
448
        keys = set(keys)
449
        if not keys:
450
            return
451
        if not self._index.has_graph:
452
            # Cannot topological order when no graph has been stored.
453
            ordering = 'unordered'
454
        # Cheap: iterate
455
        locations = self._index.get_build_details(keys)
456
        if ordering == 'topological':
457
            # would be better to not globally sort initially but instead
458
            # start with one key, recurse to its oldest parent, then grab
459
            # everything in the same group, etc.
460
            parent_map = dict((key, details[2]) for key, details in
461
                locations.iteritems())
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
462
            local = frozenset(keys).intersection(set(self._unadded_refs))
463
            for key in local:
464
                parent_map[key] = self._unadded_refs[key]
465
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
466
            present_keys = topo_sort(parent_map)
467
            # Now group by source:
468
        else:
469
            present_keys = locations.keys()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
470
            local = frozenset(keys).intersection(set(self._unadded_refs))
471
            for key in local:
472
                present_keys.append(key)
473
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
474
        absent_keys = keys.difference(set(locations))
475
        for key in absent_keys:
476
            yield AbsentContentFactory(key)
477
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
478
            if key in self._unadded_refs:
479
                lines, sha1 = self._compressor.extract(key)
480
                parents = self._unadded_refs[key]
481
            else:
482
                index_memo, _, parents, (method, _) = locations[key]
483
                # read
484
                read_memo = index_memo[0:3]
485
                zdata = self._access.get_raw_records([read_memo]).next()
486
                # decompress
487
                plain = zlib.decompress(zdata)
488
                # parse
489
                delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
490
                label, sha1, delta = parse(delta_lines)
491
                if label != key:
492
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
493
                basis = plain[:index_memo[3]]
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
494
                # basis = StringIO(basis).readlines()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
495
                #basis = split_lines(plain[:last_end])
496
                lines = apply_delta(basis, delta)
0.17.5 by Robert Collins
nograph tests completely passing.
497
            bytes = ''.join(lines)
498
            yield FulltextContentFactory(key, parents, sha1, bytes)
499
            
500
    def get_sha1s(self, keys):
501
        """See VersionedFiles.get_sha1s()."""
502
        result = {}
503
        for record in self.get_record_stream(keys, 'unordered', True):
504
            if record.sha1 != None:
505
                result[record.key] = record.sha1
506
            else:
507
                if record.storage_kind != 'absent':
508
                    result[record.key] == sha_string(record.get_bytes_as(
509
                        'fulltext'))
510
        return result
511
0.17.2 by Robert Collins
Core proof of concept working.
512
    def insert_record_stream(self, stream):
513
        """Insert a record stream into this container.
514
515
        :param stream: A stream of records to insert. 
516
        :return: None
517
        :seealso VersionedFiles.get_record_stream:
518
        """
0.17.5 by Robert Collins
nograph tests completely passing.
519
        for _ in self._insert_record_stream(stream):
520
            pass
0.17.2 by Robert Collins
Core proof of concept working.
521
0.17.5 by Robert Collins
nograph tests completely passing.
522
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
523
        """Internal core to insert a record stream into this container.
524
525
        This helper function has a different interface than insert_record_stream
526
        to allow add_lines to be minimal, but still return the needed data.
527
528
        :param stream: A stream of records to insert. 
529
        :return: An iterator over the sha1 of the inserted records.
530
        :seealso insert_record_stream:
531
        :seealso add_lines:
532
        """
0.17.5 by Robert Collins
nograph tests completely passing.
533
        def get_adapter(adapter_key):
534
            try:
535
                return adapters[adapter_key]
536
            except KeyError:
537
                adapter_factory = adapter_registry.get(adapter_key)
538
                adapter = adapter_factory(self)
539
                adapters[adapter_key] = adapter
540
                return adapter
541
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
542
        # This will go up to fulltexts for gc to gc fetching, which isn't
543
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
544
        self._compressor = GroupCompressor(self._delta)
545
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
546
        keys_to_add = []
547
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
548
        groups = 1
549
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
550
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
551
            index, start, length = self._access.add_raw_records(
552
                [(None, len(compressed))], compressed)[0]
553
            nodes = []
554
            for key, reads, refs in keys_to_add:
555
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
556
            self._index.add_records(nodes, random_id=random_id)
0.17.2 by Robert Collins
Core proof of concept working.
557
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
558
            # Raise an error when a record is missing.
559
            if record.storage_kind == 'absent':
560
                raise errors.RevisionNotPresent([record.key], self)
561
            elif record.storage_kind == 'fulltext':
562
                bytes = record.get_bytes_as('fulltext')
563
            else:
564
                adapter_key = record.storage_kind, 'fulltext'
565
                adapter = get_adapter(adapter_key)
566
                bytes = adapter.get_bytes(record,
567
                    record.get_bytes_as(record.storage_kind))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
568
            found_sha1, end_point = self._compressor.compress(record.key,
0.17.5 by Robert Collins
nograph tests completely passing.
569
                split_lines(bytes), record.sha1)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
570
            self._unadded_refs[record.key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
571
            yield found_sha1
0.17.5 by Robert Collins
nograph tests completely passing.
572
            keys_to_add.append((record.key, '%d %d' % (basis_end, end_point),
573
                (record.parents,)))
574
            basis_end = end_point
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
575
            if basis_end > 1024 * 1024 * 20:
576
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
577
                self._compressor = GroupCompressor(self._delta)
578
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
579
                keys_to_add = []
580
                basis_end = 0
581
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
582
        if len(keys_to_add):
583
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
584
        self._compressor = None
585
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
586
587
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
588
        """Iterate over the lines in the versioned files from keys.
589
590
        This may return lines from other keys. Each item the returned
591
        iterator yields is a tuple of a line and a text version that that line
592
        is present in (not introduced in).
593
594
        Ordering of results is in whatever order is most suitable for the
595
        underlying storage format.
596
597
        If a progress bar is supplied, it may be used to indicate progress.
598
        The caller is responsible for cleaning up progress bars (because this
599
        is an iterator).
600
601
        NOTES:
602
         * Lines are normalised by the underlying store: they will all have \n
603
           terminators.
604
         * Lines are returned in arbitrary order.
605
606
        :return: An iterator over (line, key).
607
        """
608
        if pb is None:
609
            pb = progress.DummyProgress()
610
        keys = set(keys)
611
        total = len(keys)
612
        # we don't care about inclusions, the caller cares.
613
        # but we need to setup a list of records to visit.
614
        # we need key, position, length
615
        for key_idx, record in enumerate(self.get_record_stream(keys,
616
            'unordered', True)):
617
            # XXX: todo - optimise to use less than full texts.
618
            key = record.key
619
            pb.update('Walking content.', key_idx, total)
620
            if record.storage_kind == 'absent':
621
                raise errors.RevisionNotPresent(record.key, self)
622
            lines = split_lines(record.get_bytes_as('fulltext'))
623
            for line in lines:
624
                yield line, key
625
        pb.update('Walking content.', total, total)
626
627
    def keys(self):
628
        """See VersionedFiles.keys."""
629
        if 'evil' in debug.debug_flags:
630
            trace.mutter_callsite(2, "keys scales with size of history")
631
        sources = [self._index]
632
        result = set()
633
        for source in sources:
634
            result.update(source.keys())
635
        return result
636
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
637
638
class _GCGraphIndex(object):
639
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
640
0.17.9 by Robert Collins
Initial stab at repository format support.
641
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
642
        add_callback=None):
643
        """Construct a _GCGraphIndex on a graph_index.
644
645
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
646
        :param is_locked: A callback to check whether the object should answer
647
            queries.
648
        :param parents: If True, record knits parents, if not do not record 
649
            parents.
650
        :param add_callback: If not None, allow additions to the index and call
651
            this callback with a list of added GraphIndex nodes:
652
            [(node, value, node_refs), ...]
653
        :param is_locked: A callback, returns True if the index is locked and
654
            thus usable.
655
        """
656
        self._add_callback = add_callback
657
        self._graph_index = graph_index
658
        self._parents = parents
659
        self.has_graph = parents
660
        self._is_locked = is_locked
661
0.17.5 by Robert Collins
nograph tests completely passing.
662
    def add_records(self, records, random_id=False):
663
        """Add multiple records to the index.
664
        
665
        This function does not insert data into the Immutable GraphIndex
666
        backing the KnitGraphIndex, instead it prepares data for insertion by
667
        the caller and checks that it is safe to insert then calls
668
        self._add_callback with the prepared GraphIndex nodes.
669
670
        :param records: a list of tuples:
671
                         (key, options, access_memo, parents).
672
        :param random_id: If True the ids being added were randomly generated
673
            and no check for existence will be performed.
674
        """
675
        if not self._add_callback:
676
            raise errors.ReadOnlyError(self)
677
        # we hope there are no repositories with inconsistent parentage
678
        # anymore.
679
680
        changed = False
681
        keys = {}
682
        for (key, value, refs) in records:
683
            if not self._parents:
684
                if refs:
685
                    for ref in refs:
686
                        if ref:
687
                            raise KnitCorrupt(self,
688
                                "attempt to add node with parents "
689
                                "in parentless index.")
690
                    refs = ()
691
                    changed = True
692
            keys[key] = (value, refs)
693
        # check for dups
694
        if not random_id:
695
            present_nodes = self._get_entries(keys)
696
            for (index, key, value, node_refs) in present_nodes:
697
                if node_refs != keys[key][1]:
698
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
699
                        ": %s %s" % ((value, node_refs), keys[key]))
700
                del keys[key]
701
                changed = True
702
        if changed:
703
            result = []
704
            if self._parents:
705
                for key, (value, node_refs) in keys.iteritems():
706
                    result.append((key, value, node_refs))
707
            else:
708
                for key, (value, node_refs) in keys.iteritems():
709
                    result.append((key, value))
710
            records = result
711
        self._add_callback(records)
712
        
713
    def _check_read(self):
714
        """raise if reads are not permitted."""
715
        if not self._is_locked():
716
            raise errors.ObjectNotLocked(self)
717
0.17.2 by Robert Collins
Core proof of concept working.
718
    def _check_write_ok(self):
719
        """Assert if writes are not permitted."""
720
        if not self._is_locked():
721
            raise errors.ObjectNotLocked(self)
722
0.17.5 by Robert Collins
nograph tests completely passing.
723
    def _get_entries(self, keys, check_present=False):
724
        """Get the entries for keys.
725
        
726
        :param keys: An iterable of index key tuples.
727
        """
728
        keys = set(keys)
729
        found_keys = set()
730
        if self._parents:
731
            for node in self._graph_index.iter_entries(keys):
732
                yield node
733
                found_keys.add(node[1])
734
        else:
735
            # adapt parentless index to the rest of the code.
736
            for node in self._graph_index.iter_entries(keys):
737
                yield node[0], node[1], node[2], ()
738
                found_keys.add(node[1])
739
        if check_present:
740
            missing_keys = keys.difference(found_keys)
741
            if missing_keys:
742
                raise RevisionNotPresent(missing_keys.pop(), self)
743
744
    def get_parent_map(self, keys):
745
        """Get a map of the parents of keys.
746
747
        :param keys: The keys to look up parents for.
748
        :return: A mapping from keys to parents. Absent keys are absent from
749
            the mapping.
750
        """
751
        self._check_read()
752
        nodes = self._get_entries(keys)
753
        result = {}
754
        if self._parents:
755
            for node in nodes:
756
                result[node[1]] = node[3][0]
757
        else:
758
            for node in nodes:
759
                result[node[1]] = None
760
        return result
761
762
    def get_build_details(self, keys):
763
        """Get the various build details for keys.
764
765
        Ghosts are omitted from the result.
766
767
        :param keys: An iterable of keys.
768
        :return: A dict of key:
769
            (index_memo, compression_parent, parents, record_details).
770
            index_memo
771
                opaque structure to pass to read_records to extract the raw
772
                data
773
            compression_parent
774
                Content that this record is built upon, may be None
775
            parents
776
                Logical parents of this node
777
            record_details
778
                extra information about the content which needs to be passed to
779
                Factory.parse_record
780
        """
781
        self._check_read()
782
        result = {}
783
        entries = self._get_entries(keys, False)
784
        for entry in entries:
785
            key = entry[1]
786
            if not self._parents:
787
                parents = None
788
            else:
789
                parents = entry[3][0]
790
            value = entry[2]
791
            method = 'group'
792
            result[key] = (self._node_to_position(entry),
793
                                  None, parents, (method, None))
794
        return result
795
    
796
    def keys(self):
797
        """Get all the keys in the collection.
798
        
799
        The keys are not ordered.
800
        """
801
        self._check_read()
802
        return [node[1] for node in self._graph_index.iter_all_entries()]
803
    
804
    def _node_to_position(self, node):
805
        """Convert an index value to position details."""
806
        bits = node[2].split(' ')
807
        # It would be nice not to read the entire gzip.
808
        start = int(bits[0])
809
        stop = int(bits[1])
810
        basis_end = int(bits[2])
811
        delta_end = int(bits[3])
812
        return node[0], start, stop, basis_end, delta_end