/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
30
    pack,
31
    patiencediff,
32
    )
33
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
34
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
35
from bzrlib.osutils import (
36
    contains_whitespace,
37
    contains_linebreaks,
38
    sha_string,
39
    sha_strings,
40
    split_lines,
41
    )
0.17.7 by Robert Collins
Update for current index2 changes.
42
from bzrlib.plugins.index2.btree_index import BTreeBuilder
0.17.9 by Robert Collins
Initial stab at repository format support.
43
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
44
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
45
    adapter_registry,
46
    AbsentContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
47
    FulltextContentFactory,
48
    VersionedFiles,
49
    )
50
51
0.17.5 by Robert Collins
nograph tests completely passing.
52
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
53
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
54
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
55
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
56
    label_line = lines.next()
57
    sha1_line = lines.next()
58
    if (not label_line.startswith('label: ') or
59
        not sha1_line.startswith('sha1: ')):
60
        raise AssertionError("bad text record %r" % lines)
61
    label = tuple(label_line[7:-1].split('\x00'))
62
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
63
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
64
        op = header[0]
65
        numbers = header[2:]
66
        numbers = [int(n) for n in header[2:].split(',')]
67
        if op == 'c':
68
            result.append((op, numbers[0], numbers[1], None))
69
        else:
70
            contents = [next() for i in xrange(numbers[0])]
71
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
72
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
73
74
def apply_delta(basis, delta):
75
    """Apply delta to this object to become new_version_id."""
76
    lines = []
77
    last_offset = 0
78
    # eq ranges occur where gaps occur
79
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
80
    for op, start, count, delta_lines in delta:
81
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
82
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
83
        else:
84
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
85
    trim_encoding_newline(lines)
86
    return lines
87
88
89
def trim_encoding_newline(lines):
90
    if lines[-1] == '\n':
91
        del lines[-1]
92
    else:
93
        lines[-1] = lines[-1][:-1]
94
95
96
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
97
    """Produce a serialised group of compressed texts.
98
    
99
    It contains code very similar to SequenceMatcher because of having a similar
100
    task. However some key differences apply:
101
     - there is no junk, we want a minimal edit not a human readable diff.
102
     - we don't filter very common lines (because we don't know where a good
103
       range will start, and after the first text we want to be emitting minmal
104
       edits only.
105
     - we chain the left side, not the right side
106
     - we incrementally update the adjacency matrix as new lines are provided.
107
     - we look for matches in all of the left side, so the routine which does
108
       the analagous task of find_longest_match does not need to filter on the
109
       left side.
110
    """
0.17.2 by Robert Collins
Core proof of concept working.
111
112
    def __init__(self, delta=True):
113
        """Create a GroupCompressor.
114
115
        :paeam delta: If False, do not compress records.
116
        """
117
        self._delta = delta
118
        self.lines = []
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
119
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
120
        self.endpoint = 0
121
        self.input_bytes = 0
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
122
        # line: set(locations it appears at), set(N+1 for N in locations)
123
        self.line_locations = {}
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
124
        self.labels_deltas = {}
0.17.2 by Robert Collins
Core proof of concept working.
125
126
    def compress(self, key, lines, expected_sha):
127
        """Compress lines with label key.
128
129
        :param key: A key tuple. It is stored in the output
130
            for identification of the text during decompression.
131
        :param lines: The lines to be compressed. Must be split
132
            on \n, with the \n preserved.'
133
        :param expected_sha: If non-None, the sha the lines are blieved to
134
            have. During compression the sha is calculated; a mismatch will
135
            cause an error.
136
        :return: The sha1 of lines, and the number of bytes accumulated in
137
            the group output so far.
138
        """
139
        sha1 = sha_strings(lines)
140
        label = '\x00'.join(key)
141
        # setup good encoding for trailing \n support.
142
        if not lines or lines[-1].endswith('\n'):
143
            lines.append('\n')
144
        else:
145
            lines[-1] = lines[-1] + '\n'
146
        new_lines = []
147
        new_lines.append('label: %s\n' % label)
148
        new_lines.append('sha1: %s\n' % sha1)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
149
        index_lines = [False, False]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
150
        pos = 0
151
        line_locations = self.line_locations
152
        accumulator = []
153
        copying = False
0.17.14 by Robert Collins
Cleaner code.
154
        range_len = 0
155
        range_start = 0
156
        flush_range = self.flush_range
157
        copy_ends = None
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
158
        # We either copy a range (while there are reusable lines) or we 
159
        # insert new lines. To find reusable lines we traverse 
160
        while pos < len(lines):
161
            line = lines[pos]
162
            if line not in line_locations:
163
                if copying:
0.17.14 by Robert Collins
Cleaner code.
164
                    flush_range(copying, range_start, copy_ends, range_len,
165
                        lines, new_lines, index_lines)
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
166
                    copying = False
0.17.14 by Robert Collins
Cleaner code.
167
                    range_start = pos
168
                    range_len = 1
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
169
                else:
0.17.14 by Robert Collins
Cleaner code.
170
                    range_len += 1
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
171
            else:
0.17.14 by Robert Collins
Cleaner code.
172
                locations, next = line_locations[line]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
173
                if copying:
174
                    next_locations = locations.intersection(copy_ends)
175
                    if len(next_locations):
176
                        # range continues
0.17.14 by Robert Collins
Cleaner code.
177
                        range_len += 1
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
178
                        copy_ends = set(loc + 1 for loc in next_locations)
0.17.14 by Robert Collins
Cleaner code.
179
                        pos += 1
180
                        continue
181
                # New copy range starts here:
182
                flush_range(copying, range_start, copy_ends, range_len, lines,
183
                    new_lines, index_lines)
184
                range_len = 1
185
                copy_ends = next
186
                range_start = pos
187
                copying = True
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
188
            pos += 1
0.17.14 by Robert Collins
Cleaner code.
189
        flush_range(copying, range_start, copy_ends, range_len, lines,
190
            new_lines, index_lines)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
191
        delta_start = (self.endpoint, len(self.lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
192
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
193
        trim_encoding_newline(lines)
194
        self.input_bytes += sum(map(len, lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
195
        delta_end = (self.endpoint, len(self.lines))
196
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
197
        return sha1, self.endpoint
198
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
199
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
200
        """Extract a key previously added to the compressor.
201
        
202
        :param key: The key to extract.
203
        :return: An iterable over bytes and the sha1.
204
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
205
        delta_details = self.labels_deltas[key]
206
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
207
        label, sha1, delta = parse(delta_lines)
208
        if label != key:
209
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
210
        # Perhaps we want to keep the line offsets too in memory at least?
211
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
212
        sha1 = sha_strings(lines)
213
        return lines, sha1
214
0.17.14 by Robert Collins
Cleaner code.
215
    def flush_range(self, copying, range_start, copy_ends, range_len, lines, new_lines, index_lines):
216
        if not range_len:
217
            return
218
        insert_instruction = "i,%d\n" % range_len
219
        if copying:
220
            # range stops, flush and start a new copy range
221
            copy_start = min(copy_ends) - range_len
222
            stop_byte = self.line_offsets[copy_start + range_len - 1]
223
            if copy_start == 0:
224
                start_byte = 0
225
            else:
226
                start_byte = self.line_offsets[copy_start - 1]
227
            bytes = stop_byte - start_byte
228
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
229
            if (bytes + len(insert_instruction) >
230
                len(copy_control_instruction)):
231
                new_lines.append(copy_control_instruction)
232
                index_lines.append(False)
233
                return
234
        # not copying, or inserting is shorter than copying, so insert.
235
        new_lines.append(insert_instruction)
236
        new_lines.extend(lines[range_start:range_start+range_len])
237
        index_lines.append(False)
238
        index_lines.extend([not copying]*range_len)
239
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
240
    def output_lines(self, new_lines, index_lines):
241
        """Output some lines.
242
243
        :param new_lines: The lines to output.
244
        :param index_lines: A boolean flag for each line - when True, index
245
            that line.
246
        """
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
247
        endpoint = self.endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
248
        offset = len(self.lines)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
249
        for (pos, line), index in izip(enumerate(new_lines), index_lines):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
250
            self.lines.append(line)
251
            endpoint += len(line)
252
            self.line_offsets.append(endpoint)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
253
            if index:
254
                indices, next_lines = self.line_locations.setdefault(line,
255
                    (set(), set()))
256
                indices.add(pos + offset)
257
                next_lines.add(pos + offset + 1)
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
258
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
259
0.17.2 by Robert Collins
Core proof of concept working.
260
    def ratio(self):
261
        """Return the overall compression ratio."""
262
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
263
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
264
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
265
def make_pack_factory(graph, delta, keylength):
266
    """Create a factory for creating a pack based groupcompress.
267
268
    This is only functional enough to run interface tests, it doesn't try to
269
    provide a full pack environment.
270
    
271
    :param graph: Store a graph.
272
    :param delta: Delta compress contents.
273
    :param keylength: How long should keys be.
274
    """
275
    def factory(transport):
276
        parents = graph or delta
277
        ref_length = 0
278
        if graph:
279
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
280
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
281
            key_elements=keylength)
282
        stream = transport.open_write_stream('newpack')
283
        writer = pack.ContainerWriter(stream.write)
284
        writer.begin()
285
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
286
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
287
        access = _DirectPackAccess({})
288
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
289
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
290
        result.stream = stream
291
        result.writer = writer
292
        return result
293
    return factory
294
295
296
def cleanup_pack_group(versioned_files):
297
    versioned_files.stream.close()
298
    versioned_files.writer.end()
299
300
301
class GroupCompressVersionedFiles(VersionedFiles):
302
    """A group-compress based VersionedFiles implementation."""
303
0.17.2 by Robert Collins
Core proof of concept working.
304
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
305
        """Create a GroupCompressVersionedFiles object.
306
307
        :param index: The index object storing access and graph data.
308
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
309
        :param delta: Whether to delta compress or just entropy compress.
310
        """
311
        self._index = index
312
        self._access = access
313
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
314
        self._unadded_refs = {}
0.17.2 by Robert Collins
Core proof of concept working.
315
316
    def add_lines(self, key, parents, lines, parent_texts=None,
317
        left_matching_blocks=None, nostore_sha=None, random_id=False,
318
        check_content=True):
319
        """Add a text to the store.
320
321
        :param key: The key tuple of the text to add.
322
        :param parents: The parents key tuples of the text to add.
323
        :param lines: A list of lines. Each line must be a bytestring. And all
324
            of them except the last must be terminated with \n and contain no
325
            other \n's. The last line may either contain no \n's or a single
326
            terminating \n. If the lines list does meet this constraint the add
327
            routine may error or may succeed - but you will be unable to read
328
            the data back accurately. (Checking the lines have been split
329
            correctly is expensive and extremely unlikely to catch bugs so it
330
            is not done at runtime unless check_content is True.)
331
        :param parent_texts: An optional dictionary containing the opaque 
332
            representations of some or all of the parents of version_id to
333
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
334
            returned by add_lines or data corruption can be caused.
335
        :param left_matching_blocks: a hint about which areas are common
336
            between the text and its left-hand-parent.  The format is
337
            the SequenceMatcher.get_matching_blocks format.
338
        :param nostore_sha: Raise ExistingContent and do not add the lines to
339
            the versioned file if the digest of the lines matches this.
340
        :param random_id: If True a random id has been selected rather than
341
            an id determined by some deterministic process such as a converter
342
            from a foreign VCS. When True the backend may choose not to check
343
            for uniqueness of the resulting key within the versioned file, so
344
            this should only be done when the result is expected to be unique
345
            anyway.
346
        :param check_content: If True, the lines supplied are verified to be
347
            bytestrings that are correctly formed lines.
348
        :return: The text sha1, the number of bytes in the text, and an opaque
349
                 representation of the inserted version which can be provided
350
                 back to future add_lines calls in the parent_texts dictionary.
351
        """
352
        self._index._check_write_ok()
353
        self._check_add(key, lines, random_id, check_content)
354
        if parents is None:
355
            # The caller might pass None if there is no graph data, but kndx
356
            # indexes can't directly store that, so we give them
357
            # an empty tuple instead.
358
            parents = ()
359
        # double handling for now. Make it work until then.
360
        bytes = ''.join(lines)
361
        record = FulltextContentFactory(key, parents, None, bytes)
0.17.5 by Robert Collins
nograph tests completely passing.
362
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.17.2 by Robert Collins
Core proof of concept working.
363
        return sha1, len(bytes), None
364
0.17.4 by Robert Collins
Annotate.
365
    def annotate(self, key):
366
        """See VersionedFiles.annotate."""
367
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
368
        parent_map = self.get_parent_map([key])
369
        if not parent_map:
370
            raise errors.RevisionNotPresent(key, self)
371
        if parent_map[key] is not None:
372
            search = graph._make_breadth_first_searcher([key])
373
            keys = set()
374
            while True:
375
                try:
376
                    present, ghosts = search.next_with_ghosts()
377
                except StopIteration:
378
                    break
379
                keys.update(present)
380
            parent_map = self.get_parent_map(keys)
381
        else:
382
            keys = [key]
383
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
384
        head_cache = _mod_graph.FrozenHeadsCache(graph)
385
        parent_cache = {}
386
        reannotate = annotate.reannotate
387
        for record in self.get_record_stream(keys, 'topological', True):
388
            key = record.key
389
            fulltext = split_lines(record.get_bytes_as('fulltext'))
390
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
391
            parent_cache[key] = list(
392
                reannotate(parent_lines, fulltext, key, None, head_cache))
393
        return parent_cache[key]
394
0.17.5 by Robert Collins
nograph tests completely passing.
395
    def check(self, progress_bar=None):
396
        """See VersionedFiles.check()."""
397
        keys = self.keys()
398
        for record in self.get_record_stream(keys, 'unordered', True):
399
            record.get_bytes_as('fulltext')
400
0.17.2 by Robert Collins
Core proof of concept working.
401
    def _check_add(self, key, lines, random_id, check_content):
402
        """check that version_id and lines are safe to add."""
403
        version_id = key[-1]
404
        if contains_whitespace(version_id):
405
            raise InvalidRevisionId(version_id, self)
406
        self.check_not_reserved_id(version_id)
407
        # TODO: If random_id==False and the key is already present, we should
408
        # probably check that the existing content is identical to what is
409
        # being inserted, and otherwise raise an exception.  This would make
410
        # the bundle code simpler.
411
        if check_content:
412
            self._check_lines_not_unicode(lines)
413
            self._check_lines_are_lines(lines)
414
0.17.5 by Robert Collins
nograph tests completely passing.
415
    def get_parent_map(self, keys):
416
        """Get a map of the parents of keys.
417
418
        :param keys: The keys to look up parents for.
419
        :return: A mapping from keys to parents. Absent keys are absent from
420
            the mapping.
421
        """
422
        result = {}
423
        sources = [self._index]
424
        source_results = []
425
        missing = set(keys)
426
        for source in sources:
427
            if not missing:
428
                break
429
            new_result = source.get_parent_map(missing)
430
            source_results.append(new_result)
431
            result.update(new_result)
432
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
433
        if self._unadded_refs:
434
            for key in missing:
435
                if key in self._unadded_refs:
436
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
437
        return result
438
439
    def get_record_stream(self, keys, ordering, include_delta_closure):
440
        """Get a stream of records for keys.
441
442
        :param keys: The keys to include.
443
        :param ordering: Either 'unordered' or 'topological'. A topologically
444
            sorted stream has compression parents strictly before their
445
            children.
446
        :param include_delta_closure: If True then the closure across any
447
            compression parents will be included (in the opaque data).
448
        :return: An iterator of ContentFactory objects, each of which is only
449
            valid until the iterator is advanced.
450
        """
451
        # keys might be a generator
452
        keys = set(keys)
453
        if not keys:
454
            return
455
        if not self._index.has_graph:
456
            # Cannot topological order when no graph has been stored.
457
            ordering = 'unordered'
458
        # Cheap: iterate
459
        locations = self._index.get_build_details(keys)
460
        if ordering == 'topological':
461
            # would be better to not globally sort initially but instead
462
            # start with one key, recurse to its oldest parent, then grab
463
            # everything in the same group, etc.
464
            parent_map = dict((key, details[2]) for key, details in
465
                locations.iteritems())
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
466
            local = frozenset(keys).intersection(set(self._unadded_refs))
467
            for key in local:
468
                parent_map[key] = self._unadded_refs[key]
469
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
470
            present_keys = topo_sort(parent_map)
471
            # Now group by source:
472
        else:
473
            present_keys = locations.keys()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
474
            local = frozenset(keys).intersection(set(self._unadded_refs))
475
            for key in local:
476
                present_keys.append(key)
477
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
478
        absent_keys = keys.difference(set(locations))
479
        for key in absent_keys:
480
            yield AbsentContentFactory(key)
481
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
482
            if key in self._unadded_refs:
483
                lines, sha1 = self._compressor.extract(key)
484
                parents = self._unadded_refs[key]
485
            else:
486
                index_memo, _, parents, (method, _) = locations[key]
487
                # read
488
                read_memo = index_memo[0:3]
489
                zdata = self._access.get_raw_records([read_memo]).next()
490
                # decompress
491
                plain = zlib.decompress(zdata)
492
                # parse
493
                delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
494
                label, sha1, delta = parse(delta_lines)
495
                if label != key:
496
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
497
                basis = plain[:index_memo[3]]
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
498
                # basis = StringIO(basis).readlines()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
499
                #basis = split_lines(plain[:last_end])
500
                lines = apply_delta(basis, delta)
0.17.5 by Robert Collins
nograph tests completely passing.
501
            bytes = ''.join(lines)
502
            yield FulltextContentFactory(key, parents, sha1, bytes)
503
            
504
    def get_sha1s(self, keys):
505
        """See VersionedFiles.get_sha1s()."""
506
        result = {}
507
        for record in self.get_record_stream(keys, 'unordered', True):
508
            if record.sha1 != None:
509
                result[record.key] = record.sha1
510
            else:
511
                if record.storage_kind != 'absent':
512
                    result[record.key] == sha_string(record.get_bytes_as(
513
                        'fulltext'))
514
        return result
515
0.17.2 by Robert Collins
Core proof of concept working.
516
    def insert_record_stream(self, stream):
517
        """Insert a record stream into this container.
518
519
        :param stream: A stream of records to insert. 
520
        :return: None
521
        :seealso VersionedFiles.get_record_stream:
522
        """
0.17.5 by Robert Collins
nograph tests completely passing.
523
        for _ in self._insert_record_stream(stream):
524
            pass
0.17.2 by Robert Collins
Core proof of concept working.
525
0.17.5 by Robert Collins
nograph tests completely passing.
526
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
527
        """Internal core to insert a record stream into this container.
528
529
        This helper function has a different interface than insert_record_stream
530
        to allow add_lines to be minimal, but still return the needed data.
531
532
        :param stream: A stream of records to insert. 
533
        :return: An iterator over the sha1 of the inserted records.
534
        :seealso insert_record_stream:
535
        :seealso add_lines:
536
        """
0.17.5 by Robert Collins
nograph tests completely passing.
537
        def get_adapter(adapter_key):
538
            try:
539
                return adapters[adapter_key]
540
            except KeyError:
541
                adapter_factory = adapter_registry.get(adapter_key)
542
                adapter = adapter_factory(self)
543
                adapters[adapter_key] = adapter
544
                return adapter
545
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
546
        # This will go up to fulltexts for gc to gc fetching, which isn't
547
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
548
        self._compressor = GroupCompressor(self._delta)
549
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
550
        keys_to_add = []
551
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
552
        groups = 1
553
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
554
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
555
            index, start, length = self._access.add_raw_records(
556
                [(None, len(compressed))], compressed)[0]
557
            nodes = []
558
            for key, reads, refs in keys_to_add:
559
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
560
            self._index.add_records(nodes, random_id=random_id)
0.17.2 by Robert Collins
Core proof of concept working.
561
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
562
            # Raise an error when a record is missing.
563
            if record.storage_kind == 'absent':
564
                raise errors.RevisionNotPresent([record.key], self)
565
            elif record.storage_kind == 'fulltext':
566
                bytes = record.get_bytes_as('fulltext')
567
            else:
568
                adapter_key = record.storage_kind, 'fulltext'
569
                adapter = get_adapter(adapter_key)
570
                bytes = adapter.get_bytes(record,
571
                    record.get_bytes_as(record.storage_kind))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
572
            found_sha1, end_point = self._compressor.compress(record.key,
0.17.5 by Robert Collins
nograph tests completely passing.
573
                split_lines(bytes), record.sha1)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
574
            self._unadded_refs[record.key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
575
            yield found_sha1
0.17.5 by Robert Collins
nograph tests completely passing.
576
            keys_to_add.append((record.key, '%d %d' % (basis_end, end_point),
577
                (record.parents,)))
578
            basis_end = end_point
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
579
            if basis_end > 1024 * 1024 * 20:
580
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
581
                self._compressor = GroupCompressor(self._delta)
582
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
583
                keys_to_add = []
584
                basis_end = 0
585
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
586
        if len(keys_to_add):
587
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
588
        self._compressor = None
589
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
590
591
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
592
        """Iterate over the lines in the versioned files from keys.
593
594
        This may return lines from other keys. Each item the returned
595
        iterator yields is a tuple of a line and a text version that that line
596
        is present in (not introduced in).
597
598
        Ordering of results is in whatever order is most suitable for the
599
        underlying storage format.
600
601
        If a progress bar is supplied, it may be used to indicate progress.
602
        The caller is responsible for cleaning up progress bars (because this
603
        is an iterator).
604
605
        NOTES:
606
         * Lines are normalised by the underlying store: they will all have \n
607
           terminators.
608
         * Lines are returned in arbitrary order.
609
610
        :return: An iterator over (line, key).
611
        """
612
        if pb is None:
613
            pb = progress.DummyProgress()
614
        keys = set(keys)
615
        total = len(keys)
616
        # we don't care about inclusions, the caller cares.
617
        # but we need to setup a list of records to visit.
618
        # we need key, position, length
619
        for key_idx, record in enumerate(self.get_record_stream(keys,
620
            'unordered', True)):
621
            # XXX: todo - optimise to use less than full texts.
622
            key = record.key
623
            pb.update('Walking content.', key_idx, total)
624
            if record.storage_kind == 'absent':
625
                raise errors.RevisionNotPresent(record.key, self)
626
            lines = split_lines(record.get_bytes_as('fulltext'))
627
            for line in lines:
628
                yield line, key
629
        pb.update('Walking content.', total, total)
630
631
    def keys(self):
632
        """See VersionedFiles.keys."""
633
        if 'evil' in debug.debug_flags:
634
            trace.mutter_callsite(2, "keys scales with size of history")
635
        sources = [self._index]
636
        result = set()
637
        for source in sources:
638
            result.update(source.keys())
639
        return result
640
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
641
642
class _GCGraphIndex(object):
643
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
644
0.17.9 by Robert Collins
Initial stab at repository format support.
645
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
646
        add_callback=None):
647
        """Construct a _GCGraphIndex on a graph_index.
648
649
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
650
        :param is_locked: A callback to check whether the object should answer
651
            queries.
652
        :param parents: If True, record knits parents, if not do not record 
653
            parents.
654
        :param add_callback: If not None, allow additions to the index and call
655
            this callback with a list of added GraphIndex nodes:
656
            [(node, value, node_refs), ...]
657
        :param is_locked: A callback, returns True if the index is locked and
658
            thus usable.
659
        """
660
        self._add_callback = add_callback
661
        self._graph_index = graph_index
662
        self._parents = parents
663
        self.has_graph = parents
664
        self._is_locked = is_locked
665
0.17.5 by Robert Collins
nograph tests completely passing.
666
    def add_records(self, records, random_id=False):
667
        """Add multiple records to the index.
668
        
669
        This function does not insert data into the Immutable GraphIndex
670
        backing the KnitGraphIndex, instead it prepares data for insertion by
671
        the caller and checks that it is safe to insert then calls
672
        self._add_callback with the prepared GraphIndex nodes.
673
674
        :param records: a list of tuples:
675
                         (key, options, access_memo, parents).
676
        :param random_id: If True the ids being added were randomly generated
677
            and no check for existence will be performed.
678
        """
679
        if not self._add_callback:
680
            raise errors.ReadOnlyError(self)
681
        # we hope there are no repositories with inconsistent parentage
682
        # anymore.
683
684
        changed = False
685
        keys = {}
686
        for (key, value, refs) in records:
687
            if not self._parents:
688
                if refs:
689
                    for ref in refs:
690
                        if ref:
691
                            raise KnitCorrupt(self,
692
                                "attempt to add node with parents "
693
                                "in parentless index.")
694
                    refs = ()
695
                    changed = True
696
            keys[key] = (value, refs)
697
        # check for dups
698
        if not random_id:
699
            present_nodes = self._get_entries(keys)
700
            for (index, key, value, node_refs) in present_nodes:
701
                if node_refs != keys[key][1]:
702
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
703
                        ": %s %s" % ((value, node_refs), keys[key]))
704
                del keys[key]
705
                changed = True
706
        if changed:
707
            result = []
708
            if self._parents:
709
                for key, (value, node_refs) in keys.iteritems():
710
                    result.append((key, value, node_refs))
711
            else:
712
                for key, (value, node_refs) in keys.iteritems():
713
                    result.append((key, value))
714
            records = result
715
        self._add_callback(records)
716
        
717
    def _check_read(self):
718
        """raise if reads are not permitted."""
719
        if not self._is_locked():
720
            raise errors.ObjectNotLocked(self)
721
0.17.2 by Robert Collins
Core proof of concept working.
722
    def _check_write_ok(self):
723
        """Assert if writes are not permitted."""
724
        if not self._is_locked():
725
            raise errors.ObjectNotLocked(self)
726
0.17.5 by Robert Collins
nograph tests completely passing.
727
    def _get_entries(self, keys, check_present=False):
728
        """Get the entries for keys.
729
        
730
        :param keys: An iterable of index key tuples.
731
        """
732
        keys = set(keys)
733
        found_keys = set()
734
        if self._parents:
735
            for node in self._graph_index.iter_entries(keys):
736
                yield node
737
                found_keys.add(node[1])
738
        else:
739
            # adapt parentless index to the rest of the code.
740
            for node in self._graph_index.iter_entries(keys):
741
                yield node[0], node[1], node[2], ()
742
                found_keys.add(node[1])
743
        if check_present:
744
            missing_keys = keys.difference(found_keys)
745
            if missing_keys:
746
                raise RevisionNotPresent(missing_keys.pop(), self)
747
748
    def get_parent_map(self, keys):
749
        """Get a map of the parents of keys.
750
751
        :param keys: The keys to look up parents for.
752
        :return: A mapping from keys to parents. Absent keys are absent from
753
            the mapping.
754
        """
755
        self._check_read()
756
        nodes = self._get_entries(keys)
757
        result = {}
758
        if self._parents:
759
            for node in nodes:
760
                result[node[1]] = node[3][0]
761
        else:
762
            for node in nodes:
763
                result[node[1]] = None
764
        return result
765
766
    def get_build_details(self, keys):
767
        """Get the various build details for keys.
768
769
        Ghosts are omitted from the result.
770
771
        :param keys: An iterable of keys.
772
        :return: A dict of key:
773
            (index_memo, compression_parent, parents, record_details).
774
            index_memo
775
                opaque structure to pass to read_records to extract the raw
776
                data
777
            compression_parent
778
                Content that this record is built upon, may be None
779
            parents
780
                Logical parents of this node
781
            record_details
782
                extra information about the content which needs to be passed to
783
                Factory.parse_record
784
        """
785
        self._check_read()
786
        result = {}
787
        entries = self._get_entries(keys, False)
788
        for entry in entries:
789
            key = entry[1]
790
            if not self._parents:
791
                parents = None
792
            else:
793
                parents = entry[3][0]
794
            value = entry[2]
795
            method = 'group'
796
            result[key] = (self._node_to_position(entry),
797
                                  None, parents, (method, None))
798
        return result
799
    
800
    def keys(self):
801
        """Get all the keys in the collection.
802
        
803
        The keys are not ordered.
804
        """
805
        self._check_read()
806
        return [node[1] for node in self._graph_index.iter_all_entries()]
807
    
808
    def _node_to_position(self, node):
809
        """Convert an index value to position details."""
810
        bits = node[2].split(' ')
811
        # It would be nice not to read the entire gzip.
812
        start = int(bits[0])
813
        stop = int(bits[1])
814
        basis_end = int(bits[2])
815
        delta_end = int(bits[3])
816
        return node[0], start, stop, basis_end, delta_end