/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
30
    pack,
31
    patiencediff,
32
    )
33
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
34
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
35
from bzrlib.osutils import (
36
    contains_whitespace,
37
    contains_linebreaks,
38
    sha_string,
39
    sha_strings,
40
    split_lines,
41
    )
0.17.7 by Robert Collins
Update for current index2 changes.
42
from bzrlib.plugins.index2.btree_index import BTreeBuilder
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
43
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
44
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
45
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
46
    adapter_registry,
47
    AbsentContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
48
    FulltextContentFactory,
49
    VersionedFiles,
50
    )
51
52
0.17.5 by Robert Collins
nograph tests completely passing.
53
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
54
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
55
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
56
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
57
    label_line = lines.next()
58
    sha1_line = lines.next()
59
    if (not label_line.startswith('label: ') or
60
        not sha1_line.startswith('sha1: ')):
61
        raise AssertionError("bad text record %r" % lines)
62
    label = tuple(label_line[7:-1].split('\x00'))
63
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
64
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
65
        op = header[0]
66
        numbers = header[2:]
67
        numbers = [int(n) for n in header[2:].split(',')]
68
        if op == 'c':
69
            result.append((op, numbers[0], numbers[1], None))
70
        else:
71
            contents = [next() for i in xrange(numbers[0])]
72
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
73
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
74
75
def apply_delta(basis, delta):
76
    """Apply delta to this object to become new_version_id."""
77
    lines = []
78
    last_offset = 0
79
    # eq ranges occur where gaps occur
80
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
81
    for op, start, count, delta_lines in delta:
82
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
83
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
84
        else:
85
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
86
    trim_encoding_newline(lines)
87
    return lines
88
89
90
def trim_encoding_newline(lines):
91
    if lines[-1] == '\n':
92
        del lines[-1]
93
    else:
94
        lines[-1] = lines[-1][:-1]
95
96
97
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
98
    """Produce a serialised group of compressed texts.
99
    
100
    It contains code very similar to SequenceMatcher because of having a similar
101
    task. However some key differences apply:
102
     - there is no junk, we want a minimal edit not a human readable diff.
103
     - we don't filter very common lines (because we don't know where a good
104
       range will start, and after the first text we want to be emitting minmal
105
       edits only.
106
     - we chain the left side, not the right side
107
     - we incrementally update the adjacency matrix as new lines are provided.
108
     - we look for matches in all of the left side, so the routine which does
109
       the analagous task of find_longest_match does not need to filter on the
110
       left side.
111
    """
0.17.2 by Robert Collins
Core proof of concept working.
112
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
113
    _equivalence_table_class = equivalence_table.EquivalenceTable
114
0.17.2 by Robert Collins
Core proof of concept working.
115
    def __init__(self, delta=True):
116
        """Create a GroupCompressor.
117
118
        :paeam delta: If False, do not compress records.
119
        """
120
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
121
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
122
        self.endpoint = 0
123
        self.input_bytes = 0
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
124
        self.line_locations = self._equivalence_table_class([])
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
125
        self.lines = self.line_locations.lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
126
        self.labels_deltas = {}
0.17.2 by Robert Collins
Core proof of concept working.
127
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
128
    def get_matching_blocks(self, lines):
129
        """Return an the ranges in lines which match self.lines.
130
131
        :param lines: lines to compress
132
        :return: A list of (old_start, new_start, length) tuples which reflect
133
            a region in self.lines that is present in lines.  The last element
134
            of the list is always (old_len, new_len, 0) to provide a end point
135
            for generating instructions from the matching blocks list.
136
        """
137
        result = []
138
        pos = 0
139
        line_locations = self.line_locations
0.18.11 by John Arbash Meinel
Convert back into grabbing a right-lines ahead of time.
140
        line_locations.set_right_lines(lines)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
141
        # We either copy a range (while there are reusable lines) or we 
142
        # insert new lines. To find reusable lines we traverse 
0.18.24 by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func.
143
        locations = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
144
        max_pos = len(lines)
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
145
        max_time = 0.0
146
        max_info = None
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
147
        result_append = result.append
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
148
        while pos < max_pos:
0.18.35 by John Arbash Meinel
remove the timing calls
149
            block, pos, locations = _get_longest_match(line_locations, pos,
150
                                                       max_pos, locations)
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
151
            if block is not None:
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
152
                result_append(block)
153
        result_append((len(self.lines), len(lines), 0))
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
154
        return result
155
0.17.2 by Robert Collins
Core proof of concept working.
156
    def compress(self, key, lines, expected_sha):
157
        """Compress lines with label key.
158
159
        :param key: A key tuple. It is stored in the output
160
            for identification of the text during decompression.
161
        :param lines: The lines to be compressed. Must be split
162
            on \n, with the \n preserved.'
163
        :param expected_sha: If non-None, the sha the lines are blieved to
164
            have. During compression the sha is calculated; a mismatch will
165
            cause an error.
166
        :return: The sha1 of lines, and the number of bytes accumulated in
167
            the group output so far.
168
        """
169
        sha1 = sha_strings(lines)
170
        label = '\x00'.join(key)
171
        # setup good encoding for trailing \n support.
172
        if not lines or lines[-1].endswith('\n'):
173
            lines.append('\n')
174
        else:
175
            lines[-1] = lines[-1] + '\n'
176
        new_lines = []
177
        new_lines.append('label: %s\n' % label)
178
        new_lines.append('sha1: %s\n' % sha1)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
179
        index_lines = [False, False]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
180
        pos = 0
181
        line_locations = self.line_locations
0.17.14 by Robert Collins
Cleaner code.
182
        range_len = 0
183
        range_start = 0
184
        flush_range = self.flush_range
185
        copy_ends = None
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
186
        blocks = self.get_matching_blocks(lines)
187
        current_pos = 0
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
188
        # We either copy a range (while there are reusable lines) or we 
189
        # insert new lines. To find reusable lines we traverse 
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
190
        for old_start, new_start, range_len in blocks:
191
            if new_start != current_pos:
192
                # non-matching region
0.19.1 by Robert Collins
Start to simplify flush_range.
193
                flush_range(current_pos, None, new_start - current_pos,
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
194
                    lines, new_lines, index_lines)
195
            current_pos = new_start + range_len
196
            if not range_len:
197
                continue
0.19.1 by Robert Collins
Start to simplify flush_range.
198
            flush_range(new_start, old_start, range_len, lines,
199
                new_lines, index_lines)
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
200
        delta_start = (self.endpoint, len(self.lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
201
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
202
        trim_encoding_newline(lines)
203
        self.input_bytes += sum(map(len, lines))
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
204
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
205
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
206
        return sha1, self.endpoint
207
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
208
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
209
        """Extract a key previously added to the compressor.
210
        
211
        :param key: The key to extract.
212
        :return: An iterable over bytes and the sha1.
213
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
214
        delta_details = self.labels_deltas[key]
215
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
216
        label, sha1, delta = parse(delta_lines)
217
        if label != key:
218
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
219
        # Perhaps we want to keep the line offsets too in memory at least?
220
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
221
        sha1 = sha_strings(lines)
222
        return lines, sha1
223
0.19.1 by Robert Collins
Start to simplify flush_range.
224
    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
0.17.14 by Robert Collins
Cleaner code.
225
        if not range_len:
226
            return
227
        insert_instruction = "i,%d\n" % range_len
0.19.1 by Robert Collins
Start to simplify flush_range.
228
        if copy_start is not None:
0.17.14 by Robert Collins
Cleaner code.
229
            # range stops, flush and start a new copy range
230
            stop_byte = self.line_offsets[copy_start + range_len - 1]
231
            if copy_start == 0:
232
                start_byte = 0
233
            else:
234
                start_byte = self.line_offsets[copy_start - 1]
235
            bytes = stop_byte - start_byte
236
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
237
            if (bytes + len(insert_instruction) >
238
                len(copy_control_instruction)):
239
                new_lines.append(copy_control_instruction)
240
                index_lines.append(False)
241
                return
242
        # not copying, or inserting is shorter than copying, so insert.
243
        new_lines.append(insert_instruction)
244
        new_lines.extend(lines[range_start:range_start+range_len])
245
        index_lines.append(False)
0.19.1 by Robert Collins
Start to simplify flush_range.
246
        index_lines.extend([copy_start is None]*range_len)
0.17.14 by Robert Collins
Cleaner code.
247
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
248
    def output_lines(self, new_lines, index_lines):
249
        """Output some lines.
250
251
        :param new_lines: The lines to output.
252
        :param index_lines: A boolean flag for each line - when True, index
253
            that line.
254
        """
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
255
        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
256
        #                          if val and new_lines[idx] == '\n']
257
        # if indexed_newlines:
258
        #     import pdb; pdb.set_trace()
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
259
        endpoint = self.endpoint
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
260
        self.line_locations.extend_lines(new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
261
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
262
            endpoint += len(line)
263
            self.line_offsets.append(endpoint)
264
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
265
0.17.2 by Robert Collins
Core proof of concept working.
266
    def ratio(self):
267
        """Return the overall compression ratio."""
268
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
269
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
270
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
271
def make_pack_factory(graph, delta, keylength):
272
    """Create a factory for creating a pack based groupcompress.
273
274
    This is only functional enough to run interface tests, it doesn't try to
275
    provide a full pack environment.
276
    
277
    :param graph: Store a graph.
278
    :param delta: Delta compress contents.
279
    :param keylength: How long should keys be.
280
    """
281
    def factory(transport):
282
        parents = graph or delta
283
        ref_length = 0
284
        if graph:
285
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
286
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
287
            key_elements=keylength)
288
        stream = transport.open_write_stream('newpack')
289
        writer = pack.ContainerWriter(stream.write)
290
        writer.begin()
291
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
292
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
293
        access = _DirectPackAccess({})
294
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
295
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
296
        result.stream = stream
297
        result.writer = writer
298
        return result
299
    return factory
300
301
302
def cleanup_pack_group(versioned_files):
303
    versioned_files.stream.close()
304
    versioned_files.writer.end()
305
306
307
class GroupCompressVersionedFiles(VersionedFiles):
308
    """A group-compress based VersionedFiles implementation."""
309
0.17.2 by Robert Collins
Core proof of concept working.
310
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
311
        """Create a GroupCompressVersionedFiles object.
312
313
        :param index: The index object storing access and graph data.
314
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
315
        :param delta: Whether to delta compress or just entropy compress.
316
        """
317
        self._index = index
318
        self._access = access
319
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
320
        self._unadded_refs = {}
0.17.2 by Robert Collins
Core proof of concept working.
321
322
    def add_lines(self, key, parents, lines, parent_texts=None,
323
        left_matching_blocks=None, nostore_sha=None, random_id=False,
324
        check_content=True):
325
        """Add a text to the store.
326
327
        :param key: The key tuple of the text to add.
328
        :param parents: The parents key tuples of the text to add.
329
        :param lines: A list of lines. Each line must be a bytestring. And all
330
            of them except the last must be terminated with \n and contain no
331
            other \n's. The last line may either contain no \n's or a single
332
            terminating \n. If the lines list does meet this constraint the add
333
            routine may error or may succeed - but you will be unable to read
334
            the data back accurately. (Checking the lines have been split
335
            correctly is expensive and extremely unlikely to catch bugs so it
336
            is not done at runtime unless check_content is True.)
337
        :param parent_texts: An optional dictionary containing the opaque 
338
            representations of some or all of the parents of version_id to
339
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
340
            returned by add_lines or data corruption can be caused.
341
        :param left_matching_blocks: a hint about which areas are common
342
            between the text and its left-hand-parent.  The format is
343
            the SequenceMatcher.get_matching_blocks format.
344
        :param nostore_sha: Raise ExistingContent and do not add the lines to
345
            the versioned file if the digest of the lines matches this.
346
        :param random_id: If True a random id has been selected rather than
347
            an id determined by some deterministic process such as a converter
348
            from a foreign VCS. When True the backend may choose not to check
349
            for uniqueness of the resulting key within the versioned file, so
350
            this should only be done when the result is expected to be unique
351
            anyway.
352
        :param check_content: If True, the lines supplied are verified to be
353
            bytestrings that are correctly formed lines.
354
        :return: The text sha1, the number of bytes in the text, and an opaque
355
                 representation of the inserted version which can be provided
356
                 back to future add_lines calls in the parent_texts dictionary.
357
        """
358
        self._index._check_write_ok()
359
        self._check_add(key, lines, random_id, check_content)
360
        if parents is None:
361
            # The caller might pass None if there is no graph data, but kndx
362
            # indexes can't directly store that, so we give them
363
            # an empty tuple instead.
364
            parents = ()
365
        # double handling for now. Make it work until then.
366
        bytes = ''.join(lines)
367
        record = FulltextContentFactory(key, parents, None, bytes)
0.17.5 by Robert Collins
nograph tests completely passing.
368
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.17.2 by Robert Collins
Core proof of concept working.
369
        return sha1, len(bytes), None
370
0.17.4 by Robert Collins
Annotate.
371
    def annotate(self, key):
372
        """See VersionedFiles.annotate."""
373
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
374
        parent_map = self.get_parent_map([key])
375
        if not parent_map:
376
            raise errors.RevisionNotPresent(key, self)
377
        if parent_map[key] is not None:
378
            search = graph._make_breadth_first_searcher([key])
379
            keys = set()
380
            while True:
381
                try:
382
                    present, ghosts = search.next_with_ghosts()
383
                except StopIteration:
384
                    break
385
                keys.update(present)
386
            parent_map = self.get_parent_map(keys)
387
        else:
388
            keys = [key]
389
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
390
        head_cache = _mod_graph.FrozenHeadsCache(graph)
391
        parent_cache = {}
392
        reannotate = annotate.reannotate
393
        for record in self.get_record_stream(keys, 'topological', True):
394
            key = record.key
395
            fulltext = split_lines(record.get_bytes_as('fulltext'))
396
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
397
            parent_cache[key] = list(
398
                reannotate(parent_lines, fulltext, key, None, head_cache))
399
        return parent_cache[key]
400
0.17.5 by Robert Collins
nograph tests completely passing.
401
    def check(self, progress_bar=None):
402
        """See VersionedFiles.check()."""
403
        keys = self.keys()
404
        for record in self.get_record_stream(keys, 'unordered', True):
405
            record.get_bytes_as('fulltext')
406
0.17.2 by Robert Collins
Core proof of concept working.
407
    def _check_add(self, key, lines, random_id, check_content):
408
        """check that version_id and lines are safe to add."""
409
        version_id = key[-1]
410
        if contains_whitespace(version_id):
411
            raise InvalidRevisionId(version_id, self)
412
        self.check_not_reserved_id(version_id)
413
        # TODO: If random_id==False and the key is already present, we should
414
        # probably check that the existing content is identical to what is
415
        # being inserted, and otherwise raise an exception.  This would make
416
        # the bundle code simpler.
417
        if check_content:
418
            self._check_lines_not_unicode(lines)
419
            self._check_lines_are_lines(lines)
420
0.17.5 by Robert Collins
nograph tests completely passing.
421
    def get_parent_map(self, keys):
422
        """Get a map of the parents of keys.
423
424
        :param keys: The keys to look up parents for.
425
        :return: A mapping from keys to parents. Absent keys are absent from
426
            the mapping.
427
        """
428
        result = {}
429
        sources = [self._index]
430
        source_results = []
431
        missing = set(keys)
432
        for source in sources:
433
            if not missing:
434
                break
435
            new_result = source.get_parent_map(missing)
436
            source_results.append(new_result)
437
            result.update(new_result)
438
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
439
        if self._unadded_refs:
440
            for key in missing:
441
                if key in self._unadded_refs:
442
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
443
        return result
444
445
    def get_record_stream(self, keys, ordering, include_delta_closure):
446
        """Get a stream of records for keys.
447
448
        :param keys: The keys to include.
449
        :param ordering: Either 'unordered' or 'topological'. A topologically
450
            sorted stream has compression parents strictly before their
451
            children.
452
        :param include_delta_closure: If True then the closure across any
453
            compression parents will be included (in the opaque data).
454
        :return: An iterator of ContentFactory objects, each of which is only
455
            valid until the iterator is advanced.
456
        """
457
        # keys might be a generator
458
        keys = set(keys)
459
        if not keys:
460
            return
461
        if not self._index.has_graph:
462
            # Cannot topological order when no graph has been stored.
463
            ordering = 'unordered'
464
        # Cheap: iterate
465
        locations = self._index.get_build_details(keys)
466
        if ordering == 'topological':
467
            # would be better to not globally sort initially but instead
468
            # start with one key, recurse to its oldest parent, then grab
469
            # everything in the same group, etc.
470
            parent_map = dict((key, details[2]) for key, details in
471
                locations.iteritems())
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
472
            local = frozenset(keys).intersection(set(self._unadded_refs))
473
            for key in local:
474
                parent_map[key] = self._unadded_refs[key]
475
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
476
            present_keys = topo_sort(parent_map)
477
            # Now group by source:
478
        else:
479
            present_keys = locations.keys()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
480
            local = frozenset(keys).intersection(set(self._unadded_refs))
481
            for key in local:
482
                present_keys.append(key)
483
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
484
        absent_keys = keys.difference(set(locations))
485
        for key in absent_keys:
486
            yield AbsentContentFactory(key)
487
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
488
            if key in self._unadded_refs:
489
                lines, sha1 = self._compressor.extract(key)
490
                parents = self._unadded_refs[key]
491
            else:
492
                index_memo, _, parents, (method, _) = locations[key]
493
                # read
494
                read_memo = index_memo[0:3]
495
                zdata = self._access.get_raw_records([read_memo]).next()
496
                # decompress
497
                plain = zlib.decompress(zdata)
498
                # parse
499
                delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
500
                label, sha1, delta = parse(delta_lines)
501
                if label != key:
502
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
503
                basis = plain[:index_memo[3]]
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
504
                # basis = StringIO(basis).readlines()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
505
                #basis = split_lines(plain[:last_end])
506
                lines = apply_delta(basis, delta)
0.17.5 by Robert Collins
nograph tests completely passing.
507
            bytes = ''.join(lines)
508
            yield FulltextContentFactory(key, parents, sha1, bytes)
509
            
510
    def get_sha1s(self, keys):
511
        """See VersionedFiles.get_sha1s()."""
512
        result = {}
513
        for record in self.get_record_stream(keys, 'unordered', True):
514
            if record.sha1 != None:
515
                result[record.key] = record.sha1
516
            else:
517
                if record.storage_kind != 'absent':
518
                    result[record.key] == sha_string(record.get_bytes_as(
519
                        'fulltext'))
520
        return result
521
0.17.2 by Robert Collins
Core proof of concept working.
522
    def insert_record_stream(self, stream):
523
        """Insert a record stream into this container.
524
525
        :param stream: A stream of records to insert. 
526
        :return: None
527
        :seealso VersionedFiles.get_record_stream:
528
        """
0.17.5 by Robert Collins
nograph tests completely passing.
529
        for _ in self._insert_record_stream(stream):
530
            pass
0.17.2 by Robert Collins
Core proof of concept working.
531
0.17.5 by Robert Collins
nograph tests completely passing.
532
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
533
        """Internal core to insert a record stream into this container.
534
535
        This helper function has a different interface than insert_record_stream
536
        to allow add_lines to be minimal, but still return the needed data.
537
538
        :param stream: A stream of records to insert. 
539
        :return: An iterator over the sha1 of the inserted records.
540
        :seealso insert_record_stream:
541
        :seealso add_lines:
542
        """
0.17.5 by Robert Collins
nograph tests completely passing.
543
        def get_adapter(adapter_key):
544
            try:
545
                return adapters[adapter_key]
546
            except KeyError:
547
                adapter_factory = adapter_registry.get(adapter_key)
548
                adapter = adapter_factory(self)
549
                adapters[adapter_key] = adapter
550
                return adapter
551
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
552
        # This will go up to fulltexts for gc to gc fetching, which isn't
553
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
554
        self._compressor = GroupCompressor(self._delta)
555
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
556
        keys_to_add = []
557
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
558
        groups = 1
559
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
560
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
561
            index, start, length = self._access.add_raw_records(
562
                [(None, len(compressed))], compressed)[0]
563
            nodes = []
564
            for key, reads, refs in keys_to_add:
565
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
566
            self._index.add_records(nodes, random_id=random_id)
0.17.2 by Robert Collins
Core proof of concept working.
567
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
568
            # Raise an error when a record is missing.
569
            if record.storage_kind == 'absent':
570
                raise errors.RevisionNotPresent([record.key], self)
571
            elif record.storage_kind == 'fulltext':
572
                bytes = record.get_bytes_as('fulltext')
573
            else:
574
                adapter_key = record.storage_kind, 'fulltext'
575
                adapter = get_adapter(adapter_key)
576
                bytes = adapter.get_bytes(record,
577
                    record.get_bytes_as(record.storage_kind))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
578
            found_sha1, end_point = self._compressor.compress(record.key,
0.17.5 by Robert Collins
nograph tests completely passing.
579
                split_lines(bytes), record.sha1)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
580
            self._unadded_refs[record.key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
581
            yield found_sha1
0.17.5 by Robert Collins
nograph tests completely passing.
582
            keys_to_add.append((record.key, '%d %d' % (basis_end, end_point),
583
                (record.parents,)))
584
            basis_end = end_point
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
585
            if basis_end > 1024 * 1024 * 20:
586
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
587
                self._compressor = GroupCompressor(self._delta)
588
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
589
                keys_to_add = []
590
                basis_end = 0
591
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
592
        if len(keys_to_add):
593
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
594
        self._compressor = None
595
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
596
597
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
598
        """Iterate over the lines in the versioned files from keys.
599
600
        This may return lines from other keys. Each item the returned
601
        iterator yields is a tuple of a line and a text version that that line
602
        is present in (not introduced in).
603
604
        Ordering of results is in whatever order is most suitable for the
605
        underlying storage format.
606
607
        If a progress bar is supplied, it may be used to indicate progress.
608
        The caller is responsible for cleaning up progress bars (because this
609
        is an iterator).
610
611
        NOTES:
612
         * Lines are normalised by the underlying store: they will all have \n
613
           terminators.
614
         * Lines are returned in arbitrary order.
615
616
        :return: An iterator over (line, key).
617
        """
618
        if pb is None:
619
            pb = progress.DummyProgress()
620
        keys = set(keys)
621
        total = len(keys)
622
        # we don't care about inclusions, the caller cares.
623
        # but we need to setup a list of records to visit.
624
        # we need key, position, length
625
        for key_idx, record in enumerate(self.get_record_stream(keys,
626
            'unordered', True)):
627
            # XXX: todo - optimise to use less than full texts.
628
            key = record.key
629
            pb.update('Walking content.', key_idx, total)
630
            if record.storage_kind == 'absent':
631
                raise errors.RevisionNotPresent(record.key, self)
632
            lines = split_lines(record.get_bytes_as('fulltext'))
633
            for line in lines:
634
                yield line, key
635
        pb.update('Walking content.', total, total)
636
637
    def keys(self):
638
        """See VersionedFiles.keys."""
639
        if 'evil' in debug.debug_flags:
640
            trace.mutter_callsite(2, "keys scales with size of history")
641
        sources = [self._index]
642
        result = set()
643
        for source in sources:
644
            result.update(source.keys())
645
        return result
646
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
647
648
class _GCGraphIndex(object):
649
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
650
0.17.9 by Robert Collins
Initial stab at repository format support.
651
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
652
        add_callback=None):
653
        """Construct a _GCGraphIndex on a graph_index.
654
655
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
656
        :param is_locked: A callback to check whether the object should answer
657
            queries.
658
        :param parents: If True, record knits parents, if not do not record 
659
            parents.
660
        :param add_callback: If not None, allow additions to the index and call
661
            this callback with a list of added GraphIndex nodes:
662
            [(node, value, node_refs), ...]
663
        :param is_locked: A callback, returns True if the index is locked and
664
            thus usable.
665
        """
666
        self._add_callback = add_callback
667
        self._graph_index = graph_index
668
        self._parents = parents
669
        self.has_graph = parents
670
        self._is_locked = is_locked
671
0.17.5 by Robert Collins
nograph tests completely passing.
672
    def add_records(self, records, random_id=False):
673
        """Add multiple records to the index.
674
        
675
        This function does not insert data into the Immutable GraphIndex
676
        backing the KnitGraphIndex, instead it prepares data for insertion by
677
        the caller and checks that it is safe to insert then calls
678
        self._add_callback with the prepared GraphIndex nodes.
679
680
        :param records: a list of tuples:
681
                         (key, options, access_memo, parents).
682
        :param random_id: If True the ids being added were randomly generated
683
            and no check for existence will be performed.
684
        """
685
        if not self._add_callback:
686
            raise errors.ReadOnlyError(self)
687
        # we hope there are no repositories with inconsistent parentage
688
        # anymore.
689
690
        changed = False
691
        keys = {}
692
        for (key, value, refs) in records:
693
            if not self._parents:
694
                if refs:
695
                    for ref in refs:
696
                        if ref:
697
                            raise KnitCorrupt(self,
698
                                "attempt to add node with parents "
699
                                "in parentless index.")
700
                    refs = ()
701
                    changed = True
702
            keys[key] = (value, refs)
703
        # check for dups
704
        if not random_id:
705
            present_nodes = self._get_entries(keys)
706
            for (index, key, value, node_refs) in present_nodes:
707
                if node_refs != keys[key][1]:
708
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
709
                        ": %s %s" % ((value, node_refs), keys[key]))
710
                del keys[key]
711
                changed = True
712
        if changed:
713
            result = []
714
            if self._parents:
715
                for key, (value, node_refs) in keys.iteritems():
716
                    result.append((key, value, node_refs))
717
            else:
718
                for key, (value, node_refs) in keys.iteritems():
719
                    result.append((key, value))
720
            records = result
721
        self._add_callback(records)
722
        
723
    def _check_read(self):
724
        """raise if reads are not permitted."""
725
        if not self._is_locked():
726
            raise errors.ObjectNotLocked(self)
727
0.17.2 by Robert Collins
Core proof of concept working.
728
    def _check_write_ok(self):
729
        """Assert if writes are not permitted."""
730
        if not self._is_locked():
731
            raise errors.ObjectNotLocked(self)
732
0.17.5 by Robert Collins
nograph tests completely passing.
733
    def _get_entries(self, keys, check_present=False):
734
        """Get the entries for keys.
735
        
736
        :param keys: An iterable of index key tuples.
737
        """
738
        keys = set(keys)
739
        found_keys = set()
740
        if self._parents:
741
            for node in self._graph_index.iter_entries(keys):
742
                yield node
743
                found_keys.add(node[1])
744
        else:
745
            # adapt parentless index to the rest of the code.
746
            for node in self._graph_index.iter_entries(keys):
747
                yield node[0], node[1], node[2], ()
748
                found_keys.add(node[1])
749
        if check_present:
750
            missing_keys = keys.difference(found_keys)
751
            if missing_keys:
752
                raise RevisionNotPresent(missing_keys.pop(), self)
753
754
    def get_parent_map(self, keys):
755
        """Get a map of the parents of keys.
756
757
        :param keys: The keys to look up parents for.
758
        :return: A mapping from keys to parents. Absent keys are absent from
759
            the mapping.
760
        """
761
        self._check_read()
762
        nodes = self._get_entries(keys)
763
        result = {}
764
        if self._parents:
765
            for node in nodes:
766
                result[node[1]] = node[3][0]
767
        else:
768
            for node in nodes:
769
                result[node[1]] = None
770
        return result
771
772
    def get_build_details(self, keys):
773
        """Get the various build details for keys.
774
775
        Ghosts are omitted from the result.
776
777
        :param keys: An iterable of keys.
778
        :return: A dict of key:
779
            (index_memo, compression_parent, parents, record_details).
780
            index_memo
781
                opaque structure to pass to read_records to extract the raw
782
                data
783
            compression_parent
784
                Content that this record is built upon, may be None
785
            parents
786
                Logical parents of this node
787
            record_details
788
                extra information about the content which needs to be passed to
789
                Factory.parse_record
790
        """
791
        self._check_read()
792
        result = {}
793
        entries = self._get_entries(keys, False)
794
        for entry in entries:
795
            key = entry[1]
796
            if not self._parents:
797
                parents = None
798
            else:
799
                parents = entry[3][0]
800
            value = entry[2]
801
            method = 'group'
802
            result[key] = (self._node_to_position(entry),
803
                                  None, parents, (method, None))
804
        return result
805
    
806
    def keys(self):
807
        """Get all the keys in the collection.
808
        
809
        The keys are not ordered.
810
        """
811
        self._check_read()
812
        return [node[1] for node in self._graph_index.iter_all_entries()]
813
    
814
    def _node_to_position(self, node):
815
        """Convert an index value to position details."""
816
        bits = node[2].split(' ')
817
        # It would be nice not to read the entire gzip.
818
        start = int(bits[0])
819
        stop = int(bits[1])
820
        basis_end = int(bits[2])
821
        delta_end = int(bits[3])
822
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
823
824
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
825
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
826
    """Get the longest possible match for the current position."""
827
    range_start = pos
828
    range_len = 0
829
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
830
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
831
        if locations is None:
832
            locations = equivalence_table.get_idx_matches(pos)
833
        if locations is None:
834
            # No more matches, just return whatever we have, but we know that
835
            # this last position is not going to match anything
836
            pos += 1
837
            break
838
        else:
839
            if copy_ends is None:
840
                # We are starting a new range
841
                copy_ends = [loc + 1 for loc in locations]
842
                range_len = 1
843
                locations = None # Consumed
844
            else:
845
                # We are currently in the middle of a match
846
                next_locations = set(copy_ends).intersection(locations)
847
                if len(next_locations):
848
                    # range continues
849
                    copy_ends = [loc + 1 for loc in next_locations]
850
                    range_len += 1
851
                    locations = None # Consumed
852
                else:
853
                    # But we are done with this match, we should be
854
                    # starting a new one, though. We will pass back 'locations'
855
                    # so that we don't have to do another lookup.
856
                    break
857
        pos += 1
858
    if copy_ends is None:
859
        return None, pos, locations
860
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
861
862
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
863
try:
864
    from bzrlib.plugins.groupcompress import _groupcompress_c
865
except ImportError:
866
    pass
867
else:
868
    GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
869
    _get_longest_match = _groupcompress_c._get_longest_match