/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
30
    pack,
31
    patiencediff,
32
    )
33
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
34
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
35
from bzrlib.osutils import (
36
    contains_whitespace,
37
    contains_linebreaks,
38
    sha_string,
39
    sha_strings,
40
    split_lines,
41
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
42
from bzrlib.btree_index import BTreeBuilder
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
43
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
44
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
45
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
46
    adapter_registry,
47
    AbsentContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
48
    FulltextContentFactory,
49
    VersionedFiles,
50
    )
51
52
0.17.5 by Robert Collins
nograph tests completely passing.
53
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
54
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
55
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
56
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
57
    label_line = lines.next()
58
    sha1_line = lines.next()
59
    if (not label_line.startswith('label: ') or
60
        not sha1_line.startswith('sha1: ')):
61
        raise AssertionError("bad text record %r" % lines)
62
    label = tuple(label_line[7:-1].split('\x00'))
63
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
64
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
65
        op = header[0]
66
        numbers = header[2:]
67
        numbers = [int(n) for n in header[2:].split(',')]
68
        if op == 'c':
69
            result.append((op, numbers[0], numbers[1], None))
70
        else:
71
            contents = [next() for i in xrange(numbers[0])]
72
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
73
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
74
75
def apply_delta(basis, delta):
76
    """Apply delta to this object to become new_version_id."""
77
    lines = []
78
    last_offset = 0
79
    # eq ranges occur where gaps occur
80
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
81
    for op, start, count, delta_lines in delta:
82
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
83
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
84
        else:
85
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
86
    trim_encoding_newline(lines)
87
    return lines
88
89
90
def trim_encoding_newline(lines):
91
    if lines[-1] == '\n':
92
        del lines[-1]
93
    else:
94
        lines[-1] = lines[-1][:-1]
95
96
97
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
98
    """Produce a serialised group of compressed texts.
99
    
100
    It contains code very similar to SequenceMatcher because of having a similar
101
    task. However some key differences apply:
102
     - there is no junk, we want a minimal edit not a human readable diff.
103
     - we don't filter very common lines (because we don't know where a good
104
       range will start, and after the first text we want to be emitting minmal
105
       edits only.
106
     - we chain the left side, not the right side
107
     - we incrementally update the adjacency matrix as new lines are provided.
108
     - we look for matches in all of the left side, so the routine which does
109
       the analagous task of find_longest_match does not need to filter on the
110
       left side.
111
    """
0.17.2 by Robert Collins
Core proof of concept working.
112
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
113
    _equivalence_table_class = equivalence_table.EquivalenceTable
114
0.17.2 by Robert Collins
Core proof of concept working.
115
    def __init__(self, delta=True):
116
        """Create a GroupCompressor.
117
118
        :paeam delta: If False, do not compress records.
119
        """
120
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
121
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
122
        self.endpoint = 0
123
        self.input_bytes = 0
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
124
        self.line_locations = self._equivalence_table_class([])
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
125
        self.lines = self.line_locations.lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
126
        self.labels_deltas = {}
0.17.2 by Robert Collins
Core proof of concept working.
127
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
128
    def get_matching_blocks(self, lines):
129
        """Return an the ranges in lines which match self.lines.
130
131
        :param lines: lines to compress
132
        :return: A list of (old_start, new_start, length) tuples which reflect
133
            a region in self.lines that is present in lines.  The last element
134
            of the list is always (old_len, new_len, 0) to provide a end point
135
            for generating instructions from the matching blocks list.
136
        """
137
        result = []
138
        pos = 0
139
        line_locations = self.line_locations
0.18.11 by John Arbash Meinel
Convert back into grabbing a right-lines ahead of time.
140
        line_locations.set_right_lines(lines)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
141
        # We either copy a range (while there are reusable lines) or we 
142
        # insert new lines. To find reusable lines we traverse 
0.18.24 by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func.
143
        locations = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
144
        max_pos = len(lines)
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
145
        max_time = 0.0
146
        max_info = None
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
147
        result_append = result.append
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
148
        while pos < max_pos:
0.18.35 by John Arbash Meinel
remove the timing calls
149
            block, pos, locations = _get_longest_match(line_locations, pos,
150
                                                       max_pos, locations)
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
151
            if block is not None:
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
152
                result_append(block)
153
        result_append((len(self.lines), len(lines), 0))
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
154
        return result
155
0.17.2 by Robert Collins
Core proof of concept working.
156
    def compress(self, key, lines, expected_sha):
157
        """Compress lines with label key.
158
159
        :param key: A key tuple. It is stored in the output
160
            for identification of the text during decompression.
161
        :param lines: The lines to be compressed. Must be split
162
            on \n, with the \n preserved.'
163
        :param expected_sha: If non-None, the sha the lines are blieved to
164
            have. During compression the sha is calculated; a mismatch will
165
            cause an error.
166
        :return: The sha1 of lines, and the number of bytes accumulated in
167
            the group output so far.
168
        """
169
        sha1 = sha_strings(lines)
170
        label = '\x00'.join(key)
171
        # setup good encoding for trailing \n support.
172
        if not lines or lines[-1].endswith('\n'):
173
            lines.append('\n')
174
        else:
175
            lines[-1] = lines[-1] + '\n'
176
        new_lines = []
177
        new_lines.append('label: %s\n' % label)
178
        new_lines.append('sha1: %s\n' % sha1)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
179
        index_lines = [False, False]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
180
        pos = 0
0.17.14 by Robert Collins
Cleaner code.
181
        range_len = 0
182
        range_start = 0
183
        flush_range = self.flush_range
184
        copy_ends = None
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
185
        blocks = self.get_matching_blocks(lines)
186
        current_pos = 0
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
187
        # We either copy a range (while there are reusable lines) or we 
188
        # insert new lines. To find reusable lines we traverse 
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
189
        for old_start, new_start, range_len in blocks:
190
            if new_start != current_pos:
191
                # non-matching region
0.19.1 by Robert Collins
Start to simplify flush_range.
192
                flush_range(current_pos, None, new_start - current_pos,
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
193
                    lines, new_lines, index_lines)
194
            current_pos = new_start + range_len
195
            if not range_len:
196
                continue
0.19.1 by Robert Collins
Start to simplify flush_range.
197
            flush_range(new_start, old_start, range_len, lines,
198
                new_lines, index_lines)
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
199
        delta_start = (self.endpoint, len(self.lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
200
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
201
        trim_encoding_newline(lines)
202
        self.input_bytes += sum(map(len, lines))
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
203
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
204
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
205
        return sha1, self.endpoint
206
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
207
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
208
        """Extract a key previously added to the compressor.
209
        
210
        :param key: The key to extract.
211
        :return: An iterable over bytes and the sha1.
212
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
213
        delta_details = self.labels_deltas[key]
214
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
215
        label, sha1, delta = parse(delta_lines)
216
        if label != key:
217
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
218
        # Perhaps we want to keep the line offsets too in memory at least?
219
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
220
        sha1 = sha_strings(lines)
221
        return lines, sha1
222
0.19.1 by Robert Collins
Start to simplify flush_range.
223
    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
0.17.14 by Robert Collins
Cleaner code.
224
        insert_instruction = "i,%d\n" % range_len
0.19.1 by Robert Collins
Start to simplify flush_range.
225
        if copy_start is not None:
0.17.14 by Robert Collins
Cleaner code.
226
            # range stops, flush and start a new copy range
227
            stop_byte = self.line_offsets[copy_start + range_len - 1]
228
            if copy_start == 0:
229
                start_byte = 0
230
            else:
231
                start_byte = self.line_offsets[copy_start - 1]
232
            bytes = stop_byte - start_byte
233
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
234
            if (bytes + len(insert_instruction) >
235
                len(copy_control_instruction)):
236
                new_lines.append(copy_control_instruction)
237
                index_lines.append(False)
238
                return
239
        # not copying, or inserting is shorter than copying, so insert.
240
        new_lines.append(insert_instruction)
241
        new_lines.extend(lines[range_start:range_start+range_len])
242
        index_lines.append(False)
0.19.1 by Robert Collins
Start to simplify flush_range.
243
        index_lines.extend([copy_start is None]*range_len)
0.17.14 by Robert Collins
Cleaner code.
244
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
245
    def output_lines(self, new_lines, index_lines):
246
        """Output some lines.
247
248
        :param new_lines: The lines to output.
249
        :param index_lines: A boolean flag for each line - when True, index
250
            that line.
251
        """
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
252
        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
253
        #                          if val and new_lines[idx] == '\n']
254
        # if indexed_newlines:
255
        #     import pdb; pdb.set_trace()
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
256
        endpoint = self.endpoint
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
257
        self.line_locations.extend_lines(new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
258
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
259
            endpoint += len(line)
260
            self.line_offsets.append(endpoint)
261
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
262
0.17.2 by Robert Collins
Core proof of concept working.
263
    def ratio(self):
264
        """Return the overall compression ratio."""
265
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
266
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
267
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
268
def make_pack_factory(graph, delta, keylength):
269
    """Create a factory for creating a pack based groupcompress.
270
271
    This is only functional enough to run interface tests, it doesn't try to
272
    provide a full pack environment.
273
    
274
    :param graph: Store a graph.
275
    :param delta: Delta compress contents.
276
    :param keylength: How long should keys be.
277
    """
278
    def factory(transport):
279
        parents = graph or delta
280
        ref_length = 0
281
        if graph:
282
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
283
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
284
            key_elements=keylength)
285
        stream = transport.open_write_stream('newpack')
286
        writer = pack.ContainerWriter(stream.write)
287
        writer.begin()
288
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
289
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
290
        access = _DirectPackAccess({})
291
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
292
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
293
        result.stream = stream
294
        result.writer = writer
295
        return result
296
    return factory
297
298
299
def cleanup_pack_group(versioned_files):
300
    versioned_files.stream.close()
301
    versioned_files.writer.end()
302
303
304
class GroupCompressVersionedFiles(VersionedFiles):
305
    """A group-compress based VersionedFiles implementation."""
306
0.17.2 by Robert Collins
Core proof of concept working.
307
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
308
        """Create a GroupCompressVersionedFiles object.
309
310
        :param index: The index object storing access and graph data.
311
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
312
        :param delta: Whether to delta compress or just entropy compress.
313
        """
314
        self._index = index
315
        self._access = access
316
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
317
        self._unadded_refs = {}
0.17.2 by Robert Collins
Core proof of concept working.
318
319
    def add_lines(self, key, parents, lines, parent_texts=None,
320
        left_matching_blocks=None, nostore_sha=None, random_id=False,
321
        check_content=True):
322
        """Add a text to the store.
323
324
        :param key: The key tuple of the text to add.
325
        :param parents: The parents key tuples of the text to add.
326
        :param lines: A list of lines. Each line must be a bytestring. And all
327
            of them except the last must be terminated with \n and contain no
328
            other \n's. The last line may either contain no \n's or a single
329
            terminating \n. If the lines list does meet this constraint the add
330
            routine may error or may succeed - but you will be unable to read
331
            the data back accurately. (Checking the lines have been split
332
            correctly is expensive and extremely unlikely to catch bugs so it
333
            is not done at runtime unless check_content is True.)
334
        :param parent_texts: An optional dictionary containing the opaque 
335
            representations of some or all of the parents of version_id to
336
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
337
            returned by add_lines or data corruption can be caused.
338
        :param left_matching_blocks: a hint about which areas are common
339
            between the text and its left-hand-parent.  The format is
340
            the SequenceMatcher.get_matching_blocks format.
341
        :param nostore_sha: Raise ExistingContent and do not add the lines to
342
            the versioned file if the digest of the lines matches this.
343
        :param random_id: If True a random id has been selected rather than
344
            an id determined by some deterministic process such as a converter
345
            from a foreign VCS. When True the backend may choose not to check
346
            for uniqueness of the resulting key within the versioned file, so
347
            this should only be done when the result is expected to be unique
348
            anyway.
349
        :param check_content: If True, the lines supplied are verified to be
350
            bytestrings that are correctly formed lines.
351
        :return: The text sha1, the number of bytes in the text, and an opaque
352
                 representation of the inserted version which can be provided
353
                 back to future add_lines calls in the parent_texts dictionary.
354
        """
355
        self._index._check_write_ok()
356
        self._check_add(key, lines, random_id, check_content)
357
        if parents is None:
358
            # The caller might pass None if there is no graph data, but kndx
359
            # indexes can't directly store that, so we give them
360
            # an empty tuple instead.
361
            parents = ()
362
        # double handling for now. Make it work until then.
363
        bytes = ''.join(lines)
364
        record = FulltextContentFactory(key, parents, None, bytes)
0.17.5 by Robert Collins
nograph tests completely passing.
365
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.17.2 by Robert Collins
Core proof of concept working.
366
        return sha1, len(bytes), None
367
0.17.4 by Robert Collins
Annotate.
368
    def annotate(self, key):
369
        """See VersionedFiles.annotate."""
370
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
371
        parent_map = self.get_parent_map([key])
372
        if not parent_map:
373
            raise errors.RevisionNotPresent(key, self)
374
        if parent_map[key] is not None:
375
            search = graph._make_breadth_first_searcher([key])
376
            keys = set()
377
            while True:
378
                try:
379
                    present, ghosts = search.next_with_ghosts()
380
                except StopIteration:
381
                    break
382
                keys.update(present)
383
            parent_map = self.get_parent_map(keys)
384
        else:
385
            keys = [key]
386
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
387
        head_cache = _mod_graph.FrozenHeadsCache(graph)
388
        parent_cache = {}
389
        reannotate = annotate.reannotate
390
        for record in self.get_record_stream(keys, 'topological', True):
391
            key = record.key
392
            fulltext = split_lines(record.get_bytes_as('fulltext'))
393
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
394
            parent_cache[key] = list(
395
                reannotate(parent_lines, fulltext, key, None, head_cache))
396
        return parent_cache[key]
397
0.17.5 by Robert Collins
nograph tests completely passing.
398
    def check(self, progress_bar=None):
399
        """See VersionedFiles.check()."""
400
        keys = self.keys()
401
        for record in self.get_record_stream(keys, 'unordered', True):
402
            record.get_bytes_as('fulltext')
403
0.17.2 by Robert Collins
Core proof of concept working.
404
    def _check_add(self, key, lines, random_id, check_content):
405
        """check that version_id and lines are safe to add."""
406
        version_id = key[-1]
407
        if contains_whitespace(version_id):
408
            raise InvalidRevisionId(version_id, self)
409
        self.check_not_reserved_id(version_id)
410
        # TODO: If random_id==False and the key is already present, we should
411
        # probably check that the existing content is identical to what is
412
        # being inserted, and otherwise raise an exception.  This would make
413
        # the bundle code simpler.
414
        if check_content:
415
            self._check_lines_not_unicode(lines)
416
            self._check_lines_are_lines(lines)
417
0.17.5 by Robert Collins
nograph tests completely passing.
418
    def get_parent_map(self, keys):
419
        """Get a map of the parents of keys.
420
421
        :param keys: The keys to look up parents for.
422
        :return: A mapping from keys to parents. Absent keys are absent from
423
            the mapping.
424
        """
425
        result = {}
426
        sources = [self._index]
427
        source_results = []
428
        missing = set(keys)
429
        for source in sources:
430
            if not missing:
431
                break
432
            new_result = source.get_parent_map(missing)
433
            source_results.append(new_result)
434
            result.update(new_result)
435
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
436
        if self._unadded_refs:
437
            for key in missing:
438
                if key in self._unadded_refs:
439
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
440
        return result
441
442
    def get_record_stream(self, keys, ordering, include_delta_closure):
443
        """Get a stream of records for keys.
444
445
        :param keys: The keys to include.
446
        :param ordering: Either 'unordered' or 'topological'. A topologically
447
            sorted stream has compression parents strictly before their
448
            children.
449
        :param include_delta_closure: If True then the closure across any
450
            compression parents will be included (in the opaque data).
451
        :return: An iterator of ContentFactory objects, each of which is only
452
            valid until the iterator is advanced.
453
        """
454
        # keys might be a generator
455
        keys = set(keys)
456
        if not keys:
457
            return
458
        if not self._index.has_graph:
459
            # Cannot topological order when no graph has been stored.
460
            ordering = 'unordered'
461
        # Cheap: iterate
462
        locations = self._index.get_build_details(keys)
463
        if ordering == 'topological':
464
            # would be better to not globally sort initially but instead
465
            # start with one key, recurse to its oldest parent, then grab
466
            # everything in the same group, etc.
467
            parent_map = dict((key, details[2]) for key, details in
468
                locations.iteritems())
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
469
            local = frozenset(keys).intersection(set(self._unadded_refs))
470
            for key in local:
471
                parent_map[key] = self._unadded_refs[key]
472
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
473
            present_keys = topo_sort(parent_map)
474
            # Now group by source:
475
        else:
476
            present_keys = locations.keys()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
477
            local = frozenset(keys).intersection(set(self._unadded_refs))
478
            for key in local:
479
                present_keys.append(key)
480
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
481
        absent_keys = keys.difference(set(locations))
482
        for key in absent_keys:
483
            yield AbsentContentFactory(key)
484
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
485
            if key in self._unadded_refs:
486
                lines, sha1 = self._compressor.extract(key)
487
                parents = self._unadded_refs[key]
488
            else:
489
                index_memo, _, parents, (method, _) = locations[key]
490
                # read
491
                read_memo = index_memo[0:3]
492
                zdata = self._access.get_raw_records([read_memo]).next()
493
                # decompress
494
                plain = zlib.decompress(zdata)
495
                # parse
496
                delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
497
                label, sha1, delta = parse(delta_lines)
498
                if label != key:
499
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
500
                basis = plain[:index_memo[3]]
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
501
                # basis = StringIO(basis).readlines()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
502
                #basis = split_lines(plain[:last_end])
503
                lines = apply_delta(basis, delta)
0.17.5 by Robert Collins
nograph tests completely passing.
504
            bytes = ''.join(lines)
505
            yield FulltextContentFactory(key, parents, sha1, bytes)
506
            
507
    def get_sha1s(self, keys):
508
        """See VersionedFiles.get_sha1s()."""
509
        result = {}
510
        for record in self.get_record_stream(keys, 'unordered', True):
511
            if record.sha1 != None:
512
                result[record.key] = record.sha1
513
            else:
514
                if record.storage_kind != 'absent':
515
                    result[record.key] == sha_string(record.get_bytes_as(
516
                        'fulltext'))
517
        return result
518
0.17.2 by Robert Collins
Core proof of concept working.
519
    def insert_record_stream(self, stream):
520
        """Insert a record stream into this container.
521
522
        :param stream: A stream of records to insert. 
523
        :return: None
524
        :seealso VersionedFiles.get_record_stream:
525
        """
0.17.5 by Robert Collins
nograph tests completely passing.
526
        for _ in self._insert_record_stream(stream):
527
            pass
0.17.2 by Robert Collins
Core proof of concept working.
528
0.17.5 by Robert Collins
nograph tests completely passing.
529
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
530
        """Internal core to insert a record stream into this container.
531
532
        This helper function has a different interface than insert_record_stream
533
        to allow add_lines to be minimal, but still return the needed data.
534
535
        :param stream: A stream of records to insert. 
536
        :return: An iterator over the sha1 of the inserted records.
537
        :seealso insert_record_stream:
538
        :seealso add_lines:
539
        """
0.17.5 by Robert Collins
nograph tests completely passing.
540
        def get_adapter(adapter_key):
541
            try:
542
                return adapters[adapter_key]
543
            except KeyError:
544
                adapter_factory = adapter_registry.get(adapter_key)
545
                adapter = adapter_factory(self)
546
                adapters[adapter_key] = adapter
547
                return adapter
548
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
549
        # This will go up to fulltexts for gc to gc fetching, which isn't
550
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
551
        self._compressor = GroupCompressor(self._delta)
552
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
553
        keys_to_add = []
554
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
555
        groups = 1
556
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
557
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
558
            index, start, length = self._access.add_raw_records(
559
                [(None, len(compressed))], compressed)[0]
560
            nodes = []
561
            for key, reads, refs in keys_to_add:
562
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
563
            self._index.add_records(nodes, random_id=random_id)
0.17.2 by Robert Collins
Core proof of concept working.
564
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
565
            # Raise an error when a record is missing.
566
            if record.storage_kind == 'absent':
567
                raise errors.RevisionNotPresent([record.key], self)
568
            elif record.storage_kind == 'fulltext':
569
                bytes = record.get_bytes_as('fulltext')
570
            else:
571
                adapter_key = record.storage_kind, 'fulltext'
572
                adapter = get_adapter(adapter_key)
573
                bytes = adapter.get_bytes(record,
574
                    record.get_bytes_as(record.storage_kind))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
575
            found_sha1, end_point = self._compressor.compress(record.key,
0.17.5 by Robert Collins
nograph tests completely passing.
576
                split_lines(bytes), record.sha1)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
577
            self._unadded_refs[record.key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
578
            yield found_sha1
0.17.5 by Robert Collins
nograph tests completely passing.
579
            keys_to_add.append((record.key, '%d %d' % (basis_end, end_point),
580
                (record.parents,)))
581
            basis_end = end_point
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
582
            if basis_end > 1024 * 1024 * 20:
583
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
584
                self._compressor = GroupCompressor(self._delta)
585
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
586
                keys_to_add = []
587
                basis_end = 0
588
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
589
        if len(keys_to_add):
590
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
591
        self._compressor = None
592
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
593
594
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
595
        """Iterate over the lines in the versioned files from keys.
596
597
        This may return lines from other keys. Each item the returned
598
        iterator yields is a tuple of a line and a text version that that line
599
        is present in (not introduced in).
600
601
        Ordering of results is in whatever order is most suitable for the
602
        underlying storage format.
603
604
        If a progress bar is supplied, it may be used to indicate progress.
605
        The caller is responsible for cleaning up progress bars (because this
606
        is an iterator).
607
608
        NOTES:
609
         * Lines are normalised by the underlying store: they will all have \n
610
           terminators.
611
         * Lines are returned in arbitrary order.
612
613
        :return: An iterator over (line, key).
614
        """
615
        if pb is None:
616
            pb = progress.DummyProgress()
617
        keys = set(keys)
618
        total = len(keys)
619
        # we don't care about inclusions, the caller cares.
620
        # but we need to setup a list of records to visit.
621
        # we need key, position, length
622
        for key_idx, record in enumerate(self.get_record_stream(keys,
623
            'unordered', True)):
624
            # XXX: todo - optimise to use less than full texts.
625
            key = record.key
626
            pb.update('Walking content.', key_idx, total)
627
            if record.storage_kind == 'absent':
628
                raise errors.RevisionNotPresent(record.key, self)
629
            lines = split_lines(record.get_bytes_as('fulltext'))
630
            for line in lines:
631
                yield line, key
632
        pb.update('Walking content.', total, total)
633
634
    def keys(self):
635
        """See VersionedFiles.keys."""
636
        if 'evil' in debug.debug_flags:
637
            trace.mutter_callsite(2, "keys scales with size of history")
638
        sources = [self._index]
639
        result = set()
640
        for source in sources:
641
            result.update(source.keys())
642
        return result
643
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
644
645
class _GCGraphIndex(object):
646
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
647
0.17.9 by Robert Collins
Initial stab at repository format support.
648
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
649
        add_callback=None):
650
        """Construct a _GCGraphIndex on a graph_index.
651
652
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
653
        :param is_locked: A callback to check whether the object should answer
654
            queries.
655
        :param parents: If True, record knits parents, if not do not record 
656
            parents.
657
        :param add_callback: If not None, allow additions to the index and call
658
            this callback with a list of added GraphIndex nodes:
659
            [(node, value, node_refs), ...]
660
        :param is_locked: A callback, returns True if the index is locked and
661
            thus usable.
662
        """
663
        self._add_callback = add_callback
664
        self._graph_index = graph_index
665
        self._parents = parents
666
        self.has_graph = parents
667
        self._is_locked = is_locked
668
0.17.5 by Robert Collins
nograph tests completely passing.
669
    def add_records(self, records, random_id=False):
670
        """Add multiple records to the index.
671
        
672
        This function does not insert data into the Immutable GraphIndex
673
        backing the KnitGraphIndex, instead it prepares data for insertion by
674
        the caller and checks that it is safe to insert then calls
675
        self._add_callback with the prepared GraphIndex nodes.
676
677
        :param records: a list of tuples:
678
                         (key, options, access_memo, parents).
679
        :param random_id: If True the ids being added were randomly generated
680
            and no check for existence will be performed.
681
        """
682
        if not self._add_callback:
683
            raise errors.ReadOnlyError(self)
684
        # we hope there are no repositories with inconsistent parentage
685
        # anymore.
686
687
        changed = False
688
        keys = {}
689
        for (key, value, refs) in records:
690
            if not self._parents:
691
                if refs:
692
                    for ref in refs:
693
                        if ref:
694
                            raise KnitCorrupt(self,
695
                                "attempt to add node with parents "
696
                                "in parentless index.")
697
                    refs = ()
698
                    changed = True
699
            keys[key] = (value, refs)
700
        # check for dups
701
        if not random_id:
702
            present_nodes = self._get_entries(keys)
703
            for (index, key, value, node_refs) in present_nodes:
704
                if node_refs != keys[key][1]:
705
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
706
                        ": %s %s" % ((value, node_refs), keys[key]))
707
                del keys[key]
708
                changed = True
709
        if changed:
710
            result = []
711
            if self._parents:
712
                for key, (value, node_refs) in keys.iteritems():
713
                    result.append((key, value, node_refs))
714
            else:
715
                for key, (value, node_refs) in keys.iteritems():
716
                    result.append((key, value))
717
            records = result
718
        self._add_callback(records)
719
        
720
    def _check_read(self):
721
        """raise if reads are not permitted."""
722
        if not self._is_locked():
723
            raise errors.ObjectNotLocked(self)
724
0.17.2 by Robert Collins
Core proof of concept working.
725
    def _check_write_ok(self):
726
        """Assert if writes are not permitted."""
727
        if not self._is_locked():
728
            raise errors.ObjectNotLocked(self)
729
0.17.5 by Robert Collins
nograph tests completely passing.
730
    def _get_entries(self, keys, check_present=False):
731
        """Get the entries for keys.
732
        
733
        :param keys: An iterable of index key tuples.
734
        """
735
        keys = set(keys)
736
        found_keys = set()
737
        if self._parents:
738
            for node in self._graph_index.iter_entries(keys):
739
                yield node
740
                found_keys.add(node[1])
741
        else:
742
            # adapt parentless index to the rest of the code.
743
            for node in self._graph_index.iter_entries(keys):
744
                yield node[0], node[1], node[2], ()
745
                found_keys.add(node[1])
746
        if check_present:
747
            missing_keys = keys.difference(found_keys)
748
            if missing_keys:
749
                raise RevisionNotPresent(missing_keys.pop(), self)
750
751
    def get_parent_map(self, keys):
752
        """Get a map of the parents of keys.
753
754
        :param keys: The keys to look up parents for.
755
        :return: A mapping from keys to parents. Absent keys are absent from
756
            the mapping.
757
        """
758
        self._check_read()
759
        nodes = self._get_entries(keys)
760
        result = {}
761
        if self._parents:
762
            for node in nodes:
763
                result[node[1]] = node[3][0]
764
        else:
765
            for node in nodes:
766
                result[node[1]] = None
767
        return result
768
769
    def get_build_details(self, keys):
770
        """Get the various build details for keys.
771
772
        Ghosts are omitted from the result.
773
774
        :param keys: An iterable of keys.
775
        :return: A dict of key:
776
            (index_memo, compression_parent, parents, record_details).
777
            index_memo
778
                opaque structure to pass to read_records to extract the raw
779
                data
780
            compression_parent
781
                Content that this record is built upon, may be None
782
            parents
783
                Logical parents of this node
784
            record_details
785
                extra information about the content which needs to be passed to
786
                Factory.parse_record
787
        """
788
        self._check_read()
789
        result = {}
790
        entries = self._get_entries(keys, False)
791
        for entry in entries:
792
            key = entry[1]
793
            if not self._parents:
794
                parents = None
795
            else:
796
                parents = entry[3][0]
797
            value = entry[2]
798
            method = 'group'
799
            result[key] = (self._node_to_position(entry),
800
                                  None, parents, (method, None))
801
        return result
802
    
803
    def keys(self):
804
        """Get all the keys in the collection.
805
        
806
        The keys are not ordered.
807
        """
808
        self._check_read()
809
        return [node[1] for node in self._graph_index.iter_all_entries()]
810
    
811
    def _node_to_position(self, node):
812
        """Convert an index value to position details."""
813
        bits = node[2].split(' ')
814
        # It would be nice not to read the entire gzip.
815
        start = int(bits[0])
816
        stop = int(bits[1])
817
        basis_end = int(bits[2])
818
        delta_end = int(bits[3])
819
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
820
821
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
822
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
823
    """Get the longest possible match for the current position."""
824
    range_start = pos
825
    range_len = 0
826
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
827
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
828
        if locations is None:
829
            locations = equivalence_table.get_idx_matches(pos)
830
        if locations is None:
831
            # No more matches, just return whatever we have, but we know that
832
            # this last position is not going to match anything
833
            pos += 1
834
            break
835
        else:
836
            if copy_ends is None:
837
                # We are starting a new range
838
                copy_ends = [loc + 1 for loc in locations]
839
                range_len = 1
840
                locations = None # Consumed
841
            else:
842
                # We are currently in the middle of a match
843
                next_locations = set(copy_ends).intersection(locations)
844
                if len(next_locations):
845
                    # range continues
846
                    copy_ends = [loc + 1 for loc in next_locations]
847
                    range_len += 1
848
                    locations = None # Consumed
849
                else:
850
                    # But we are done with this match, we should be
851
                    # starting a new one, though. We will pass back 'locations'
852
                    # so that we don't have to do another lookup.
853
                    break
854
        pos += 1
855
    if copy_ends is None:
856
        return None, pos, locations
857
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
858
859
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
860
try:
861
    from bzrlib.plugins.groupcompress import _groupcompress_c
862
except ImportError:
863
    pass
864
else:
865
    GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
866
    _get_longest_match = _groupcompress_c._get_longest_match