/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
30
    pack,
31
    patiencediff,
32
    )
33
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
34
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
35
from bzrlib.osutils import (
36
    contains_whitespace,
37
    contains_linebreaks,
38
    sha_string,
39
    sha_strings,
40
    split_lines,
41
    )
0.17.7 by Robert Collins
Update for current index2 changes.
42
from bzrlib.plugins.index2.btree_index import BTreeBuilder
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
43
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
44
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
45
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
46
    adapter_registry,
47
    AbsentContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
48
    FulltextContentFactory,
49
    VersionedFiles,
50
    )
51
52
0.17.5 by Robert Collins
nograph tests completely passing.
53
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
54
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
55
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
56
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
57
    label_line = lines.next()
58
    sha1_line = lines.next()
59
    if (not label_line.startswith('label: ') or
60
        not sha1_line.startswith('sha1: ')):
61
        raise AssertionError("bad text record %r" % lines)
62
    label = tuple(label_line[7:-1].split('\x00'))
63
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
64
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
65
        op = header[0]
66
        numbers = header[2:]
67
        numbers = [int(n) for n in header[2:].split(',')]
68
        if op == 'c':
69
            result.append((op, numbers[0], numbers[1], None))
70
        else:
71
            contents = [next() for i in xrange(numbers[0])]
72
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
73
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
74
75
def apply_delta(basis, delta):
76
    """Apply delta to this object to become new_version_id."""
77
    lines = []
78
    last_offset = 0
79
    # eq ranges occur where gaps occur
80
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
81
    for op, start, count, delta_lines in delta:
82
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
83
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
84
        else:
85
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
86
    trim_encoding_newline(lines)
87
    return lines
88
89
90
def trim_encoding_newline(lines):
91
    if lines[-1] == '\n':
92
        del lines[-1]
93
    else:
94
        lines[-1] = lines[-1][:-1]
95
96
97
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
98
    """Produce a serialised group of compressed texts.
99
    
100
    It contains code very similar to SequenceMatcher because of having a similar
101
    task. However some key differences apply:
102
     - there is no junk, we want a minimal edit not a human readable diff.
103
     - we don't filter very common lines (because we don't know where a good
104
       range will start, and after the first text we want to be emitting minmal
105
       edits only.
106
     - we chain the left side, not the right side
107
     - we incrementally update the adjacency matrix as new lines are provided.
108
     - we look for matches in all of the left side, so the routine which does
109
       the analagous task of find_longest_match does not need to filter on the
110
       left side.
111
    """
0.17.2 by Robert Collins
Core proof of concept working.
112
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
113
    _equivalence_table_class = equivalence_table.EquivalenceTable
114
0.17.2 by Robert Collins
Core proof of concept working.
115
    def __init__(self, delta=True):
116
        """Create a GroupCompressor.
117
118
        :paeam delta: If False, do not compress records.
119
        """
120
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
121
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
122
        self.endpoint = 0
123
        self.input_bytes = 0
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
124
        self.line_locations = self._equivalence_table_class([])
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
125
        self.lines = self.line_locations.lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
126
        self.labels_deltas = {}
0.17.2 by Robert Collins
Core proof of concept working.
127
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
128
    def get_matching_blocks(self, lines):
129
        """Return an the ranges in lines which match self.lines.
130
131
        :param lines: lines to compress
132
        :return: A list of (old_start, new_start, length) tuples which reflect
133
            a region in self.lines that is present in lines.  The last element
134
            of the list is always (old_len, new_len, 0) to provide a end point
135
            for generating instructions from the matching blocks list.
136
        """
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
137
        import time
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
138
        result = []
139
        pos = 0
140
        line_locations = self.line_locations
0.18.11 by John Arbash Meinel
Convert back into grabbing a right-lines ahead of time.
141
        line_locations.set_right_lines(lines)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
142
        # We either copy a range (while there are reusable lines) or we 
143
        # insert new lines. To find reusable lines we traverse 
0.18.24 by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func.
144
        locations = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
145
        max_pos = len(lines)
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
146
        timer = time.clock
147
        max_time = 0.0
148
        max_info = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
149
        while pos < max_pos:
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
150
            tstart = timer()
151
            block, next_pos, locations = _get_longest_match(line_locations, pos,
152
                                                            max_pos, locations)
153
            tdelta = timer() - tstart
154
            if tdelta > max_time:
155
                max_time = tdelta
156
                max_info = tdelta, pos, block, next_pos, locations
157
            pos = next_pos
158
            
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
159
            if block is not None:
0.18.24 by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func.
160
                result.append(block)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
161
        result.append((len(self.lines), len(lines), 0))
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
162
        # if max_time > 0.01:
163
        #     print max_info[:-1]
164
        #     import pdb; pdb.set_trace()
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
165
        return result
166
0.17.2 by Robert Collins
Core proof of concept working.
167
    def compress(self, key, lines, expected_sha):
168
        """Compress lines with label key.
169
170
        :param key: A key tuple. It is stored in the output
171
            for identification of the text during decompression.
172
        :param lines: The lines to be compressed. Must be split
173
            on \n, with the \n preserved.'
174
        :param expected_sha: If non-None, the sha the lines are blieved to
175
            have. During compression the sha is calculated; a mismatch will
176
            cause an error.
177
        :return: The sha1 of lines, and the number of bytes accumulated in
178
            the group output so far.
179
        """
180
        sha1 = sha_strings(lines)
181
        label = '\x00'.join(key)
182
        # setup good encoding for trailing \n support.
183
        if not lines or lines[-1].endswith('\n'):
184
            lines.append('\n')
185
        else:
186
            lines[-1] = lines[-1] + '\n'
187
        new_lines = []
188
        new_lines.append('label: %s\n' % label)
189
        new_lines.append('sha1: %s\n' % sha1)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
190
        index_lines = [False, False]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
191
        pos = 0
192
        line_locations = self.line_locations
0.17.14 by Robert Collins
Cleaner code.
193
        range_len = 0
194
        range_start = 0
195
        flush_range = self.flush_range
196
        copy_ends = None
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
197
        blocks = self.get_matching_blocks(lines)
198
        current_pos = 0
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
199
        # We either copy a range (while there are reusable lines) or we 
200
        # insert new lines. To find reusable lines we traverse 
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
201
        for old_start, new_start, range_len in blocks:
202
            if new_start != current_pos:
203
                # non-matching region
0.19.1 by Robert Collins
Start to simplify flush_range.
204
                flush_range(current_pos, None, new_start - current_pos,
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
205
                    lines, new_lines, index_lines)
206
            current_pos = new_start + range_len
207
            if not range_len:
208
                continue
0.19.1 by Robert Collins
Start to simplify flush_range.
209
            flush_range(new_start, old_start, range_len, lines,
210
                new_lines, index_lines)
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
211
        delta_start = (self.endpoint, len(self.lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
212
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
213
        trim_encoding_newline(lines)
214
        self.input_bytes += sum(map(len, lines))
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
215
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
216
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
217
        return sha1, self.endpoint
218
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
219
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
220
        """Extract a key previously added to the compressor.
221
        
222
        :param key: The key to extract.
223
        :return: An iterable over bytes and the sha1.
224
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
225
        delta_details = self.labels_deltas[key]
226
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
227
        label, sha1, delta = parse(delta_lines)
228
        if label != key:
229
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
230
        # Perhaps we want to keep the line offsets too in memory at least?
231
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
232
        sha1 = sha_strings(lines)
233
        return lines, sha1
234
0.19.1 by Robert Collins
Start to simplify flush_range.
235
    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
0.17.14 by Robert Collins
Cleaner code.
236
        if not range_len:
237
            return
238
        insert_instruction = "i,%d\n" % range_len
0.19.1 by Robert Collins
Start to simplify flush_range.
239
        if copy_start is not None:
0.17.14 by Robert Collins
Cleaner code.
240
            # range stops, flush and start a new copy range
241
            stop_byte = self.line_offsets[copy_start + range_len - 1]
242
            if copy_start == 0:
243
                start_byte = 0
244
            else:
245
                start_byte = self.line_offsets[copy_start - 1]
246
            bytes = stop_byte - start_byte
247
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
248
            if (bytes + len(insert_instruction) >
249
                len(copy_control_instruction)):
250
                new_lines.append(copy_control_instruction)
251
                index_lines.append(False)
252
                return
253
        # not copying, or inserting is shorter than copying, so insert.
254
        new_lines.append(insert_instruction)
255
        new_lines.extend(lines[range_start:range_start+range_len])
256
        index_lines.append(False)
0.19.1 by Robert Collins
Start to simplify flush_range.
257
        index_lines.extend([copy_start is None]*range_len)
0.17.14 by Robert Collins
Cleaner code.
258
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
259
    def output_lines(self, new_lines, index_lines):
260
        """Output some lines.
261
262
        :param new_lines: The lines to output.
263
        :param index_lines: A boolean flag for each line - when True, index
264
            that line.
265
        """
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
266
        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
267
        #                          if val and new_lines[idx] == '\n']
268
        # if indexed_newlines:
269
        #     import pdb; pdb.set_trace()
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
270
        endpoint = self.endpoint
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
271
        self.line_locations.extend_lines(new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
272
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
273
            endpoint += len(line)
274
            self.line_offsets.append(endpoint)
275
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
276
0.17.2 by Robert Collins
Core proof of concept working.
277
    def ratio(self):
278
        """Return the overall compression ratio."""
279
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
280
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
281
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
282
def make_pack_factory(graph, delta, keylength):
283
    """Create a factory for creating a pack based groupcompress.
284
285
    This is only functional enough to run interface tests, it doesn't try to
286
    provide a full pack environment.
287
    
288
    :param graph: Store a graph.
289
    :param delta: Delta compress contents.
290
    :param keylength: How long should keys be.
291
    """
292
    def factory(transport):
293
        parents = graph or delta
294
        ref_length = 0
295
        if graph:
296
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
297
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
298
            key_elements=keylength)
299
        stream = transport.open_write_stream('newpack')
300
        writer = pack.ContainerWriter(stream.write)
301
        writer.begin()
302
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
303
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
304
        access = _DirectPackAccess({})
305
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
306
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
307
        result.stream = stream
308
        result.writer = writer
309
        return result
310
    return factory
311
312
313
def cleanup_pack_group(versioned_files):
314
    versioned_files.stream.close()
315
    versioned_files.writer.end()
316
317
318
class GroupCompressVersionedFiles(VersionedFiles):
319
    """A group-compress based VersionedFiles implementation."""
320
0.17.2 by Robert Collins
Core proof of concept working.
321
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
322
        """Create a GroupCompressVersionedFiles object.
323
324
        :param index: The index object storing access and graph data.
325
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
326
        :param delta: Whether to delta compress or just entropy compress.
327
        """
328
        self._index = index
329
        self._access = access
330
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
331
        self._unadded_refs = {}
0.17.2 by Robert Collins
Core proof of concept working.
332
333
    def add_lines(self, key, parents, lines, parent_texts=None,
334
        left_matching_blocks=None, nostore_sha=None, random_id=False,
335
        check_content=True):
336
        """Add a text to the store.
337
338
        :param key: The key tuple of the text to add.
339
        :param parents: The parents key tuples of the text to add.
340
        :param lines: A list of lines. Each line must be a bytestring. And all
341
            of them except the last must be terminated with \n and contain no
342
            other \n's. The last line may either contain no \n's or a single
343
            terminating \n. If the lines list does meet this constraint the add
344
            routine may error or may succeed - but you will be unable to read
345
            the data back accurately. (Checking the lines have been split
346
            correctly is expensive and extremely unlikely to catch bugs so it
347
            is not done at runtime unless check_content is True.)
348
        :param parent_texts: An optional dictionary containing the opaque 
349
            representations of some or all of the parents of version_id to
350
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
351
            returned by add_lines or data corruption can be caused.
352
        :param left_matching_blocks: a hint about which areas are common
353
            between the text and its left-hand-parent.  The format is
354
            the SequenceMatcher.get_matching_blocks format.
355
        :param nostore_sha: Raise ExistingContent and do not add the lines to
356
            the versioned file if the digest of the lines matches this.
357
        :param random_id: If True a random id has been selected rather than
358
            an id determined by some deterministic process such as a converter
359
            from a foreign VCS. When True the backend may choose not to check
360
            for uniqueness of the resulting key within the versioned file, so
361
            this should only be done when the result is expected to be unique
362
            anyway.
363
        :param check_content: If True, the lines supplied are verified to be
364
            bytestrings that are correctly formed lines.
365
        :return: The text sha1, the number of bytes in the text, and an opaque
366
                 representation of the inserted version which can be provided
367
                 back to future add_lines calls in the parent_texts dictionary.
368
        """
369
        self._index._check_write_ok()
370
        self._check_add(key, lines, random_id, check_content)
371
        if parents is None:
372
            # The caller might pass None if there is no graph data, but kndx
373
            # indexes can't directly store that, so we give them
374
            # an empty tuple instead.
375
            parents = ()
376
        # double handling for now. Make it work until then.
377
        bytes = ''.join(lines)
378
        record = FulltextContentFactory(key, parents, None, bytes)
0.17.5 by Robert Collins
nograph tests completely passing.
379
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.17.2 by Robert Collins
Core proof of concept working.
380
        return sha1, len(bytes), None
381
0.17.4 by Robert Collins
Annotate.
382
    def annotate(self, key):
383
        """See VersionedFiles.annotate."""
384
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
385
        parent_map = self.get_parent_map([key])
386
        if not parent_map:
387
            raise errors.RevisionNotPresent(key, self)
388
        if parent_map[key] is not None:
389
            search = graph._make_breadth_first_searcher([key])
390
            keys = set()
391
            while True:
392
                try:
393
                    present, ghosts = search.next_with_ghosts()
394
                except StopIteration:
395
                    break
396
                keys.update(present)
397
            parent_map = self.get_parent_map(keys)
398
        else:
399
            keys = [key]
400
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
401
        head_cache = _mod_graph.FrozenHeadsCache(graph)
402
        parent_cache = {}
403
        reannotate = annotate.reannotate
404
        for record in self.get_record_stream(keys, 'topological', True):
405
            key = record.key
406
            fulltext = split_lines(record.get_bytes_as('fulltext'))
407
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
408
            parent_cache[key] = list(
409
                reannotate(parent_lines, fulltext, key, None, head_cache))
410
        return parent_cache[key]
411
0.17.5 by Robert Collins
nograph tests completely passing.
412
    def check(self, progress_bar=None):
413
        """See VersionedFiles.check()."""
414
        keys = self.keys()
415
        for record in self.get_record_stream(keys, 'unordered', True):
416
            record.get_bytes_as('fulltext')
417
0.17.2 by Robert Collins
Core proof of concept working.
418
    def _check_add(self, key, lines, random_id, check_content):
419
        """check that version_id and lines are safe to add."""
420
        version_id = key[-1]
421
        if contains_whitespace(version_id):
422
            raise InvalidRevisionId(version_id, self)
423
        self.check_not_reserved_id(version_id)
424
        # TODO: If random_id==False and the key is already present, we should
425
        # probably check that the existing content is identical to what is
426
        # being inserted, and otherwise raise an exception.  This would make
427
        # the bundle code simpler.
428
        if check_content:
429
            self._check_lines_not_unicode(lines)
430
            self._check_lines_are_lines(lines)
431
0.17.5 by Robert Collins
nograph tests completely passing.
432
    def get_parent_map(self, keys):
433
        """Get a map of the parents of keys.
434
435
        :param keys: The keys to look up parents for.
436
        :return: A mapping from keys to parents. Absent keys are absent from
437
            the mapping.
438
        """
439
        result = {}
440
        sources = [self._index]
441
        source_results = []
442
        missing = set(keys)
443
        for source in sources:
444
            if not missing:
445
                break
446
            new_result = source.get_parent_map(missing)
447
            source_results.append(new_result)
448
            result.update(new_result)
449
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
450
        if self._unadded_refs:
451
            for key in missing:
452
                if key in self._unadded_refs:
453
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
454
        return result
455
456
    def get_record_stream(self, keys, ordering, include_delta_closure):
457
        """Get a stream of records for keys.
458
459
        :param keys: The keys to include.
460
        :param ordering: Either 'unordered' or 'topological'. A topologically
461
            sorted stream has compression parents strictly before their
462
            children.
463
        :param include_delta_closure: If True then the closure across any
464
            compression parents will be included (in the opaque data).
465
        :return: An iterator of ContentFactory objects, each of which is only
466
            valid until the iterator is advanced.
467
        """
468
        # keys might be a generator
469
        keys = set(keys)
470
        if not keys:
471
            return
472
        if not self._index.has_graph:
473
            # Cannot topological order when no graph has been stored.
474
            ordering = 'unordered'
475
        # Cheap: iterate
476
        locations = self._index.get_build_details(keys)
477
        if ordering == 'topological':
478
            # would be better to not globally sort initially but instead
479
            # start with one key, recurse to its oldest parent, then grab
480
            # everything in the same group, etc.
481
            parent_map = dict((key, details[2]) for key, details in
482
                locations.iteritems())
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
483
            local = frozenset(keys).intersection(set(self._unadded_refs))
484
            for key in local:
485
                parent_map[key] = self._unadded_refs[key]
486
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
487
            present_keys = topo_sort(parent_map)
488
            # Now group by source:
489
        else:
490
            present_keys = locations.keys()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
491
            local = frozenset(keys).intersection(set(self._unadded_refs))
492
            for key in local:
493
                present_keys.append(key)
494
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
495
        absent_keys = keys.difference(set(locations))
496
        for key in absent_keys:
497
            yield AbsentContentFactory(key)
498
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
499
            if key in self._unadded_refs:
500
                lines, sha1 = self._compressor.extract(key)
501
                parents = self._unadded_refs[key]
502
            else:
503
                index_memo, _, parents, (method, _) = locations[key]
504
                # read
505
                read_memo = index_memo[0:3]
506
                zdata = self._access.get_raw_records([read_memo]).next()
507
                # decompress
508
                plain = zlib.decompress(zdata)
509
                # parse
510
                delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
511
                label, sha1, delta = parse(delta_lines)
512
                if label != key:
513
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
514
                basis = plain[:index_memo[3]]
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
515
                # basis = StringIO(basis).readlines()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
516
                #basis = split_lines(plain[:last_end])
517
                lines = apply_delta(basis, delta)
0.17.5 by Robert Collins
nograph tests completely passing.
518
            bytes = ''.join(lines)
519
            yield FulltextContentFactory(key, parents, sha1, bytes)
520
            
521
    def get_sha1s(self, keys):
522
        """See VersionedFiles.get_sha1s()."""
523
        result = {}
524
        for record in self.get_record_stream(keys, 'unordered', True):
525
            if record.sha1 != None:
526
                result[record.key] = record.sha1
527
            else:
528
                if record.storage_kind != 'absent':
529
                    result[record.key] == sha_string(record.get_bytes_as(
530
                        'fulltext'))
531
        return result
532
0.17.2 by Robert Collins
Core proof of concept working.
533
    def insert_record_stream(self, stream):
534
        """Insert a record stream into this container.
535
536
        :param stream: A stream of records to insert. 
537
        :return: None
538
        :seealso VersionedFiles.get_record_stream:
539
        """
0.17.5 by Robert Collins
nograph tests completely passing.
540
        for _ in self._insert_record_stream(stream):
541
            pass
0.17.2 by Robert Collins
Core proof of concept working.
542
0.17.5 by Robert Collins
nograph tests completely passing.
543
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
544
        """Internal core to insert a record stream into this container.
545
546
        This helper function has a different interface than insert_record_stream
547
        to allow add_lines to be minimal, but still return the needed data.
548
549
        :param stream: A stream of records to insert. 
550
        :return: An iterator over the sha1 of the inserted records.
551
        :seealso insert_record_stream:
552
        :seealso add_lines:
553
        """
0.17.5 by Robert Collins
nograph tests completely passing.
554
        def get_adapter(adapter_key):
555
            try:
556
                return adapters[adapter_key]
557
            except KeyError:
558
                adapter_factory = adapter_registry.get(adapter_key)
559
                adapter = adapter_factory(self)
560
                adapters[adapter_key] = adapter
561
                return adapter
562
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
563
        # This will go up to fulltexts for gc to gc fetching, which isn't
564
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
565
        self._compressor = GroupCompressor(self._delta)
566
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
567
        keys_to_add = []
568
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
569
        groups = 1
570
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
571
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
572
            index, start, length = self._access.add_raw_records(
573
                [(None, len(compressed))], compressed)[0]
574
            nodes = []
575
            for key, reads, refs in keys_to_add:
576
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
577
            self._index.add_records(nodes, random_id=random_id)
0.17.2 by Robert Collins
Core proof of concept working.
578
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
579
            # Raise an error when a record is missing.
580
            if record.storage_kind == 'absent':
581
                raise errors.RevisionNotPresent([record.key], self)
582
            elif record.storage_kind == 'fulltext':
583
                bytes = record.get_bytes_as('fulltext')
584
            else:
585
                adapter_key = record.storage_kind, 'fulltext'
586
                adapter = get_adapter(adapter_key)
587
                bytes = adapter.get_bytes(record,
588
                    record.get_bytes_as(record.storage_kind))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
589
            found_sha1, end_point = self._compressor.compress(record.key,
0.17.5 by Robert Collins
nograph tests completely passing.
590
                split_lines(bytes), record.sha1)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
591
            self._unadded_refs[record.key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
592
            yield found_sha1
0.17.5 by Robert Collins
nograph tests completely passing.
593
            keys_to_add.append((record.key, '%d %d' % (basis_end, end_point),
594
                (record.parents,)))
595
            basis_end = end_point
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
596
            if basis_end > 1024 * 1024 * 20:
597
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
598
                self._compressor = GroupCompressor(self._delta)
599
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
600
                keys_to_add = []
601
                basis_end = 0
602
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
603
        if len(keys_to_add):
604
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
605
        self._compressor = None
606
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
607
608
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
609
        """Iterate over the lines in the versioned files from keys.
610
611
        This may return lines from other keys. Each item the returned
612
        iterator yields is a tuple of a line and a text version that that line
613
        is present in (not introduced in).
614
615
        Ordering of results is in whatever order is most suitable for the
616
        underlying storage format.
617
618
        If a progress bar is supplied, it may be used to indicate progress.
619
        The caller is responsible for cleaning up progress bars (because this
620
        is an iterator).
621
622
        NOTES:
623
         * Lines are normalised by the underlying store: they will all have \n
624
           terminators.
625
         * Lines are returned in arbitrary order.
626
627
        :return: An iterator over (line, key).
628
        """
629
        if pb is None:
630
            pb = progress.DummyProgress()
631
        keys = set(keys)
632
        total = len(keys)
633
        # we don't care about inclusions, the caller cares.
634
        # but we need to setup a list of records to visit.
635
        # we need key, position, length
636
        for key_idx, record in enumerate(self.get_record_stream(keys,
637
            'unordered', True)):
638
            # XXX: todo - optimise to use less than full texts.
639
            key = record.key
640
            pb.update('Walking content.', key_idx, total)
641
            if record.storage_kind == 'absent':
642
                raise errors.RevisionNotPresent(record.key, self)
643
            lines = split_lines(record.get_bytes_as('fulltext'))
644
            for line in lines:
645
                yield line, key
646
        pb.update('Walking content.', total, total)
647
648
    def keys(self):
649
        """See VersionedFiles.keys."""
650
        if 'evil' in debug.debug_flags:
651
            trace.mutter_callsite(2, "keys scales with size of history")
652
        sources = [self._index]
653
        result = set()
654
        for source in sources:
655
            result.update(source.keys())
656
        return result
657
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
658
659
class _GCGraphIndex(object):
660
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
661
0.17.9 by Robert Collins
Initial stab at repository format support.
662
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
663
        add_callback=None):
664
        """Construct a _GCGraphIndex on a graph_index.
665
666
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
667
        :param is_locked: A callback to check whether the object should answer
668
            queries.
669
        :param parents: If True, record knits parents, if not do not record 
670
            parents.
671
        :param add_callback: If not None, allow additions to the index and call
672
            this callback with a list of added GraphIndex nodes:
673
            [(node, value, node_refs), ...]
674
        :param is_locked: A callback, returns True if the index is locked and
675
            thus usable.
676
        """
677
        self._add_callback = add_callback
678
        self._graph_index = graph_index
679
        self._parents = parents
680
        self.has_graph = parents
681
        self._is_locked = is_locked
682
0.17.5 by Robert Collins
nograph tests completely passing.
683
    def add_records(self, records, random_id=False):
684
        """Add multiple records to the index.
685
        
686
        This function does not insert data into the Immutable GraphIndex
687
        backing the KnitGraphIndex, instead it prepares data for insertion by
688
        the caller and checks that it is safe to insert then calls
689
        self._add_callback with the prepared GraphIndex nodes.
690
691
        :param records: a list of tuples:
692
                         (key, options, access_memo, parents).
693
        :param random_id: If True the ids being added were randomly generated
694
            and no check for existence will be performed.
695
        """
696
        if not self._add_callback:
697
            raise errors.ReadOnlyError(self)
698
        # we hope there are no repositories with inconsistent parentage
699
        # anymore.
700
701
        changed = False
702
        keys = {}
703
        for (key, value, refs) in records:
704
            if not self._parents:
705
                if refs:
706
                    for ref in refs:
707
                        if ref:
708
                            raise KnitCorrupt(self,
709
                                "attempt to add node with parents "
710
                                "in parentless index.")
711
                    refs = ()
712
                    changed = True
713
            keys[key] = (value, refs)
714
        # check for dups
715
        if not random_id:
716
            present_nodes = self._get_entries(keys)
717
            for (index, key, value, node_refs) in present_nodes:
718
                if node_refs != keys[key][1]:
719
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
720
                        ": %s %s" % ((value, node_refs), keys[key]))
721
                del keys[key]
722
                changed = True
723
        if changed:
724
            result = []
725
            if self._parents:
726
                for key, (value, node_refs) in keys.iteritems():
727
                    result.append((key, value, node_refs))
728
            else:
729
                for key, (value, node_refs) in keys.iteritems():
730
                    result.append((key, value))
731
            records = result
732
        self._add_callback(records)
733
        
734
    def _check_read(self):
735
        """raise if reads are not permitted."""
736
        if not self._is_locked():
737
            raise errors.ObjectNotLocked(self)
738
0.17.2 by Robert Collins
Core proof of concept working.
739
    def _check_write_ok(self):
740
        """Assert if writes are not permitted."""
741
        if not self._is_locked():
742
            raise errors.ObjectNotLocked(self)
743
0.17.5 by Robert Collins
nograph tests completely passing.
744
    def _get_entries(self, keys, check_present=False):
745
        """Get the entries for keys.
746
        
747
        :param keys: An iterable of index key tuples.
748
        """
749
        keys = set(keys)
750
        found_keys = set()
751
        if self._parents:
752
            for node in self._graph_index.iter_entries(keys):
753
                yield node
754
                found_keys.add(node[1])
755
        else:
756
            # adapt parentless index to the rest of the code.
757
            for node in self._graph_index.iter_entries(keys):
758
                yield node[0], node[1], node[2], ()
759
                found_keys.add(node[1])
760
        if check_present:
761
            missing_keys = keys.difference(found_keys)
762
            if missing_keys:
763
                raise RevisionNotPresent(missing_keys.pop(), self)
764
765
    def get_parent_map(self, keys):
766
        """Get a map of the parents of keys.
767
768
        :param keys: The keys to look up parents for.
769
        :return: A mapping from keys to parents. Absent keys are absent from
770
            the mapping.
771
        """
772
        self._check_read()
773
        nodes = self._get_entries(keys)
774
        result = {}
775
        if self._parents:
776
            for node in nodes:
777
                result[node[1]] = node[3][0]
778
        else:
779
            for node in nodes:
780
                result[node[1]] = None
781
        return result
782
783
    def get_build_details(self, keys):
784
        """Get the various build details for keys.
785
786
        Ghosts are omitted from the result.
787
788
        :param keys: An iterable of keys.
789
        :return: A dict of key:
790
            (index_memo, compression_parent, parents, record_details).
791
            index_memo
792
                opaque structure to pass to read_records to extract the raw
793
                data
794
            compression_parent
795
                Content that this record is built upon, may be None
796
            parents
797
                Logical parents of this node
798
            record_details
799
                extra information about the content which needs to be passed to
800
                Factory.parse_record
801
        """
802
        self._check_read()
803
        result = {}
804
        entries = self._get_entries(keys, False)
805
        for entry in entries:
806
            key = entry[1]
807
            if not self._parents:
808
                parents = None
809
            else:
810
                parents = entry[3][0]
811
            value = entry[2]
812
            method = 'group'
813
            result[key] = (self._node_to_position(entry),
814
                                  None, parents, (method, None))
815
        return result
816
    
817
    def keys(self):
818
        """Get all the keys in the collection.
819
        
820
        The keys are not ordered.
821
        """
822
        self._check_read()
823
        return [node[1] for node in self._graph_index.iter_all_entries()]
824
    
825
    def _node_to_position(self, node):
826
        """Convert an index value to position details."""
827
        bits = node[2].split(' ')
828
        # It would be nice not to read the entire gzip.
829
        start = int(bits[0])
830
        stop = int(bits[1])
831
        basis_end = int(bits[2])
832
        delta_end = int(bits[3])
833
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
834
835
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
836
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
837
    """Get the longest possible match for the current position."""
838
    range_start = pos
839
    range_len = 0
840
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
841
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
842
        if locations is None:
843
            locations = equivalence_table.get_idx_matches(pos)
844
        if locations is None:
845
            # No more matches, just return whatever we have, but we know that
846
            # this last position is not going to match anything
847
            pos += 1
848
            break
849
        else:
850
            if copy_ends is None:
851
                # We are starting a new range
852
                copy_ends = [loc + 1 for loc in locations]
853
                range_len = 1
854
                locations = None # Consumed
855
            else:
856
                # We are currently in the middle of a match
857
                next_locations = set(copy_ends).intersection(locations)
858
                if len(next_locations):
859
                    # range continues
860
                    copy_ends = [loc + 1 for loc in next_locations]
861
                    range_len += 1
862
                    locations = None # Consumed
863
                else:
864
                    # But we are done with this match, we should be
865
                    # starting a new one, though. We will pass back 'locations'
866
                    # so that we don't have to do another lookup.
867
                    break
868
        pos += 1
869
    if copy_ends is None:
870
        return None, pos, locations
871
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
872
873
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
874
try:
875
    from bzrlib.plugins.groupcompress import _groupcompress_c
876
except ImportError:
877
    pass
878
else:
879
    GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
880
    _get_longest_match = _groupcompress_c._get_longest_match