/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
30
    osutils,
0.17.4 by Robert Collins
Annotate.
31
    pack,
32
    patiencediff,
33
    )
34
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
35
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
36
from bzrlib.osutils import (
37
    contains_whitespace,
38
    contains_linebreaks,
39
    sha_string,
40
    sha_strings,
41
    split_lines,
42
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
43
from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
44
from bzrlib.lru_cache import LRUSizeCache
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
45
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
46
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
47
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
48
    adapter_registry,
49
    AbsentContentFactory,
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
50
    ChunkedContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
51
    FulltextContentFactory,
52
    VersionedFiles,
53
    )
54
55
0.20.8 by John Arbash Meinel
Allow writing negative offsets. Turns out not to actually compress better.
56
total_copy_neg = 0
57
0.17.5 by Robert Collins
nograph tests completely passing.
58
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
59
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
60
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
61
    next = lines.next
0.17.5 by Robert Collins
nograph tests completely passing.
62
    label_line = lines.next()
63
    sha1_line = lines.next()
64
    if (not label_line.startswith('label: ') or
65
        not sha1_line.startswith('sha1: ')):
66
        raise AssertionError("bad text record %r" % lines)
67
    label = tuple(label_line[7:-1].split('\x00'))
68
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
69
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
70
        op = header[0]
71
        numbers = header[2:]
72
        numbers = [int(n) for n in header[2:].split(',')]
73
        if op == 'c':
74
            result.append((op, numbers[0], numbers[1], None))
75
        else:
76
            contents = [next() for i in xrange(numbers[0])]
77
            result.append((op, None, numbers[0], contents))
0.17.5 by Robert Collins
nograph tests completely passing.
78
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
79
0.20.8 by John Arbash Meinel
Allow writing negative offsets. Turns out not to actually compress better.
80
def apply_delta(basis, delta, delta_bytes_start):
0.17.2 by Robert Collins
Core proof of concept working.
81
    """Apply delta to this object to become new_version_id."""
82
    lines = []
83
    last_offset = 0
84
    # eq ranges occur where gaps occur
85
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
86
    for op, start, count, delta_lines in delta:
87
        if op == 'c':
0.20.8 by John Arbash Meinel
Allow writing negative offsets. Turns out not to actually compress better.
88
            if start < 0:
89
                start = delta_bytes_start + start
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
90
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
91
        else:
92
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
93
    trim_encoding_newline(lines)
94
    return lines
95
96
97
def trim_encoding_newline(lines):
98
    if lines[-1] == '\n':
99
        del lines[-1]
100
    else:
101
        lines[-1] = lines[-1][:-1]
102
103
104
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
105
    """Produce a serialised group of compressed texts.
106
    
107
    It contains code very similar to SequenceMatcher because of having a similar
108
    task. However some key differences apply:
109
     - there is no junk, we want a minimal edit not a human readable diff.
110
     - we don't filter very common lines (because we don't know where a good
111
       range will start, and after the first text we want to be emitting minmal
112
       edits only.
113
     - we chain the left side, not the right side
114
     - we incrementally update the adjacency matrix as new lines are provided.
115
     - we look for matches in all of the left side, so the routine which does
116
       the analagous task of find_longest_match does not need to filter on the
117
       left side.
118
    """
0.17.2 by Robert Collins
Core proof of concept working.
119
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
120
    _equivalence_table_class = equivalence_table.EquivalenceTable
121
0.17.2 by Robert Collins
Core proof of concept working.
122
    def __init__(self, delta=True):
123
        """Create a GroupCompressor.
124
125
        :paeam delta: If False, do not compress records.
126
        """
127
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
128
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
129
        self.endpoint = 0
130
        self.input_bytes = 0
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
131
        self.line_locations = self._equivalence_table_class([])
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
132
        self.lines = self.line_locations.lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
133
        self.labels_deltas = {}
0.17.2 by Robert Collins
Core proof of concept working.
134
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
135
    def get_matching_blocks(self, lines):
136
        """Return an the ranges in lines which match self.lines.
137
138
        :param lines: lines to compress
139
        :return: A list of (old_start, new_start, length) tuples which reflect
140
            a region in self.lines that is present in lines.  The last element
141
            of the list is always (old_len, new_len, 0) to provide a end point
142
            for generating instructions from the matching blocks list.
143
        """
144
        result = []
145
        pos = 0
146
        line_locations = self.line_locations
0.18.11 by John Arbash Meinel
Convert back into grabbing a right-lines ahead of time.
147
        line_locations.set_right_lines(lines)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
148
        # We either copy a range (while there are reusable lines) or we 
149
        # insert new lines. To find reusable lines we traverse 
0.18.24 by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func.
150
        locations = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
151
        max_pos = len(lines)
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
152
        max_time = 0.0
153
        max_info = None
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
154
        result_append = result.append
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
155
        while pos < max_pos:
0.18.35 by John Arbash Meinel
remove the timing calls
156
            block, pos, locations = _get_longest_match(line_locations, pos,
157
                                                       max_pos, locations)
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
158
            if block is not None:
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
159
                result_append(block)
160
        result_append((len(self.lines), len(lines), 0))
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
161
        return result
162
0.17.2 by Robert Collins
Core proof of concept working.
163
    def compress(self, key, lines, expected_sha):
164
        """Compress lines with label key.
165
166
        :param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins
Working better --gc-plain-chk.
167
            for identification of the text during decompression. If the last
168
            element is 'None' it is replaced with the sha1 of the text -
169
            e.g. sha1:xxxxxxx.
0.17.2 by Robert Collins
Core proof of concept working.
170
        :param lines: The lines to be compressed. Must be split
171
            on \n, with the \n preserved.'
172
        :param expected_sha: If non-None, the sha the lines are blieved to
173
            have. During compression the sha is calculated; a mismatch will
174
            cause an error.
175
        :return: The sha1 of lines, and the number of bytes accumulated in
176
            the group output so far.
177
        """
178
        sha1 = sha_strings(lines)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
179
        if key[-1] is None:
180
            key = key[:-1] + ('sha1:' + sha1,)
0.17.2 by Robert Collins
Core proof of concept working.
181
        label = '\x00'.join(key)
182
        # setup good encoding for trailing \n support.
183
        if not lines or lines[-1].endswith('\n'):
184
            lines.append('\n')
185
        else:
186
            lines[-1] = lines[-1] + '\n'
187
        new_lines = []
188
        new_lines.append('label: %s\n' % label)
189
        new_lines.append('sha1: %s\n' % sha1)
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
190
        index_lines = [False, False]
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
191
        pos = 0
0.17.14 by Robert Collins
Cleaner code.
192
        range_len = 0
193
        range_start = 0
194
        flush_range = self.flush_range
195
        copy_ends = None
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
196
        blocks = self.get_matching_blocks(lines)
197
        current_pos = 0
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
198
        # We either copy a range (while there are reusable lines) or we 
199
        # insert new lines. To find reusable lines we traverse 
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
200
        for old_start, new_start, range_len in blocks:
201
            if new_start != current_pos:
202
                # non-matching region
0.19.1 by Robert Collins
Start to simplify flush_range.
203
                flush_range(current_pos, None, new_start - current_pos,
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
204
                    lines, new_lines, index_lines)
205
            current_pos = new_start + range_len
206
            if not range_len:
207
                continue
0.19.1 by Robert Collins
Start to simplify flush_range.
208
            flush_range(new_start, old_start, range_len, lines,
209
                new_lines, index_lines)
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
210
        delta_start = (self.endpoint, len(self.lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
211
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
212
        trim_encoding_newline(lines)
213
        self.input_bytes += sum(map(len, lines))
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
214
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
215
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
216
        return sha1, self.endpoint
217
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
218
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
219
        """Extract a key previously added to the compressor.
220
        
221
        :param key: The key to extract.
222
        :return: An iterable over bytes and the sha1.
223
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
224
        delta_details = self.labels_deltas[key]
225
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
226
        label, sha1, delta = parse(delta_lines)
227
        if label != key:
228
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
229
        # Perhaps we want to keep the line offsets too in memory at least?
230
        lines = apply_delta(''.join(self.lines), delta)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
231
        sha1 = sha_strings(lines)
232
        return lines, sha1
233
0.19.1 by Robert Collins
Start to simplify flush_range.
234
    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
0.17.14 by Robert Collins
Cleaner code.
235
        insert_instruction = "i,%d\n" % range_len
0.19.1 by Robert Collins
Start to simplify flush_range.
236
        if copy_start is not None:
0.17.14 by Robert Collins
Cleaner code.
237
            # range stops, flush and start a new copy range
238
            stop_byte = self.line_offsets[copy_start + range_len - 1]
239
            if copy_start == 0:
240
                start_byte = 0
241
            else:
242
                start_byte = self.line_offsets[copy_start - 1]
243
            bytes = stop_byte - start_byte
0.20.8 by John Arbash Meinel
Allow writing negative offsets. Turns out not to actually compress better.
244
            neg_offset = start_byte - self.endpoint
0.17.14 by Robert Collins
Cleaner code.
245
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
0.20.8 by John Arbash Meinel
Allow writing negative offsets. Turns out not to actually compress better.
246
            assert neg_offset < 0
247
            copy_neg = "c,%d,%d\n" % (neg_offset, bytes)
248
            if len(copy_neg) < len(copy_control_instruction):
249
                global total_copy_neg
250
                total_copy_neg += len(copy_neg) - len(copy_control_instruction)
251
                # copy_control_instruction = copy_neg
0.17.14 by Robert Collins
Cleaner code.
252
            if (bytes + len(insert_instruction) >
253
                len(copy_control_instruction)):
254
                new_lines.append(copy_control_instruction)
255
                index_lines.append(False)
256
                return
257
        # not copying, or inserting is shorter than copying, so insert.
258
        new_lines.append(insert_instruction)
259
        new_lines.extend(lines[range_start:range_start+range_len])
260
        index_lines.append(False)
0.19.1 by Robert Collins
Start to simplify flush_range.
261
        index_lines.extend([copy_start is None]*range_len)
0.17.14 by Robert Collins
Cleaner code.
262
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
263
    def output_lines(self, new_lines, index_lines):
264
        """Output some lines.
265
266
        :param new_lines: The lines to output.
267
        :param index_lines: A boolean flag for each line - when True, index
268
            that line.
269
        """
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
270
        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
271
        #                          if val and new_lines[idx] == '\n']
272
        # if indexed_newlines:
273
        #     import pdb; pdb.set_trace()
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
274
        endpoint = self.endpoint
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
275
        self.line_locations.extend_lines(new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
276
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
277
            endpoint += len(line)
278
            self.line_offsets.append(endpoint)
279
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
280
0.17.2 by Robert Collins
Core proof of concept working.
281
    def ratio(self):
282
        """Return the overall compression ratio."""
283
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
284
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
285
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
286
def make_pack_factory(graph, delta, keylength):
287
    """Create a factory for creating a pack based groupcompress.
288
289
    This is only functional enough to run interface tests, it doesn't try to
290
    provide a full pack environment.
291
    
292
    :param graph: Store a graph.
293
    :param delta: Delta compress contents.
294
    :param keylength: How long should keys be.
295
    """
296
    def factory(transport):
297
        parents = graph or delta
298
        ref_length = 0
299
        if graph:
300
            ref_length += 1
0.17.7 by Robert Collins
Update for current index2 changes.
301
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
302
            key_elements=keylength)
303
        stream = transport.open_write_stream('newpack')
304
        writer = pack.ContainerWriter(stream.write)
305
        writer.begin()
306
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
307
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
308
        access = _DirectPackAccess({})
309
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
310
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
311
        result.stream = stream
312
        result.writer = writer
313
        return result
314
    return factory
315
316
317
def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
318
    versioned_files.writer.end()
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
319
    versioned_files.stream.close()
320
321
322
class GroupCompressVersionedFiles(VersionedFiles):
323
    """A group-compress based VersionedFiles implementation."""
324
0.17.2 by Robert Collins
Core proof of concept working.
325
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
326
        """Create a GroupCompressVersionedFiles object.
327
328
        :param index: The index object storing access and graph data.
329
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
330
        :param delta: Whether to delta compress or just entropy compress.
331
        """
332
        self._index = index
333
        self._access = access
334
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
335
        self._unadded_refs = {}
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
336
        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
0.17.2 by Robert Collins
Core proof of concept working.
337
338
    def add_lines(self, key, parents, lines, parent_texts=None,
339
        left_matching_blocks=None, nostore_sha=None, random_id=False,
340
        check_content=True):
341
        """Add a text to the store.
342
343
        :param key: The key tuple of the text to add.
344
        :param parents: The parents key tuples of the text to add.
345
        :param lines: A list of lines. Each line must be a bytestring. And all
346
            of them except the last must be terminated with \n and contain no
347
            other \n's. The last line may either contain no \n's or a single
348
            terminating \n. If the lines list does meet this constraint the add
349
            routine may error or may succeed - but you will be unable to read
350
            the data back accurately. (Checking the lines have been split
351
            correctly is expensive and extremely unlikely to catch bugs so it
352
            is not done at runtime unless check_content is True.)
353
        :param parent_texts: An optional dictionary containing the opaque 
354
            representations of some or all of the parents of version_id to
355
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
356
            returned by add_lines or data corruption can be caused.
357
        :param left_matching_blocks: a hint about which areas are common
358
            between the text and its left-hand-parent.  The format is
359
            the SequenceMatcher.get_matching_blocks format.
360
        :param nostore_sha: Raise ExistingContent and do not add the lines to
361
            the versioned file if the digest of the lines matches this.
362
        :param random_id: If True a random id has been selected rather than
363
            an id determined by some deterministic process such as a converter
364
            from a foreign VCS. When True the backend may choose not to check
365
            for uniqueness of the resulting key within the versioned file, so
366
            this should only be done when the result is expected to be unique
367
            anyway.
368
        :param check_content: If True, the lines supplied are verified to be
369
            bytestrings that are correctly formed lines.
370
        :return: The text sha1, the number of bytes in the text, and an opaque
371
                 representation of the inserted version which can be provided
372
                 back to future add_lines calls in the parent_texts dictionary.
373
        """
374
        self._index._check_write_ok()
375
        self._check_add(key, lines, random_id, check_content)
376
        if parents is None:
377
            # The caller might pass None if there is no graph data, but kndx
378
            # indexes can't directly store that, so we give them
379
            # an empty tuple instead.
380
            parents = ()
381
        # double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
382
        length = sum(map(len, lines))
383
        record = ChunkedContentFactory(key, parents, None, lines)
0.17.5 by Robert Collins
nograph tests completely passing.
384
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
385
        return sha1, length, None
0.17.2 by Robert Collins
Core proof of concept working.
386
0.17.4 by Robert Collins
Annotate.
387
    def annotate(self, key):
388
        """See VersionedFiles.annotate."""
389
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
390
        parent_map = self.get_parent_map([key])
391
        if not parent_map:
392
            raise errors.RevisionNotPresent(key, self)
393
        if parent_map[key] is not None:
394
            search = graph._make_breadth_first_searcher([key])
395
            keys = set()
396
            while True:
397
                try:
398
                    present, ghosts = search.next_with_ghosts()
399
                except StopIteration:
400
                    break
401
                keys.update(present)
402
            parent_map = self.get_parent_map(keys)
403
        else:
404
            keys = [key]
405
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
406
        head_cache = _mod_graph.FrozenHeadsCache(graph)
407
        parent_cache = {}
408
        reannotate = annotate.reannotate
409
        for record in self.get_record_stream(keys, 'topological', True):
410
            key = record.key
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
411
            chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins
Annotate.
412
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
413
            parent_cache[key] = list(
414
                reannotate(parent_lines, fulltext, key, None, head_cache))
415
        return parent_cache[key]
416
0.17.5 by Robert Collins
nograph tests completely passing.
417
    def check(self, progress_bar=None):
418
        """See VersionedFiles.check()."""
419
        keys = self.keys()
420
        for record in self.get_record_stream(keys, 'unordered', True):
421
            record.get_bytes_as('fulltext')
422
0.17.2 by Robert Collins
Core proof of concept working.
423
    def _check_add(self, key, lines, random_id, check_content):
424
        """check that version_id and lines are safe to add."""
425
        version_id = key[-1]
0.17.26 by Robert Collins
Working better --gc-plain-chk.
426
        if version_id is not None:
427
            if contains_whitespace(version_id):
428
                raise InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins
Core proof of concept working.
429
        self.check_not_reserved_id(version_id)
430
        # TODO: If random_id==False and the key is already present, we should
431
        # probably check that the existing content is identical to what is
432
        # being inserted, and otherwise raise an exception.  This would make
433
        # the bundle code simpler.
434
        if check_content:
435
            self._check_lines_not_unicode(lines)
436
            self._check_lines_are_lines(lines)
437
0.17.5 by Robert Collins
nograph tests completely passing.
438
    def get_parent_map(self, keys):
439
        """Get a map of the parents of keys.
440
441
        :param keys: The keys to look up parents for.
442
        :return: A mapping from keys to parents. Absent keys are absent from
443
            the mapping.
444
        """
445
        result = {}
446
        sources = [self._index]
447
        source_results = []
448
        missing = set(keys)
449
        for source in sources:
450
            if not missing:
451
                break
452
            new_result = source.get_parent_map(missing)
453
            source_results.append(new_result)
454
            result.update(new_result)
455
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
456
        if self._unadded_refs:
457
            for key in missing:
458
                if key in self._unadded_refs:
459
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
460
        return result
461
462
    def get_record_stream(self, keys, ordering, include_delta_closure):
463
        """Get a stream of records for keys.
464
465
        :param keys: The keys to include.
466
        :param ordering: Either 'unordered' or 'topological'. A topologically
467
            sorted stream has compression parents strictly before their
468
            children.
469
        :param include_delta_closure: If True then the closure across any
470
            compression parents will be included (in the opaque data).
471
        :return: An iterator of ContentFactory objects, each of which is only
472
            valid until the iterator is advanced.
473
        """
474
        # keys might be a generator
475
        keys = set(keys)
476
        if not keys:
477
            return
478
        if not self._index.has_graph:
479
            # Cannot topological order when no graph has been stored.
480
            ordering = 'unordered'
481
        # Cheap: iterate
482
        locations = self._index.get_build_details(keys)
483
        if ordering == 'topological':
484
            # would be better to not globally sort initially but instead
485
            # start with one key, recurse to its oldest parent, then grab
486
            # everything in the same group, etc.
487
            parent_map = dict((key, details[2]) for key, details in
488
                locations.iteritems())
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
489
            local = frozenset(keys).intersection(set(self._unadded_refs))
490
            for key in local:
491
                parent_map[key] = self._unadded_refs[key]
492
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
493
            present_keys = topo_sort(parent_map)
494
            # Now group by source:
495
        else:
496
            present_keys = locations.keys()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
497
            local = frozenset(keys).intersection(set(self._unadded_refs))
498
            for key in local:
499
                present_keys.append(key)
500
                locations[key] = None
0.17.5 by Robert Collins
nograph tests completely passing.
501
        absent_keys = keys.difference(set(locations))
502
        for key in absent_keys:
503
            yield AbsentContentFactory(key)
504
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
505
            if key in self._unadded_refs:
506
                lines, sha1 = self._compressor.extract(key)
507
                parents = self._unadded_refs[key]
508
            else:
509
                index_memo, _, parents, (method, _) = locations[key]
510
                read_memo = index_memo[0:3]
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
511
                # get the group:
512
                try:
513
                    plain = self._group_cache[read_memo]
514
                except KeyError:
515
                    # read the group
516
                    zdata = self._access.get_raw_records([read_memo]).next()
517
                    # decompress - whole thing - this is not a bug, as it
518
                    # permits caching. We might want to store the partially
519
                    # decompresed group and decompress object, so that recent
520
                    # texts are not penalised by big groups.
521
                    decomp = zlib.decompressobj()
522
                    plain = decomp.decompress(zdata) #, index_memo[4])
523
                    self._group_cache[read_memo] = plain
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
524
                # cheapo debugging:
525
                # print len(zdata), len(plain)
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
526
                # parse - requires split_lines, better to have byte offsets
527
                # here (but not by much - we only split the region for the
528
                # recipe, and we often want to end up with lines anyway.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
529
                delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
530
                label, sha1, delta = parse(delta_lines)
531
                if label != key:
532
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.20.8 by John Arbash Meinel
Allow writing negative offsets. Turns out not to actually compress better.
533
                lines = apply_delta(plain, delta, index_memo[3])
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
534
            yield ChunkedContentFactory(key, parents, sha1, lines)
535
0.17.5 by Robert Collins
nograph tests completely passing.
536
    def get_sha1s(self, keys):
537
        """See VersionedFiles.get_sha1s()."""
538
        result = {}
539
        for record in self.get_record_stream(keys, 'unordered', True):
540
            if record.sha1 != None:
541
                result[record.key] = record.sha1
542
            else:
543
                if record.storage_kind != 'absent':
544
                    result[record.key] == sha_string(record.get_bytes_as(
545
                        'fulltext'))
546
        return result
547
0.17.2 by Robert Collins
Core proof of concept working.
548
    def insert_record_stream(self, stream):
549
        """Insert a record stream into this container.
550
551
        :param stream: A stream of records to insert. 
552
        :return: None
553
        :seealso VersionedFiles.get_record_stream:
554
        """
0.17.5 by Robert Collins
nograph tests completely passing.
555
        for _ in self._insert_record_stream(stream):
556
            pass
0.17.2 by Robert Collins
Core proof of concept working.
557
0.17.5 by Robert Collins
nograph tests completely passing.
558
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
559
        """Internal core to insert a record stream into this container.
560
561
        This helper function has a different interface than insert_record_stream
562
        to allow add_lines to be minimal, but still return the needed data.
563
564
        :param stream: A stream of records to insert. 
565
        :return: An iterator over the sha1 of the inserted records.
566
        :seealso insert_record_stream:
567
        :seealso add_lines:
568
        """
0.17.5 by Robert Collins
nograph tests completely passing.
569
        def get_adapter(adapter_key):
570
            try:
571
                return adapters[adapter_key]
572
            except KeyError:
573
                adapter_factory = adapter_registry.get(adapter_key)
574
                adapter = adapter_factory(self)
575
                adapters[adapter_key] = adapter
576
                return adapter
577
        adapters = {}
0.17.2 by Robert Collins
Core proof of concept working.
578
        # This will go up to fulltexts for gc to gc fetching, which isn't
579
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
580
        self._compressor = GroupCompressor(self._delta)
581
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
582
        keys_to_add = []
583
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
584
        groups = 1
585
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
586
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
587
            index, start, length = self._access.add_raw_records(
588
                [(None, len(compressed))], compressed)[0]
589
            nodes = []
590
            for key, reads, refs in keys_to_add:
591
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
0.20.8 by John Arbash Meinel
Allow writing negative offsets. Turns out not to actually compress better.
592
            print '\ntotal neg %s' % (total_copy_neg,)
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
593
            self._index.add_records(nodes, random_id=random_id)
0.17.2 by Robert Collins
Core proof of concept working.
594
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
595
            # Raise an error when a record is missing.
596
            if record.storage_kind == 'absent':
597
                raise errors.RevisionNotPresent([record.key], self)
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
598
            elif record.storage_kind in ('chunked', 'fulltext'):
599
                lines = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.5 by Robert Collins
nograph tests completely passing.
600
            else:
601
                adapter_key = record.storage_kind, 'fulltext'
602
                adapter = get_adapter(adapter_key)
603
                bytes = adapter.get_bytes(record,
604
                    record.get_bytes_as(record.storage_kind))
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
605
                lines = osutils.split_lines(bytes)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
606
            found_sha1, end_point = self._compressor.compress(record.key,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
607
                lines, record.sha1)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
608
            if record.key[-1] is None:
609
                key = record.key[:-1] + ('sha1:' + found_sha1,)
610
            else:
611
                key = record.key
612
            self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
613
            yield found_sha1
0.17.26 by Robert Collins
Working better --gc-plain-chk.
614
            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
0.17.5 by Robert Collins
nograph tests completely passing.
615
                (record.parents,)))
616
            basis_end = end_point
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
617
            if basis_end > 1024 * 1024 * 20:
618
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
619
                self._compressor = GroupCompressor(self._delta)
620
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
621
                keys_to_add = []
622
                basis_end = 0
623
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
624
        if len(keys_to_add):
625
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
626
        self._compressor = None
627
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
628
629
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
630
        """Iterate over the lines in the versioned files from keys.
631
632
        This may return lines from other keys. Each item the returned
633
        iterator yields is a tuple of a line and a text version that that line
634
        is present in (not introduced in).
635
636
        Ordering of results is in whatever order is most suitable for the
637
        underlying storage format.
638
639
        If a progress bar is supplied, it may be used to indicate progress.
640
        The caller is responsible for cleaning up progress bars (because this
641
        is an iterator).
642
643
        NOTES:
644
         * Lines are normalised by the underlying store: they will all have \n
645
           terminators.
646
         * Lines are returned in arbitrary order.
647
648
        :return: An iterator over (line, key).
649
        """
650
        if pb is None:
651
            pb = progress.DummyProgress()
652
        keys = set(keys)
653
        total = len(keys)
654
        # we don't care about inclusions, the caller cares.
655
        # but we need to setup a list of records to visit.
656
        # we need key, position, length
657
        for key_idx, record in enumerate(self.get_record_stream(keys,
658
            'unordered', True)):
659
            # XXX: todo - optimise to use less than full texts.
660
            key = record.key
661
            pb.update('Walking content.', key_idx, total)
662
            if record.storage_kind == 'absent':
663
                raise errors.RevisionNotPresent(record.key, self)
664
            lines = split_lines(record.get_bytes_as('fulltext'))
665
            for line in lines:
666
                yield line, key
667
        pb.update('Walking content.', total, total)
668
669
    def keys(self):
670
        """See VersionedFiles.keys."""
671
        if 'evil' in debug.debug_flags:
672
            trace.mutter_callsite(2, "keys scales with size of history")
673
        sources = [self._index]
674
        result = set()
675
        for source in sources:
676
            result.update(source.keys())
677
        return result
678
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
679
680
class _GCGraphIndex(object):
681
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
682
0.17.9 by Robert Collins
Initial stab at repository format support.
683
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
684
        add_callback=None):
685
        """Construct a _GCGraphIndex on a graph_index.
686
687
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
688
        :param is_locked: A callback to check whether the object should answer
689
            queries.
690
        :param parents: If True, record knits parents, if not do not record 
691
            parents.
692
        :param add_callback: If not None, allow additions to the index and call
693
            this callback with a list of added GraphIndex nodes:
694
            [(node, value, node_refs), ...]
695
        :param is_locked: A callback, returns True if the index is locked and
696
            thus usable.
697
        """
698
        self._add_callback = add_callback
699
        self._graph_index = graph_index
700
        self._parents = parents
701
        self.has_graph = parents
702
        self._is_locked = is_locked
703
0.17.5 by Robert Collins
nograph tests completely passing.
704
    def add_records(self, records, random_id=False):
705
        """Add multiple records to the index.
706
        
707
        This function does not insert data into the Immutable GraphIndex
708
        backing the KnitGraphIndex, instead it prepares data for insertion by
709
        the caller and checks that it is safe to insert then calls
710
        self._add_callback with the prepared GraphIndex nodes.
711
712
        :param records: a list of tuples:
713
                         (key, options, access_memo, parents).
714
        :param random_id: If True the ids being added were randomly generated
715
            and no check for existence will be performed.
716
        """
717
        if not self._add_callback:
718
            raise errors.ReadOnlyError(self)
719
        # we hope there are no repositories with inconsistent parentage
720
        # anymore.
721
722
        changed = False
723
        keys = {}
724
        for (key, value, refs) in records:
725
            if not self._parents:
726
                if refs:
727
                    for ref in refs:
728
                        if ref:
729
                            raise KnitCorrupt(self,
730
                                "attempt to add node with parents "
731
                                "in parentless index.")
732
                    refs = ()
733
                    changed = True
734
            keys[key] = (value, refs)
735
        # check for dups
736
        if not random_id:
737
            present_nodes = self._get_entries(keys)
738
            for (index, key, value, node_refs) in present_nodes:
739
                if node_refs != keys[key][1]:
740
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
741
                        ": %s %s" % ((value, node_refs), keys[key]))
742
                del keys[key]
743
                changed = True
744
        if changed:
745
            result = []
746
            if self._parents:
747
                for key, (value, node_refs) in keys.iteritems():
748
                    result.append((key, value, node_refs))
749
            else:
750
                for key, (value, node_refs) in keys.iteritems():
751
                    result.append((key, value))
752
            records = result
753
        self._add_callback(records)
754
        
755
    def _check_read(self):
756
        """raise if reads are not permitted."""
757
        if not self._is_locked():
758
            raise errors.ObjectNotLocked(self)
759
0.17.2 by Robert Collins
Core proof of concept working.
760
    def _check_write_ok(self):
761
        """Assert if writes are not permitted."""
762
        if not self._is_locked():
763
            raise errors.ObjectNotLocked(self)
764
0.17.5 by Robert Collins
nograph tests completely passing.
765
    def _get_entries(self, keys, check_present=False):
766
        """Get the entries for keys.
767
        
768
        :param keys: An iterable of index key tuples.
769
        """
770
        keys = set(keys)
771
        found_keys = set()
772
        if self._parents:
773
            for node in self._graph_index.iter_entries(keys):
774
                yield node
775
                found_keys.add(node[1])
776
        else:
777
            # adapt parentless index to the rest of the code.
778
            for node in self._graph_index.iter_entries(keys):
779
                yield node[0], node[1], node[2], ()
780
                found_keys.add(node[1])
781
        if check_present:
782
            missing_keys = keys.difference(found_keys)
783
            if missing_keys:
784
                raise RevisionNotPresent(missing_keys.pop(), self)
785
786
    def get_parent_map(self, keys):
787
        """Get a map of the parents of keys.
788
789
        :param keys: The keys to look up parents for.
790
        :return: A mapping from keys to parents. Absent keys are absent from
791
            the mapping.
792
        """
793
        self._check_read()
794
        nodes = self._get_entries(keys)
795
        result = {}
796
        if self._parents:
797
            for node in nodes:
798
                result[node[1]] = node[3][0]
799
        else:
800
            for node in nodes:
801
                result[node[1]] = None
802
        return result
803
804
    def get_build_details(self, keys):
805
        """Get the various build details for keys.
806
807
        Ghosts are omitted from the result.
808
809
        :param keys: An iterable of keys.
810
        :return: A dict of key:
811
            (index_memo, compression_parent, parents, record_details).
812
            index_memo
813
                opaque structure to pass to read_records to extract the raw
814
                data
815
            compression_parent
816
                Content that this record is built upon, may be None
817
            parents
818
                Logical parents of this node
819
            record_details
820
                extra information about the content which needs to be passed to
821
                Factory.parse_record
822
        """
823
        self._check_read()
824
        result = {}
825
        entries = self._get_entries(keys, False)
826
        for entry in entries:
827
            key = entry[1]
828
            if not self._parents:
829
                parents = None
830
            else:
831
                parents = entry[3][0]
832
            value = entry[2]
833
            method = 'group'
834
            result[key] = (self._node_to_position(entry),
835
                                  None, parents, (method, None))
836
        return result
837
    
838
    def keys(self):
839
        """Get all the keys in the collection.
840
        
841
        The keys are not ordered.
842
        """
843
        self._check_read()
844
        return [node[1] for node in self._graph_index.iter_all_entries()]
845
    
846
    def _node_to_position(self, node):
847
        """Convert an index value to position details."""
848
        bits = node[2].split(' ')
849
        # It would be nice not to read the entire gzip.
850
        start = int(bits[0])
851
        stop = int(bits[1])
852
        basis_end = int(bits[2])
853
        delta_end = int(bits[3])
854
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
855
856
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
857
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
858
    """Get the longest possible match for the current position."""
859
    range_start = pos
860
    range_len = 0
861
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
862
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
863
        if locations is None:
864
            locations = equivalence_table.get_idx_matches(pos)
865
        if locations is None:
866
            # No more matches, just return whatever we have, but we know that
867
            # this last position is not going to match anything
868
            pos += 1
869
            break
870
        else:
871
            if copy_ends is None:
872
                # We are starting a new range
873
                copy_ends = [loc + 1 for loc in locations]
874
                range_len = 1
875
                locations = None # Consumed
876
            else:
877
                # We are currently in the middle of a match
878
                next_locations = set(copy_ends).intersection(locations)
879
                if len(next_locations):
880
                    # range continues
881
                    copy_ends = [loc + 1 for loc in next_locations]
882
                    range_len += 1
883
                    locations = None # Consumed
884
                else:
885
                    # But we are done with this match, we should be
886
                    # starting a new one, though. We will pass back 'locations'
887
                    # so that we don't have to do another lookup.
888
                    break
889
        pos += 1
890
    if copy_ends is None:
891
        return None, pos, locations
892
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
893
894
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
895
try:
896
    from bzrlib.plugins.groupcompress import _groupcompress_c
897
except ImportError:
898
    pass
899
else:
900
    GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
901
    _get_longest_match = _groupcompress_c._get_longest_match