/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
1
# groupcompress, a bzr plugin providing new compression logic.
2
# Copyright (C) 2008 Canonical Limited.
3
# 
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License version 2 as published
6
# by the Free Software Foundation.
7
# 
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
# 
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
16
# 
17
18
"""Core compression logic for compressing streams of related files."""
19
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
20
from itertools import izip
0.17.5 by Robert Collins
nograph tests completely passing.
21
from cStringIO import StringIO
22
import zlib
23
0.17.4 by Robert Collins
Annotate.
24
from bzrlib import (
25
    annotate,
0.17.5 by Robert Collins
nograph tests completely passing.
26
    debug,
0.17.4 by Robert Collins
Annotate.
27
    diff,
0.17.5 by Robert Collins
nograph tests completely passing.
28
    errors,
0.17.4 by Robert Collins
Annotate.
29
    graph as _mod_graph,
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
30
    osutils,
0.17.4 by Robert Collins
Annotate.
31
    pack,
32
    patiencediff,
33
    )
34
from bzrlib.graph import Graph
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
35
from bzrlib.knit import _DirectPackAccess
0.17.2 by Robert Collins
Core proof of concept working.
36
from bzrlib.osutils import (
37
    contains_whitespace,
38
    contains_linebreaks,
39
    sha_string,
40
    sha_strings,
41
    split_lines,
42
    )
0.17.21 by Robert Collins
Update groupcompress to bzrlib 1.10.
43
from bzrlib.btree_index import BTreeBuilder
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
44
from bzrlib.lru_cache import LRUSizeCache
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
45
from bzrlib.plugins.groupcompress import equivalence_table
0.17.9 by Robert Collins
Initial stab at repository format support.
46
from bzrlib.tsort import topo_sort
0.17.2 by Robert Collins
Core proof of concept working.
47
from bzrlib.versionedfile import (
0.17.5 by Robert Collins
nograph tests completely passing.
48
    adapter_registry,
49
    AbsentContentFactory,
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
50
    ChunkedContentFactory,
0.17.2 by Robert Collins
Core proof of concept working.
51
    FulltextContentFactory,
52
    VersionedFiles,
53
    )
54
55
0.17.5 by Robert Collins
nograph tests completely passing.
56
def parse(line_list):
0.17.2 by Robert Collins
Core proof of concept working.
57
    result = []
0.17.5 by Robert Collins
nograph tests completely passing.
58
    lines = iter(line_list)
0.17.2 by Robert Collins
Core proof of concept working.
59
    next = lines.next
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
60
    label_line = next()
61
    sha1_line = next()
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
62
    if (not label_line.startswith('label: ') or
63
        not sha1_line.startswith('sha1: ')):
64
        raise AssertionError("bad text record %r" % lines)
65
    label = tuple(label_line[7:-1].split('\x00'))
66
    sha1 = sha1_line[6:-1]
0.17.2 by Robert Collins
Core proof of concept working.
67
    for header in lines:
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
68
        op = header[0]
69
        numbers = header[2:]
70
        numbers = [int(n) for n in header[2:].split(',')]
71
        if op == 'c':
72
            result.append((op, numbers[0], numbers[1], None))
73
        else:
74
            contents = [next() for i in xrange(numbers[0])]
75
            result.append((op, None, numbers[0], contents))
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
76
    ## return result
77
    return label, sha1, result
0.17.2 by Robert Collins
Core proof of concept working.
78
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
79
0.20.9 by John Arbash Meinel
Revert previous change.
80
def apply_delta(basis, delta):
0.17.2 by Robert Collins
Core proof of concept working.
81
    """Apply delta to this object to become new_version_id."""
82
    lines = []
83
    last_offset = 0
84
    # eq ranges occur where gaps occur
85
    # start, end refer to offsets in basis
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
86
    for op, start, count, delta_lines in delta:
87
        if op == 'c':
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
88
            lines.append(basis[start:start+count])
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
89
        else:
90
            lines.extend(delta_lines)
0.17.2 by Robert Collins
Core proof of concept working.
91
    trim_encoding_newline(lines)
92
    return lines
93
94
95
def trim_encoding_newline(lines):
96
    if lines[-1] == '\n':
97
        del lines[-1]
98
    else:
99
        lines[-1] = lines[-1][:-1]
100
101
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
102
103
def sort_gc_optimal(parent_map):
104
    """Sort and group the keys in parent_map into gc-optimal order.
105
106
    gc-optimal is defined (currently) as reverse-topological order, grouped by
107
    the key prefix.
108
109
    :return: A sorted-list of keys
110
    """
111
    # gc-optimal ordering is approximately reverse topological,
112
    # properly grouped by file-id.
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
113
    per_prefix_map = {}
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
114
    for item in parent_map.iteritems():
115
        key = item[0]
116
        if isinstance(key, str) or len(key) == 1:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
117
            prefix = ''
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
118
        else:
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
119
            prefix = key[0]
120
        try:
121
            per_prefix_map[prefix].append(item)
122
        except KeyError:
123
            per_prefix_map[prefix] = [item]
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
124
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
125
    present_keys = []
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
126
    for prefix in sorted(per_prefix_map):
127
        present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
128
    return present_keys
129
130
0.17.2 by Robert Collins
Core proof of concept working.
131
class GroupCompressor(object):
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
132
    """Produce a serialised group of compressed texts.
133
    
134
    It contains code very similar to SequenceMatcher because of having a similar
135
    task. However some key differences apply:
136
     - there is no junk, we want a minimal edit not a human readable diff.
137
     - we don't filter very common lines (because we don't know where a good
138
       range will start, and after the first text we want to be emitting minmal
139
       edits only.
140
     - we chain the left side, not the right side
141
     - we incrementally update the adjacency matrix as new lines are provided.
142
     - we look for matches in all of the left side, so the routine which does
143
       the analagous task of find_longest_match does not need to filter on the
144
       left side.
145
    """
0.17.2 by Robert Collins
Core proof of concept working.
146
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
147
    _equivalence_table_class = equivalence_table.EquivalenceTable
148
0.17.2 by Robert Collins
Core proof of concept working.
149
    def __init__(self, delta=True):
150
        """Create a GroupCompressor.
151
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
152
        :param delta: If False, do not compress records.
0.17.2 by Robert Collins
Core proof of concept working.
153
        """
154
        self._delta = delta
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
155
        self.line_offsets = []
0.17.2 by Robert Collins
Core proof of concept working.
156
        self.endpoint = 0
157
        self.input_bytes = 0
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
158
        self.line_locations = self._equivalence_table_class([])
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
159
        self.lines = self.line_locations.lines
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
160
        self.labels_deltas = {}
0.20.12 by John Arbash Meinel
Change the code a little bit.
161
        self._present_prefixes = set()
0.17.2 by Robert Collins
Core proof of concept working.
162
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
163
    def get_matching_blocks(self, lines, soft=False):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
164
        """Return the ranges in lines which match self.lines.
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
165
166
        :param lines: lines to compress
167
        :return: A list of (old_start, new_start, length) tuples which reflect
168
            a region in self.lines that is present in lines.  The last element
169
            of the list is always (old_len, new_len, 0) to provide a end point
170
            for generating instructions from the matching blocks list.
171
        """
172
        result = []
173
        pos = 0
174
        line_locations = self.line_locations
0.18.11 by John Arbash Meinel
Convert back into grabbing a right-lines ahead of time.
175
        line_locations.set_right_lines(lines)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
176
        # We either copy a range (while there are reusable lines) or we 
177
        # insert new lines. To find reusable lines we traverse 
0.18.24 by John Arbash Meinel
Factor out the most compute intensive portion, with plans to turn it into a compiled func.
178
        locations = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
179
        max_pos = len(lines)
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
180
        result_append = result.append
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
181
        min_match_bytes = 10
182
        if soft:
183
            min_match_bytes = 200
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
184
        while pos < max_pos:
0.18.35 by John Arbash Meinel
remove the timing calls
185
            block, pos, locations = _get_longest_match(line_locations, pos,
186
                                                       max_pos, locations)
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
187
            if block is not None:
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
188
                # Check to see if we are matching fewer than 5 characters,
189
                # which is turned into a simple 'insert', rather than a copy
190
                # If we have more than 5 lines, we definitely have more than 5
191
                # chars
192
                if block[-1] < min_match_bytes:
193
                    # This block may be a 'short' block, check
194
                    old_start, new_start, range_len = block
195
                    matched_bytes = sum(map(len,
196
                        lines[new_start:new_start + range_len]))
197
                    if matched_bytes < min_match_bytes:
198
                        block = None
199
            if block is not None:
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
200
                result_append(block)
201
        result_append((len(self.lines), len(lines), 0))
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
202
        return result
203
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
204
    def compress(self, key, lines, expected_sha, soft=False):
0.17.2 by Robert Collins
Core proof of concept working.
205
        """Compress lines with label key.
206
207
        :param key: A key tuple. It is stored in the output
0.17.26 by Robert Collins
Working better --gc-plain-chk.
208
            for identification of the text during decompression. If the last
209
            element is 'None' it is replaced with the sha1 of the text -
210
            e.g. sha1:xxxxxxx.
0.17.2 by Robert Collins
Core proof of concept working.
211
        :param lines: The lines to be compressed. Must be split
212
            on \n, with the \n preserved.'
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
213
        :param expected_sha: If non-None, the sha the lines are believed to
0.17.2 by Robert Collins
Core proof of concept working.
214
            have. During compression the sha is calculated; a mismatch will
215
            cause an error.
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
216
        :param soft: Do a 'soft' compression. This means that we require larger
217
            ranges to match to be considered for a copy command.
0.17.2 by Robert Collins
Core proof of concept working.
218
        :return: The sha1 of lines, and the number of bytes accumulated in
219
            the group output so far.
220
        """
221
        sha1 = sha_strings(lines)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
222
        if key[-1] is None:
223
            key = key[:-1] + ('sha1:' + sha1,)
0.17.2 by Robert Collins
Core proof of concept working.
224
        label = '\x00'.join(key)
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
225
        ## new_lines = []
226
        new_lines = ['label: %s\n' % label,
227
                     'sha1: %s\n' % sha1,
228
                    ]
229
        ## index_lines = []
230
        index_lines = [False, False]
0.17.2 by Robert Collins
Core proof of concept working.
231
        # setup good encoding for trailing \n support.
232
        if not lines or lines[-1].endswith('\n'):
233
            lines.append('\n')
234
        else:
235
            lines[-1] = lines[-1] + '\n'
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
236
        pos = 0
0.17.14 by Robert Collins
Cleaner code.
237
        range_len = 0
238
        range_start = 0
239
        flush_range = self.flush_range
240
        copy_ends = None
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
241
        blocks = self.get_matching_blocks(lines, soft=soft)
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
242
        current_pos = 0
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
243
        #copies_without_insertion = []
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
244
        # We either copy a range (while there are reusable lines) or we
245
        # insert new lines. To find reusable lines we traverse
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
246
        for old_start, new_start, range_len in blocks:
247
            if new_start != current_pos:
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
248
                # if copies_without_insertion:
249
                #     self.flush_multi(copies_without_insertion,
250
                #                      lines, new_lines, index_lines)
251
                #     copies_without_insertion = []
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
252
                # non-matching region
0.19.1 by Robert Collins
Start to simplify flush_range.
253
                flush_range(current_pos, None, new_start - current_pos,
0.17.15 by Robert Collins
Factor out a get_matching_blocks style function.
254
                    lines, new_lines, index_lines)
255
            current_pos = new_start + range_len
256
            if not range_len:
257
                continue
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
258
            # copies_without_insertion.append((new_start, old_start, range_len))
259
            flush_range(new_start, old_start, range_len,
260
                        lines, new_lines, index_lines)
261
        # if copies_without_insertion:
262
        #     self.flush_multi(copies_without_insertion,
263
        #                      lines, new_lines, index_lines)
264
        #     copies_without_insertion = []
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
265
        delta_start = (self.endpoint, len(self.lines))
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
266
        self.output_lines(new_lines, index_lines)
0.17.2 by Robert Collins
Core proof of concept working.
267
        trim_encoding_newline(lines)
268
        self.input_bytes += sum(map(len, lines))
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
269
        delta_end = (self.endpoint, len(self.lines))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
270
        self.labels_deltas[key] = (delta_start, delta_end)
0.17.2 by Robert Collins
Core proof of concept working.
271
        return sha1, self.endpoint
272
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
273
    def extract(self, key):
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
274
        """Extract a key previously added to the compressor.
275
        
276
        :param key: The key to extract.
277
        :return: An iterable over bytes and the sha1.
278
        """
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
279
        delta_details = self.labels_deltas[key]
280
        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
281
        label, sha1, delta = parse(delta_lines)
282
        ## delta = parse(delta_lines)
283
        if label != key:
284
            raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
285
        # Perhaps we want to keep the line offsets too in memory at least?
0.20.22 by John Arbash Meinel
Make it clear that the bits you get from 'apply_delta' are chunks, not lines.
286
        chunks = apply_delta(''.join(self.lines), delta)
287
        sha1 = sha_strings(chunks)
288
        return chunks, sha1
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
289
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
290
    def flush_multi(self, instructions, lines, new_lines, index_lines):
291
        """Flush a bunch of different ranges out.
292
293
        This should only be called with data that are "pure" copies.
294
        """
295
        flush_range = self.flush_range
296
        if len(instructions) > 2:
297
            # This is the number of lines to be copied
298
            total_copy_range = sum(i[2] for i in instructions)
299
            if len(instructions) > 0.5 * total_copy_range:
300
                # We are copying N lines, but taking more than N/2
301
                # copy instructions to do so. We will go ahead and expand this
302
                # text so that other code is able to match against it
303
                flush_range(instructions[0][0], None, total_copy_range,
304
                            lines, new_lines, index_lines)
305
                return
306
        for ns, os, rl in instructions:
307
            flush_range(ns, os, rl, lines, new_lines, index_lines)
308
0.19.1 by Robert Collins
Start to simplify flush_range.
309
    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
0.17.14 by Robert Collins
Cleaner code.
310
        insert_instruction = "i,%d\n" % range_len
0.19.1 by Robert Collins
Start to simplify flush_range.
311
        if copy_start is not None:
0.17.14 by Robert Collins
Cleaner code.
312
            # range stops, flush and start a new copy range
313
            stop_byte = self.line_offsets[copy_start + range_len - 1]
314
            if copy_start == 0:
315
                start_byte = 0
316
            else:
317
                start_byte = self.line_offsets[copy_start - 1]
318
            bytes = stop_byte - start_byte
319
            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
320
            if (bytes + len(insert_instruction) >
321
                len(copy_control_instruction)):
322
                new_lines.append(copy_control_instruction)
323
                index_lines.append(False)
324
                return
325
        # not copying, or inserting is shorter than copying, so insert.
326
        new_lines.append(insert_instruction)
327
        new_lines.extend(lines[range_start:range_start+range_len])
328
        index_lines.append(False)
0.19.1 by Robert Collins
Start to simplify flush_range.
329
        index_lines.extend([copy_start is None]*range_len)
0.17.14 by Robert Collins
Cleaner code.
330
0.17.13 by Robert Collins
Do not output copy instructions which take more to encode than a fresh insert. (But do not refer to those insertions when finding ranges to copy: they are not interesting).
331
    def output_lines(self, new_lines, index_lines):
332
        """Output some lines.
333
334
        :param new_lines: The lines to output.
335
        :param index_lines: A boolean flag for each line - when True, index
336
            that line.
337
        """
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
338
        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
339
        #                          if val and new_lines[idx] == '\n']
340
        # if indexed_newlines:
341
        #     import pdb; pdb.set_trace()
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
342
        endpoint = self.endpoint
0.18.9 by John Arbash Meinel
If we are going to do it this way, we don't need to explicitly distinguish left and right
343
        self.line_locations.extend_lines(new_lines, index_lines)
0.18.6 by John Arbash Meinel
Use the new EquivalenceTable to track the lines.
344
        for line in new_lines:
0.17.12 by Robert Collins
Encode copy ranges as bytes not lines, halves decode overhead.
345
            endpoint += len(line)
346
            self.line_offsets.append(endpoint)
347
        self.endpoint = endpoint
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
348
0.17.2 by Robert Collins
Core proof of concept working.
349
    def ratio(self):
350
        """Return the overall compression ratio."""
351
        return float(self.input_bytes) / float(self.endpoint)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
352
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
353
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
354
def make_pack_factory(graph, delta, keylength):
355
    """Create a factory for creating a pack based groupcompress.
356
357
    This is only functional enough to run interface tests, it doesn't try to
358
    provide a full pack environment.
359
    
360
    :param graph: Store a graph.
361
    :param delta: Delta compress contents.
362
    :param keylength: How long should keys be.
363
    """
364
    def factory(transport):
365
        parents = graph or delta
366
        ref_length = 0
367
        if graph:
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
368
            ref_length = 1
0.17.7 by Robert Collins
Update for current index2 changes.
369
        graph_index = BTreeBuilder(reference_lists=ref_length,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
370
            key_elements=keylength)
371
        stream = transport.open_write_stream('newpack')
372
        writer = pack.ContainerWriter(stream.write)
373
        writer.begin()
374
        index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
0.17.9 by Robert Collins
Initial stab at repository format support.
375
            add_callback=graph_index.add_nodes)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
376
        access = _DirectPackAccess({})
377
        access.set_writer(writer, graph_index, (transport, 'newpack'))
0.17.2 by Robert Collins
Core proof of concept working.
378
        result = GroupCompressVersionedFiles(index, access, delta)
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
379
        result.stream = stream
380
        result.writer = writer
381
        return result
382
    return factory
383
384
385
def cleanup_pack_group(versioned_files):
0.17.23 by Robert Collins
Only decompress as much of the zlib data as is needed to read the text recipe.
386
    versioned_files.writer.end()
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
387
    versioned_files.stream.close()
388
389
390
class GroupCompressVersionedFiles(VersionedFiles):
391
    """A group-compress based VersionedFiles implementation."""
392
0.17.2 by Robert Collins
Core proof of concept working.
393
    def __init__(self, index, access, delta=True):
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
394
        """Create a GroupCompressVersionedFiles object.
395
396
        :param index: The index object storing access and graph data.
397
        :param access: The access object storing raw data.
0.17.2 by Robert Collins
Core proof of concept working.
398
        :param delta: Whether to delta compress or just entropy compress.
399
        """
400
        self._index = index
401
        self._access = access
402
        self._delta = delta
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
403
        self._unadded_refs = {}
0.17.24 by Robert Collins
Add a group cache to decompression, 5 times faster than knit at decompression when accessing everything in a group.
404
        self._group_cache = LRUSizeCache(max_size=50*1024*1024)
0.17.2 by Robert Collins
Core proof of concept working.
405
406
    def add_lines(self, key, parents, lines, parent_texts=None,
407
        left_matching_blocks=None, nostore_sha=None, random_id=False,
408
        check_content=True):
409
        """Add a text to the store.
410
411
        :param key: The key tuple of the text to add.
412
        :param parents: The parents key tuples of the text to add.
413
        :param lines: A list of lines. Each line must be a bytestring. And all
414
            of them except the last must be terminated with \n and contain no
415
            other \n's. The last line may either contain no \n's or a single
416
            terminating \n. If the lines list does meet this constraint the add
417
            routine may error or may succeed - but you will be unable to read
418
            the data back accurately. (Checking the lines have been split
419
            correctly is expensive and extremely unlikely to catch bugs so it
420
            is not done at runtime unless check_content is True.)
421
        :param parent_texts: An optional dictionary containing the opaque 
422
            representations of some or all of the parents of version_id to
423
            allow delta optimisations.  VERY IMPORTANT: the texts must be those
424
            returned by add_lines or data corruption can be caused.
425
        :param left_matching_blocks: a hint about which areas are common
426
            between the text and its left-hand-parent.  The format is
427
            the SequenceMatcher.get_matching_blocks format.
428
        :param nostore_sha: Raise ExistingContent and do not add the lines to
429
            the versioned file if the digest of the lines matches this.
430
        :param random_id: If True a random id has been selected rather than
431
            an id determined by some deterministic process such as a converter
432
            from a foreign VCS. When True the backend may choose not to check
433
            for uniqueness of the resulting key within the versioned file, so
434
            this should only be done when the result is expected to be unique
435
            anyway.
436
        :param check_content: If True, the lines supplied are verified to be
437
            bytestrings that are correctly formed lines.
438
        :return: The text sha1, the number of bytes in the text, and an opaque
439
                 representation of the inserted version which can be provided
440
                 back to future add_lines calls in the parent_texts dictionary.
441
        """
442
        self._index._check_write_ok()
443
        self._check_add(key, lines, random_id, check_content)
444
        if parents is None:
445
            # The caller might pass None if there is no graph data, but kndx
446
            # indexes can't directly store that, so we give them
447
            # an empty tuple instead.
448
            parents = ()
449
        # double handling for now. Make it work until then.
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
450
        length = sum(map(len, lines))
451
        record = ChunkedContentFactory(key, parents, None, lines)
0.17.5 by Robert Collins
nograph tests completely passing.
452
        sha1 = list(self._insert_record_stream([record], random_id=random_id))[0]
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
453
        return sha1, length, None
0.17.2 by Robert Collins
Core proof of concept working.
454
0.17.4 by Robert Collins
Annotate.
455
    def annotate(self, key):
456
        """See VersionedFiles.annotate."""
457
        graph = Graph(self)
0.17.5 by Robert Collins
nograph tests completely passing.
458
        parent_map = self.get_parent_map([key])
459
        if not parent_map:
460
            raise errors.RevisionNotPresent(key, self)
461
        if parent_map[key] is not None:
462
            search = graph._make_breadth_first_searcher([key])
463
            keys = set()
464
            while True:
465
                try:
466
                    present, ghosts = search.next_with_ghosts()
467
                except StopIteration:
468
                    break
469
                keys.update(present)
470
            parent_map = self.get_parent_map(keys)
471
        else:
472
            keys = [key]
473
            parent_map = {key:()}
0.17.4 by Robert Collins
Annotate.
474
        head_cache = _mod_graph.FrozenHeadsCache(graph)
475
        parent_cache = {}
476
        reannotate = annotate.reannotate
477
        for record in self.get_record_stream(keys, 'topological', True):
478
            key = record.key
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
479
            chunks = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.17.4 by Robert Collins
Annotate.
480
            parent_lines = [parent_cache[parent] for parent in parent_map[key]]
481
            parent_cache[key] = list(
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
482
                reannotate(parent_lines, chunks, key, None, head_cache))
0.17.4 by Robert Collins
Annotate.
483
        return parent_cache[key]
484
0.17.5 by Robert Collins
nograph tests completely passing.
485
    def check(self, progress_bar=None):
486
        """See VersionedFiles.check()."""
487
        keys = self.keys()
488
        for record in self.get_record_stream(keys, 'unordered', True):
489
            record.get_bytes_as('fulltext')
490
0.17.2 by Robert Collins
Core proof of concept working.
491
    def _check_add(self, key, lines, random_id, check_content):
492
        """check that version_id and lines are safe to add."""
493
        version_id = key[-1]
0.17.26 by Robert Collins
Working better --gc-plain-chk.
494
        if version_id is not None:
495
            if contains_whitespace(version_id):
496
                raise InvalidRevisionId(version_id, self)
0.17.2 by Robert Collins
Core proof of concept working.
497
        self.check_not_reserved_id(version_id)
498
        # TODO: If random_id==False and the key is already present, we should
499
        # probably check that the existing content is identical to what is
500
        # being inserted, and otherwise raise an exception.  This would make
501
        # the bundle code simpler.
502
        if check_content:
503
            self._check_lines_not_unicode(lines)
504
            self._check_lines_are_lines(lines)
505
0.17.5 by Robert Collins
nograph tests completely passing.
506
    def get_parent_map(self, keys):
507
        """Get a map of the parents of keys.
508
509
        :param keys: The keys to look up parents for.
510
        :return: A mapping from keys to parents. Absent keys are absent from
511
            the mapping.
512
        """
513
        result = {}
514
        sources = [self._index]
515
        source_results = []
516
        missing = set(keys)
517
        for source in sources:
518
            if not missing:
519
                break
520
            new_result = source.get_parent_map(missing)
521
            source_results.append(new_result)
522
            result.update(new_result)
523
            missing.difference_update(set(new_result))
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
524
        if self._unadded_refs:
525
            for key in missing:
526
                if key in self._unadded_refs:
527
                    result[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
528
        return result
529
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
530
    def _get_delta_lines(self, key):
531
        """Helper function for manual debugging.
532
533
        This is a convenience function that shouldn't be used in production
534
        code.
535
        """
536
        build_details = self._index.get_build_details([key])[key]
537
        index_memo = build_details[0]
538
        group, delta_lines = self._get_group_and_delta_lines(index_memo)
539
        return delta_lines
540
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
541
    def _get_group_and_delta_lines(self, index_memo):
542
        read_memo = index_memo[0:3]
543
        # get the group:
544
        try:
545
            plain = self._group_cache[read_memo]
546
        except KeyError:
547
            # read the group
548
            zdata = self._access.get_raw_records([read_memo]).next()
549
            # decompress - whole thing - this is not a bug, as it
550
            # permits caching. We might want to store the partially
551
            # decompresed group and decompress object, so that recent
552
            # texts are not penalised by big groups.
553
            plain = zlib.decompress(zdata) #, index_memo[4])
554
            self._group_cache[read_memo] = plain
555
        # cheapo debugging:
556
        # print len(zdata), len(plain)
557
        # parse - requires split_lines, better to have byte offsets
558
        # here (but not by much - we only split the region for the
559
        # recipe, and we often want to end up with lines anyway.
560
        return plain, split_lines(plain[index_memo[3]:index_memo[4]])
561
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
562
    def get_missing_compression_parent_keys(self):
563
        """Return the keys of missing compression parents.
564
565
        Missing compression parents occur when a record stream was missing
566
        basis texts, or a index was scanned that had missing basis texts.
567
        """
568
        # GroupCompress cannot currently reference texts that are not in the
569
        # group, so this is valid for now
570
        return frozenset()
571
0.17.5 by Robert Collins
nograph tests completely passing.
572
    def get_record_stream(self, keys, ordering, include_delta_closure):
573
        """Get a stream of records for keys.
574
575
        :param keys: The keys to include.
576
        :param ordering: Either 'unordered' or 'topological'. A topologically
577
            sorted stream has compression parents strictly before their
578
            children.
579
        :param include_delta_closure: If True then the closure across any
580
            compression parents will be included (in the opaque data).
581
        :return: An iterator of ContentFactory objects, each of which is only
582
            valid until the iterator is advanced.
583
        """
584
        # keys might be a generator
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
585
        orig_keys = list(keys)
586
        keys = set(orig_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
587
        if not keys:
588
            return
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
589
        if (not self._index.has_graph
590
            and ordering in ('topological', 'gc-optimal')):
0.17.5 by Robert Collins
nograph tests completely passing.
591
            # Cannot topological order when no graph has been stored.
592
            ordering = 'unordered'
593
        # Cheap: iterate
594
        locations = self._index.get_build_details(keys)
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
595
        local_keys = frozenset(keys).intersection(set(self._unadded_refs))
0.17.5 by Robert Collins
nograph tests completely passing.
596
        if ordering == 'topological':
597
            # would be better to not globally sort initially but instead
598
            # start with one key, recurse to its oldest parent, then grab
599
            # everything in the same group, etc.
600
            parent_map = dict((key, details[2]) for key, details in
601
                locations.iteritems())
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
602
            for key in local_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
603
                parent_map[key] = self._unadded_refs[key]
0.17.5 by Robert Collins
nograph tests completely passing.
604
            present_keys = topo_sort(parent_map)
605
            # Now group by source:
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
606
        elif ordering == 'gc-optimal':
607
            parent_map = dict((key, details[2]) for key, details in
0.20.23 by John Arbash Meinel
Add a progress indicator for chk pages.
608
                              locations.iteritems())
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
609
            for key in local_keys:
610
                parent_map[key] = self._unadded_refs[key]
611
            # XXX: This only optimizes for the target ordering. We may need to
612
            #      balance that with the time it takes to extract ordering, by
613
            #      somehow grouping based on locations[key][0:3]
614
            present_keys = sort_gc_optimal(parent_map)
0.22.6 by John Arbash Meinel
Clustering chk pages properly makes a big difference.
615
        elif ordering == 'as-requested':
0.20.32 by John Arbash Meinel
Fix bug #336373 by adding local keys to locations after the fact, rather than before.
616
            present_keys = [key for key in orig_keys if key in locations
617
                            or key in local_keys]
0.17.5 by Robert Collins
nograph tests completely passing.
618
        else:
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
619
            # We want to yield the keys in a semi-optimal (read-wise) ordering.
620
            # Otherwise we thrash the _group_cache and destroy performance
621
            def get_group(key):
0.20.11 by John Arbash Meinel
start experimenting with gc-optimal ordering.
622
                # This is the group the bytes are stored in, followed by the
623
                # location in the group
624
                return locations[key][0]
0.20.10 by John Arbash Meinel
Change the extraction ordering for 'unordered'.
625
            present_keys = sorted(locations.iterkeys(), key=get_group)
626
            # We don't have an ordering for keys in the in-memory object, but
627
            # lets process the in-memory ones first.
628
            present_keys = list(local_keys) + present_keys
0.20.32 by John Arbash Meinel
Fix bug #336373 by adding local keys to locations after the fact, rather than before.
629
        locations.update((key, None) for key in local_keys)
0.17.5 by Robert Collins
nograph tests completely passing.
630
        absent_keys = keys.difference(set(locations))
631
        for key in absent_keys:
632
            yield AbsentContentFactory(key)
633
        for key in present_keys:
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
634
            if key in self._unadded_refs:
0.20.28 by John Arbash Meinel
Fix typo with the recent lines => chunks rename.
635
                chunks, sha1 = self._compressor.extract(key)
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
636
                parents = self._unadded_refs[key]
637
            else:
638
                index_memo, _, parents, (method, _) = locations[key]
0.20.14 by John Arbash Meinel
Factor out _get_group_and_delta_lines.
639
                plain, delta_lines = self._get_group_and_delta_lines(index_memo)
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
640
                label, sha1, delta = parse(delta_lines)
641
                if label != key:
642
                    raise AssertionError("wrong key: %r, wanted %r" % (label, key))
0.20.22 by John Arbash Meinel
Make it clear that the bits you get from 'apply_delta' are chunks, not lines.
643
                chunks = apply_delta(plain, delta)
644
                if sha_strings(chunks) != sha1:
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
645
                    raise AssertionError('sha1 sum did not match')
0.20.22 by John Arbash Meinel
Make it clear that the bits you get from 'apply_delta' are chunks, not lines.
646
            yield ChunkedContentFactory(key, parents, sha1, chunks)
0.20.5 by John Arbash Meinel
Finish the Fulltext => Chunked conversions so that we work in the more-efficient Chunks.
647
0.17.5 by Robert Collins
nograph tests completely passing.
648
    def get_sha1s(self, keys):
649
        """See VersionedFiles.get_sha1s()."""
650
        result = {}
651
        for record in self.get_record_stream(keys, 'unordered', True):
652
            if record.sha1 != None:
653
                result[record.key] = record.sha1
654
            else:
655
                if record.storage_kind != 'absent':
656
                    result[record.key] == sha_string(record.get_bytes_as(
657
                        'fulltext'))
658
        return result
659
0.17.2 by Robert Collins
Core proof of concept working.
660
    def insert_record_stream(self, stream):
661
        """Insert a record stream into this container.
662
663
        :param stream: A stream of records to insert. 
664
        :return: None
665
        :seealso VersionedFiles.get_record_stream:
666
        """
0.17.5 by Robert Collins
nograph tests completely passing.
667
        for _ in self._insert_record_stream(stream):
668
            pass
0.17.2 by Robert Collins
Core proof of concept working.
669
0.17.5 by Robert Collins
nograph tests completely passing.
670
    def _insert_record_stream(self, stream, random_id=False):
0.17.2 by Robert Collins
Core proof of concept working.
671
        """Internal core to insert a record stream into this container.
672
673
        This helper function has a different interface than insert_record_stream
674
        to allow add_lines to be minimal, but still return the needed data.
675
676
        :param stream: A stream of records to insert. 
677
        :return: An iterator over the sha1 of the inserted records.
678
        :seealso insert_record_stream:
679
        :seealso add_lines:
680
        """
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
681
        adapters = {}
0.17.5 by Robert Collins
nograph tests completely passing.
682
        def get_adapter(adapter_key):
683
            try:
684
                return adapters[adapter_key]
685
            except KeyError:
686
                adapter_factory = adapter_registry.get(adapter_key)
687
                adapter = adapter_factory(self)
688
                adapters[adapter_key] = adapter
689
                return adapter
0.17.2 by Robert Collins
Core proof of concept working.
690
        # This will go up to fulltexts for gc to gc fetching, which isn't
691
        # ideal.
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
692
        self._compressor = GroupCompressor(self._delta)
693
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
694
        keys_to_add = []
695
        basis_end = 0
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
696
        groups = 1
697
        def flush():
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
698
            compressed = zlib.compress(''.join(self._compressor.lines))
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
699
            index, start, length = self._access.add_raw_records(
700
                [(None, len(compressed))], compressed)[0]
701
            nodes = []
702
            for key, reads, refs in keys_to_add:
703
                nodes.append((key, "%d %d %s" % (start, length, reads), refs))
704
            self._index.add_records(nodes, random_id=random_id)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
705
        last_prefix = None
0.17.2 by Robert Collins
Core proof of concept working.
706
        for record in stream:
0.17.5 by Robert Collins
nograph tests completely passing.
707
            # Raise an error when a record is missing.
708
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
709
                raise errors.RevisionNotPresent(record.key, self)
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
710
            try:
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
711
                lines = osutils.chunks_to_lines(record.get_bytes_as('chunked'))
0.20.18 by John Arbash Meinel
Implement new handling of get_bytes_as(), and get_missing_compression_parent_keys()
712
            except errors.UnavailableRepresentation:
0.17.5 by Robert Collins
nograph tests completely passing.
713
                adapter_key = record.storage_kind, 'fulltext'
714
                adapter = get_adapter(adapter_key)
0.20.21 by John Arbash Meinel
Merge the chk sorting code.
715
                bytes = adapter.get_bytes(record)
0.20.2 by John Arbash Meinel
Teach groupcompress about 'chunked' encoding
716
                lines = osutils.split_lines(bytes)
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
717
            soft = False
0.20.13 by John Arbash Meinel
Play around a bit.
718
            if len(record.key) > 1:
719
                prefix = record.key[0]
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
720
                if (last_prefix is not None and prefix != last_prefix):
721
                    soft = True
722
                    if basis_end > 1024 * 1024 * 4:
723
                        flush()
724
                        self._compressor = GroupCompressor(self._delta)
725
                        self._unadded_refs = {}
726
                        keys_to_add = []
727
                        basis_end = 0
728
                        groups += 1
729
                last_prefix = prefix
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
730
            found_sha1, end_point = self._compressor.compress(record.key,
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
731
                lines, record.sha1, soft=soft)
0.17.26 by Robert Collins
Working better --gc-plain-chk.
732
            if record.key[-1] is None:
733
                key = record.key[:-1] + ('sha1:' + found_sha1,)
734
            else:
735
                key = record.key
736
            self._unadded_refs[key] = record.parents
0.17.3 by Robert Collins
new encoder, allows non monotonically increasing sequence matches for moar compression.
737
            yield found_sha1
0.17.26 by Robert Collins
Working better --gc-plain-chk.
738
            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
0.17.5 by Robert Collins
nograph tests completely passing.
739
                (record.parents,)))
740
            basis_end = end_point
0.20.15 by John Arbash Meinel
Change so that regions that have lots of copies get converted back
741
            if basis_end > 1024 * 1024 * 8:
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
742
                flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
743
                self._compressor = GroupCompressor(self._delta)
744
                self._unadded_refs = {}
0.17.6 by Robert Collins
Cap group size at 20MB internal buffer. (Probably way too big).
745
                keys_to_add = []
746
                basis_end = 0
747
                groups += 1
0.17.8 by Robert Collins
Flush pending updates at the end of _insert_record_stream
748
        if len(keys_to_add):
749
            flush()
0.17.11 by Robert Collins
Add extraction of just-compressed texts to support converting from knits.
750
        self._compressor = None
751
        self._unadded_refs = {}
0.17.5 by Robert Collins
nograph tests completely passing.
752
753
    def iter_lines_added_or_present_in_keys(self, keys, pb=None):
754
        """Iterate over the lines in the versioned files from keys.
755
756
        This may return lines from other keys. Each item the returned
757
        iterator yields is a tuple of a line and a text version that that line
758
        is present in (not introduced in).
759
760
        Ordering of results is in whatever order is most suitable for the
761
        underlying storage format.
762
763
        If a progress bar is supplied, it may be used to indicate progress.
764
        The caller is responsible for cleaning up progress bars (because this
765
        is an iterator).
766
767
        NOTES:
768
         * Lines are normalised by the underlying store: they will all have \n
769
           terminators.
770
         * Lines are returned in arbitrary order.
771
772
        :return: An iterator over (line, key).
773
        """
774
        if pb is None:
775
            pb = progress.DummyProgress()
776
        keys = set(keys)
777
        total = len(keys)
778
        # we don't care about inclusions, the caller cares.
779
        # but we need to setup a list of records to visit.
780
        # we need key, position, length
781
        for key_idx, record in enumerate(self.get_record_stream(keys,
782
            'unordered', True)):
783
            # XXX: todo - optimise to use less than full texts.
784
            key = record.key
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
785
            pb.update('Walking content.', key_idx + 1, total)
0.17.5 by Robert Collins
nograph tests completely passing.
786
            if record.storage_kind == 'absent':
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
787
                raise errors.RevisionNotPresent(key, self)
0.17.5 by Robert Collins
nograph tests completely passing.
788
            lines = split_lines(record.get_bytes_as('fulltext'))
789
            for line in lines:
790
                yield line, key
791
        pb.update('Walking content.', total, total)
792
793
    def keys(self):
794
        """See VersionedFiles.keys."""
795
        if 'evil' in debug.debug_flags:
796
            trace.mutter_callsite(2, "keys scales with size of history")
797
        sources = [self._index]
798
        result = set()
799
        for source in sources:
800
            result.update(source.keys())
801
        return result
802
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
803
804
class _GCGraphIndex(object):
805
    """Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
806
0.17.9 by Robert Collins
Initial stab at repository format support.
807
    def __init__(self, graph_index, is_locked, parents=True,
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
808
        add_callback=None):
809
        """Construct a _GCGraphIndex on a graph_index.
810
811
        :param graph_index: An implementation of bzrlib.index.GraphIndex.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
812
        :param is_locked: A callback, returns True if the index is locked and
813
            thus usable.
0.17.1 by Robert Collins
Starting point. Interface tests hooked up and failing.
814
        :param parents: If True, record knits parents, if not do not record 
815
            parents.
816
        :param add_callback: If not None, allow additions to the index and call
817
            this callback with a list of added GraphIndex nodes:
818
            [(node, value, node_refs), ...]
819
        """
820
        self._add_callback = add_callback
821
        self._graph_index = graph_index
822
        self._parents = parents
823
        self.has_graph = parents
824
        self._is_locked = is_locked
825
0.17.5 by Robert Collins
nograph tests completely passing.
826
    def add_records(self, records, random_id=False):
827
        """Add multiple records to the index.
828
        
829
        This function does not insert data into the Immutable GraphIndex
830
        backing the KnitGraphIndex, instead it prepares data for insertion by
831
        the caller and checks that it is safe to insert then calls
832
        self._add_callback with the prepared GraphIndex nodes.
833
834
        :param records: a list of tuples:
835
                         (key, options, access_memo, parents).
836
        :param random_id: If True the ids being added were randomly generated
837
            and no check for existence will be performed.
838
        """
839
        if not self._add_callback:
840
            raise errors.ReadOnlyError(self)
841
        # we hope there are no repositories with inconsistent parentage
842
        # anymore.
843
844
        changed = False
845
        keys = {}
846
        for (key, value, refs) in records:
847
            if not self._parents:
848
                if refs:
849
                    for ref in refs:
850
                        if ref:
851
                            raise KnitCorrupt(self,
852
                                "attempt to add node with parents "
853
                                "in parentless index.")
854
                    refs = ()
855
                    changed = True
856
            keys[key] = (value, refs)
857
        # check for dups
858
        if not random_id:
859
            present_nodes = self._get_entries(keys)
860
            for (index, key, value, node_refs) in present_nodes:
861
                if node_refs != keys[key][1]:
862
                    raise errors.KnitCorrupt(self, "inconsistent details in add_records"
863
                        ": %s %s" % ((value, node_refs), keys[key]))
864
                del keys[key]
865
                changed = True
866
        if changed:
867
            result = []
868
            if self._parents:
869
                for key, (value, node_refs) in keys.iteritems():
870
                    result.append((key, value, node_refs))
871
            else:
872
                for key, (value, node_refs) in keys.iteritems():
873
                    result.append((key, value))
874
            records = result
875
        self._add_callback(records)
876
        
877
    def _check_read(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
878
        """Raise an exception if reads are not permitted."""
0.17.5 by Robert Collins
nograph tests completely passing.
879
        if not self._is_locked():
880
            raise errors.ObjectNotLocked(self)
881
0.17.2 by Robert Collins
Core proof of concept working.
882
    def _check_write_ok(self):
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
883
        """Raise an exception if writes are not permitted."""
0.17.2 by Robert Collins
Core proof of concept working.
884
        if not self._is_locked():
885
            raise errors.ObjectNotLocked(self)
886
0.17.5 by Robert Collins
nograph tests completely passing.
887
    def _get_entries(self, keys, check_present=False):
888
        """Get the entries for keys.
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
889
890
        Note: Callers are responsible for checking that the index is locked
891
        before calling this method.
892
0.17.5 by Robert Collins
nograph tests completely passing.
893
        :param keys: An iterable of index key tuples.
894
        """
895
        keys = set(keys)
896
        found_keys = set()
897
        if self._parents:
898
            for node in self._graph_index.iter_entries(keys):
899
                yield node
900
                found_keys.add(node[1])
901
        else:
902
            # adapt parentless index to the rest of the code.
903
            for node in self._graph_index.iter_entries(keys):
904
                yield node[0], node[1], node[2], ()
905
                found_keys.add(node[1])
906
        if check_present:
907
            missing_keys = keys.difference(found_keys)
908
            if missing_keys:
909
                raise RevisionNotPresent(missing_keys.pop(), self)
910
911
    def get_parent_map(self, keys):
912
        """Get a map of the parents of keys.
913
914
        :param keys: The keys to look up parents for.
915
        :return: A mapping from keys to parents. Absent keys are absent from
916
            the mapping.
917
        """
918
        self._check_read()
919
        nodes = self._get_entries(keys)
920
        result = {}
921
        if self._parents:
922
            for node in nodes:
923
                result[node[1]] = node[3][0]
924
        else:
925
            for node in nodes:
926
                result[node[1]] = None
927
        return result
928
929
    def get_build_details(self, keys):
930
        """Get the various build details for keys.
931
932
        Ghosts are omitted from the result.
933
934
        :param keys: An iterable of keys.
935
        :return: A dict of key:
936
            (index_memo, compression_parent, parents, record_details).
937
            index_memo
938
                opaque structure to pass to read_records to extract the raw
939
                data
940
            compression_parent
941
                Content that this record is built upon, may be None
942
            parents
943
                Logical parents of this node
944
            record_details
945
                extra information about the content which needs to be passed to
946
                Factory.parse_record
947
        """
948
        self._check_read()
949
        result = {}
0.20.29 by Ian Clatworthy
groupcompress.py code cleanups
950
        entries = self._get_entries(keys)
0.17.5 by Robert Collins
nograph tests completely passing.
951
        for entry in entries:
952
            key = entry[1]
953
            if not self._parents:
954
                parents = None
955
            else:
956
                parents = entry[3][0]
957
            method = 'group'
958
            result[key] = (self._node_to_position(entry),
959
                                  None, parents, (method, None))
960
        return result
961
    
962
    def keys(self):
963
        """Get all the keys in the collection.
964
        
965
        The keys are not ordered.
966
        """
967
        self._check_read()
968
        return [node[1] for node in self._graph_index.iter_all_entries()]
969
    
970
    def _node_to_position(self, node):
971
        """Convert an index value to position details."""
972
        bits = node[2].split(' ')
973
        # It would be nice not to read the entire gzip.
974
        start = int(bits[0])
975
        stop = int(bits[1])
976
        basis_end = int(bits[2])
977
        delta_end = int(bits[3])
978
        return node[0], start, stop, basis_end, delta_end
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
979
980
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
981
def _get_longest_match(equivalence_table, pos, max_pos, locations):
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
982
    """Get the longest possible match for the current position."""
983
    range_start = pos
984
    range_len = 0
985
    copy_ends = None
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
986
    while pos < max_pos:
0.18.25 by John Arbash Meinel
Factor the get_longest_match into a helper func
987
        if locations is None:
988
            locations = equivalence_table.get_idx_matches(pos)
989
        if locations is None:
990
            # No more matches, just return whatever we have, but we know that
991
            # this last position is not going to match anything
992
            pos += 1
993
            break
994
        else:
995
            if copy_ends is None:
996
                # We are starting a new range
997
                copy_ends = [loc + 1 for loc in locations]
998
                range_len = 1
999
                locations = None # Consumed
1000
            else:
1001
                # We are currently in the middle of a match
1002
                next_locations = set(copy_ends).intersection(locations)
1003
                if len(next_locations):
1004
                    # range continues
1005
                    copy_ends = [loc + 1 for loc in next_locations]
1006
                    range_len += 1
1007
                    locations = None # Consumed
1008
                else:
1009
                    # But we are done with this match, we should be
1010
                    # starting a new one, though. We will pass back 'locations'
1011
                    # so that we don't have to do another lookup.
1012
                    break
1013
        pos += 1
1014
    if copy_ends is None:
1015
        return None, pos, locations
1016
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
1017
1018
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
1019
try:
1020
    from bzrlib.plugins.groupcompress import _groupcompress_c
1021
except ImportError:
1022
    pass
1023
else:
1024
    GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
1025
    _get_longest_match = _groupcompress_c._get_longest_match