bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
1  | 
# Copyright (C) 2009 Canonical Ltd
 | 
2  | 
#
 | 
|
3  | 
# This program is free software; you can redistribute it and/or modify
 | 
|
4  | 
# it under the terms of the GNU General Public License as published by
 | 
|
5  | 
# the Free Software Foundation; either version 2 of the License, or
 | 
|
6  | 
# (at your option) any later version.
 | 
|
7  | 
#
 | 
|
8  | 
# This program is distributed in the hope that it will be useful,
 | 
|
9  | 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
|
10  | 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
|
11  | 
# GNU General Public License for more details.
 | 
|
12  | 
#
 | 
|
13  | 
# You should have received a copy of the GNU General Public License
 | 
|
14  | 
# along with this program; if not, write to the Free Software
 | 
|
15  | 
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
|
16  | 
||
17  | 
"""Python version of compiled extensions for doing compression.
 | 
|
18  | 
||
19  | 
We separate the implementation from the groupcompress.py to avoid importing
 | 
|
20  | 
useless stuff.
 | 
|
21  | 
"""
 | 
|
22  | 
||
| 
3735.40.7
by John Arbash Meinel
 Move even more functionality into EquivalenceTable.  | 
23  | 
from bzrlib import osutils  | 
24  | 
||
25  | 
||
| 
4300.1.2
by John Arbash Meinel
 Change the pure-python compressor a bit.  | 
26  | 
class _OutputHandler(object):  | 
27  | 
"""A simple class which just tracks how to split up an insert request."""  | 
|
28  | 
||
29  | 
def __init__(self, out_lines, index_lines, min_len_to_index):  | 
|
30  | 
self.out_lines = out_lines  | 
|
31  | 
self.index_lines = index_lines  | 
|
32  | 
self.min_len_to_index = min_len_to_index  | 
|
33  | 
self.cur_insert_lines = []  | 
|
34  | 
self.cur_insert_len = 0  | 
|
35  | 
||
36  | 
def add_copy(self, start_byte, end_byte):  | 
|
37  | 
        # The data stream allows >64kB in a copy, but to match the compiled
 | 
|
38  | 
        # code, we will also limit it to a 64kB copy
 | 
|
39  | 
for start_byte in xrange(start_byte, end_byte, 64*1024):  | 
|
40  | 
num_bytes = min(64*1024, end_byte - start_byte)  | 
|
41  | 
copy_bytes = encode_copy_instruction(start_byte, num_bytes)  | 
|
42  | 
self.out_lines.append(copy_bytes)  | 
|
43  | 
self.index_lines.append(False)  | 
|
44  | 
||
45  | 
def _flush_insert(self):  | 
|
46  | 
if not self.cur_insert_lines:  | 
|
47  | 
            return
 | 
|
| 
4300.1.7
by John Arbash Meinel
 Bring in the other test cases.  | 
48  | 
if self.cur_insert_len > 127:  | 
49  | 
raise AssertionError('We cannot insert more than 127 bytes'  | 
|
50  | 
' at a time.')  | 
|
| 
4300.1.2
by John Arbash Meinel
 Change the pure-python compressor a bit.  | 
51  | 
self.out_lines.append(chr(self.cur_insert_len))  | 
52  | 
self.index_lines.append(False)  | 
|
53  | 
self.out_lines.extend(self.cur_insert_lines)  | 
|
54  | 
if self.cur_insert_len < self.min_len_to_index:  | 
|
55  | 
self.index_lines.extend([False]*len(self.cur_insert_lines))  | 
|
56  | 
else:  | 
|
57  | 
self.index_lines.extend([True]*len(self.cur_insert_lines))  | 
|
58  | 
self.cur_insert_lines = []  | 
|
59  | 
self.cur_insert_len = 0  | 
|
60  | 
||
61  | 
def _insert_long_line(self, line):  | 
|
62  | 
        # Flush out anything pending
 | 
|
63  | 
self._flush_insert()  | 
|
64  | 
line_len = len(line)  | 
|
65  | 
for start_index in xrange(0, line_len, 127):  | 
|
66  | 
next_len = min(127, line_len - start_index)  | 
|
67  | 
self.out_lines.append(chr(next_len))  | 
|
68  | 
self.index_lines.append(False)  | 
|
69  | 
self.out_lines.append(line[start_index:start_index+next_len])  | 
|
70  | 
            # We don't index long lines, because we won't be able to match
 | 
|
71  | 
            # a line split across multiple inserts anway
 | 
|
72  | 
self.index_lines.append(False)  | 
|
73  | 
||
74  | 
def add_insert(self, lines):  | 
|
| 
4300.1.7
by John Arbash Meinel
 Bring in the other test cases.  | 
75  | 
if self.cur_insert_lines != []:  | 
76  | 
raise AssertionError('self.cur_insert_lines must be empty when'  | 
|
77  | 
' adding a new insert')  | 
|
| 
4300.1.2
by John Arbash Meinel
 Change the pure-python compressor a bit.  | 
78  | 
for line in lines:  | 
79  | 
if len(line) > 127:  | 
|
80  | 
self._insert_long_line(line)  | 
|
81  | 
else:  | 
|
82  | 
next_len = len(line) + self.cur_insert_len  | 
|
83  | 
if next_len > 127:  | 
|
84  | 
                    # Adding this line would overflow, so flush, and start over
 | 
|
85  | 
self._flush_insert()  | 
|
86  | 
self.cur_insert_lines = [line]  | 
|
87  | 
self.cur_insert_len = len(line)  | 
|
88  | 
else:  | 
|
89  | 
self.cur_insert_lines.append(line)  | 
|
90  | 
self.cur_insert_len = next_len  | 
|
91  | 
self._flush_insert()  | 
|
92  | 
||
93  | 
||
| 
3735.40.13
by John Arbash Meinel
 Rename EquivalenceTable to LinesDeltaIndex.  | 
94  | 
class LinesDeltaIndex(object):  | 
95  | 
"""This class indexes matches between strings.  | 
|
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
96  | 
|
97  | 
    :ivar lines: The 'static' lines that will be preserved between runs.
 | 
|
| 
3735.40.13
by John Arbash Meinel
 Rename EquivalenceTable to LinesDeltaIndex.  | 
98  | 
    :ivar _matching_lines: A dict of {line:[matching offsets]}
 | 
99  | 
    :ivar line_offsets: The byte offset for the end of each line, used to
 | 
|
100  | 
        quickly map between a matching line number and the byte location
 | 
|
101  | 
    :ivar endpoint: The total number of bytes in self.line_offsets
 | 
|
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
102  | 
    """
 | 
103  | 
||
| 
4300.1.2
by John Arbash Meinel
 Change the pure-python compressor a bit.  | 
104  | 
_MIN_MATCH_BYTES = 10  | 
105  | 
_SOFT_MIN_MATCH_BYTES = 200  | 
|
106  | 
||
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
107  | 
def __init__(self, lines):  | 
| 
3735.40.13
by John Arbash Meinel
 Rename EquivalenceTable to LinesDeltaIndex.  | 
108  | 
self.lines = []  | 
| 
3735.40.6
by John Arbash Meinel
 Move more information down into EquivalenceTable.  | 
109  | 
self.line_offsets = []  | 
| 
3735.40.13
by John Arbash Meinel
 Rename EquivalenceTable to LinesDeltaIndex.  | 
110  | 
self.endpoint = 0  | 
111  | 
self._matching_lines = {}  | 
|
112  | 
self.extend_lines(lines, [True]*len(lines))  | 
|
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
113  | 
|
114  | 
def _update_matching_lines(self, new_lines, index):  | 
|
115  | 
matches = self._matching_lines  | 
|
116  | 
start_idx = len(self.lines)  | 
|
| 
3735.40.15
by John Arbash Meinel
 Some cleanup passes over the LinesDeltaIndex code.  | 
117  | 
if len(new_lines) != len(index):  | 
118  | 
raise AssertionError('The number of lines to be indexed does'  | 
|
119  | 
' not match the index/don\'t index flags: %d != %d'  | 
|
120  | 
% (len(new_lines), len(index)))  | 
|
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
121  | 
for idx, do_index in enumerate(index):  | 
122  | 
if not do_index:  | 
|
123  | 
                continue
 | 
|
| 
4300.1.4
by John Arbash Meinel
 Change self._matching_lines to use a set rather than a list.  | 
124  | 
line = new_lines[idx]  | 
125  | 
try:  | 
|
126  | 
matches[line].add(start_idx + idx)  | 
|
127  | 
except KeyError:  | 
|
128  | 
matches[line] = set([start_idx + idx])  | 
|
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
129  | 
|
130  | 
def get_matches(self, line):  | 
|
131  | 
"""Return the lines which match the line in right."""  | 
|
132  | 
try:  | 
|
133  | 
return self._matching_lines[line]  | 
|
134  | 
except KeyError:  | 
|
135  | 
return None  | 
|
136  | 
||
| 
4300.1.5
by John Arbash Meinel
 A couple more cleanups on the pure-python implementation.  | 
137  | 
def _get_longest_match(self, lines, pos):  | 
| 
3735.40.15
by John Arbash Meinel
 Some cleanup passes over the LinesDeltaIndex code.  | 
138  | 
"""Look at all matches for the current line, return the longest.  | 
139  | 
||
140  | 
        :param lines: The lines we are matching against
 | 
|
141  | 
        :param pos: The current location we care about
 | 
|
142  | 
        :param locations: A list of lines that matched the current location.
 | 
|
143  | 
            This may be None, but often we'll have already found matches for
 | 
|
144  | 
            this line.
 | 
|
145  | 
        :return: (start_in_self, start_in_lines, num_lines)
 | 
|
146  | 
            All values are the offset in the list (aka the line number)
 | 
|
147  | 
            If start_in_self is None, then we have no matches, and this line
 | 
|
148  | 
            should be inserted in the target.
 | 
|
149  | 
        """
 | 
|
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
150  | 
range_start = pos  | 
151  | 
range_len = 0  | 
|
| 
4300.1.5
by John Arbash Meinel
 A couple more cleanups on the pure-python implementation.  | 
152  | 
prev_locations = None  | 
| 
3735.40.15
by John Arbash Meinel
 Some cleanup passes over the LinesDeltaIndex code.  | 
153  | 
max_pos = len(lines)  | 
| 
4300.1.4
by John Arbash Meinel
 Change self._matching_lines to use a set rather than a list.  | 
154  | 
matching = self._matching_lines  | 
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
155  | 
while pos < max_pos:  | 
| 
4300.1.5
by John Arbash Meinel
 A couple more cleanups on the pure-python implementation.  | 
156  | 
try:  | 
157  | 
locations = matching[lines[pos]]  | 
|
158  | 
except KeyError:  | 
|
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
159  | 
                # No more matches, just return whatever we have, but we know
 | 
160  | 
                # that this last position is not going to match anything
 | 
|
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
161  | 
pos += 1  | 
162  | 
                break
 | 
|
| 
4300.1.5
by John Arbash Meinel
 A couple more cleanups on the pure-python implementation.  | 
163  | 
            # We have a match
 | 
164  | 
if prev_locations is None:  | 
|
165  | 
                # This is the first match in a range
 | 
|
166  | 
prev_locations = locations  | 
|
167  | 
range_len = 1  | 
|
168  | 
locations = None # Consumed  | 
|
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
169  | 
else:  | 
| 
4300.1.5
by John Arbash Meinel
 A couple more cleanups on the pure-python implementation.  | 
170  | 
                # We have a match started, compare to see if any of the
 | 
171  | 
                # current matches can be continued
 | 
|
172  | 
next_locations = locations.intersection([loc + 1 for loc  | 
|
173  | 
in prev_locations])  | 
|
174  | 
if next_locations:  | 
|
175  | 
                    # At least one of the regions continues to match
 | 
|
176  | 
prev_locations = set(next_locations)  | 
|
177  | 
range_len += 1  | 
|
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
178  | 
locations = None # Consumed  | 
179  | 
else:  | 
|
| 
4300.1.5
by John Arbash Meinel
 A couple more cleanups on the pure-python implementation.  | 
180  | 
                    # All current regions no longer match.
 | 
181  | 
                    # This line does still match something, just not at the
 | 
|
182  | 
                    # end of the previous matches. We will return locations
 | 
|
183  | 
                    # so that we can avoid another _matching_lines lookup.
 | 
|
184  | 
                    break
 | 
|
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
185  | 
pos += 1  | 
| 
4300.1.5
by John Arbash Meinel
 A couple more cleanups on the pure-python implementation.  | 
186  | 
if prev_locations is None:  | 
| 
3735.40.15
by John Arbash Meinel
 Some cleanup passes over the LinesDeltaIndex code.  | 
187  | 
            # We have no matches, this is a pure insert
 | 
| 
4300.1.5
by John Arbash Meinel
 A couple more cleanups on the pure-python implementation.  | 
188  | 
return None, pos  | 
189  | 
smallest = min(prev_locations)  | 
|
190  | 
return (smallest - range_len + 1, range_start, range_len), pos  | 
|
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
191  | 
|
192  | 
def get_matching_blocks(self, lines, soft=False):  | 
|
193  | 
"""Return the ranges in lines which match self.lines.  | 
|
194  | 
||
195  | 
        :param lines: lines to compress
 | 
|
196  | 
        :return: A list of (old_start, new_start, length) tuples which reflect
 | 
|
197  | 
            a region in self.lines that is present in lines.  The last element
 | 
|
198  | 
            of the list is always (old_len, new_len, 0) to provide a end point
 | 
|
199  | 
            for generating instructions from the matching blocks list.
 | 
|
200  | 
        """
 | 
|
| 
3735.40.15
by John Arbash Meinel
 Some cleanup passes over the LinesDeltaIndex code.  | 
201  | 
        # In this code, we iterate over multiple _get_longest_match calls, to
 | 
202  | 
        # find the next longest copy, and possible insert regions. We then
 | 
|
203  | 
        # convert that to the simple matching_blocks representation, since
 | 
|
204  | 
        # otherwise inserting 10 lines in a row would show up as 10
 | 
|
205  | 
        # instructions.
 | 
|
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
206  | 
result = []  | 
207  | 
pos = 0  | 
|
208  | 
max_pos = len(lines)  | 
|
209  | 
result_append = result.append  | 
|
| 
4300.1.2
by John Arbash Meinel
 Change the pure-python compressor a bit.  | 
210  | 
min_match_bytes = self._MIN_MATCH_BYTES  | 
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
211  | 
if soft:  | 
| 
4300.1.2
by John Arbash Meinel
 Change the pure-python compressor a bit.  | 
212  | 
min_match_bytes = self._SOFT_MIN_MATCH_BYTES  | 
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
213  | 
while pos < max_pos:  | 
| 
4300.1.5
by John Arbash Meinel
 A couple more cleanups on the pure-python implementation.  | 
214  | 
block, pos = self._get_longest_match(lines, pos)  | 
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
215  | 
if block is not None:  | 
| 
3735.40.14
by John Arbash Meinel
 Get rid of the self._right_lines state. It doesn't matter anymore.  | 
216  | 
                # Check to see if we match fewer than min_match_bytes. As we
 | 
217  | 
                # will turn this into a pure 'insert', rather than a copy.
 | 
|
218  | 
                # block[-1] is the number of lines. A quick check says if we
 | 
|
219  | 
                # have more lines than min_match_bytes, then we know we have
 | 
|
220  | 
                # enough bytes.
 | 
|
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
221  | 
if block[-1] < min_match_bytes:  | 
222  | 
                    # This block may be a 'short' block, check
 | 
|
223  | 
old_start, new_start, range_len = block  | 
|
224  | 
matched_bytes = sum(map(len,  | 
|
225  | 
lines[new_start:new_start + range_len]))  | 
|
226  | 
if matched_bytes < min_match_bytes:  | 
|
227  | 
block = None  | 
|
228  | 
if block is not None:  | 
|
229  | 
result_append(block)  | 
|
230  | 
result_append((len(self.lines), len(lines), 0))  | 
|
231  | 
return result  | 
|
232  | 
||
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
233  | 
def extend_lines(self, lines, index):  | 
234  | 
"""Add more lines to the left-lines list.  | 
|
235  | 
||
236  | 
        :param lines: A list of lines to add
 | 
|
237  | 
        :param index: A True/False for each node to define if it should be
 | 
|
238  | 
            indexed.
 | 
|
239  | 
        """
 | 
|
240  | 
self._update_matching_lines(lines, index)  | 
|
241  | 
self.lines.extend(lines)  | 
|
| 
3735.40.6
by John Arbash Meinel
 Move more information down into EquivalenceTable.  | 
242  | 
endpoint = self.endpoint  | 
243  | 
for line in lines:  | 
|
244  | 
endpoint += len(line)  | 
|
245  | 
self.line_offsets.append(endpoint)  | 
|
| 
3735.40.15
by John Arbash Meinel
 Some cleanup passes over the LinesDeltaIndex code.  | 
246  | 
if len(self.line_offsets) != len(self.lines):  | 
247  | 
raise AssertionError('Somehow the line offset indicator'  | 
|
248  | 
' got out of sync with the line counter.')  | 
|
| 
3735.40.6
by John Arbash Meinel
 Move more information down into EquivalenceTable.  | 
249  | 
self.endpoint = endpoint  | 
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
250  | 
|
251  | 
def _flush_insert(self, start_linenum, end_linenum,  | 
|
252  | 
new_lines, out_lines, index_lines):  | 
|
253  | 
"""Add an 'insert' request to the data stream."""  | 
|
254  | 
bytes_to_insert = ''.join(new_lines[start_linenum:end_linenum])  | 
|
255  | 
insert_length = len(bytes_to_insert)  | 
|
256  | 
        # Each insert instruction is at most 127 bytes long
 | 
|
257  | 
for start_byte in xrange(0, insert_length, 127):  | 
|
258  | 
insert_count = min(insert_length - start_byte, 127)  | 
|
259  | 
out_lines.append(chr(insert_count))  | 
|
260  | 
            # Don't index the 'insert' instruction
 | 
|
261  | 
index_lines.append(False)  | 
|
262  | 
insert = bytes_to_insert[start_byte:start_byte+insert_count]  | 
|
263  | 
as_lines = osutils.split_lines(insert)  | 
|
264  | 
out_lines.extend(as_lines)  | 
|
265  | 
index_lines.extend([True]*len(as_lines))  | 
|
266  | 
||
267  | 
def _flush_copy(self, old_start_linenum, num_lines,  | 
|
268  | 
out_lines, index_lines):  | 
|
269  | 
if old_start_linenum == 0:  | 
|
270  | 
first_byte = 0  | 
|
271  | 
else:  | 
|
272  | 
first_byte = self.line_offsets[old_start_linenum - 1]  | 
|
273  | 
stop_byte = self.line_offsets[old_start_linenum + num_lines - 1]  | 
|
274  | 
num_bytes = stop_byte - first_byte  | 
|
275  | 
        # The data stream allows >64kB in a copy, but to match the compiled
 | 
|
276  | 
        # code, we will also limit it to a 64kB copy
 | 
|
277  | 
for start_byte in xrange(first_byte, stop_byte, 64*1024):  | 
|
| 
4241.20.1
by John Arbash Meinel
 Cherrypick the python groupcompress fix for bzr-1.14-final  | 
278  | 
num_bytes = min(64*1024, stop_byte - start_byte)  | 
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
279  | 
copy_bytes = encode_copy_instruction(start_byte, num_bytes)  | 
280  | 
out_lines.append(copy_bytes)  | 
|
281  | 
index_lines.append(False)  | 
|
282  | 
||
| 
3735.40.11
by John Arbash Meinel
 Implement make_delta and apply_delta.  | 
283  | 
def make_delta(self, new_lines, bytes_length=None, soft=False):  | 
| 
3735.40.7
by John Arbash Meinel
 Move even more functionality into EquivalenceTable.  | 
284  | 
"""Compute the delta for this content versus the original content."""  | 
| 
3735.40.11
by John Arbash Meinel
 Implement make_delta and apply_delta.  | 
285  | 
if bytes_length is None:  | 
286  | 
bytes_length = sum(map(len, new_lines))  | 
|
287  | 
        # reserved for content type, content length
 | 
|
288  | 
out_lines = ['', '', encode_base128_int(bytes_length)]  | 
|
| 
3735.40.10
by John Arbash Meinel
 Merge in the new delta format code.  | 
289  | 
index_lines = [False, False, False]  | 
| 
4300.1.2
by John Arbash Meinel
 Change the pure-python compressor a bit.  | 
290  | 
output_handler = _OutputHandler(out_lines, index_lines,  | 
291  | 
self._MIN_MATCH_BYTES)  | 
|
| 
3735.40.7
by John Arbash Meinel
 Move even more functionality into EquivalenceTable.  | 
292  | 
blocks = self.get_matching_blocks(new_lines, soft=soft)  | 
293  | 
current_line_num = 0  | 
|
294  | 
        # We either copy a range (while there are reusable lines) or we
 | 
|
295  | 
        # insert new lines. To find reusable lines we traverse
 | 
|
296  | 
for old_start, new_start, range_len in blocks:  | 
|
297  | 
if new_start != current_line_num:  | 
|
298  | 
                # non-matching region, insert the content
 | 
|
| 
4300.1.2
by John Arbash Meinel
 Change the pure-python compressor a bit.  | 
299  | 
output_handler.add_insert(new_lines[current_line_num:new_start])  | 
| 
3735.40.7
by John Arbash Meinel
 Move even more functionality into EquivalenceTable.  | 
300  | 
current_line_num = new_start + range_len  | 
301  | 
if range_len:  | 
|
| 
4300.1.2
by John Arbash Meinel
 Change the pure-python compressor a bit.  | 
302  | 
                # Convert the line based offsets into byte based offsets
 | 
303  | 
if old_start == 0:  | 
|
304  | 
first_byte = 0  | 
|
305  | 
else:  | 
|
306  | 
first_byte = self.line_offsets[old_start - 1]  | 
|
307  | 
last_byte = self.line_offsets[old_start + range_len - 1]  | 
|
308  | 
output_handler.add_copy(first_byte, last_byte)  | 
|
| 
3735.40.7
by John Arbash Meinel
 Move even more functionality into EquivalenceTable.  | 
309  | 
return out_lines, index_lines  | 
310  | 
||
311  | 
||
| 
3735.40.11
by John Arbash Meinel
 Implement make_delta and apply_delta.  | 
312  | 
def encode_base128_int(val):  | 
313  | 
"""Convert an integer into a 7-bit lsb encoding."""  | 
|
314  | 
bytes = []  | 
|
315  | 
count = 0  | 
|
316  | 
while val >= 0x80:  | 
|
317  | 
bytes.append(chr((val | 0x80) & 0xFF))  | 
|
318  | 
val >>= 7  | 
|
319  | 
bytes.append(chr(val))  | 
|
320  | 
return ''.join(bytes)  | 
|
321  | 
||
322  | 
||
323  | 
def decode_base128_int(bytes):  | 
|
324  | 
"""Decode an integer from a 7-bit lsb encoding."""  | 
|
325  | 
offset = 0  | 
|
326  | 
val = 0  | 
|
327  | 
shift = 0  | 
|
328  | 
bval = ord(bytes[offset])  | 
|
329  | 
while bval >= 0x80:  | 
|
330  | 
val |= (bval & 0x7F) << shift  | 
|
331  | 
shift += 7  | 
|
332  | 
offset += 1  | 
|
333  | 
bval = ord(bytes[offset])  | 
|
334  | 
val |= bval << shift  | 
|
335  | 
offset += 1  | 
|
336  | 
return val, offset  | 
|
337  | 
||
338  | 
||
| 
3735.40.7
by John Arbash Meinel
 Move even more functionality into EquivalenceTable.  | 
339  | 
def encode_copy_instruction(offset, length):  | 
340  | 
"""Convert this offset into a control code and bytes."""  | 
|
341  | 
copy_command = 0x80  | 
|
342  | 
copy_bytes = [None]  | 
|
343  | 
||
344  | 
for copy_bit in (0x01, 0x02, 0x04, 0x08):  | 
|
345  | 
base_byte = offset & 0xff  | 
|
346  | 
if base_byte:  | 
|
347  | 
copy_command |= copy_bit  | 
|
348  | 
copy_bytes.append(chr(base_byte))  | 
|
349  | 
offset >>= 8  | 
|
350  | 
if length is None:  | 
|
| 
4300.2.1
by John Arbash Meinel
 Fix bug #364900, properly remove the 64kB that was just encoded in the copy.  | 
351  | 
raise ValueError("cannot supply a length of None")  | 
| 
3735.40.7
by John Arbash Meinel
 Move even more functionality into EquivalenceTable.  | 
352  | 
if length > 0x10000:  | 
353  | 
raise ValueError("we don't emit copy records for lengths > 64KiB")  | 
|
354  | 
if length == 0:  | 
|
355  | 
raise ValueError("We cannot emit a copy of length 0")  | 
|
356  | 
if length != 0x10000:  | 
|
357  | 
        # A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
 | 
|
358  | 
        # since that saves bytes for large chained copies
 | 
|
359  | 
for copy_bit in (0x10, 0x20):  | 
|
360  | 
base_byte = length & 0xff  | 
|
361  | 
if base_byte:  | 
|
362  | 
copy_command |= copy_bit  | 
|
363  | 
copy_bytes.append(chr(base_byte))  | 
|
364  | 
length >>= 8  | 
|
365  | 
copy_bytes[0] = chr(copy_command)  | 
|
366  | 
return ''.join(copy_bytes)  | 
|
367  | 
||
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
368  | 
|
| 
3735.40.11
by John Arbash Meinel
 Implement make_delta and apply_delta.  | 
369  | 
def decode_copy_instruction(bytes, cmd, pos):  | 
370  | 
"""Decode a copy instruction from the next few bytes.  | 
|
371  | 
||
372  | 
    A copy instruction is a variable number of bytes, so we will parse the
 | 
|
373  | 
    bytes we care about, and return the new position, as well as the offset and
 | 
|
374  | 
    length referred to in the bytes.
 | 
|
375  | 
||
376  | 
    :param bytes: A string of bytes
 | 
|
377  | 
    :param cmd: The command code
 | 
|
378  | 
    :param pos: The position in bytes right after the copy command
 | 
|
379  | 
    :return: (offset, length, newpos)
 | 
|
380  | 
        The offset of the copy start, the number of bytes to copy, and the
 | 
|
381  | 
        position after the last byte of the copy
 | 
|
382  | 
    """
 | 
|
383  | 
if cmd & 0x80 != 0x80:  | 
|
384  | 
raise ValueError('copy instructions must have bit 0x80 set')  | 
|
385  | 
offset = 0  | 
|
386  | 
length = 0  | 
|
387  | 
if (cmd & 0x01):  | 
|
388  | 
offset = ord(bytes[pos])  | 
|
389  | 
pos += 1  | 
|
390  | 
if (cmd & 0x02):  | 
|
391  | 
offset = offset | (ord(bytes[pos]) << 8)  | 
|
392  | 
pos += 1  | 
|
393  | 
if (cmd & 0x04):  | 
|
394  | 
offset = offset | (ord(bytes[pos]) << 16)  | 
|
395  | 
pos += 1  | 
|
396  | 
if (cmd & 0x08):  | 
|
397  | 
offset = offset | (ord(bytes[pos]) << 24)  | 
|
398  | 
pos += 1  | 
|
399  | 
if (cmd & 0x10):  | 
|
400  | 
length = ord(bytes[pos])  | 
|
401  | 
pos += 1  | 
|
402  | 
if (cmd & 0x20):  | 
|
403  | 
length = length | (ord(bytes[pos]) << 8)  | 
|
404  | 
pos += 1  | 
|
405  | 
if (cmd & 0x40):  | 
|
406  | 
length = length | (ord(bytes[pos]) << 16)  | 
|
407  | 
pos += 1  | 
|
408  | 
if length == 0:  | 
|
409  | 
length = 65536  | 
|
410  | 
return (offset, length, pos)  | 
|
411  | 
||
| 
3735.40.5
by John Arbash Meinel
 Start adding permutation tests for _groupcompress_py and _groupcompress_pyx  | 
412  | 
|
413  | 
def make_delta(source_bytes, target_bytes):  | 
|
414  | 
"""Create a delta from source to target."""  | 
|
| 
3735.40.11
by John Arbash Meinel
 Implement make_delta and apply_delta.  | 
415  | 
if type(source_bytes) is not str:  | 
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
416  | 
raise TypeError('source is not a str')  | 
| 
3735.40.11
by John Arbash Meinel
 Implement make_delta and apply_delta.  | 
417  | 
if type(target_bytes) is not str:  | 
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
418  | 
raise TypeError('target is not a str')  | 
| 
3735.40.13
by John Arbash Meinel
 Rename EquivalenceTable to LinesDeltaIndex.  | 
419  | 
line_locations = LinesDeltaIndex(osutils.split_lines(source_bytes))  | 
| 
3735.40.11
by John Arbash Meinel
 Implement make_delta and apply_delta.  | 
420  | 
delta, _ = line_locations.make_delta(osutils.split_lines(target_bytes),  | 
421  | 
bytes_length=len(target_bytes))  | 
|
422  | 
return ''.join(delta)  | 
|
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
423  | 
|
424  | 
||
425  | 
def apply_delta(basis, delta):  | 
|
426  | 
"""Apply delta to this object to become new_version_id."""  | 
|
| 
3735.40.11
by John Arbash Meinel
 Implement make_delta and apply_delta.  | 
427  | 
if type(basis) is not str:  | 
428  | 
raise TypeError('basis is not a str')  | 
|
429  | 
if type(delta) is not str:  | 
|
430  | 
raise TypeError('delta is not a str')  | 
|
431  | 
target_length, pos = decode_base128_int(delta)  | 
|
| 
4241.6.6
by Robert Collins, John Arbash Meinel, Ian Clathworthy, Vincent Ladeuil
 Groupcompress from brisbane-core.  | 
432  | 
lines = []  | 
| 
3735.40.11
by John Arbash Meinel
 Implement make_delta and apply_delta.  | 
433  | 
len_delta = len(delta)  | 
434  | 
while pos < len_delta:  | 
|
435  | 
cmd = ord(delta[pos])  | 
|
436  | 
pos += 1  | 
|
437  | 
if cmd & 0x80:  | 
|
438  | 
offset, length, pos = decode_copy_instruction(delta, cmd, pos)  | 
|
| 
3735.40.19
by John Arbash Meinel
 Implement apply_delta_to_source which doesn't have to malloc another string.  | 
439  | 
last = offset + length  | 
440  | 
if last > len(basis):  | 
|
441  | 
raise ValueError('data would copy bytes past the'  | 
|
442  | 
'end of source')  | 
|
443  | 
lines.append(basis[offset:last])  | 
|
| 
3735.40.11
by John Arbash Meinel
 Implement make_delta and apply_delta.  | 
444  | 
else: # Insert of 'cmd' bytes  | 
445  | 
if cmd == 0:  | 
|
446  | 
raise ValueError('Command == 0 not supported yet')  | 
|
447  | 
lines.append(delta[pos:pos+cmd])  | 
|
448  | 
pos += cmd  | 
|
449  | 
bytes = ''.join(lines)  | 
|
450  | 
if len(bytes) != target_length:  | 
|
451  | 
raise ValueError('Delta claimed to be %d long, but ended up'  | 
|
452  | 
' %d long' % (target_length, len(bytes)))  | 
|
453  | 
return bytes  | 
|
| 
3735.40.19
by John Arbash Meinel
 Implement apply_delta_to_source which doesn't have to malloc another string.  | 
454  | 
|
455  | 
||
456  | 
def apply_delta_to_source(source, delta_start, delta_end):  | 
|
457  | 
"""Extract a delta from source bytes, and apply it."""  | 
|
458  | 
source_size = len(source)  | 
|
459  | 
if delta_start >= source_size:  | 
|
460  | 
raise ValueError('delta starts after source')  | 
|
461  | 
if delta_end > source_size:  | 
|
462  | 
raise ValueError('delta ends after source')  | 
|
463  | 
if delta_start >= delta_end:  | 
|
464  | 
raise ValueError('delta starts after it ends')  | 
|
465  | 
delta_bytes = source[delta_start:delta_end]  | 
|
466  | 
return apply_delta(source, delta_bytes)  |