1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Python version of compiled extensions for doing compression.
19
We separate the implementation from the groupcompress.py to avoid importing
23
from bzrlib import osutils
26
class _OutputHandler(object):
27
"""A simple class which just tracks how to split up an insert request."""
29
def __init__(self, out_lines, index_lines, min_len_to_index):
30
self.out_lines = out_lines
31
self.index_lines = index_lines
32
self.min_len_to_index = min_len_to_index
33
self.cur_insert_lines = []
34
self.cur_insert_len = 0
36
def add_copy(self, start_byte, end_byte):
37
# The data stream allows >64kB in a copy, but to match the compiled
38
# code, we will also limit it to a 64kB copy
39
for start_byte in xrange(start_byte, end_byte, 64*1024):
40
num_bytes = min(64*1024, end_byte - start_byte)
41
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
42
self.out_lines.append(copy_bytes)
43
self.index_lines.append(False)
45
def _flush_insert(self):
46
if not self.cur_insert_lines:
48
if self.cur_insert_len > 127:
49
raise AssertionError('We cannot insert more than 127 bytes'
51
self.out_lines.append(chr(self.cur_insert_len))
52
self.index_lines.append(False)
53
self.out_lines.extend(self.cur_insert_lines)
54
if self.cur_insert_len < self.min_len_to_index:
55
self.index_lines.extend([False]*len(self.cur_insert_lines))
57
self.index_lines.extend([True]*len(self.cur_insert_lines))
58
self.cur_insert_lines = []
59
self.cur_insert_len = 0
61
def _insert_long_line(self, line):
62
# Flush out anything pending
65
for start_index in xrange(0, line_len, 127):
66
next_len = min(127, line_len - start_index)
67
self.out_lines.append(chr(next_len))
68
self.index_lines.append(False)
69
self.out_lines.append(line[start_index:start_index+next_len])
70
# We don't index long lines, because we won't be able to match
71
# a line split across multiple inserts anway
72
self.index_lines.append(False)
74
def add_insert(self, lines):
75
if self.cur_insert_lines != []:
76
raise AssertionError('self.cur_insert_lines must be empty when'
77
' adding a new insert')
80
self._insert_long_line(line)
82
next_len = len(line) + self.cur_insert_len
84
# Adding this line would overflow, so flush, and start over
86
self.cur_insert_lines = [line]
87
self.cur_insert_len = len(line)
89
self.cur_insert_lines.append(line)
90
self.cur_insert_len = next_len
94
class LinesDeltaIndex(object):
95
"""This class indexes matches between strings.
97
:ivar lines: The 'static' lines that will be preserved between runs.
98
:ivar _matching_lines: A dict of {line:[matching offsets]}
99
:ivar line_offsets: The byte offset for the end of each line, used to
100
quickly map between a matching line number and the byte location
101
:ivar endpoint: The total number of bytes in self.line_offsets
104
_MIN_MATCH_BYTES = 10
105
_SOFT_MIN_MATCH_BYTES = 200
107
def __init__(self, lines):
109
self.line_offsets = []
111
self._matching_lines = {}
112
self.extend_lines(lines, [True]*len(lines))
114
def _update_matching_lines(self, new_lines, index):
115
matches = self._matching_lines
116
start_idx = len(self.lines)
117
if len(new_lines) != len(index):
118
raise AssertionError('The number of lines to be indexed does'
119
' not match the index/don\'t index flags: %d != %d'
120
% (len(new_lines), len(index)))
121
for idx, do_index in enumerate(index):
124
line = new_lines[idx]
126
matches[line].add(start_idx + idx)
128
matches[line] = set([start_idx + idx])
130
def get_matches(self, line):
131
"""Return the lines which match the line in right."""
133
return self._matching_lines[line]
137
def _get_longest_match(self, lines, pos):
138
"""Look at all matches for the current line, return the longest.
140
:param lines: The lines we are matching against
141
:param pos: The current location we care about
142
:param locations: A list of lines that matched the current location.
143
This may be None, but often we'll have already found matches for
145
:return: (start_in_self, start_in_lines, num_lines)
146
All values are the offset in the list (aka the line number)
147
If start_in_self is None, then we have no matches, and this line
148
should be inserted in the target.
152
prev_locations = None
154
matching = self._matching_lines
157
locations = matching[lines[pos]]
159
# No more matches, just return whatever we have, but we know
160
# that this last position is not going to match anything
164
if prev_locations is None:
165
# This is the first match in a range
166
prev_locations = locations
168
locations = None # Consumed
170
# We have a match started, compare to see if any of the
171
# current matches can be continued
172
next_locations = locations.intersection([loc + 1 for loc
175
# At least one of the regions continues to match
176
prev_locations = set(next_locations)
178
locations = None # Consumed
180
# All current regions no longer match.
181
# This line does still match something, just not at the
182
# end of the previous matches. We will return locations
183
# so that we can avoid another _matching_lines lookup.
186
if prev_locations is None:
187
# We have no matches, this is a pure insert
189
smallest = min(prev_locations)
190
return (smallest - range_len + 1, range_start, range_len), pos
192
def get_matching_blocks(self, lines, soft=False):
193
"""Return the ranges in lines which match self.lines.
195
:param lines: lines to compress
196
:return: A list of (old_start, new_start, length) tuples which reflect
197
a region in self.lines that is present in lines. The last element
198
of the list is always (old_len, new_len, 0) to provide a end point
199
for generating instructions from the matching blocks list.
201
# In this code, we iterate over multiple _get_longest_match calls, to
202
# find the next longest copy, and possible insert regions. We then
203
# convert that to the simple matching_blocks representation, since
204
# otherwise inserting 10 lines in a row would show up as 10
209
result_append = result.append
210
min_match_bytes = self._MIN_MATCH_BYTES
212
min_match_bytes = self._SOFT_MIN_MATCH_BYTES
214
block, pos = self._get_longest_match(lines, pos)
215
if block is not None:
216
# Check to see if we match fewer than min_match_bytes. As we
217
# will turn this into a pure 'insert', rather than a copy.
218
# block[-1] is the number of lines. A quick check says if we
219
# have more lines than min_match_bytes, then we know we have
221
if block[-1] < min_match_bytes:
222
# This block may be a 'short' block, check
223
old_start, new_start, range_len = block
224
matched_bytes = sum(map(len,
225
lines[new_start:new_start + range_len]))
226
if matched_bytes < min_match_bytes:
228
if block is not None:
230
result_append((len(self.lines), len(lines), 0))
233
def extend_lines(self, lines, index):
234
"""Add more lines to the left-lines list.
236
:param lines: A list of lines to add
237
:param index: A True/False for each node to define if it should be
240
self._update_matching_lines(lines, index)
241
self.lines.extend(lines)
242
endpoint = self.endpoint
244
endpoint += len(line)
245
self.line_offsets.append(endpoint)
246
if len(self.line_offsets) != len(self.lines):
247
raise AssertionError('Somehow the line offset indicator'
248
' got out of sync with the line counter.')
249
self.endpoint = endpoint
251
def make_delta(self, new_lines, bytes_length=None, soft=False):
252
"""Compute the delta for this content versus the original content."""
253
if bytes_length is None:
254
bytes_length = sum(map(len, new_lines))
255
# reserved for content type, content length
256
out_lines = ['', '', encode_base128_int(bytes_length)]
257
index_lines = [False, False, False]
258
output_handler = _OutputHandler(out_lines, index_lines,
259
self._MIN_MATCH_BYTES)
260
blocks = self.get_matching_blocks(new_lines, soft=soft)
262
# We either copy a range (while there are reusable lines) or we
263
# insert new lines. To find reusable lines we traverse
264
for old_start, new_start, range_len in blocks:
265
if new_start != current_line_num:
266
# non-matching region, insert the content
267
output_handler.add_insert(new_lines[current_line_num:new_start])
268
current_line_num = new_start + range_len
270
# Convert the line based offsets into byte based offsets
274
first_byte = self.line_offsets[old_start - 1]
275
last_byte = self.line_offsets[old_start + range_len - 1]
276
output_handler.add_copy(first_byte, last_byte)
277
return out_lines, index_lines
280
def encode_base128_int(val):
281
"""Convert an integer into a 7-bit lsb encoding."""
285
bytes.append(chr((val | 0x80) & 0xFF))
287
bytes.append(chr(val))
288
return ''.join(bytes)
291
def decode_base128_int(bytes):
292
"""Decode an integer from a 7-bit lsb encoding."""
296
bval = ord(bytes[offset])
298
val |= (bval & 0x7F) << shift
301
bval = ord(bytes[offset])
307
def encode_copy_instruction(offset, length):
308
"""Convert this offset into a control code and bytes."""
312
for copy_bit in (0x01, 0x02, 0x04, 0x08):
313
base_byte = offset & 0xff
315
copy_command |= copy_bit
316
copy_bytes.append(chr(base_byte))
319
raise ValueError("cannot supply a length of None")
321
raise ValueError("we don't emit copy records for lengths > 64KiB")
323
raise ValueError("We cannot emit a copy of length 0")
324
if length != 0x10000:
325
# A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
326
# since that saves bytes for large chained copies
327
for copy_bit in (0x10, 0x20):
328
base_byte = length & 0xff
330
copy_command |= copy_bit
331
copy_bytes.append(chr(base_byte))
333
copy_bytes[0] = chr(copy_command)
334
return ''.join(copy_bytes)
337
def decode_copy_instruction(bytes, cmd, pos):
338
"""Decode a copy instruction from the next few bytes.
340
A copy instruction is a variable number of bytes, so we will parse the
341
bytes we care about, and return the new position, as well as the offset and
342
length referred to in the bytes.
344
:param bytes: A string of bytes
345
:param cmd: The command code
346
:param pos: The position in bytes right after the copy command
347
:return: (offset, length, newpos)
348
The offset of the copy start, the number of bytes to copy, and the
349
position after the last byte of the copy
351
if cmd & 0x80 != 0x80:
352
raise ValueError('copy instructions must have bit 0x80 set')
356
offset = ord(bytes[pos])
359
offset = offset | (ord(bytes[pos]) << 8)
362
offset = offset | (ord(bytes[pos]) << 16)
365
offset = offset | (ord(bytes[pos]) << 24)
368
length = ord(bytes[pos])
371
length = length | (ord(bytes[pos]) << 8)
374
length = length | (ord(bytes[pos]) << 16)
378
return (offset, length, pos)
381
def make_delta(source_bytes, target_bytes):
382
"""Create a delta from source to target."""
383
if type(source_bytes) is not str:
384
raise TypeError('source is not a str')
385
if type(target_bytes) is not str:
386
raise TypeError('target is not a str')
387
line_locations = LinesDeltaIndex(osutils.split_lines(source_bytes))
388
delta, _ = line_locations.make_delta(osutils.split_lines(target_bytes),
389
bytes_length=len(target_bytes))
390
return ''.join(delta)
393
def apply_delta(basis, delta):
394
"""Apply delta to this object to become new_version_id."""
395
if type(basis) is not str:
396
raise TypeError('basis is not a str')
397
if type(delta) is not str:
398
raise TypeError('delta is not a str')
399
target_length, pos = decode_base128_int(delta)
401
len_delta = len(delta)
402
while pos < len_delta:
403
cmd = ord(delta[pos])
406
offset, length, pos = decode_copy_instruction(delta, cmd, pos)
407
last = offset + length
408
if last > len(basis):
409
raise ValueError('data would copy bytes past the'
411
lines.append(basis[offset:last])
412
else: # Insert of 'cmd' bytes
414
raise ValueError('Command == 0 not supported yet')
415
lines.append(delta[pos:pos+cmd])
417
bytes = ''.join(lines)
418
if len(bytes) != target_length:
419
raise ValueError('Delta claimed to be %d long, but ended up'
420
' %d long' % (target_length, len(bytes)))
424
def apply_delta_to_source(source, delta_start, delta_end):
425
"""Extract a delta from source bytes, and apply it."""
426
source_size = len(source)
427
if delta_start >= source_size:
428
raise ValueError('delta starts after source')
429
if delta_end > source_size:
430
raise ValueError('delta ends after source')
431
if delta_start >= delta_end:
432
raise ValueError('delta starts after it ends')
433
delta_bytes = source[delta_start:delta_end]
434
return apply_delta(source, delta_bytes)