1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Python version of compiled extensions for doing compression.
19
We separate the implementation from the groupcompress.py to avoid importing
23
from __future__ import absolute_import
25
from .. import osutils
26
from ..sixish import (
33
class _OutputHandler(object):
34
"""A simple class which just tracks how to split up an insert request."""
36
def __init__(self, out_lines, index_lines, min_len_to_index):
37
self.out_lines = out_lines
38
self.index_lines = index_lines
39
self.min_len_to_index = min_len_to_index
40
self.cur_insert_lines = []
41
self.cur_insert_len = 0
43
def add_copy(self, start_byte, end_byte):
44
# The data stream allows >64kB in a copy, but to match the compiled
45
# code, we will also limit it to a 64kB copy
46
for start_byte in range(start_byte, end_byte, 64 * 1024):
47
num_bytes = min(64 * 1024, end_byte - start_byte)
48
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
49
self.out_lines.append(copy_bytes)
50
self.index_lines.append(False)
52
def _flush_insert(self):
53
if not self.cur_insert_lines:
55
if self.cur_insert_len > 127:
56
raise AssertionError('We cannot insert more than 127 bytes'
58
self.out_lines.append(int2byte(self.cur_insert_len))
59
self.index_lines.append(False)
60
self.out_lines.extend(self.cur_insert_lines)
61
if self.cur_insert_len < self.min_len_to_index:
62
self.index_lines.extend([False] * len(self.cur_insert_lines))
64
self.index_lines.extend([True] * len(self.cur_insert_lines))
65
self.cur_insert_lines = []
66
self.cur_insert_len = 0
68
def _insert_long_line(self, line):
69
# Flush out anything pending
72
for start_index in range(0, line_len, 127):
73
next_len = min(127, line_len - start_index)
74
self.out_lines.append(int2byte(next_len))
75
self.index_lines.append(False)
76
self.out_lines.append(line[start_index:start_index + next_len])
77
# We don't index long lines, because we won't be able to match
78
# a line split across multiple inserts anway
79
self.index_lines.append(False)
81
def add_insert(self, lines):
82
if self.cur_insert_lines != []:
83
raise AssertionError('self.cur_insert_lines must be empty when'
84
' adding a new insert')
87
self._insert_long_line(line)
89
next_len = len(line) + self.cur_insert_len
91
# Adding this line would overflow, so flush, and start over
93
self.cur_insert_lines = [line]
94
self.cur_insert_len = len(line)
96
self.cur_insert_lines.append(line)
97
self.cur_insert_len = next_len
101
class LinesDeltaIndex(object):
102
"""This class indexes matches between strings.
104
:ivar lines: The 'static' lines that will be preserved between runs.
105
:ivar _matching_lines: A dict of {line:[matching offsets]}
106
:ivar line_offsets: The byte offset for the end of each line, used to
107
quickly map between a matching line number and the byte location
108
:ivar endpoint: The total number of bytes in self.line_offsets
111
_MIN_MATCH_BYTES = 10
112
_SOFT_MIN_MATCH_BYTES = 200
114
def __init__(self, lines):
116
self.line_offsets = []
118
self._matching_lines = {}
119
self.extend_lines(lines, [True] * len(lines))
121
def _update_matching_lines(self, new_lines, index):
122
matches = self._matching_lines
123
start_idx = len(self.lines)
124
if len(new_lines) != len(index):
125
raise AssertionError('The number of lines to be indexed does'
126
' not match the index/don\'t index flags: %d != %d'
127
% (len(new_lines), len(index)))
128
for idx, do_index in enumerate(index):
131
line = new_lines[idx]
133
matches[line].add(start_idx + idx)
135
matches[line] = {start_idx + idx}
137
def get_matches(self, line):
138
"""Return the lines which match the line in right."""
140
return self._matching_lines[line]
144
def _get_longest_match(self, lines, pos):
145
"""Look at all matches for the current line, return the longest.
147
:param lines: The lines we are matching against
148
:param pos: The current location we care about
149
:param locations: A list of lines that matched the current location.
150
This may be None, but often we'll have already found matches for
152
:return: (start_in_self, start_in_lines, num_lines)
153
All values are the offset in the list (aka the line number)
154
If start_in_self is None, then we have no matches, and this line
155
should be inserted in the target.
159
prev_locations = None
161
matching = self._matching_lines
164
locations = matching[lines[pos]]
166
# No more matches, just return whatever we have, but we know
167
# that this last position is not going to match anything
171
if prev_locations is None:
172
# This is the first match in a range
173
prev_locations = locations
175
locations = None # Consumed
177
# We have a match started, compare to see if any of the
178
# current matches can be continued
179
next_locations = locations.intersection([loc + 1 for loc
182
# At least one of the regions continues to match
183
prev_locations = set(next_locations)
185
locations = None # Consumed
187
# All current regions no longer match.
188
# This line does still match something, just not at the
189
# end of the previous matches. We will return locations
190
# so that we can avoid another _matching_lines lookup.
193
if prev_locations is None:
194
# We have no matches, this is a pure insert
196
smallest = min(prev_locations)
197
return (smallest - range_len + 1, range_start, range_len), pos
199
def get_matching_blocks(self, lines, soft=False):
200
"""Return the ranges in lines which match self.lines.
202
:param lines: lines to compress
203
:return: A list of (old_start, new_start, length) tuples which reflect
204
a region in self.lines that is present in lines. The last element
205
of the list is always (old_len, new_len, 0) to provide a end point
206
for generating instructions from the matching blocks list.
208
# In this code, we iterate over multiple _get_longest_match calls, to
209
# find the next longest copy, and possible insert regions. We then
210
# convert that to the simple matching_blocks representation, since
211
# otherwise inserting 10 lines in a row would show up as 10
216
result_append = result.append
217
min_match_bytes = self._MIN_MATCH_BYTES
219
min_match_bytes = self._SOFT_MIN_MATCH_BYTES
221
block, pos = self._get_longest_match(lines, pos)
222
if block is not None:
223
# Check to see if we match fewer than min_match_bytes. As we
224
# will turn this into a pure 'insert', rather than a copy.
225
# block[-1] is the number of lines. A quick check says if we
226
# have more lines than min_match_bytes, then we know we have
228
if block[-1] < min_match_bytes:
229
# This block may be a 'short' block, check
230
old_start, new_start, range_len = block
231
matched_bytes = sum(map(len,
232
lines[new_start:new_start + range_len]))
233
if matched_bytes < min_match_bytes:
235
if block is not None:
237
result_append((len(self.lines), len(lines), 0))
240
def extend_lines(self, lines, index):
241
"""Add more lines to the left-lines list.
243
:param lines: A list of lines to add
244
:param index: A True/False for each node to define if it should be
247
self._update_matching_lines(lines, index)
248
self.lines.extend(lines)
249
endpoint = self.endpoint
251
endpoint += len(line)
252
self.line_offsets.append(endpoint)
253
if len(self.line_offsets) != len(self.lines):
254
raise AssertionError('Somehow the line offset indicator'
255
' got out of sync with the line counter.')
256
self.endpoint = endpoint
258
def _flush_insert(self, start_linenum, end_linenum,
259
new_lines, out_lines, index_lines):
260
"""Add an 'insert' request to the data stream."""
261
bytes_to_insert = b''.join(new_lines[start_linenum:end_linenum])
262
insert_length = len(bytes_to_insert)
263
# Each insert instruction is at most 127 bytes long
264
for start_byte in range(0, insert_length, 127):
265
insert_count = min(insert_length - start_byte, 127)
266
out_lines.append(int2byte(insert_count))
267
# Don't index the 'insert' instruction
268
index_lines.append(False)
269
insert = bytes_to_insert[start_byte:start_byte + insert_count]
270
as_lines = osutils.split_lines(insert)
271
out_lines.extend(as_lines)
272
index_lines.extend([True] * len(as_lines))
274
def _flush_copy(self, old_start_linenum, num_lines,
275
out_lines, index_lines):
276
if old_start_linenum == 0:
279
first_byte = self.line_offsets[old_start_linenum - 1]
280
stop_byte = self.line_offsets[old_start_linenum + num_lines - 1]
281
num_bytes = stop_byte - first_byte
282
# The data stream allows >64kB in a copy, but to match the compiled
283
# code, we will also limit it to a 64kB copy
284
for start_byte in range(first_byte, stop_byte, 64 * 1024):
285
num_bytes = min(64 * 1024, stop_byte - start_byte)
286
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
287
out_lines.append(copy_bytes)
288
index_lines.append(False)
290
def make_delta(self, new_lines, bytes_length=None, soft=False):
291
"""Compute the delta for this content versus the original content."""
292
if bytes_length is None:
293
bytes_length = sum(map(len, new_lines))
294
# reserved for content type, content length
295
out_lines = [b'', b'', encode_base128_int(bytes_length)]
296
index_lines = [False, False, False]
297
output_handler = _OutputHandler(out_lines, index_lines,
298
self._MIN_MATCH_BYTES)
299
blocks = self.get_matching_blocks(new_lines, soft=soft)
301
# We either copy a range (while there are reusable lines) or we
302
# insert new lines. To find reusable lines we traverse
303
for old_start, new_start, range_len in blocks:
304
if new_start != current_line_num:
305
# non-matching region, insert the content
306
output_handler.add_insert(
307
new_lines[current_line_num:new_start])
308
current_line_num = new_start + range_len
310
# Convert the line based offsets into byte based offsets
314
first_byte = self.line_offsets[old_start - 1]
315
last_byte = self.line_offsets[old_start + range_len - 1]
316
output_handler.add_copy(first_byte, last_byte)
317
return out_lines, index_lines
320
def encode_base128_int(val):
321
"""Convert an integer into a 7-bit lsb encoding."""
325
data.append((val | 0x80) & 0xFF)
331
def decode_base128_int(data):
332
"""Decode an integer from a 7-bit lsb encoding."""
336
bval = indexbytes(data, offset)
338
val |= (bval & 0x7F) << shift
341
bval = indexbytes(data, offset)
347
def encode_copy_instruction(offset, length):
348
"""Convert this offset into a control code and bytes."""
352
for copy_bit in (0x01, 0x02, 0x04, 0x08):
353
base_byte = offset & 0xff
355
copy_command |= copy_bit
356
copy_bytes.append(int2byte(base_byte))
359
raise ValueError("cannot supply a length of None")
361
raise ValueError("we don't emit copy records for lengths > 64KiB")
363
raise ValueError("We cannot emit a copy of length 0")
364
if length != 0x10000:
365
# A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
366
# since that saves bytes for large chained copies
367
for copy_bit in (0x10, 0x20):
368
base_byte = length & 0xff
370
copy_command |= copy_bit
371
copy_bytes.append(int2byte(base_byte))
373
copy_bytes[0] = int2byte(copy_command)
374
return b''.join(copy_bytes)
377
def decode_copy_instruction(bytes, cmd, pos):
378
"""Decode a copy instruction from the next few bytes.
380
A copy instruction is a variable number of bytes, so we will parse the
381
bytes we care about, and return the new position, as well as the offset and
382
length referred to in the bytes.
384
:param bytes: A string of bytes
385
:param cmd: The command code
386
:param pos: The position in bytes right after the copy command
387
:return: (offset, length, newpos)
388
The offset of the copy start, the number of bytes to copy, and the
389
position after the last byte of the copy
391
if cmd & 0x80 != 0x80:
392
raise ValueError('copy instructions must have bit 0x80 set')
396
offset = indexbytes(bytes, pos)
399
offset = offset | (indexbytes(bytes, pos) << 8)
402
offset = offset | (indexbytes(bytes, pos) << 16)
405
offset = offset | (indexbytes(bytes, pos) << 24)
408
length = indexbytes(bytes, pos)
411
length = length | (indexbytes(bytes, pos) << 8)
414
length = length | (indexbytes(bytes, pos) << 16)
418
return (offset, length, pos)
421
def make_delta(source_bytes, target_bytes):
422
"""Create a delta from source to target."""
423
if not isinstance(source_bytes, bytes):
424
raise TypeError('source is not bytes')
425
if not isinstance(target_bytes, bytes):
426
raise TypeError('target is not bytes')
427
line_locations = LinesDeltaIndex(osutils.split_lines(source_bytes))
428
delta, _ = line_locations.make_delta(osutils.split_lines(target_bytes),
429
bytes_length=len(target_bytes))
430
return b''.join(delta)
433
def apply_delta(basis, delta):
434
"""Apply delta to this object to become new_version_id."""
435
if not isinstance(basis, bytes):
436
raise TypeError('basis is not bytes')
437
if not isinstance(delta, bytes):
438
raise TypeError('delta is not bytes')
439
target_length, pos = decode_base128_int(delta)
441
len_delta = len(delta)
442
while pos < len_delta:
443
cmd = indexbytes(delta, pos)
446
offset, length, pos = decode_copy_instruction(delta, cmd, pos)
447
last = offset + length
448
if last > len(basis):
449
raise ValueError('data would copy bytes past the'
451
lines.append(basis[offset:last])
452
else: # Insert of 'cmd' bytes
454
raise ValueError('Command == 0 not supported yet')
455
lines.append(delta[pos:pos + cmd])
457
data = b''.join(lines)
458
if len(data) != target_length:
459
raise ValueError('Delta claimed to be %d long, but ended up'
460
' %d long' % (target_length, len(bytes)))
464
def apply_delta_to_source(source, delta_start, delta_end):
465
"""Extract a delta from source bytes, and apply it."""
466
source_size = len(source)
467
if delta_start >= source_size:
468
raise ValueError('delta starts after source')
469
if delta_end > source_size:
470
raise ValueError('delta ends after source')
471
if delta_start >= delta_end:
472
raise ValueError('delta starts after it ends')
473
delta_bytes = source[delta_start:delta_end]
474
return apply_delta(source, delta_bytes)