1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Python version of compiled extensions for doing compression.
19
We separate the implementation from the groupcompress.py to avoid importing
23
from __future__ import absolute_import
25
from .. import osutils
26
from ..sixish import (
31
class _OutputHandler(object):
32
"""A simple class which just tracks how to split up an insert request."""
34
def __init__(self, out_lines, index_lines, min_len_to_index):
35
self.out_lines = out_lines
36
self.index_lines = index_lines
37
self.min_len_to_index = min_len_to_index
38
self.cur_insert_lines = []
39
self.cur_insert_len = 0
41
def add_copy(self, start_byte, end_byte):
42
# The data stream allows >64kB in a copy, but to match the compiled
43
# code, we will also limit it to a 64kB copy
44
for start_byte in range(start_byte, end_byte, 64*1024):
45
num_bytes = min(64*1024, end_byte - start_byte)
46
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
47
self.out_lines.append(copy_bytes)
48
self.index_lines.append(False)
50
def _flush_insert(self):
51
if not self.cur_insert_lines:
53
if self.cur_insert_len > 127:
54
raise AssertionError('We cannot insert more than 127 bytes'
56
self.out_lines.append(chr(self.cur_insert_len))
57
self.index_lines.append(False)
58
self.out_lines.extend(self.cur_insert_lines)
59
if self.cur_insert_len < self.min_len_to_index:
60
self.index_lines.extend([False]*len(self.cur_insert_lines))
62
self.index_lines.extend([True]*len(self.cur_insert_lines))
63
self.cur_insert_lines = []
64
self.cur_insert_len = 0
66
def _insert_long_line(self, line):
67
# Flush out anything pending
70
for start_index in range(0, line_len, 127):
71
next_len = min(127, line_len - start_index)
72
self.out_lines.append(chr(next_len))
73
self.index_lines.append(False)
74
self.out_lines.append(line[start_index:start_index+next_len])
75
# We don't index long lines, because we won't be able to match
76
# a line split across multiple inserts anway
77
self.index_lines.append(False)
79
def add_insert(self, lines):
80
if self.cur_insert_lines != []:
81
raise AssertionError('self.cur_insert_lines must be empty when'
82
' adding a new insert')
85
self._insert_long_line(line)
87
next_len = len(line) + self.cur_insert_len
89
# Adding this line would overflow, so flush, and start over
91
self.cur_insert_lines = [line]
92
self.cur_insert_len = len(line)
94
self.cur_insert_lines.append(line)
95
self.cur_insert_len = next_len
99
class LinesDeltaIndex(object):
100
"""This class indexes matches between strings.
102
:ivar lines: The 'static' lines that will be preserved between runs.
103
:ivar _matching_lines: A dict of {line:[matching offsets]}
104
:ivar line_offsets: The byte offset for the end of each line, used to
105
quickly map between a matching line number and the byte location
106
:ivar endpoint: The total number of bytes in self.line_offsets
109
_MIN_MATCH_BYTES = 10
110
_SOFT_MIN_MATCH_BYTES = 200
112
def __init__(self, lines):
114
self.line_offsets = []
116
self._matching_lines = {}
117
self.extend_lines(lines, [True]*len(lines))
119
def _update_matching_lines(self, new_lines, index):
120
matches = self._matching_lines
121
start_idx = len(self.lines)
122
if len(new_lines) != len(index):
123
raise AssertionError('The number of lines to be indexed does'
124
' not match the index/don\'t index flags: %d != %d'
125
% (len(new_lines), len(index)))
126
for idx, do_index in enumerate(index):
129
line = new_lines[idx]
131
matches[line].add(start_idx + idx)
133
matches[line] = {start_idx + idx}
135
def get_matches(self, line):
136
"""Return the lines which match the line in right."""
138
return self._matching_lines[line]
142
def _get_longest_match(self, lines, pos):
143
"""Look at all matches for the current line, return the longest.
145
:param lines: The lines we are matching against
146
:param pos: The current location we care about
147
:param locations: A list of lines that matched the current location.
148
This may be None, but often we'll have already found matches for
150
:return: (start_in_self, start_in_lines, num_lines)
151
All values are the offset in the list (aka the line number)
152
If start_in_self is None, then we have no matches, and this line
153
should be inserted in the target.
157
prev_locations = None
159
matching = self._matching_lines
162
locations = matching[lines[pos]]
164
# No more matches, just return whatever we have, but we know
165
# that this last position is not going to match anything
169
if prev_locations is None:
170
# This is the first match in a range
171
prev_locations = locations
173
locations = None # Consumed
175
# We have a match started, compare to see if any of the
176
# current matches can be continued
177
next_locations = locations.intersection([loc + 1 for loc
180
# At least one of the regions continues to match
181
prev_locations = set(next_locations)
183
locations = None # Consumed
185
# All current regions no longer match.
186
# This line does still match something, just not at the
187
# end of the previous matches. We will return locations
188
# so that we can avoid another _matching_lines lookup.
191
if prev_locations is None:
192
# We have no matches, this is a pure insert
194
smallest = min(prev_locations)
195
return (smallest - range_len + 1, range_start, range_len), pos
197
def get_matching_blocks(self, lines, soft=False):
198
"""Return the ranges in lines which match self.lines.
200
:param lines: lines to compress
201
:return: A list of (old_start, new_start, length) tuples which reflect
202
a region in self.lines that is present in lines. The last element
203
of the list is always (old_len, new_len, 0) to provide a end point
204
for generating instructions from the matching blocks list.
206
# In this code, we iterate over multiple _get_longest_match calls, to
207
# find the next longest copy, and possible insert regions. We then
208
# convert that to the simple matching_blocks representation, since
209
# otherwise inserting 10 lines in a row would show up as 10
214
result_append = result.append
215
min_match_bytes = self._MIN_MATCH_BYTES
217
min_match_bytes = self._SOFT_MIN_MATCH_BYTES
219
block, pos = self._get_longest_match(lines, pos)
220
if block is not None:
221
# Check to see if we match fewer than min_match_bytes. As we
222
# will turn this into a pure 'insert', rather than a copy.
223
# block[-1] is the number of lines. A quick check says if we
224
# have more lines than min_match_bytes, then we know we have
226
if block[-1] < min_match_bytes:
227
# This block may be a 'short' block, check
228
old_start, new_start, range_len = block
229
matched_bytes = sum(map(len,
230
lines[new_start:new_start + range_len]))
231
if matched_bytes < min_match_bytes:
233
if block is not None:
235
result_append((len(self.lines), len(lines), 0))
238
def extend_lines(self, lines, index):
239
"""Add more lines to the left-lines list.
241
:param lines: A list of lines to add
242
:param index: A True/False for each node to define if it should be
245
self._update_matching_lines(lines, index)
246
self.lines.extend(lines)
247
endpoint = self.endpoint
249
endpoint += len(line)
250
self.line_offsets.append(endpoint)
251
if len(self.line_offsets) != len(self.lines):
252
raise AssertionError('Somehow the line offset indicator'
253
' got out of sync with the line counter.')
254
self.endpoint = endpoint
256
def _flush_insert(self, start_linenum, end_linenum,
257
new_lines, out_lines, index_lines):
258
"""Add an 'insert' request to the data stream."""
259
bytes_to_insert = ''.join(new_lines[start_linenum:end_linenum])
260
insert_length = len(bytes_to_insert)
261
# Each insert instruction is at most 127 bytes long
262
for start_byte in range(0, insert_length, 127):
263
insert_count = min(insert_length - start_byte, 127)
264
out_lines.append(chr(insert_count))
265
# Don't index the 'insert' instruction
266
index_lines.append(False)
267
insert = bytes_to_insert[start_byte:start_byte+insert_count]
268
as_lines = osutils.split_lines(insert)
269
out_lines.extend(as_lines)
270
index_lines.extend([True]*len(as_lines))
272
def _flush_copy(self, old_start_linenum, num_lines,
273
out_lines, index_lines):
274
if old_start_linenum == 0:
277
first_byte = self.line_offsets[old_start_linenum - 1]
278
stop_byte = self.line_offsets[old_start_linenum + num_lines - 1]
279
num_bytes = stop_byte - first_byte
280
# The data stream allows >64kB in a copy, but to match the compiled
281
# code, we will also limit it to a 64kB copy
282
for start_byte in range(first_byte, stop_byte, 64*1024):
283
num_bytes = min(64*1024, stop_byte - start_byte)
284
copy_bytes = encode_copy_instruction(start_byte, num_bytes)
285
out_lines.append(copy_bytes)
286
index_lines.append(False)
288
def make_delta(self, new_lines, bytes_length=None, soft=False):
289
"""Compute the delta for this content versus the original content."""
290
if bytes_length is None:
291
bytes_length = sum(map(len, new_lines))
292
# reserved for content type, content length
293
out_lines = ['', '', encode_base128_int(bytes_length)]
294
index_lines = [False, False, False]
295
output_handler = _OutputHandler(out_lines, index_lines,
296
self._MIN_MATCH_BYTES)
297
blocks = self.get_matching_blocks(new_lines, soft=soft)
299
# We either copy a range (while there are reusable lines) or we
300
# insert new lines. To find reusable lines we traverse
301
for old_start, new_start, range_len in blocks:
302
if new_start != current_line_num:
303
# non-matching region, insert the content
304
output_handler.add_insert(new_lines[current_line_num:new_start])
305
current_line_num = new_start + range_len
307
# Convert the line based offsets into byte based offsets
311
first_byte = self.line_offsets[old_start - 1]
312
last_byte = self.line_offsets[old_start + range_len - 1]
313
output_handler.add_copy(first_byte, last_byte)
314
return out_lines, index_lines
317
def encode_base128_int(val):
318
"""Convert an integer into a 7-bit lsb encoding."""
322
bytes.append(chr((val | 0x80) & 0xFF))
324
bytes.append(chr(val))
325
return ''.join(bytes)
328
def decode_base128_int(bytes):
329
"""Decode an integer from a 7-bit lsb encoding."""
333
bval = ord(bytes[offset])
335
val |= (bval & 0x7F) << shift
338
bval = ord(bytes[offset])
344
def encode_copy_instruction(offset, length):
345
"""Convert this offset into a control code and bytes."""
349
for copy_bit in (0x01, 0x02, 0x04, 0x08):
350
base_byte = offset & 0xff
352
copy_command |= copy_bit
353
copy_bytes.append(chr(base_byte))
356
raise ValueError("cannot supply a length of None")
358
raise ValueError("we don't emit copy records for lengths > 64KiB")
360
raise ValueError("We cannot emit a copy of length 0")
361
if length != 0x10000:
362
# A copy of length exactly 64*1024 == 0x10000 is sent as a length of 0,
363
# since that saves bytes for large chained copies
364
for copy_bit in (0x10, 0x20):
365
base_byte = length & 0xff
367
copy_command |= copy_bit
368
copy_bytes.append(chr(base_byte))
370
copy_bytes[0] = chr(copy_command)
371
return ''.join(copy_bytes)
374
def decode_copy_instruction(bytes, cmd, pos):
375
"""Decode a copy instruction from the next few bytes.
377
A copy instruction is a variable number of bytes, so we will parse the
378
bytes we care about, and return the new position, as well as the offset and
379
length referred to in the bytes.
381
:param bytes: A string of bytes
382
:param cmd: The command code
383
:param pos: The position in bytes right after the copy command
384
:return: (offset, length, newpos)
385
The offset of the copy start, the number of bytes to copy, and the
386
position after the last byte of the copy
388
if cmd & 0x80 != 0x80:
389
raise ValueError('copy instructions must have bit 0x80 set')
393
offset = ord(bytes[pos])
396
offset = offset | (ord(bytes[pos]) << 8)
399
offset = offset | (ord(bytes[pos]) << 16)
402
offset = offset | (ord(bytes[pos]) << 24)
405
length = ord(bytes[pos])
408
length = length | (ord(bytes[pos]) << 8)
411
length = length | (ord(bytes[pos]) << 16)
415
return (offset, length, pos)
418
def make_delta(source_bytes, target_bytes):
419
"""Create a delta from source to target."""
420
if not isinstance(source_bytes, str):
421
raise TypeError('source is not a str')
422
if not isinstance(target_bytes, str):
423
raise TypeError('target is not a str')
424
line_locations = LinesDeltaIndex(osutils.split_lines(source_bytes))
425
delta, _ = line_locations.make_delta(osutils.split_lines(target_bytes),
426
bytes_length=len(target_bytes))
427
return ''.join(delta)
430
def apply_delta(basis, delta):
431
"""Apply delta to this object to become new_version_id."""
432
if not isinstance(basis, str):
433
raise TypeError('basis is not a str')
434
if not isinstance(delta, str):
435
raise TypeError('delta is not a str')
436
target_length, pos = decode_base128_int(delta)
438
len_delta = len(delta)
439
while pos < len_delta:
440
cmd = ord(delta[pos])
443
offset, length, pos = decode_copy_instruction(delta, cmd, pos)
444
last = offset + length
445
if last > len(basis):
446
raise ValueError('data would copy bytes past the'
448
lines.append(basis[offset:last])
449
else: # Insert of 'cmd' bytes
451
raise ValueError('Command == 0 not supported yet')
452
lines.append(delta[pos:pos+cmd])
454
bytes = ''.join(lines)
455
if len(bytes) != target_length:
456
raise ValueError('Delta claimed to be %d long, but ended up'
457
' %d long' % (target_length, len(bytes)))
461
def apply_delta_to_source(source, delta_start, delta_end):
462
"""Extract a delta from source bytes, and apply it."""
463
source_size = len(source)
464
if delta_start >= source_size:
465
raise ValueError('delta starts after source')
466
if delta_end > source_size:
467
raise ValueError('delta ends after source')
468
if delta_start >= delta_end:
469
raise ValueError('delta starts after it ends')
470
delta_bytes = source[delta_start:delta_end]
471
return apply_delta(source, delta_bytes)