125
137
self.apply_delta_to_source = self._gc_module.apply_delta_to_source
127
139
def test_make_delta_is_typesafe(self):
128
self.make_delta('a string', 'another string')
140
self.make_delta(b'a string', b'another string')
130
142
def _check_make_delta(string1, string2):
131
143
self.assertRaises(TypeError, self.make_delta, string1, string2)
133
_check_make_delta('a string', object())
134
_check_make_delta('a string', u'not a string')
135
_check_make_delta(object(), 'a string')
136
_check_make_delta(u'not a string', 'a string')
145
_check_make_delta(b'a string', object())
146
_check_make_delta(b'a string', u'not a string')
147
_check_make_delta(object(), b'a string')
148
_check_make_delta(u'not a string', b'a string')
138
150
def test_make_noop_delta(self):
139
151
ident_delta = self.make_delta(_text1, _text1)
140
self.assertEqual('M\x90M', ident_delta)
152
self.assertEqual(b'M\x90M', ident_delta)
141
153
ident_delta = self.make_delta(_text2, _text2)
142
self.assertEqual('N\x90N', ident_delta)
154
self.assertEqual(b'N\x90N', ident_delta)
143
155
ident_delta = self.make_delta(_text3, _text3)
144
self.assertEqual('\x87\x01\x90\x87', ident_delta)
156
self.assertEqual(b'\x87\x01\x90\x87', ident_delta)
146
158
def assertDeltaIn(self, delta1, delta2, delta):
147
159
"""Make sure that the delta bytes match one of the expectations."""
148
160
# In general, the python delta matcher gives different results than the
149
161
# pyrex delta matcher. Both should be valid deltas, though.
150
162
if delta not in (delta1, delta2):
151
self.fail("Delta bytes:\n"
163
self.fail(b"Delta bytes:\n"
155
167
% (delta, delta1, delta2))
157
169
def test_make_delta(self):
158
170
delta = self.make_delta(_text1, _text2)
159
171
self.assertDeltaIn(
160
'N\x90/\x1fdiffer from\nagainst other text\n',
161
'N\x90\x1d\x1ewhich is meant to differ from\n\x91:\x13',
172
b'N\x90/\x1fdiffer from\nagainst other text\n',
173
b'N\x90\x1d\x1ewhich is meant to differ from\n\x91:\x13',
163
175
delta = self.make_delta(_text2, _text1)
164
176
self.assertDeltaIn(
165
'M\x90/\x1ebe matched\nagainst other text\n',
166
'M\x90\x1d\x1dwhich is meant to be matched\n\x91;\x13',
177
b'M\x90/\x1ebe matched\nagainst other text\n',
178
b'M\x90\x1d\x1dwhich is meant to be matched\n\x91;\x13',
168
180
delta = self.make_delta(_text3, _text1)
169
self.assertEqual('M\x90M', delta)
181
self.assertEqual(b'M\x90M', delta)
170
182
delta = self.make_delta(_text3, _text2)
171
183
self.assertDeltaIn(
172
'N\x90/\x1fdiffer from\nagainst other text\n',
173
'N\x90\x1d\x1ewhich is meant to differ from\n\x91:\x13',
184
b'N\x90/\x1fdiffer from\nagainst other text\n',
185
b'N\x90\x1d\x1ewhich is meant to differ from\n\x91:\x13',
176
188
def test_make_delta_with_large_copies(self):
179
191
big_text = _text3 * 1220
180
192
delta = self.make_delta(big_text, big_text)
181
193
self.assertDeltaIn(
182
'\xdc\x86\x0a' # Encoding the length of the uncompressed text
183
'\x80' # Copy 64kB, starting at byte 0
184
'\x84\x01' # and another 64kB starting at 64kB
185
'\xb4\x02\x5c\x83', # And the bit of tail.
194
b'\xdc\x86\x0a' # Encoding the length of the uncompressed text
195
b'\x80' # Copy 64kB, starting at byte 0
196
b'\x84\x01' # and another 64kB starting at 64kB
197
b'\xb4\x02\x5c\x83', # And the bit of tail.
186
198
None, # Both implementations should be identical
189
201
def test_apply_delta_is_typesafe(self):
190
self.apply_delta(_text1, 'M\x90M')
191
self.assertRaises(TypeError, self.apply_delta, object(), 'M\x90M')
202
self.apply_delta(_text1, b'M\x90M')
203
self.assertRaises(TypeError, self.apply_delta, object(), b'M\x90M')
192
204
self.assertRaises(TypeError, self.apply_delta,
193
unicode(_text1), 'M\x90M')
205
_text1.decode('latin1'), b'M\x90M')
194
206
self.assertRaises(TypeError, self.apply_delta, _text1, u'M\x90M')
195
207
self.assertRaises(TypeError, self.apply_delta, _text1, object())
197
209
def test_apply_delta(self):
198
210
target = self.apply_delta(_text1,
199
'N\x90/\x1fdiffer from\nagainst other text\n')
211
b'N\x90/\x1fdiffer from\nagainst other text\n')
200
212
self.assertEqual(_text2, target)
201
213
target = self.apply_delta(_text2,
202
'M\x90/\x1ebe matched\nagainst other text\n')
214
b'M\x90/\x1ebe matched\nagainst other text\n')
203
215
self.assertEqual(_text1, target)
205
217
def test_apply_delta_to_source_is_safe(self):
206
218
self.assertRaises(TypeError,
207
self.apply_delta_to_source, object(), 0, 1)
219
self.apply_delta_to_source, object(), 0, 1)
208
220
self.assertRaises(TypeError,
209
self.apply_delta_to_source, u'unicode str', 0, 1)
221
self.apply_delta_to_source, u'unicode str', 0, 1)
211
223
self.assertRaises(ValueError,
212
self.apply_delta_to_source, 'foo', 1, 4)
224
self.apply_delta_to_source, b'foo', 1, 4)
214
226
self.assertRaises(ValueError,
215
self.apply_delta_to_source, 'foo', 5, 3)
227
self.apply_delta_to_source, b'foo', 5, 3)
217
229
self.assertRaises(ValueError,
218
self.apply_delta_to_source, 'foo', 3, 2)
230
self.apply_delta_to_source, b'foo', 3, 2)
220
232
def test_apply_delta_to_source(self):
221
233
source_and_delta = (_text1
222
+ 'N\x90/\x1fdiffer from\nagainst other text\n')
234
+ b'N\x90/\x1fdiffer from\nagainst other text\n')
223
235
self.assertEqual(_text2, self.apply_delta_to_source(source_and_delta,
224
len(_text1), len(source_and_delta)))
236
len(_text1), len(source_and_delta)))
227
239
class TestMakeAndApplyCompatible(tests.TestCase):
229
make_delta = None # Set by load_tests
230
apply_delta = None # Set by load_tests
241
scenarios = two_way_scenarios()
243
make_delta = None # Set by load_tests
244
apply_delta = None # Set by load_tests
232
246
def assertMakeAndApply(self, source, target):
233
247
"""Assert that generating a delta and applying gives success."""
255
269
self._gc_module = compiled_groupcompress_feature.module
257
271
def test_repr(self):
258
di = self._gc_module.DeltaIndex('test text\n')
272
di = self._gc_module.DeltaIndex(b'test text\n')
259
273
self.assertEqual('DeltaIndex(1, 10)', repr(di))
275
def test_sizeof(self):
276
di = self._gc_module.DeltaIndex()
277
# Exact value will depend on platform but should include sources
278
# source_info is a pointer and two longs so at least 12 bytes
279
lower_bound = di._max_num_sources * 12
280
self.assertGreater(sys.getsizeof(di), lower_bound)
282
def test__dump_no_index(self):
283
di = self._gc_module.DeltaIndex()
284
self.assertEqual(None, di._dump_index())
286
def test__dump_index_simple(self):
287
di = self._gc_module.DeltaIndex()
288
di.add_source(_text1, 0)
289
self.assertFalse(di._has_index())
290
self.assertEqual(None, di._dump_index())
291
_ = di.make_delta(_text1)
292
self.assertTrue(di._has_index())
293
hash_list, entry_list = di._dump_index()
294
self.assertEqual(16, len(hash_list))
295
self.assertEqual(68, len(entry_list))
296
just_entries = [(idx, text_offset, hash_val)
297
for idx, (text_offset, hash_val)
298
in enumerate(entry_list)
299
if text_offset != 0 or hash_val != 0]
300
rabin_hash = self._gc_module._rabin_hash
301
self.assertEqual([(8, 16, rabin_hash(_text1[1:17])),
302
(25, 48, rabin_hash(_text1[33:49])),
303
(34, 32, rabin_hash(_text1[17:33])),
304
(47, 64, rabin_hash(_text1[49:65])),
306
# This ensures that the hash map points to the location we expect it to
307
for entry_idx, text_offset, hash_val in just_entries:
308
self.assertEqual(entry_idx, hash_list[hash_val & 0xf])
310
def test__dump_index_two_sources(self):
311
di = self._gc_module.DeltaIndex()
312
di.add_source(_text1, 0)
313
di.add_source(_text2, 2)
314
start2 = len(_text1) + 2
315
self.assertTrue(di._has_index())
316
hash_list, entry_list = di._dump_index()
317
self.assertEqual(16, len(hash_list))
318
self.assertEqual(68, len(entry_list))
319
just_entries = [(idx, text_offset, hash_val)
320
for idx, (text_offset, hash_val)
321
in enumerate(entry_list)
322
if text_offset != 0 or hash_val != 0]
323
rabin_hash = self._gc_module._rabin_hash
324
self.assertEqual([(8, 16, rabin_hash(_text1[1:17])),
325
(9, start2 + 16, rabin_hash(_text2[1:17])),
326
(25, 48, rabin_hash(_text1[33:49])),
327
(30, start2 + 64, rabin_hash(_text2[49:65])),
328
(34, 32, rabin_hash(_text1[17:33])),
329
(35, start2 + 32, rabin_hash(_text2[17:33])),
330
(43, start2 + 48, rabin_hash(_text2[33:49])),
331
(47, 64, rabin_hash(_text1[49:65])),
333
# Each entry should be in the appropriate hash bucket.
334
for entry_idx, text_offset, hash_val in just_entries:
335
hash_idx = hash_val & 0xf
337
hash_list[hash_idx] <= entry_idx < hash_list[hash_idx + 1])
261
339
def test_first_add_source_doesnt_index_until_make_delta(self):
262
340
di = self._gc_module.DeltaIndex()
263
341
self.assertFalse(di._has_index())
339
438
third_delta = di.make_delta(_third_text)
340
439
result = self._gc_module.apply_delta(source, third_delta)
341
440
self.assertEqualDiff(_third_text, result)
342
self.assertEqual('\x85\x01\x90\x14\x91\x7e\x1c'
343
'\x91S&\x03and\x91\x18,', third_delta)
441
self.assertEqual(b'\x85\x01\x90\x14\x91\x7e\x1c'
442
b'\x91S&\x03and\x91\x18,', third_delta)
344
443
# Now create a delta, which we know won't be able to be 'fit' into the
346
445
fourth_delta = di.make_delta(_fourth_text)
347
446
self.assertEqual(_fourth_text,
348
447
self._gc_module.apply_delta(source, fourth_delta))
349
self.assertEqual('\x80\x01'
350
'\x7f123456789012345\nsame rabin hash\n'
351
'123456789012345\nsame rabin hash\n'
352
'123456789012345\nsame rabin hash\n'
353
'123456789012345\nsame rabin hash'
354
'\x01\n', fourth_delta)
448
self.assertEqual(b'\x80\x01'
449
b'\x7f123456789012345\nsame rabin hash\n'
450
b'123456789012345\nsame rabin hash\n'
451
b'123456789012345\nsame rabin hash\n'
452
b'123456789012345\nsame rabin hash'
453
b'\x01\n', fourth_delta)
355
454
di.add_delta_source(fourth_delta, 0)
356
455
source += fourth_delta
357
456
# With the next delta, everything should be found
358
457
fifth_delta = di.make_delta(_fourth_text)
359
458
self.assertEqual(_fourth_text,
360
459
self._gc_module.apply_delta(source, fifth_delta))
361
self.assertEqual('\x80\x01\x91\xa7\x7f\x01\n', fifth_delta)
460
self.assertEqual(b'\x80\x01\x91\xa7\x7f\x01\n', fifth_delta)
364
463
class TestCopyInstruction(tests.TestCase):
366
465
def assertEncode(self, expected, offset, length):
367
bytes = _groupcompress_py.encode_copy_instruction(offset, length)
368
if expected != bytes:
369
self.assertEqual([hex(ord(e)) for e in expected],
370
[hex(ord(b)) for b in bytes])
466
data = _groupcompress_py.encode_copy_instruction(offset, length)
467
self.assertEqual(expected, data)
372
def assertDecode(self, exp_offset, exp_length, exp_newpos, bytes, pos):
373
cmd = ord(bytes[pos])
469
def assertDecode(self, exp_offset, exp_length, exp_newpos, data, pos):
375
out = _groupcompress_py.decode_copy_instruction(bytes, cmd, pos)
472
out = _groupcompress_py.decode_copy_instruction(data, cmd, pos)
376
473
self.assertEqual((exp_offset, exp_length, exp_newpos), out)
378
475
def test_encode_no_length(self):
379
self.assertEncode('\x80', 0, 64*1024)
380
self.assertEncode('\x81\x01', 1, 64*1024)
381
self.assertEncode('\x81\x0a', 10, 64*1024)
382
self.assertEncode('\x81\xff', 255, 64*1024)
383
self.assertEncode('\x82\x01', 256, 64*1024)
384
self.assertEncode('\x83\x01\x01', 257, 64*1024)
385
self.assertEncode('\x8F\xff\xff\xff\xff', 0xFFFFFFFF, 64*1024)
386
self.assertEncode('\x8E\xff\xff\xff', 0xFFFFFF00, 64*1024)
387
self.assertEncode('\x8D\xff\xff\xff', 0xFFFF00FF, 64*1024)
388
self.assertEncode('\x8B\xff\xff\xff', 0xFF00FFFF, 64*1024)
389
self.assertEncode('\x87\xff\xff\xff', 0x00FFFFFF, 64*1024)
390
self.assertEncode('\x8F\x04\x03\x02\x01', 0x01020304, 64*1024)
476
self.assertEncode(b'\x80', 0, 64 * 1024)
477
self.assertEncode(b'\x81\x01', 1, 64 * 1024)
478
self.assertEncode(b'\x81\x0a', 10, 64 * 1024)
479
self.assertEncode(b'\x81\xff', 255, 64 * 1024)
480
self.assertEncode(b'\x82\x01', 256, 64 * 1024)
481
self.assertEncode(b'\x83\x01\x01', 257, 64 * 1024)
482
self.assertEncode(b'\x8F\xff\xff\xff\xff', 0xFFFFFFFF, 64 * 1024)
483
self.assertEncode(b'\x8E\xff\xff\xff', 0xFFFFFF00, 64 * 1024)
484
self.assertEncode(b'\x8D\xff\xff\xff', 0xFFFF00FF, 64 * 1024)
485
self.assertEncode(b'\x8B\xff\xff\xff', 0xFF00FFFF, 64 * 1024)
486
self.assertEncode(b'\x87\xff\xff\xff', 0x00FFFFFF, 64 * 1024)
487
self.assertEncode(b'\x8F\x04\x03\x02\x01', 0x01020304, 64 * 1024)
392
489
def test_encode_no_offset(self):
393
self.assertEncode('\x90\x01', 0, 1)
394
self.assertEncode('\x90\x0a', 0, 10)
395
self.assertEncode('\x90\xff', 0, 255)
396
self.assertEncode('\xA0\x01', 0, 256)
397
self.assertEncode('\xB0\x01\x01', 0, 257)
398
self.assertEncode('\xB0\xff\xff', 0, 0xFFFF)
490
self.assertEncode(b'\x90\x01', 0, 1)
491
self.assertEncode(b'\x90\x0a', 0, 10)
492
self.assertEncode(b'\x90\xff', 0, 255)
493
self.assertEncode(b'\xA0\x01', 0, 256)
494
self.assertEncode(b'\xB0\x01\x01', 0, 257)
495
self.assertEncode(b'\xB0\xff\xff', 0, 0xFFFF)
399
496
# Special case, if copy == 64KiB, then we store exactly 0
400
497
# Note that this puns with a copy of exactly 0 bytes, but we don't care
401
498
# about that, as we would never actually copy 0 bytes
402
self.assertEncode('\x80', 0, 64*1024)
499
self.assertEncode(b'\x80', 0, 64 * 1024)
404
501
def test_encode(self):
405
self.assertEncode('\x91\x01\x01', 1, 1)
406
self.assertEncode('\x91\x09\x0a', 9, 10)
407
self.assertEncode('\x91\xfe\xff', 254, 255)
408
self.assertEncode('\xA2\x02\x01', 512, 256)
409
self.assertEncode('\xB3\x02\x01\x01\x01', 258, 257)
410
self.assertEncode('\xB0\x01\x01', 0, 257)
502
self.assertEncode(b'\x91\x01\x01', 1, 1)
503
self.assertEncode(b'\x91\x09\x0a', 9, 10)
504
self.assertEncode(b'\x91\xfe\xff', 254, 255)
505
self.assertEncode(b'\xA2\x02\x01', 512, 256)
506
self.assertEncode(b'\xB3\x02\x01\x01\x01', 258, 257)
507
self.assertEncode(b'\xB0\x01\x01', 0, 257)
411
508
# Special case, if copy == 64KiB, then we store exactly 0
412
509
# Note that this puns with a copy of exactly 0 bytes, but we don't care
413
510
# about that, as we would never actually copy 0 bytes
414
self.assertEncode('\x81\x0a', 10, 64*1024)
511
self.assertEncode(b'\x81\x0a', 10, 64 * 1024)
416
513
def test_decode_no_length(self):
417
514
# If length is 0, it is interpreted as 64KiB
418
515
# The shortest possible instruction is a copy of 64KiB from offset 0
419
self.assertDecode(0, 65536, 1, '\x80', 0)
420
self.assertDecode(1, 65536, 2, '\x81\x01', 0)
421
self.assertDecode(10, 65536, 2, '\x81\x0a', 0)
422
self.assertDecode(255, 65536, 2, '\x81\xff', 0)
423
self.assertDecode(256, 65536, 2, '\x82\x01', 0)
424
self.assertDecode(257, 65536, 3, '\x83\x01\x01', 0)
425
self.assertDecode(0xFFFFFFFF, 65536, 5, '\x8F\xff\xff\xff\xff', 0)
426
self.assertDecode(0xFFFFFF00, 65536, 4, '\x8E\xff\xff\xff', 0)
427
self.assertDecode(0xFFFF00FF, 65536, 4, '\x8D\xff\xff\xff', 0)
428
self.assertDecode(0xFF00FFFF, 65536, 4, '\x8B\xff\xff\xff', 0)
429
self.assertDecode(0x00FFFFFF, 65536, 4, '\x87\xff\xff\xff', 0)
430
self.assertDecode(0x01020304, 65536, 5, '\x8F\x04\x03\x02\x01', 0)
516
self.assertDecode(0, 65536, 1, b'\x80', 0)
517
self.assertDecode(1, 65536, 2, b'\x81\x01', 0)
518
self.assertDecode(10, 65536, 2, b'\x81\x0a', 0)
519
self.assertDecode(255, 65536, 2, b'\x81\xff', 0)
520
self.assertDecode(256, 65536, 2, b'\x82\x01', 0)
521
self.assertDecode(257, 65536, 3, b'\x83\x01\x01', 0)
522
self.assertDecode(0xFFFFFFFF, 65536, 5, b'\x8F\xff\xff\xff\xff', 0)
523
self.assertDecode(0xFFFFFF00, 65536, 4, b'\x8E\xff\xff\xff', 0)
524
self.assertDecode(0xFFFF00FF, 65536, 4, b'\x8D\xff\xff\xff', 0)
525
self.assertDecode(0xFF00FFFF, 65536, 4, b'\x8B\xff\xff\xff', 0)
526
self.assertDecode(0x00FFFFFF, 65536, 4, b'\x87\xff\xff\xff', 0)
527
self.assertDecode(0x01020304, 65536, 5, b'\x8F\x04\x03\x02\x01', 0)
432
529
def test_decode_no_offset(self):
433
self.assertDecode(0, 1, 2, '\x90\x01', 0)
434
self.assertDecode(0, 10, 2, '\x90\x0a', 0)
435
self.assertDecode(0, 255, 2, '\x90\xff', 0)
436
self.assertDecode(0, 256, 2, '\xA0\x01', 0)
437
self.assertDecode(0, 257, 3, '\xB0\x01\x01', 0)
438
self.assertDecode(0, 65535, 3, '\xB0\xff\xff', 0)
530
self.assertDecode(0, 1, 2, b'\x90\x01', 0)
531
self.assertDecode(0, 10, 2, b'\x90\x0a', 0)
532
self.assertDecode(0, 255, 2, b'\x90\xff', 0)
533
self.assertDecode(0, 256, 2, b'\xA0\x01', 0)
534
self.assertDecode(0, 257, 3, b'\xB0\x01\x01', 0)
535
self.assertDecode(0, 65535, 3, b'\xB0\xff\xff', 0)
439
536
# Special case, if copy == 64KiB, then we store exactly 0
440
537
# Note that this puns with a copy of exactly 0 bytes, but we don't care
441
538
# about that, as we would never actually copy 0 bytes
442
self.assertDecode(0, 65536, 1, '\x80', 0)
539
self.assertDecode(0, 65536, 1, b'\x80', 0)
444
541
def test_decode(self):
445
self.assertDecode(1, 1, 3, '\x91\x01\x01', 0)
446
self.assertDecode(9, 10, 3, '\x91\x09\x0a', 0)
447
self.assertDecode(254, 255, 3, '\x91\xfe\xff', 0)
448
self.assertDecode(512, 256, 3, '\xA2\x02\x01', 0)
449
self.assertDecode(258, 257, 5, '\xB3\x02\x01\x01\x01', 0)
450
self.assertDecode(0, 257, 3, '\xB0\x01\x01', 0)
542
self.assertDecode(1, 1, 3, b'\x91\x01\x01', 0)
543
self.assertDecode(9, 10, 3, b'\x91\x09\x0a', 0)
544
self.assertDecode(254, 255, 3, b'\x91\xfe\xff', 0)
545
self.assertDecode(512, 256, 3, b'\xA2\x02\x01', 0)
546
self.assertDecode(258, 257, 5, b'\xB3\x02\x01\x01\x01', 0)
547
self.assertDecode(0, 257, 3, b'\xB0\x01\x01', 0)
452
549
def test_decode_not_start(self):
453
self.assertDecode(1, 1, 6, 'abc\x91\x01\x01def', 3)
454
self.assertDecode(9, 10, 5, 'ab\x91\x09\x0ade', 2)
455
self.assertDecode(254, 255, 6, 'not\x91\xfe\xffcopy', 3)
550
self.assertDecode(1, 1, 6, b'abc\x91\x01\x01def', 3)
551
self.assertDecode(9, 10, 5, b'ab\x91\x09\x0ade', 2)
552
self.assertDecode(254, 255, 6, b'not\x91\xfe\xffcopy', 3)
458
555
class TestBase128Int(tests.TestCase):
460
_gc_module = None # Set by load_tests
557
scenarios = module_scenarios()
559
_gc_module = None # Set by load_tests
462
561
def assertEqualEncode(self, bytes, val):
463
562
self.assertEqual(bytes, self._gc_module.encode_base128_int(val))