1
# Copyright (C) 2008-2011 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Tests for group compression."""
35
from ..osutils import sha_string
36
from .test__groupcompress import compiled_groupcompress_feature
37
from .scenarios import load_tests_apply_scenarios
40
def group_compress_implementation_scenarios():
42
('python', {'compressor': groupcompress.PythonGroupCompressor}),
44
if compiled_groupcompress_feature.available():
45
scenarios.append(('C',
46
{'compressor': groupcompress.PyrexGroupCompressor}))
50
load_tests = load_tests_apply_scenarios
53
class TestGroupCompressor(tests.TestCase):
55
def _chunks_to_repr_lines(self, chunks):
56
return '\n'.join(map(repr, b''.join(chunks).split(b'\n')))
58
def assertEqualDiffEncoded(self, expected, actual):
59
"""Compare the actual content to the expected content.
61
:param expected: A group of chunks that we expect to see
62
:param actual: The measured 'chunks'
64
We will transform the chunks back into lines, and then run 'repr()'
65
over them to handle non-ascii characters.
67
self.assertEqualDiff(self._chunks_to_repr_lines(expected),
68
self._chunks_to_repr_lines(actual))
71
class TestAllGroupCompressors(TestGroupCompressor):
72
"""Tests for GroupCompressor"""
74
scenarios = group_compress_implementation_scenarios()
75
compressor = None # Set by scenario
77
def test_empty_delta(self):
78
compressor = self.compressor()
79
self.assertEqual([], compressor.chunks)
81
def test_one_nosha_delta(self):
83
compressor = self.compressor()
84
text = b'strange\ncommon\n'
85
sha1, start_point, end_point, _ = compressor.compress(
86
('label',), [text], len(text), None)
87
self.assertEqual(sha_string(b'strange\ncommon\n'), sha1)
88
expected_lines = b'f\x0fstrange\ncommon\n'
89
self.assertEqual(expected_lines, b''.join(compressor.chunks))
90
self.assertEqual(0, start_point)
91
self.assertEqual(len(expected_lines), end_point)
93
def test_empty_content(self):
94
compressor = self.compressor()
95
# Adding empty bytes should return the 'null' record
96
sha1, start_point, end_point, kind = compressor.compress(
97
('empty',), [], 0, None)
98
self.assertEqual(0, start_point)
99
self.assertEqual(0, end_point)
100
self.assertEqual('fulltext', kind)
101
self.assertEqual(groupcompress._null_sha1, sha1)
102
self.assertEqual(0, compressor.endpoint)
103
self.assertEqual([], compressor.chunks)
104
# Even after adding some content
105
text = b'some\nbytes\n'
106
compressor.compress(('content',), [text], len(text), None)
107
self.assertTrue(compressor.endpoint > 0)
108
sha1, start_point, end_point, kind = compressor.compress(
109
('empty2',), [], 0, None)
110
self.assertEqual(0, start_point)
111
self.assertEqual(0, end_point)
112
self.assertEqual('fulltext', kind)
113
self.assertEqual(groupcompress._null_sha1, sha1)
115
def test_extract_from_compressor(self):
116
# Knit fetching will try to reconstruct texts locally which results in
117
# reading something that is in the compressor stream already.
118
compressor = self.compressor()
119
text = b'strange\ncommon long line\nthat needs a 16 byte match\n'
120
sha1_1, _, _, _ = compressor.compress(
121
('label',), [text], len(text), None)
122
expected_lines = list(compressor.chunks)
123
text = b'common long line\nthat needs a 16 byte match\ndifferent\n'
124
sha1_2, _, end_point, _ = compressor.compress(
125
('newlabel',), [text], len(text), None)
127
self.assertEqual(([b'strange\ncommon long line\n'
128
b'that needs a 16 byte match\n'], sha1_1),
129
compressor.extract(('label',)))
131
self.assertEqual(([b'common long line\nthat needs a 16 byte match\n'
132
b'different\n'], sha1_2),
133
compressor.extract(('newlabel',)))
135
def test_pop_last(self):
136
compressor = self.compressor()
137
text = b'some text\nfor the first entry\n'
138
_, _, _, _ = compressor.compress(
139
('key1',), [text], len(text), None)
140
expected_lines = list(compressor.chunks)
141
text = b'some text\nfor the second entry\n'
142
_, _, _, _ = compressor.compress(
143
('key2',), [text], len(text), None)
144
compressor.pop_last()
145
self.assertEqual(expected_lines, compressor.chunks)
148
class TestPyrexGroupCompressor(TestGroupCompressor):
150
_test_needs_features = [compiled_groupcompress_feature]
151
compressor = groupcompress.PyrexGroupCompressor
153
def test_stats(self):
154
compressor = self.compressor()
155
chunks = [b'strange\n',
156
b'common very very long line\n',
159
('label',), chunks, sum(map(len, chunks)), None)
161
b'common very very long line\n',
167
chunks, sum(map(len, chunks)), None)
170
b'common very very long line\n',
175
('label3',), chunks, sum(map(len, chunks)), None)
176
self.assertAlmostEqual(1.9, compressor.ratio(), 1)
178
def test_two_nosha_delta(self):
179
compressor = self.compressor()
180
text = b'strange\ncommon long line\nthat needs a 16 byte match\n'
181
sha1_1, _, _, _ = compressor.compress(('label',), [text], len(text), None)
182
expected_lines = list(compressor.chunks)
183
text = b'common long line\nthat needs a 16 byte match\ndifferent\n'
184
sha1_2, start_point, end_point, _ = compressor.compress(
185
('newlabel',), [text], len(text), None)
186
self.assertEqual(sha_string(text), sha1_2)
187
expected_lines.extend([
188
# 'delta', delta length
190
# source and target length
192
# copy the line common
193
b'\x91\x0a\x2c', # copy, offset 0x0a, len 0x2c
194
# add the line different, and the trailing newline
195
b'\x0adifferent\n', # insert 10 bytes
197
self.assertEqualDiffEncoded(expected_lines, compressor.chunks)
198
self.assertEqual(sum(map(len, expected_lines)), end_point)
200
def test_three_nosha_delta(self):
201
# The first interesting test: make a change that should use lines from
203
compressor = self.compressor()
204
text = b'strange\ncommon very very long line\nwith some extra text\n'
205
sha1_1, _, _, _ = compressor.compress(
206
('label',), [text], len(text), None)
207
text = b'different\nmoredifferent\nand then some more\n'
208
sha1_2, _, _, _ = compressor.compress(
209
('newlabel',), [text], len(text), None)
210
expected_lines = list(compressor.chunks)
211
text = (b'new\ncommon very very long line\nwith some extra text\n'
212
b'different\nmoredifferent\nand then some more\n')
213
sha1_3, start_point, end_point, _ = compressor.compress(
214
('label3',), [text], len(text), None)
215
self.assertEqual(sha_string(text), sha1_3)
216
expected_lines.extend([
217
# 'delta', delta length
219
# source and target length
223
# Copy of first parent 'common' range
224
b'\x91\x09\x31' # copy, offset 0x09, 0x31 bytes
225
# Copy of second parent 'different' range
226
b'\x91\x3c\x2b' # copy, offset 0x3c, 0x2b bytes
228
self.assertEqualDiffEncoded(expected_lines, compressor.chunks)
229
self.assertEqual(sum(map(len, expected_lines)), end_point)
232
class TestPythonGroupCompressor(TestGroupCompressor):
234
compressor = groupcompress.PythonGroupCompressor
236
def test_stats(self):
237
compressor = self.compressor()
238
chunks = [b'strange\n',
239
b'common very very long line\n',
242
('label',), chunks, sum(map(len, chunks)), None)
244
b'common very very long line\n',
249
('newlabel',), chunks, sum(map(len, chunks)), None)
252
b'common very very long line\n',
258
chunks, sum(map(len, chunks)), None)
259
self.assertAlmostEqual(1.9, compressor.ratio(), 1)
261
def test_two_nosha_delta(self):
262
compressor = self.compressor()
263
text = b'strange\ncommon long line\nthat needs a 16 byte match\n'
264
sha1_1, _, _, _ = compressor.compress(
265
('label',), [text], len(text), None)
266
expected_lines = list(compressor.chunks)
267
text = b'common long line\nthat needs a 16 byte match\ndifferent\n'
268
sha1_2, start_point, end_point, _ = compressor.compress(
269
('newlabel',), [text], len(text), None)
270
self.assertEqual(sha_string(text), sha1_2)
271
expected_lines.extend([
272
# 'delta', delta length
276
# copy the line common
277
b'\x91\x0a\x2c', # copy, offset 0x0a, len 0x2c
278
# add the line different, and the trailing newline
279
b'\x0adifferent\n', # insert 10 bytes
281
self.assertEqualDiffEncoded(expected_lines, compressor.chunks)
282
self.assertEqual(sum(map(len, expected_lines)), end_point)
284
def test_three_nosha_delta(self):
285
# The first interesting test: make a change that should use lines from
287
compressor = self.compressor()
288
text = b'strange\ncommon very very long line\nwith some extra text\n'
289
sha1_1, _, _, _ = compressor.compress(
290
('label',), [text], len(text), None)
291
text = b'different\nmoredifferent\nand then some more\n'
292
sha1_2, _, _, _ = compressor.compress(
293
('newlabel',), [text], len(text), None)
294
expected_lines = list(compressor.chunks)
295
text = (b'new\ncommon very very long line\nwith some extra text\n'
296
b'different\nmoredifferent\nand then some more\n')
297
sha1_3, start_point, end_point, _ = compressor.compress(
298
('label3',), [text], len(text), None)
299
self.assertEqual(sha_string(text), sha1_3)
300
expected_lines.extend([
301
# 'delta', delta length
307
# Copy of first parent 'common' range
308
b'\x91\x0a\x30' # copy, offset 0x0a, 0x30 bytes
309
# Copy of second parent 'different' range
310
b'\x91\x3c\x2b' # copy, offset 0x3c, 0x2b bytes
312
self.assertEqualDiffEncoded(expected_lines, compressor.chunks)
313
self.assertEqual(sum(map(len, expected_lines)), end_point)
316
class TestGroupCompressBlock(tests.TestCase):
318
def make_block(self, key_to_text):
319
"""Create a GroupCompressBlock, filling it with the given texts."""
320
compressor = groupcompress.GroupCompressor()
322
for key in sorted(key_to_text):
324
key, [key_to_text[key]], len(key_to_text[key]), None)
325
locs = dict((key, (start, end)) for key, (start, _, end, _)
326
in compressor.labels_deltas.items())
327
block = compressor.flush()
328
raw_bytes = block.to_bytes()
329
# Go through from_bytes(to_bytes()) so that we start with a compressed
331
return locs, groupcompress.GroupCompressBlock.from_bytes(raw_bytes)
333
def test_from_empty_bytes(self):
334
self.assertRaises(ValueError,
335
groupcompress.GroupCompressBlock.from_bytes, b'')
337
def test_from_minimal_bytes(self):
338
block = groupcompress.GroupCompressBlock.from_bytes(
340
self.assertIsInstance(block, groupcompress.GroupCompressBlock)
341
self.assertIs(None, block._content)
342
self.assertEqual(b'', block._z_content)
343
block._ensure_content()
344
self.assertEqual(b'', block._content)
345
self.assertEqual(b'', block._z_content)
346
block._ensure_content() # Ensure content is safe to call 2x
348
def test_from_invalid(self):
349
self.assertRaises(ValueError,
350
groupcompress.GroupCompressBlock.from_bytes,
351
b'this is not a valid header')
353
def test_from_bytes(self):
354
content = (b'a tiny bit of content\n')
355
z_content = zlib.compress(content)
357
b'gcb1z\n' # group compress block v1 plain
358
b'%d\n' # Length of compressed content
359
b'%d\n' # Length of uncompressed content
360
b'%s' # Compressed content
361
) % (len(z_content), len(content), z_content)
362
block = groupcompress.GroupCompressBlock.from_bytes(
364
self.assertEqual(z_content, block._z_content)
365
self.assertIs(None, block._content)
366
self.assertEqual(len(z_content), block._z_content_length)
367
self.assertEqual(len(content), block._content_length)
368
block._ensure_content()
369
self.assertEqual(z_content, block._z_content)
370
self.assertEqual(content, block._content)
372
def test_to_chunks(self):
373
content_chunks = [b'this is some content\n',
374
b'this content will be compressed\n']
375
content_len = sum(map(len, content_chunks))
376
content = b''.join(content_chunks)
377
gcb = groupcompress.GroupCompressBlock()
378
gcb.set_chunked_content(content_chunks, content_len)
379
total_len, block_chunks = gcb.to_chunks()
380
block_bytes = b''.join(block_chunks)
381
self.assertEqual(gcb._z_content_length, len(gcb._z_content))
382
self.assertEqual(total_len, len(block_bytes))
383
self.assertEqual(gcb._content_length, content_len)
384
expected_header = (b'gcb1z\n' # group compress block v1 zlib
385
b'%d\n' # Length of compressed content
386
b'%d\n' # Length of uncompressed content
387
) % (gcb._z_content_length, gcb._content_length)
388
# The first chunk should be the header chunk. It is small, fixed size,
389
# and there is no compelling reason to split it up
390
self.assertEqual(expected_header, block_chunks[0])
391
self.assertStartsWith(block_bytes, expected_header)
392
remaining_bytes = block_bytes[len(expected_header):]
393
raw_bytes = zlib.decompress(remaining_bytes)
394
self.assertEqual(content, raw_bytes)
396
def test_to_bytes(self):
397
content = (b'this is some content\n'
398
b'this content will be compressed\n')
399
gcb = groupcompress.GroupCompressBlock()
400
gcb.set_content(content)
401
data = gcb.to_bytes()
402
self.assertEqual(gcb._z_content_length, len(gcb._z_content))
403
self.assertEqual(gcb._content_length, len(content))
404
expected_header = (b'gcb1z\n' # group compress block v1 zlib
405
b'%d\n' # Length of compressed content
406
b'%d\n' # Length of uncompressed content
407
) % (gcb._z_content_length, gcb._content_length)
408
self.assertStartsWith(data, expected_header)
409
remaining_bytes = data[len(expected_header):]
410
raw_bytes = zlib.decompress(remaining_bytes)
411
self.assertEqual(content, raw_bytes)
413
# we should get the same results if using the chunked version
414
gcb = groupcompress.GroupCompressBlock()
415
gcb.set_chunked_content([b'this is some content\n'
416
b'this content will be compressed\n'],
419
data = gcb.to_bytes()
420
self.assertEqual(old_data, data)
422
def test_partial_decomp(self):
424
# We need a sufficient amount of data so that zlib.decompress has
425
# partial decompression to work with. Most auto-generated data
426
# compresses a bit too well, we want a combination, so we combine a sha
427
# hash with compressible data.
428
for i in range(2048):
429
next_content = b'%d\nThis is a bit of duplicate text\n' % (i,)
430
content_chunks.append(next_content)
431
next_sha1 = osutils.sha_string(next_content)
432
content_chunks.append(next_sha1 + b'\n')
433
content = b''.join(content_chunks)
434
self.assertEqual(158634, len(content))
435
z_content = zlib.compress(content)
436
self.assertEqual(57182, len(z_content))
437
block = groupcompress.GroupCompressBlock()
438
block._z_content_chunks = (z_content,)
439
block._z_content_length = len(z_content)
440
block._compressor_name = 'zlib'
441
block._content_length = 158634
442
self.assertIs(None, block._content)
443
block._ensure_content(100)
444
self.assertIsNot(None, block._content)
445
# We have decompressed at least 100 bytes
446
self.assertTrue(len(block._content) >= 100)
447
# We have not decompressed the whole content
448
self.assertTrue(len(block._content) < 158634)
449
self.assertEqualDiff(content[:len(block._content)], block._content)
450
# ensuring content that we already have shouldn't cause any more data
452
cur_len = len(block._content)
453
block._ensure_content(cur_len - 10)
454
self.assertEqual(cur_len, len(block._content))
455
# Now we want a bit more content
457
block._ensure_content(cur_len)
458
self.assertTrue(len(block._content) >= cur_len)
459
self.assertTrue(len(block._content) < 158634)
460
self.assertEqualDiff(content[:len(block._content)], block._content)
461
# And now lets finish
462
block._ensure_content(158634)
463
self.assertEqualDiff(content, block._content)
464
# And the decompressor is finalized
465
self.assertIs(None, block._z_content_decompressor)
467
def test__ensure_all_content(self):
469
# We need a sufficient amount of data so that zlib.decompress has
470
# partial decompression to work with. Most auto-generated data
471
# compresses a bit too well, we want a combination, so we combine a sha
472
# hash with compressible data.
473
for i in range(2048):
474
next_content = b'%d\nThis is a bit of duplicate text\n' % (i,)
475
content_chunks.append(next_content)
476
next_sha1 = osutils.sha_string(next_content)
477
content_chunks.append(next_sha1 + b'\n')
478
content = b''.join(content_chunks)
479
self.assertEqual(158634, len(content))
480
z_content = zlib.compress(content)
481
self.assertEqual(57182, len(z_content))
482
block = groupcompress.GroupCompressBlock()
483
block._z_content_chunks = (z_content,)
484
block._z_content_length = len(z_content)
485
block._compressor_name = 'zlib'
486
block._content_length = 158634
487
self.assertIs(None, block._content)
488
# The first _ensure_content got all of the required data
489
block._ensure_content(158634)
490
self.assertEqualDiff(content, block._content)
491
# And we should have released the _z_content_decompressor since it was
493
self.assertIs(None, block._z_content_decompressor)
495
def test__dump(self):
496
dup_content = b'some duplicate content\nwhich is sufficiently long\n'
497
key_to_text = {(b'1',): dup_content + b'1 unique\n',
498
(b'2',): dup_content + b'2 extra special\n'}
499
locs, block = self.make_block(key_to_text)
500
self.assertEqual([(b'f', len(key_to_text[(b'1',)])),
501
(b'd', 21, len(key_to_text[(b'2',)]),
502
[(b'c', 2, len(dup_content)),
503
(b'i', len(b'2 extra special\n'), b'')
508
class TestCaseWithGroupCompressVersionedFiles(
509
tests.TestCaseWithMemoryTransport):
511
def make_test_vf(self, create_graph, keylength=1, do_cleanup=True,
512
dir='.', inconsistency_fatal=True):
513
t = self.get_transport(dir)
515
vf = groupcompress.make_pack_factory(graph=create_graph,
516
delta=False, keylength=keylength,
517
inconsistency_fatal=inconsistency_fatal)(t)
519
self.addCleanup(groupcompress.cleanup_pack_group, vf)
523
class TestGroupCompressVersionedFiles(TestCaseWithGroupCompressVersionedFiles):
525
def make_g_index(self, name, ref_lists=0, nodes=[]):
526
builder = btree_index.BTreeBuilder(ref_lists)
527
for node, references, value in nodes:
528
builder.add_node(node, references, value)
529
stream = builder.finish()
530
trans = self.get_transport()
531
size = trans.put_file(name, stream)
532
return btree_index.BTreeGraphIndex(trans, name, size)
534
def make_g_index_missing_parent(self):
535
graph_index = self.make_g_index('missing_parent', 1,
536
[((b'parent', ), b'2 78 2 10', ([],)),
537
((b'tip', ), b'2 78 2 10',
538
([(b'parent', ), (b'missing-parent', )],)),
542
def test_get_record_stream_as_requested(self):
543
# Consider promoting 'as-requested' to general availability, and
544
# make this a VF interface test
545
vf = self.make_test_vf(False, dir='source')
546
vf.add_lines((b'a',), (), [b'lines\n'])
547
vf.add_lines((b'b',), (), [b'lines\n'])
548
vf.add_lines((b'c',), (), [b'lines\n'])
549
vf.add_lines((b'd',), (), [b'lines\n'])
551
keys = [record.key for record in vf.get_record_stream(
552
[(b'a',), (b'b',), (b'c',), (b'd',)],
553
'as-requested', False)]
554
self.assertEqual([(b'a',), (b'b',), (b'c',), (b'd',)], keys)
555
keys = [record.key for record in vf.get_record_stream(
556
[(b'b',), (b'a',), (b'd',), (b'c',)],
557
'as-requested', False)]
558
self.assertEqual([(b'b',), (b'a',), (b'd',), (b'c',)], keys)
560
# It should work even after being repacked into another VF
561
vf2 = self.make_test_vf(False, dir='target')
562
vf2.insert_record_stream(vf.get_record_stream(
563
[(b'b',), (b'a',), (b'd',), (b'c',)], 'as-requested', False))
566
keys = [record.key for record in vf2.get_record_stream(
567
[(b'a',), (b'b',), (b'c',), (b'd',)],
568
'as-requested', False)]
569
self.assertEqual([(b'a',), (b'b',), (b'c',), (b'd',)], keys)
570
keys = [record.key for record in vf2.get_record_stream(
571
[(b'b',), (b'a',), (b'd',), (b'c',)],
572
'as-requested', False)]
573
self.assertEqual([(b'b',), (b'a',), (b'd',), (b'c',)], keys)
575
def test_get_record_stream_max_bytes_to_index_default(self):
576
vf = self.make_test_vf(True, dir='source')
577
vf.add_lines((b'a',), (), [b'lines\n'])
579
record = next(vf.get_record_stream([(b'a',)], 'unordered', True))
580
self.assertEqual(vf._DEFAULT_COMPRESSOR_SETTINGS,
581
record._manager._get_compressor_settings())
583
def test_get_record_stream_accesses_compressor_settings(self):
584
vf = self.make_test_vf(True, dir='source')
585
vf.add_lines((b'a',), (), [b'lines\n'])
587
vf._max_bytes_to_index = 1234
588
record = next(vf.get_record_stream([(b'a',)], 'unordered', True))
589
self.assertEqual(dict(max_bytes_to_index=1234),
590
record._manager._get_compressor_settings())
593
def grouped_stream(revision_ids, first_parents=()):
594
parents = first_parents
595
for revision_id in revision_ids:
597
record = versionedfile.FulltextContentFactory(
599
b'some content that is\n'
600
b'identical except for\n'
601
b'revision_id:%s\n' % (revision_id,))
605
def test_insert_record_stream_reuses_blocks(self):
606
vf = self.make_test_vf(True, dir='source')
608
vf.insert_record_stream(self.grouped_stream([b'a', b'b', b'c', b'd']))
610
vf.insert_record_stream(self.grouped_stream(
611
[b'e', b'f', b'g', b'h'], first_parents=((b'd',),)))
613
stream = vf.get_record_stream(
614
[(r.encode(),) for r in 'abcdefgh'], 'unordered', False)
616
for record in stream:
617
if record.key in [(b'a',), (b'e',)]:
618
self.assertEqual('groupcompress-block', record.storage_kind)
620
self.assertEqual('groupcompress-block-ref',
622
block_bytes[record.key] = record._manager._block._z_content
624
self.assertEqual(8, num_records)
627
self.assertIs(block_bytes[key], block_bytes[(b'a',)])
628
self.assertNotEqual(block_bytes[key], block_bytes[(b'e',)])
631
self.assertIs(block_bytes[key], block_bytes[(b'e',)])
632
self.assertNotEqual(block_bytes[key], block_bytes[(b'a',)])
633
# Now copy the blocks into another vf, and ensure that the blocks are
634
# preserved without creating new entries
635
vf2 = self.make_test_vf(True, dir='target')
636
keys = [(r.encode(),) for r in 'abcdefgh']
637
# ordering in 'groupcompress' order, should actually swap the groups in
638
# the target vf, but the groups themselves should not be disturbed.
640
def small_size_stream():
641
for record in vf.get_record_stream(keys, 'groupcompress', False):
642
record._manager._full_enough_block_size = \
643
record._manager._block._content_length
646
vf2.insert_record_stream(small_size_stream())
647
stream = vf2.get_record_stream(keys, 'groupcompress', False)
650
for record in stream:
652
self.assertEqual(block_bytes[record.key],
653
record._manager._block._z_content)
654
self.assertEqual(8, num_records)
656
def test_insert_record_stream_packs_on_the_fly(self):
657
vf = self.make_test_vf(True, dir='source')
659
vf.insert_record_stream(self.grouped_stream([b'a', b'b', b'c', b'd']))
661
vf.insert_record_stream(self.grouped_stream(
662
[b'e', b'f', b'g', b'h'], first_parents=((b'd',),)))
663
# Now copy the blocks into another vf, and see that the
664
# insert_record_stream rebuilt a new block on-the-fly because of
666
vf2 = self.make_test_vf(True, dir='target')
667
keys = [(r.encode(),) for r in 'abcdefgh']
668
vf2.insert_record_stream(vf.get_record_stream(
669
keys, 'groupcompress', False))
670
stream = vf2.get_record_stream(keys, 'groupcompress', False)
673
# All of the records should be recombined into a single block
675
for record in stream:
678
block = record._manager._block
680
self.assertIs(block, record._manager._block)
681
self.assertEqual(8, num_records)
683
def test__insert_record_stream_no_reuse_block(self):
684
vf = self.make_test_vf(True, dir='source')
686
vf.insert_record_stream(self.grouped_stream([b'a', b'b', b'c', b'd']))
688
vf.insert_record_stream(self.grouped_stream(
689
[b'e', b'f', b'g', b'h'], first_parents=((b'd',),)))
691
keys = [(r.encode(),) for r in 'abcdefgh']
692
self.assertEqual(8, len(list(
693
vf.get_record_stream(keys, 'unordered', False))))
694
# Now copy the blocks into another vf, and ensure that the blocks are
695
# preserved without creating new entries
696
vf2 = self.make_test_vf(True, dir='target')
697
# ordering in 'groupcompress' order, should actually swap the groups in
698
# the target vf, but the groups themselves should not be disturbed.
699
list(vf2._insert_record_stream(vf.get_record_stream(
700
keys, 'groupcompress', False),
703
# After inserting with reuse_blocks=False, we should have everything in
704
# a single new block.
705
stream = vf2.get_record_stream(keys, 'groupcompress', False)
707
for record in stream:
709
block = record._manager._block
711
self.assertIs(block, record._manager._block)
713
def test_add_missing_noncompression_parent_unvalidated_index(self):
714
unvalidated = self.make_g_index_missing_parent()
715
combined = _mod_index.CombinedGraphIndex([unvalidated])
716
index = groupcompress._GCGraphIndex(combined,
717
is_locked=lambda: True, parents=True,
718
track_external_parent_refs=True)
719
index.scan_unvalidated_index(unvalidated)
721
frozenset([(b'missing-parent',)]), index.get_missing_parents())
723
def test_track_external_parent_refs(self):
724
g_index = self.make_g_index('empty', 1, [])
725
mod_index = btree_index.BTreeBuilder(1, 1)
726
combined = _mod_index.CombinedGraphIndex([g_index, mod_index])
727
index = groupcompress._GCGraphIndex(combined,
728
is_locked=lambda: True, parents=True,
729
add_callback=mod_index.add_nodes,
730
track_external_parent_refs=True)
732
((b'new-key',), b'2 10 2 10', [((b'parent-1',), (b'parent-2',))])])
734
frozenset([(b'parent-1',), (b'parent-2',)]),
735
index.get_missing_parents())
737
def make_source_with_b(self, a_parent, path):
738
source = self.make_test_vf(True, dir=path)
739
source.add_lines((b'a',), (), [b'lines\n'])
741
b_parents = ((b'a',),)
744
source.add_lines((b'b',), b_parents, [b'lines\n'])
747
def do_inconsistent_inserts(self, inconsistency_fatal):
748
target = self.make_test_vf(True, dir='target',
749
inconsistency_fatal=inconsistency_fatal)
751
source = self.make_source_with_b(x == 1, 'source%s' % x)
752
target.insert_record_stream(source.get_record_stream(
753
[(b'b',)], 'unordered', False))
755
def test_inconsistent_redundant_inserts_warn(self):
756
"""Should not insert a record that is already present."""
759
def warning(template, args):
760
warnings.append(template % args)
761
_trace_warning = trace.warning
762
trace.warning = warning
764
self.do_inconsistent_inserts(inconsistency_fatal=False)
766
trace.warning = _trace_warning
767
self.assertContainsRe(
769
r"^inconsistent details in skipped record: \(b?'b',\)"
770
r" \(b?'42 32 0 8', \(\(\),\)\)"
771
r" \(b?'74 32 0 8', \(\(\(b?'a',\),\),\)\)$")
773
def test_inconsistent_redundant_inserts_raises(self):
774
e = self.assertRaises(knit.KnitCorrupt, self.do_inconsistent_inserts,
775
inconsistency_fatal=True)
776
self.assertContainsRe(str(e), r"Knit.* corrupt: inconsistent details"
778
r" \(b?'b',\) \(b?'42 32 0 8', \(\(\),\)\)"
779
r" \(b?'74 32 0 8', \(\(\(b?'a',\),\),\)\)")
781
def test_clear_cache(self):
782
vf = self.make_source_with_b(True, 'source')
784
for record in vf.get_record_stream([(b'a',), (b'b',)], 'unordered',
787
self.assertTrue(len(vf._group_cache) > 0)
789
self.assertEqual(0, len(vf._group_cache))
792
class TestGroupCompressConfig(tests.TestCaseWithTransport):
794
def make_test_vf(self):
795
t = self.get_transport('.')
797
factory = groupcompress.make_pack_factory(graph=True,
798
delta=False, keylength=1, inconsistency_fatal=True)
800
self.addCleanup(groupcompress.cleanup_pack_group, vf)
803
def test_max_bytes_to_index_default(self):
804
vf = self.make_test_vf()
805
gc = vf._make_group_compressor()
806
self.assertEqual(vf._DEFAULT_MAX_BYTES_TO_INDEX,
807
vf._max_bytes_to_index)
808
if isinstance(gc, groupcompress.PyrexGroupCompressor):
809
self.assertEqual(vf._DEFAULT_MAX_BYTES_TO_INDEX,
810
gc._delta_index._max_bytes_to_index)
812
def test_max_bytes_to_index_in_config(self):
813
c = config.GlobalConfig()
814
c.set_user_option('bzr.groupcompress.max_bytes_to_index', '10000')
815
vf = self.make_test_vf()
816
gc = vf._make_group_compressor()
817
self.assertEqual(10000, vf._max_bytes_to_index)
818
if isinstance(gc, groupcompress.PyrexGroupCompressor):
819
self.assertEqual(10000, gc._delta_index._max_bytes_to_index)
821
def test_max_bytes_to_index_bad_config(self):
822
c = config.GlobalConfig()
823
c.set_user_option('bzr.groupcompress.max_bytes_to_index', 'boogah')
824
vf = self.make_test_vf()
825
# TODO: This is triggering a warning, we might want to trap and make
826
# sure it is readable.
827
gc = vf._make_group_compressor()
828
self.assertEqual(vf._DEFAULT_MAX_BYTES_TO_INDEX,
829
vf._max_bytes_to_index)
830
if isinstance(gc, groupcompress.PyrexGroupCompressor):
831
self.assertEqual(vf._DEFAULT_MAX_BYTES_TO_INDEX,
832
gc._delta_index._max_bytes_to_index)
835
class StubGCVF(object):
836
def __init__(self, canned_get_blocks=None):
837
self._group_cache = {}
838
self._canned_get_blocks = canned_get_blocks or []
840
def _get_blocks(self, read_memos):
841
return iter(self._canned_get_blocks)
844
class Test_BatchingBlockFetcher(TestCaseWithGroupCompressVersionedFiles):
845
"""Simple whitebox unit tests for _BatchingBlockFetcher."""
847
def test_add_key_new_read_memo(self):
848
"""Adding a key with an uncached read_memo new to this batch adds that
849
read_memo to the list of memos to fetch.
851
# locations are: index_memo, ignored, parents, ignored
852
# where index_memo is: (idx, offset, len, factory_start, factory_end)
853
# and (idx, offset, size) is known as the 'read_memo', identifying the
855
read_memo = ('fake index', 100, 50)
857
('key',): (read_memo + (None, None), None, None, None)}
858
batcher = groupcompress._BatchingBlockFetcher(StubGCVF(), locations)
859
total_size = batcher.add_key(('key',))
860
self.assertEqual(50, total_size)
861
self.assertEqual([('key',)], batcher.keys)
862
self.assertEqual([read_memo], batcher.memos_to_get)
864
def test_add_key_duplicate_read_memo(self):
865
"""read_memos that occur multiple times in a batch will only be fetched
868
read_memo = ('fake index', 100, 50)
869
# Two keys, both sharing the same read memo (but different overall
872
('key1',): (read_memo + (0, 1), None, None, None),
873
('key2',): (read_memo + (1, 2), None, None, None)}
874
batcher = groupcompress._BatchingBlockFetcher(StubGCVF(), locations)
875
total_size = batcher.add_key(('key1',))
876
total_size = batcher.add_key(('key2',))
877
self.assertEqual(50, total_size)
878
self.assertEqual([('key1',), ('key2',)], batcher.keys)
879
self.assertEqual([read_memo], batcher.memos_to_get)
881
def test_add_key_cached_read_memo(self):
882
"""Adding a key with a cached read_memo will not cause that read_memo
883
to be added to the list to fetch.
885
read_memo = ('fake index', 100, 50)
887
gcvf._group_cache[read_memo] = 'fake block'
889
('key',): (read_memo + (None, None), None, None, None)}
890
batcher = groupcompress._BatchingBlockFetcher(gcvf, locations)
891
total_size = batcher.add_key(('key',))
892
self.assertEqual(0, total_size)
893
self.assertEqual([('key',)], batcher.keys)
894
self.assertEqual([], batcher.memos_to_get)
896
def test_yield_factories_empty(self):
897
"""An empty batch yields no factories."""
898
batcher = groupcompress._BatchingBlockFetcher(StubGCVF(), {})
899
self.assertEqual([], list(batcher.yield_factories()))
901
def test_yield_factories_calls_get_blocks(self):
902
"""Uncached memos are retrieved via get_blocks."""
903
read_memo1 = ('fake index', 100, 50)
904
read_memo2 = ('fake index', 150, 40)
907
(read_memo1, groupcompress.GroupCompressBlock()),
908
(read_memo2, groupcompress.GroupCompressBlock())])
910
('key1',): (read_memo1 + (0, 0), None, None, None),
911
('key2',): (read_memo2 + (0, 0), None, None, None)}
912
batcher = groupcompress._BatchingBlockFetcher(gcvf, locations)
913
batcher.add_key(('key1',))
914
batcher.add_key(('key2',))
915
factories = list(batcher.yield_factories(full_flush=True))
916
self.assertLength(2, factories)
917
keys = [f.key for f in factories]
918
kinds = [f.storage_kind for f in factories]
919
self.assertEqual([('key1',), ('key2',)], keys)
920
self.assertEqual(['groupcompress-block', 'groupcompress-block'], kinds)
922
def test_yield_factories_flushing(self):
923
"""yield_factories holds back on yielding results from the final block
924
unless passed full_flush=True.
926
fake_block = groupcompress.GroupCompressBlock()
927
read_memo = ('fake index', 100, 50)
929
gcvf._group_cache[read_memo] = fake_block
931
('key',): (read_memo + (0, 0), None, None, None)}
932
batcher = groupcompress._BatchingBlockFetcher(gcvf, locations)
933
batcher.add_key(('key',))
934
self.assertEqual([], list(batcher.yield_factories()))
935
factories = list(batcher.yield_factories(full_flush=True))
936
self.assertLength(1, factories)
937
self.assertEqual(('key',), factories[0].key)
938
self.assertEqual('groupcompress-block', factories[0].storage_kind)
941
class TestLazyGroupCompress(tests.TestCaseWithTransport):
944
(b'key1',): b"this is a text\n"
945
b"with a reasonable amount of compressible bytes\n"
946
b"which can be shared between various other texts\n",
947
(b'key2',): b"another text\n"
948
b"with a reasonable amount of compressible bytes\n"
949
b"which can be shared between various other texts\n",
950
(b'key3',): b"yet another text which won't be extracted\n"
951
b"with a reasonable amount of compressible bytes\n"
952
b"which can be shared between various other texts\n",
953
(b'key4',): b"this will be extracted\n"
954
b"but references most of its bytes from\n"
955
b"yet another text which won't be extracted\n"
956
b"with a reasonable amount of compressible bytes\n"
957
b"which can be shared between various other texts\n",
960
def make_block(self, key_to_text):
961
"""Create a GroupCompressBlock, filling it with the given texts."""
962
compressor = groupcompress.GroupCompressor()
964
for key in sorted(key_to_text):
966
key, [key_to_text[key]], len(key_to_text[key]), None)
967
locs = dict((key, (start, end)) for key, (start, _, end, _)
968
in compressor.labels_deltas.items())
969
block = compressor.flush()
970
raw_bytes = block.to_bytes()
971
return locs, groupcompress.GroupCompressBlock.from_bytes(raw_bytes)
973
def add_key_to_manager(self, key, locations, block, manager):
974
start, end = locations[key]
975
manager.add_factory(key, (), start, end)
977
def make_block_and_full_manager(self, texts):
978
locations, block = self.make_block(texts)
979
manager = groupcompress._LazyGroupContentManager(block)
980
for key in sorted(texts):
981
self.add_key_to_manager(key, locations, block, manager)
982
return block, manager
984
def test_get_fulltexts(self):
985
locations, block = self.make_block(self._texts)
986
manager = groupcompress._LazyGroupContentManager(block)
987
self.add_key_to_manager((b'key1',), locations, block, manager)
988
self.add_key_to_manager((b'key2',), locations, block, manager)
990
for record in manager.get_record_stream():
991
result_order.append(record.key)
992
text = self._texts[record.key]
993
self.assertEqual(text, record.get_bytes_as('fulltext'))
994
self.assertEqual([(b'key1',), (b'key2',)], result_order)
996
# If we build the manager in the opposite order, we should get them
997
# back in the opposite order
998
manager = groupcompress._LazyGroupContentManager(block)
999
self.add_key_to_manager((b'key2',), locations, block, manager)
1000
self.add_key_to_manager((b'key1',), locations, block, manager)
1002
for record in manager.get_record_stream():
1003
result_order.append(record.key)
1004
text = self._texts[record.key]
1005
self.assertEqual(text, record.get_bytes_as('fulltext'))
1006
self.assertEqual([(b'key2',), (b'key1',)], result_order)
1008
def test__wire_bytes_no_keys(self):
1009
locations, block = self.make_block(self._texts)
1010
manager = groupcompress._LazyGroupContentManager(block)
1011
wire_bytes = manager._wire_bytes()
1012
block_length = len(block.to_bytes())
1013
# We should have triggered a strip, since we aren't using any content
1014
stripped_block = manager._block.to_bytes()
1015
self.assertTrue(block_length > len(stripped_block))
1016
empty_z_header = zlib.compress(b'')
1017
self.assertEqual(b'groupcompress-block\n'
1018
b'8\n' # len(compress(''))
1020
b'%d\n' # compressed block len
1023
% (len(stripped_block), empty_z_header,
1027
def test__wire_bytes(self):
1028
locations, block = self.make_block(self._texts)
1029
manager = groupcompress._LazyGroupContentManager(block)
1030
self.add_key_to_manager((b'key1',), locations, block, manager)
1031
self.add_key_to_manager((b'key4',), locations, block, manager)
1032
block_bytes = block.to_bytes()
1033
wire_bytes = manager._wire_bytes()
1034
(storage_kind, z_header_len, header_len,
1035
block_len, rest) = wire_bytes.split(b'\n', 4)
1036
z_header_len = int(z_header_len)
1037
header_len = int(header_len)
1038
block_len = int(block_len)
1039
self.assertEqual(b'groupcompress-block', storage_kind)
1040
self.assertEqual(34, z_header_len)
1041
self.assertEqual(26, header_len)
1042
self.assertEqual(len(block_bytes), block_len)
1043
z_header = rest[:z_header_len]
1044
header = zlib.decompress(z_header)
1045
self.assertEqual(header_len, len(header))
1046
entry1 = locations[(b'key1',)]
1047
entry4 = locations[(b'key4',)]
1048
self.assertEqualDiff(b'key1\n'
1050
b'%d\n' # start offset
1051
b'%d\n' # end offset
1056
% (entry1[0], entry1[1],
1057
entry4[0], entry4[1]),
1059
z_block = rest[z_header_len:]
1060
self.assertEqual(block_bytes, z_block)
1062
def test_from_bytes(self):
1063
locations, block = self.make_block(self._texts)
1064
manager = groupcompress._LazyGroupContentManager(block)
1065
self.add_key_to_manager((b'key1',), locations, block, manager)
1066
self.add_key_to_manager((b'key4',), locations, block, manager)
1067
wire_bytes = manager._wire_bytes()
1068
self.assertStartsWith(wire_bytes, b'groupcompress-block\n')
1069
manager = groupcompress._LazyGroupContentManager.from_bytes(wire_bytes)
1070
self.assertIsInstance(manager, groupcompress._LazyGroupContentManager)
1071
self.assertEqual(2, len(manager._factories))
1072
self.assertEqual(block._z_content, manager._block._z_content)
1074
for record in manager.get_record_stream():
1075
result_order.append(record.key)
1076
text = self._texts[record.key]
1077
self.assertEqual(text, record.get_bytes_as('fulltext'))
1078
self.assertEqual([(b'key1',), (b'key4',)], result_order)
1080
def test__check_rebuild_no_changes(self):
1081
block, manager = self.make_block_and_full_manager(self._texts)
1082
manager._check_rebuild_block()
1083
self.assertIs(block, manager._block)
1085
def test__check_rebuild_only_one(self):
1086
locations, block = self.make_block(self._texts)
1087
manager = groupcompress._LazyGroupContentManager(block)
1088
# Request just the first key, which should trigger a 'strip' action
1089
self.add_key_to_manager((b'key1',), locations, block, manager)
1090
manager._check_rebuild_block()
1091
self.assertIsNot(block, manager._block)
1092
self.assertTrue(block._content_length > manager._block._content_length)
1093
# We should be able to still get the content out of this block, though
1094
# it should only have 1 entry
1095
for record in manager.get_record_stream():
1096
self.assertEqual((b'key1',), record.key)
1097
self.assertEqual(self._texts[record.key],
1098
record.get_bytes_as('fulltext'))
1100
def test__check_rebuild_middle(self):
1101
locations, block = self.make_block(self._texts)
1102
manager = groupcompress._LazyGroupContentManager(block)
1103
# Request a small key in the middle should trigger a 'rebuild'
1104
self.add_key_to_manager((b'key4',), locations, block, manager)
1105
manager._check_rebuild_block()
1106
self.assertIsNot(block, manager._block)
1107
self.assertTrue(block._content_length > manager._block._content_length)
1108
for record in manager.get_record_stream():
1109
self.assertEqual((b'key4',), record.key)
1110
self.assertEqual(self._texts[record.key],
1111
record.get_bytes_as('fulltext'))
1113
def test_manager_default_compressor_settings(self):
1114
locations, old_block = self.make_block(self._texts)
1115
manager = groupcompress._LazyGroupContentManager(old_block)
1116
gcvf = groupcompress.GroupCompressVersionedFiles
1117
# It doesn't greedily evaluate _max_bytes_to_index
1118
self.assertIs(None, manager._compressor_settings)
1119
self.assertEqual(gcvf._DEFAULT_COMPRESSOR_SETTINGS,
1120
manager._get_compressor_settings())
1122
def test_manager_custom_compressor_settings(self):
1123
locations, old_block = self.make_block(self._texts)
1126
def compressor_settings():
1127
called.append('called')
1129
manager = groupcompress._LazyGroupContentManager(old_block,
1130
get_compressor_settings=compressor_settings)
1131
gcvf = groupcompress.GroupCompressVersionedFiles
1132
# It doesn't greedily evaluate compressor_settings
1133
self.assertIs(None, manager._compressor_settings)
1134
self.assertEqual((10,), manager._get_compressor_settings())
1135
self.assertEqual((10,), manager._get_compressor_settings())
1136
self.assertEqual((10,), manager._compressor_settings)
1137
# Only called 1 time
1138
self.assertEqual(['called'], called)
1140
def test__rebuild_handles_compressor_settings(self):
1141
if not isinstance(groupcompress.GroupCompressor,
1142
groupcompress.PyrexGroupCompressor):
1143
raise tests.TestNotApplicable('pure-python compressor'
1144
' does not handle compressor_settings')
1145
locations, old_block = self.make_block(self._texts)
1146
manager = groupcompress._LazyGroupContentManager(old_block,
1147
get_compressor_settings=lambda: dict(max_bytes_to_index=32))
1148
gc = manager._make_group_compressor()
1149
self.assertEqual(32, gc._delta_index._max_bytes_to_index)
1150
self.add_key_to_manager((b'key3',), locations, old_block, manager)
1151
self.add_key_to_manager((b'key4',), locations, old_block, manager)
1152
action, last_byte, total_bytes = manager._check_rebuild_action()
1153
self.assertEqual('rebuild', action)
1154
manager._rebuild_block()
1155
new_block = manager._block
1156
self.assertIsNot(old_block, new_block)
1157
# Because of the new max_bytes_to_index, we do a poor job of
1158
# rebuilding. This is a side-effect of the change, but at least it does
1159
# show the setting had an effect.
1160
self.assertTrue(old_block._content_length < new_block._content_length)
1162
def test_check_is_well_utilized_all_keys(self):
1163
block, manager = self.make_block_and_full_manager(self._texts)
1164
self.assertFalse(manager.check_is_well_utilized())
1165
# Though we can fake it by changing the recommended minimum size
1166
manager._full_enough_block_size = block._content_length
1167
self.assertTrue(manager.check_is_well_utilized())
1168
# Setting it just above causes it to fail
1169
manager._full_enough_block_size = block._content_length + 1
1170
self.assertFalse(manager.check_is_well_utilized())
1171
# Setting the mixed-block size doesn't do anything, because the content
1172
# is considered to not be 'mixed'
1173
manager._full_enough_mixed_block_size = block._content_length
1174
self.assertFalse(manager.check_is_well_utilized())
1176
def test_check_is_well_utilized_mixed_keys(self):
1178
f1k1 = (b'f1', b'k1')
1179
f1k2 = (b'f1', b'k2')
1180
f2k1 = (b'f2', b'k1')
1181
f2k2 = (b'f2', b'k2')
1182
texts[f1k1] = self._texts[(b'key1',)]
1183
texts[f1k2] = self._texts[(b'key2',)]
1184
texts[f2k1] = self._texts[(b'key3',)]
1185
texts[f2k2] = self._texts[(b'key4',)]
1186
block, manager = self.make_block_and_full_manager(texts)
1187
self.assertFalse(manager.check_is_well_utilized())
1188
manager._full_enough_block_size = block._content_length
1189
self.assertTrue(manager.check_is_well_utilized())
1190
manager._full_enough_block_size = block._content_length + 1
1191
self.assertFalse(manager.check_is_well_utilized())
1192
manager._full_enough_mixed_block_size = block._content_length
1193
self.assertTrue(manager.check_is_well_utilized())
1195
def test_check_is_well_utilized_partial_use(self):
1196
locations, block = self.make_block(self._texts)
1197
manager = groupcompress._LazyGroupContentManager(block)
1198
manager._full_enough_block_size = block._content_length
1199
self.add_key_to_manager((b'key1',), locations, block, manager)
1200
self.add_key_to_manager((b'key2',), locations, block, manager)
1201
# Just using the content from key1 and 2 is not enough to be considered
1203
self.assertFalse(manager.check_is_well_utilized())
1204
# However if we add key3, then we have enough, as we only require 75%
1206
self.add_key_to_manager((b'key4',), locations, block, manager)
1207
self.assertTrue(manager.check_is_well_utilized())
1210
class Test_GCBuildDetails(tests.TestCase):
1212
def test_acts_like_tuple(self):
1213
# _GCBuildDetails inlines some of the data that used to be spread out
1214
# across a bunch of tuples
1215
bd = groupcompress._GCBuildDetails((('parent1',), ('parent2',)),
1216
('INDEX', 10, 20, 0, 5))
1217
self.assertEqual(4, len(bd))
1218
self.assertEqual(('INDEX', 10, 20, 0, 5), bd[0])
1219
self.assertEqual(None, bd[1]) # Compression Parent is always None
1220
self.assertEqual((('parent1',), ('parent2',)), bd[2])
1221
self.assertEqual(('group', None), bd[3]) # Record details
1223
def test__repr__(self):
1224
bd = groupcompress._GCBuildDetails((('parent1',), ('parent2',)),
1225
('INDEX', 10, 20, 0, 5))
1226
self.assertEqual("_GCBuildDetails(('INDEX', 10, 20, 0, 5),"
1227
" (('parent1',), ('parent2',)))",