22
from ..lazy_import import lazy_import
23
lazy_import(globals(), """
37
from bzrlib.btree_index import BTreeBuilder
38
from bzrlib.lru_cache import LRUSizeCache
39
from bzrlib.tsort import topo_sort
40
from bzrlib.versionedfile import (
33
from breezy.bzr import (
39
from breezy.i18n import gettext
45
from .btree_index import BTreeBuilder
46
from ..lru_cache import LRUSizeCache
47
from .versionedfile import (
42
50
AbsentContentFactory,
43
51
ChunkedContentFactory,
44
52
FulltextContentFactory,
53
VersionedFilesWithFallbacks,
48
56
# Minimum number of uncompressed bytes to try fetch at once when retrieving
49
57
# groupcompress blocks.
52
_USE_LZMA = False and (pylzma is not None)
60
# osutils.sha_string(b'')
61
_null_sha1 = b'da39a3ee5e6b4b0d3255bfef95601890afd80709'
54
# osutils.sha_string('')
55
_null_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
57
64
def sort_gc_optimal(parent_map):
58
65
"""Sort and group the keys in parent_map into groupcompress order.
79
86
for prefix in sorted(per_prefix_map):
80
present_keys.extend(reversed(topo_sort(per_prefix_map[prefix])))
87
present_keys.extend(reversed(tsort.topo_sort(per_prefix_map[prefix])))
81
88
return present_keys
91
class DecompressCorruption(errors.BzrError):
93
_fmt = "Corruption while decompressing repository file%(orig_error)s"
95
def __init__(self, orig_error=None):
96
if orig_error is not None:
97
self.orig_error = ", %s" % (orig_error,)
100
errors.BzrError.__init__(self)
84
103
# The max zlib window size is 32kB, so if we set 'max_size' output of the
85
104
# decompressor to the requested bytes + 32kB, then we should guarantee
86
105
# num_bytes coming out.
87
_ZLIB_DECOMP_WINDOW = 32*1024
106
_ZLIB_DECOMP_WINDOW = 32 * 1024
89
109
class GroupCompressBlock(object):
90
110
"""An object which maintains the internal structure of the compressed data.
132
152
# Expand the content if required
133
153
if self._content is None:
134
154
if self._content_chunks is not None:
135
self._content = ''.join(self._content_chunks)
155
self._content = b''.join(self._content_chunks)
136
156
self._content_chunks = None
137
157
if self._content is None:
138
if self._z_content is None:
158
# We join self._z_content_chunks here, because if we are
159
# decompressing, then it is *very* likely that we have a single
161
if self._z_content_chunks is None:
139
162
raise AssertionError('No content to decompress')
140
if self._z_content == '':
163
z_content = b''.join(self._z_content_chunks)
142
166
elif self._compressor_name == 'lzma':
143
167
# We don't do partial lzma decomp yet
144
self._content = pylzma.decompress(self._z_content)
169
self._content = pylzma.decompress(z_content)
145
170
elif self._compressor_name == 'zlib':
146
171
# Start a zlib decompressor
147
172
if num_bytes * 4 > self._content_length * 3:
148
173
# If we are requesting more that 3/4ths of the content,
149
174
# just extract the whole thing in a single pass
150
175
num_bytes = self._content_length
151
self._content = zlib.decompress(self._z_content)
176
self._content = zlib.decompress(z_content)
153
178
self._z_content_decompressor = zlib.decompressobj()
154
179
# Seed the decompressor with the uncompressed bytes, so
155
180
# that the rest of the code is simplified
156
181
self._content = self._z_content_decompressor.decompress(
157
self._z_content, num_bytes + _ZLIB_DECOMP_WINDOW)
182
z_content, num_bytes + _ZLIB_DECOMP_WINDOW)
158
183
if not self._z_content_decompressor.unconsumed_tail:
159
184
self._z_content_decompressor = None
197
222
# At present, we have 2 integers for the compressed and uncompressed
198
223
# content. In base10 (ascii) 14 bytes can represent > 1TB, so to avoid
199
224
# checking too far, cap the search to 14 bytes.
200
pos2 = bytes.index('\n', pos, pos + 14)
201
self._z_content_length = int(bytes[pos:pos2])
203
pos2 = bytes.index('\n', pos, pos + 14)
204
self._content_length = int(bytes[pos:pos2])
206
if len(bytes) != (pos + self._z_content_length):
225
pos2 = data.index(b'\n', pos, pos + 14)
226
self._z_content_length = int(data[pos:pos2])
228
pos2 = data.index(b'\n', pos, pos + 14)
229
self._content_length = int(data[pos:pos2])
231
if len(data) != (pos + self._z_content_length):
207
232
# XXX: Define some GCCorrupt error ?
208
233
raise AssertionError('Invalid bytes: (%d) != %d + %d' %
209
(len(bytes), pos, self._z_content_length))
210
self._z_content = bytes[pos:]
234
(len(data), pos, self._z_content_length))
235
self._z_content_chunks = (data[pos:],)
238
def _z_content(self):
239
"""Return z_content_chunks as a simple string.
241
Meant only to be used by the test suite.
243
if self._z_content_chunks is not None:
244
return b''.join(self._z_content_chunks)
213
248
def from_bytes(cls, bytes):
215
if bytes[:6] not in cls.GCB_KNOWN_HEADERS:
251
if header not in cls.GCB_KNOWN_HEADERS:
216
252
raise ValueError('bytes did not start with any of %r'
217
253
% (cls.GCB_KNOWN_HEADERS,))
218
# XXX: why not testing the whole header ?
254
if header == cls.GCB_HEADER:
220
255
out._compressor_name = 'zlib'
221
elif bytes[4] == 'l':
256
elif header == cls.GCB_LZ_HEADER:
222
257
out._compressor_name = 'lzma'
224
raise ValueError('unknown compressor: %r' % (bytes,))
259
raise ValueError('unknown compressor: %r' % (header,))
225
260
out._parse_bytes(bytes, 6)
233
268
:return: The bytes for the content
235
270
if start == end == 0:
237
272
self._ensure_content(end)
238
273
# The bytes are 'f' or 'd' for the type, then a variable-length
239
274
# base128 integer for the content size, then the actual content
240
275
# We know that the variable-length integer won't be longer than 5
241
276
# bytes (it takes 5 bytes to encode 2^32)
242
c = self._content[start]
277
c = self._content[start:start + 1]
244
279
type = 'fulltext'
247
282
raise ValueError('Unknown content control code: %s'
250
285
content_len, len_len = decode_base128_int(
251
self._content[start + 1:start + 6])
286
self._content[start + 1:start + 6])
252
287
content_start = start + 1 + len_len
253
288
if end != content_start + content_len:
254
289
raise ValueError('end != len according to field header'
255
' %s != %s' % (end, content_start + content_len))
257
bytes = self._content[content_start:end]
259
bytes = apply_delta_to_source(self._content, content_start, end)
290
' %s != %s' % (end, content_start + content_len))
292
return [self._content[content_start:end]]
293
# Must be type delta as checked above
294
return [apply_delta_to_source(self._content, content_start, end)]
262
296
def set_chunked_content(self, content_chunks, length):
263
297
"""Set the content of this block to the given chunks."""
269
303
self._content_length = length
270
304
self._content_chunks = content_chunks
271
305
self._content = None
272
self._z_content = None
306
self._z_content_chunks = None
274
308
def set_content(self, content):
275
309
"""Set the content of this block."""
276
310
self._content_length = len(content)
277
311
self._content = content
278
self._z_content = None
280
def _create_z_content_using_lzma(self):
281
if self._content_chunks is not None:
282
self._content = ''.join(self._content_chunks)
283
self._content_chunks = None
284
if self._content is None:
285
raise AssertionError('Nothing to compress')
286
self._z_content = pylzma.compress(self._content)
287
self._z_content_length = len(self._z_content)
289
def _create_z_content_from_chunks(self):
312
self._z_content_chunks = None
314
def _create_z_content_from_chunks(self, chunks):
290
315
compressor = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION)
291
compressed_chunks = map(compressor.compress, self._content_chunks)
316
# Peak in this point is 1 fulltext, 1 compressed text, + zlib overhead
317
# (measured peak is maybe 30MB over the above...)
318
compressed_chunks = list(map(compressor.compress, chunks))
292
319
compressed_chunks.append(compressor.flush())
293
self._z_content = ''.join(compressed_chunks)
294
self._z_content_length = len(self._z_content)
320
# Ignore empty chunks
321
self._z_content_chunks = [c for c in compressed_chunks if c]
322
self._z_content_length = sum(map(len, self._z_content_chunks))
296
324
def _create_z_content(self):
297
if self._z_content is not None:
300
self._create_z_content_using_lzma()
325
if self._z_content_chunks is not None:
302
327
if self._content_chunks is not None:
303
self._create_z_content_from_chunks()
305
self._z_content = zlib.compress(self._content)
306
self._z_content_length = len(self._z_content)
328
chunks = self._content_chunks
330
chunks = (self._content,)
331
self._create_z_content_from_chunks(chunks)
334
"""Create the byte stream as a series of 'chunks'"""
335
self._create_z_content()
336
header = self.GCB_HEADER
337
chunks = [b'%s%d\n%d\n'
338
% (header, self._z_content_length, self._content_length),
340
chunks.extend(self._z_content_chunks)
341
total_len = sum(map(len, chunks))
342
return total_len, chunks
308
344
def to_bytes(self):
309
345
"""Encode the information into a byte stream."""
310
self._create_z_content()
312
header = self.GCB_LZ_HEADER
314
header = self.GCB_HEADER
316
'%d\n%d\n' % (self._z_content_length, self._content_length),
319
return ''.join(chunks)
346
total_len, chunks = self.to_chunks()
347
return b''.join(chunks)
321
349
def _dump(self, include_text=False):
322
350
"""Take this block, and spit out a human-readable structure.
334
362
while pos < self._content_length:
335
kind = self._content[pos]
363
kind = self._content[pos:pos + 1]
337
if kind not in ('f', 'd'):
365
if kind not in (b'f', b'd'):
338
366
raise ValueError('invalid kind character: %r' % (kind,))
339
367
content_len, len_len = decode_base128_int(
340
self._content[pos:pos + 5])
368
self._content[pos:pos + 5])
342
370
if content_len + pos > self._content_length:
343
371
raise ValueError('invalid content_len %d for record @ pos %d'
344
372
% (content_len, pos - len_len - 1))
345
if kind == 'f': # Fulltext
373
if kind == b'f': # Fulltext
347
text = self._content[pos:pos+content_len]
348
result.append(('f', content_len, text))
375
text = self._content[pos:pos + content_len]
376
result.append((b'f', content_len, text))
350
result.append(('f', content_len))
351
elif kind == 'd': # Delta
352
delta_content = self._content[pos:pos+content_len]
378
result.append((b'f', content_len))
379
elif kind == b'd': # Delta
380
delta_content = self._content[pos:pos + content_len]
354
382
# The first entry in a delta is the decompressed length
355
383
decomp_len, delta_pos = decode_base128_int(delta_content)
356
result.append(('d', content_len, decomp_len, delta_info))
384
result.append((b'd', content_len, decomp_len, delta_info))
358
386
while delta_pos < content_len:
359
c = ord(delta_content[delta_pos])
387
c = delta_content[delta_pos]
363
391
delta_pos) = decode_copy_instruction(delta_content, c,
366
text = self._content[offset:offset+length]
367
delta_info.append(('c', offset, length, text))
394
text = self._content[offset:offset + length]
395
delta_info.append((b'c', offset, length, text))
369
delta_info.append(('c', offset, length))
397
delta_info.append((b'c', offset, length))
370
398
measured_len += length
373
txt = delta_content[delta_pos:delta_pos+c]
401
txt = delta_content[delta_pos:delta_pos + c]
376
delta_info.append(('i', c, txt))
404
delta_info.append((b'i', c, txt))
377
405
measured_len += c
379
407
if delta_pos != content_len:
422
451
def __repr__(self):
423
452
return '%s(%s, first=%s)' % (self.__class__.__name__,
424
self.key, self._first)
453
self.key, self._first)
455
def _extract_bytes(self):
456
# Grab and cache the raw bytes for this entry
457
# and break the ref-cycle with _manager since we don't need it
460
self._manager._prepare_for_extract()
461
except zlib.error as value:
462
raise DecompressCorruption("zlib: " + str(value))
463
block = self._manager._block
464
self._chunks = block.extract(self.key, self._start, self._end)
465
# There are code paths that first extract as fulltext, and then
466
# extract as storage_kind (smart fetch). So we don't break the
467
# refcycle here, but instead in manager.get_record_stream()
426
469
def get_bytes_as(self, storage_kind):
427
470
if storage_kind == self.storage_kind:
429
472
# wire bytes, something...
430
473
return self._manager._wire_bytes()
433
if storage_kind in ('fulltext', 'chunked'):
434
if self._bytes is None:
435
# Grab and cache the raw bytes for this entry
436
# and break the ref-cycle with _manager since we don't need it
438
self._manager._prepare_for_extract()
439
block = self._manager._block
440
self._bytes = block.extract(self.key, self._start, self._end)
441
# There are code paths that first extract as fulltext, and then
442
# extract as storage_kind (smart fetch). So we don't break the
443
# refcycle here, but instead in manager.get_record_stream()
476
if storage_kind in ('fulltext', 'chunked', 'lines'):
477
if self._chunks is None:
478
self._extract_bytes()
444
479
if storage_kind == 'fulltext':
480
return b''.join(self._chunks)
481
elif storage_kind == 'chunked':
484
return osutils.chunks_to_lines(self._chunks)
485
raise errors.UnavailableRepresentation(self.key, storage_kind,
488
def iter_bytes_as(self, storage_kind):
489
if self._chunks is None:
490
self._extract_bytes()
491
if storage_kind == 'chunked':
492
return iter(self._chunks)
493
elif storage_kind == 'lines':
494
return iter(osutils.chunks_to_lines(self._chunks))
448
495
raise errors.UnavailableRepresentation(self.key, storage_kind,
449
496
self.storage_kind)
452
499
class _LazyGroupContentManager(object):
453
500
"""This manages a group of _LazyGroupCompressFactory objects."""
455
_max_cut_fraction = 0.75 # We allow a block to be trimmed to 75% of
456
# current size, and still be considered
458
_full_block_size = 4*1024*1024
459
_full_mixed_block_size = 2*1024*1024
460
_full_enough_block_size = 3*1024*1024 # size at which we won't repack
461
_full_enough_mixed_block_size = 2*768*1024 # 1.5MB
502
_max_cut_fraction = 0.75 # We allow a block to be trimmed to 75% of
503
# current size, and still be considered
505
_full_block_size = 4 * 1024 * 1024
506
_full_mixed_block_size = 2 * 1024 * 1024
507
_full_enough_block_size = 3 * 1024 * 1024 # size at which we won't repack
508
_full_enough_mixed_block_size = 2 * 768 * 1024 # 1.5MB
463
def __init__(self, block):
510
def __init__(self, block, get_compressor_settings=None):
464
511
self._block = block
465
512
# We need to preserve the ordering
466
513
self._factories = []
467
514
self._last_byte = 0
515
self._get_settings = get_compressor_settings
516
self._compressor_settings = None
518
def _get_compressor_settings(self):
519
if self._compressor_settings is not None:
520
return self._compressor_settings
522
if self._get_settings is not None:
523
settings = self._get_settings()
525
vf = GroupCompressVersionedFiles
526
settings = vf._DEFAULT_COMPRESSOR_SETTINGS
527
self._compressor_settings = settings
528
return self._compressor_settings
469
530
def add_factory(self, key, parents, start, end):
470
531
if not self._factories:
503
564
new_block.set_content(self._block._content[:last_byte])
504
565
self._block = new_block
567
def _make_group_compressor(self):
568
return GroupCompressor(self._get_compressor_settings())
506
570
def _rebuild_block(self):
507
571
"""Create a new GroupCompressBlock with only the referenced texts."""
508
compressor = GroupCompressor()
572
compressor = self._make_group_compressor()
509
573
tstart = time.time()
510
574
old_length = self._block._content_length
512
576
for factory in self._factories:
513
bytes = factory.get_bytes_as('fulltext')
577
chunks = factory.get_bytes_as('chunked')
578
chunks_len = factory.size
579
if chunks_len is None:
580
chunks_len = sum(map(len, chunks))
514
581
(found_sha1, start_point, end_point,
515
type) = compressor.compress(factory.key, bytes, factory.sha1)
582
type) = compressor.compress(
583
factory.key, chunks, chunks_len, factory.sha1)
516
584
# Now update this factory with the new offsets, etc
517
585
factory.sha1 = found_sha1
518
586
factory._start = start_point
662
735
# 1 line for end byte
663
736
header_lines = []
664
737
for factory in self._factories:
665
key_bytes = '\x00'.join(factory.key)
738
key_bytes = b'\x00'.join(factory.key)
666
739
parents = factory.parents
667
740
if parents is None:
668
parent_bytes = 'None:'
741
parent_bytes = b'None:'
670
parent_bytes = '\t'.join('\x00'.join(key) for key in parents)
671
record_header = '%s\n%s\n%d\n%d\n' % (
743
parent_bytes = b'\t'.join(b'\x00'.join(key) for key in parents)
744
record_header = b'%s\n%s\n%d\n%d\n' % (
672
745
key_bytes, parent_bytes, factory._start, factory._end)
673
746
header_lines.append(record_header)
674
747
# TODO: Can we break the refcycle at this point and set
675
748
# factory._manager = None?
676
header_bytes = ''.join(header_lines)
749
header_bytes = b''.join(header_lines)
678
751
header_bytes_len = len(header_bytes)
679
752
z_header_bytes = zlib.compress(header_bytes)
681
754
z_header_bytes_len = len(z_header_bytes)
682
block_bytes = self._block.to_bytes()
683
lines.append('%d\n%d\n%d\n' % (z_header_bytes_len, header_bytes_len,
755
block_bytes_len, block_chunks = self._block.to_chunks()
756
lines.append(b'%d\n%d\n%d\n' % (
757
z_header_bytes_len, header_bytes_len, block_bytes_len))
685
758
lines.append(z_header_bytes)
686
lines.append(block_bytes)
687
del z_header_bytes, block_bytes
688
return ''.join(lines)
759
lines.extend(block_chunks)
760
del z_header_bytes, block_chunks
761
# TODO: This is a point where we will double the memory consumption. To
762
# avoid this, we probably have to switch to a 'chunked' api
763
return b''.join(lines)
691
766
def from_bytes(cls, bytes):
692
767
# TODO: This does extra string copying, probably better to do it a
768
# different way. At a minimum this creates 2 copies of the
694
770
(storage_kind, z_header_len, header_len,
695
block_len, rest) = bytes.split('\n', 4)
771
block_len, rest) = bytes.split(b'\n', 4)
697
if storage_kind != 'groupcompress-block':
773
if storage_kind != b'groupcompress-block':
698
774
raise ValueError('Unknown storage kind: %s' % (storage_kind,))
699
775
z_header_len = int(z_header_len)
700
776
if len(rest) < z_header_len:
723
799
block = GroupCompressBlock.from_bytes(block_bytes)
725
801
result = cls(block)
726
for start in xrange(0, len(header_lines), 4):
802
for start in range(0, len(header_lines), 4):
728
key = tuple(header_lines[start].split('\x00'))
729
parents_line = header_lines[start+1]
730
if parents_line == 'None:':
804
key = tuple(header_lines[start].split(b'\x00'))
805
parents_line = header_lines[start + 1]
806
if parents_line == b'None:':
733
parents = tuple([tuple(segment.split('\x00'))
734
for segment in parents_line.split('\t')
736
start_offset = int(header_lines[start+2])
737
end_offset = int(header_lines[start+3])
809
parents = tuple([tuple(segment.split(b'\x00'))
810
for segment in parents_line.split(b'\t')
812
start_offset = int(header_lines[start + 2])
813
end_offset = int(header_lines[start + 3])
738
814
result.add_factory(key, parents, start_offset, end_offset)
749
825
class _CommonGroupCompressor(object):
827
def __init__(self, settings=None):
752
828
"""Create a GroupCompressor."""
754
830
self._last = None
755
831
self.endpoint = 0
756
832
self.input_bytes = 0
757
833
self.labels_deltas = {}
758
self._delta_index = None # Set by the children
834
self._delta_index = None # Set by the children
759
835
self._block = GroupCompressBlock()
839
self._settings = settings
761
def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):
841
def compress(self, key, chunks, length, expected_sha, nostore_sha=None,
762
843
"""Compress lines with label key.
764
845
:param key: A key tuple. It is stored in the output
765
846
for identification of the text during decompression. If the last
766
element is 'None' it is replaced with the sha1 of the text -
847
element is b'None' it is replaced with the sha1 of the text -
767
848
e.g. sha1:xxxxxxx.
768
:param bytes: The bytes to be compressed
849
:param chunks: Chunks of bytes to be compressed
850
:param length: Length of chunks
769
851
:param expected_sha: If non-None, the sha the lines are believed to
770
852
have. During compression the sha is calculated; a mismatch will
787
869
if expected_sha is not None:
788
870
sha1 = expected_sha
790
sha1 = osutils.sha_string(bytes)
872
sha1 = osutils.sha_strings(chunks)
791
873
if nostore_sha is not None:
792
874
if sha1 == nostore_sha:
793
875
raise errors.ExistingContent()
794
876
if key[-1] is None:
795
key = key[:-1] + ('sha1:' + sha1,)
877
key = key[:-1] + (b'sha1:' + sha1,)
797
start, end, type = self._compress(key, bytes, len(bytes) / 2, soft)
879
start, end, type = self._compress(key, chunks, length, length / 2, soft)
798
880
return sha1, start, end, type
800
def _compress(self, key, bytes, max_delta_size, soft=False):
882
def _compress(self, key, chunks, input_len, max_delta_size, soft=False):
801
883
"""Compress lines with label key.
803
885
:param key: A key tuple. It is stored in the output for identification
804
886
of the text during decompression.
806
:param bytes: The bytes to be compressed
888
:param chunks: The chunks of bytes to be compressed
890
:param input_len: The length of the chunks
808
892
:param max_delta_size: The size above which we issue a fulltext instead
820
904
"""Extract a key previously added to the compressor.
822
906
:param key: The key to extract.
823
:return: An iterable over bytes and the sha1.
907
:return: An iterable over chunks and the sha1.
825
(start_byte, start_chunk, end_byte, end_chunk) = self.labels_deltas[key]
909
(start_byte, start_chunk, end_byte,
910
end_chunk) = self.labels_deltas[key]
826
911
delta_chunks = self.chunks[start_chunk:end_chunk]
827
stored_bytes = ''.join(delta_chunks)
828
if stored_bytes[0] == 'f':
912
stored_bytes = b''.join(delta_chunks)
913
kind = stored_bytes[:1]
829
915
fulltext_len, offset = decode_base128_int(stored_bytes[1:10])
830
916
data_len = fulltext_len + 1 + offset
831
if data_len != len(stored_bytes):
917
if data_len != len(stored_bytes):
832
918
raise ValueError('Index claimed fulltext len, but stored bytes'
833
919
' claim %s != %s'
834
920
% (len(stored_bytes), data_len))
835
bytes = stored_bytes[offset + 1:]
921
data = [stored_bytes[offset + 1:]]
924
raise ValueError('Unknown content kind, bytes claim %s' % kind)
837
925
# XXX: This is inefficient at best
838
source = ''.join(self.chunks[:start_chunk])
839
if stored_bytes[0] != 'd':
840
raise ValueError('Unknown content kind, bytes claim %s'
841
% (stored_bytes[0],))
926
source = b''.join(self.chunks[:start_chunk])
842
927
delta_len, offset = decode_base128_int(stored_bytes[1:10])
843
928
data_len = delta_len + 1 + offset
844
929
if data_len != len(stored_bytes):
845
930
raise ValueError('Index claimed delta len, but stored bytes'
846
931
' claim %s != %s'
847
932
% (len(stored_bytes), data_len))
848
bytes = apply_delta(source, stored_bytes[offset + 1:])
849
bytes_sha1 = osutils.sha_string(bytes)
850
return bytes, bytes_sha1
933
data = [apply_delta(source, stored_bytes[offset + 1:])]
934
data_sha1 = osutils.sha_strings(data)
935
return data, data_sha1
853
938
"""Finish this group, creating a formatted stream.
855
940
After calling this, the compressor should no longer be used
857
# TODO: this causes us to 'bloat' to 2x the size of content in the
858
# group. This has an impact for 'commit' of large objects.
859
# One possibility is to use self._content_chunks, and be lazy and
860
# only fill out self._content as a full string when we actually
861
# need it. That would at least drop the peak memory consumption
862
# for 'commit' down to ~1x the size of the largest file, at a
863
# cost of increased complexity within this code. 2x is still <<
864
# 3x the size of the largest file, so we are doing ok.
865
942
self._block.set_chunked_content(self.chunks, self.endpoint)
866
943
self.chunks = None
867
944
self._delta_index = None
886
963
class PythonGroupCompressor(_CommonGroupCompressor):
965
def __init__(self, settings=None):
889
966
"""Create a GroupCompressor.
891
968
Used only if the pyrex version is not available.
893
super(PythonGroupCompressor, self).__init__()
970
super(PythonGroupCompressor, self).__init__(settings)
894
971
self._delta_index = LinesDeltaIndex([])
895
972
# The actual content is managed by LinesDeltaIndex
896
973
self.chunks = self._delta_index.lines
898
def _compress(self, key, bytes, max_delta_size, soft=False):
975
def _compress(self, key, chunks, input_len, max_delta_size, soft=False):
899
976
"""see _CommonGroupCompressor._compress"""
900
input_len = len(bytes)
901
new_lines = osutils.split_lines(bytes)
977
new_lines = osutils.chunks_to_lines(chunks)
902
978
out_lines, index_lines = self._delta_index.make_delta(
903
979
new_lines, bytes_length=input_len, soft=soft)
904
980
delta_length = sum(map(len, out_lines))
905
981
if delta_length > max_delta_size:
906
982
# The delta is longer than the fulltext, insert a fulltext
907
983
type = 'fulltext'
908
out_lines = ['f', encode_base128_int(input_len)]
984
out_lines = [b'f', encode_base128_int(input_len)]
909
985
out_lines.extend(new_lines)
910
986
index_lines = [False, False]
911
987
index_lines.extend([True] * len(new_lines))
913
989
# this is a worthy delta, output it
916
992
# Update the delta_length to include those two encoded integers
917
993
out_lines[1] = encode_base128_int(delta_length)
918
994
# Before insertion
934
1010
It contains code very similar to SequenceMatcher because of having a similar
935
1011
task. However some key differences apply:
936
- there is no junk, we want a minimal edit not a human readable diff.
937
- we don't filter very common lines (because we don't know where a good
938
range will start, and after the first text we want to be emitting minmal
940
- we chain the left side, not the right side
941
- we incrementally update the adjacency matrix as new lines are provided.
942
- we look for matches in all of the left side, so the routine which does
943
the analagous task of find_longest_match does not need to filter on the
1013
* there is no junk, we want a minimal edit not a human readable diff.
1014
* we don't filter very common lines (because we don't know where a good
1015
range will start, and after the first text we want to be emitting minmal
1017
* we chain the left side, not the right side
1018
* we incrementally update the adjacency matrix as new lines are provided.
1019
* we look for matches in all of the left side, so the routine which does
1020
the analagous task of find_longest_match does not need to filter on the
948
super(PyrexGroupCompressor, self).__init__()
949
self._delta_index = DeltaIndex()
1024
def __init__(self, settings=None):
1025
super(PyrexGroupCompressor, self).__init__(settings)
1026
max_bytes_to_index = self._settings.get('max_bytes_to_index', 0)
1027
self._delta_index = DeltaIndex(max_bytes_to_index=max_bytes_to_index)
951
def _compress(self, key, bytes, max_delta_size, soft=False):
1029
def _compress(self, key, chunks, input_len, max_delta_size, soft=False):
952
1030
"""see _CommonGroupCompressor._compress"""
953
input_len = len(bytes)
954
1031
# By having action/label/sha1/len, we can parse the group if the index
955
1032
# was ever destroyed, we have the key in 'label', we know the final
956
1033
# bytes are valid from sha1, and we know where to find the end of this
962
1039
# new_chunks = ['label:%s\nsha1:%s\n' % (label, sha1)]
963
1040
if self._delta_index._source_offset != self.endpoint:
964
1041
raise AssertionError('_source_offset != endpoint'
965
' somehow the DeltaIndex got out of sync with'
1042
' somehow the DeltaIndex got out of sync with'
1043
' the output lines')
1044
bytes = b''.join(chunks)
967
1045
delta = self._delta_index.make_delta(bytes, max_delta_size)
969
1047
type = 'fulltext'
970
enc_length = encode_base128_int(len(bytes))
1048
enc_length = encode_base128_int(input_len)
971
1049
len_mini_header = 1 + len(enc_length)
972
1050
self._delta_index.add_source(bytes, len_mini_header)
973
new_chunks = ['f', enc_length, bytes]
1051
new_chunks = [b'f', enc_length] + chunks
976
1054
enc_length = encode_base128_int(len(delta))
977
1055
len_mini_header = 1 + len(enc_length)
978
new_chunks = ['d', enc_length, delta]
1056
new_chunks = [b'd', enc_length, delta]
979
1057
self._delta_index.add_delta_source(delta, len_mini_header)
980
1058
# Before insertion
981
1059
start = self.endpoint
1022
1100
graph_index = BTreeBuilder(reference_lists=ref_length,
1023
key_elements=keylength)
1101
key_elements=keylength)
1024
1102
stream = transport.open_write_stream('newpack')
1025
1103
writer = pack.ContainerWriter(stream.write)
1027
index = _GCGraphIndex(graph_index, lambda:True, parents=parents,
1028
add_callback=graph_index.add_nodes,
1029
inconsistency_fatal=inconsistency_fatal)
1030
access = knit._DirectPackAccess({})
1105
index = _GCGraphIndex(graph_index, lambda: True, parents=parents,
1106
add_callback=graph_index.add_nodes,
1107
inconsistency_fatal=inconsistency_fatal)
1108
access = pack_repo._DirectPackAccess({})
1031
1109
access.set_writer(writer, graph_index, (transport, 'newpack'))
1032
1110
result = GroupCompressVersionedFiles(index, access, delta)
1033
1111
result.stream = stream
1149
1229
self.total_bytes = 0
1152
class GroupCompressVersionedFiles(VersionedFiles):
1232
class GroupCompressVersionedFiles(VersionedFilesWithFallbacks):
1153
1233
"""A group-compress based VersionedFiles implementation."""
1155
def __init__(self, index, access, delta=True, _unadded_refs=None):
1235
# This controls how the GroupCompress DeltaIndex works. Basically, we
1236
# compute hash pointers into the source blocks (so hash(text) => text).
1237
# However each of these references costs some memory in trade against a
1238
# more accurate match result. For very large files, they either are
1239
# pre-compressed and change in bulk whenever they change, or change in just
1240
# local blocks. Either way, 'improved resolution' is not very helpful,
1241
# versus running out of memory trying to track everything. The default max
1242
# gives 100% sampling of a 1MB file.
1243
_DEFAULT_MAX_BYTES_TO_INDEX = 1024 * 1024
1244
_DEFAULT_COMPRESSOR_SETTINGS = {'max_bytes_to_index':
1245
_DEFAULT_MAX_BYTES_TO_INDEX}
1247
def __init__(self, index, access, delta=True, _unadded_refs=None,
1156
1249
"""Create a GroupCompressVersionedFiles object.
1158
1251
:param index: The index object storing access and graph data.
1159
1252
:param access: The access object storing raw data.
1160
1253
:param delta: Whether to delta compress or just entropy compress.
1161
1254
:param _unadded_refs: private parameter, don't use.
1255
:param _group_cache: private parameter, don't use.
1163
1257
self._index = index
1164
1258
self._access = access
1166
1260
if _unadded_refs is None:
1167
1261
_unadded_refs = {}
1168
1262
self._unadded_refs = _unadded_refs
1169
self._group_cache = LRUSizeCache(max_size=50*1024*1024)
1170
self._fallback_vfs = []
1263
if _group_cache is None:
1264
_group_cache = LRUSizeCache(max_size=50 * 1024 * 1024)
1265
self._group_cache = _group_cache
1266
self._immediate_fallback_vfs = []
1267
self._max_bytes_to_index = None
1172
1269
def without_fallbacks(self):
1173
1270
"""Return a clone of this object without any fallbacks configured."""
1174
1271
return GroupCompressVersionedFiles(self._index, self._access,
1175
self._delta, _unadded_refs=dict(self._unadded_refs))
1272
self._delta, _unadded_refs=dict(
1273
self._unadded_refs),
1274
_group_cache=self._group_cache)
1177
1276
def add_lines(self, key, parents, lines, parent_texts=None,
1178
left_matching_blocks=None, nostore_sha=None, random_id=False,
1179
check_content=True):
1277
left_matching_blocks=None, nostore_sha=None, random_id=False,
1278
check_content=True):
1180
1279
"""Add a text to the store.
1182
1281
:param key: The key tuple of the text to add.
1183
1282
:param parents: The parents key tuples of the text to add.
1184
1283
:param lines: A list of lines. Each line must be a bytestring. And all
1185
of them except the last must be terminated with \n and contain no
1186
other \n's. The last line may either contain no \n's or a single
1187
terminating \n. If the lines list does meet this constraint the add
1188
routine may error or may succeed - but you will be unable to read
1189
the data back accurately. (Checking the lines have been split
1284
of them except the last must be terminated with \\n and contain no
1285
other \\n's. The last line may either contain no \\n's or a single
1286
terminating \\n. If the lines list does meet this constraint the
1287
add routine may error or may succeed - but you will be unable to
1288
read the data back accurately. (Checking the lines have been split
1190
1289
correctly is expensive and extremely unlikely to catch bugs so it
1191
1290
is not done at runtime unless check_content is True.)
1192
1291
:param parent_texts: An optional dictionary containing the opaque
1211
1310
back to future add_lines calls in the parent_texts dictionary.
1213
1312
self._index._check_write_ok()
1214
self._check_add(key, lines, random_id, check_content)
1216
# The caller might pass None if there is no graph data, but kndx
1217
# indexes can't directly store that, so we give them
1218
# an empty tuple instead.
1220
# double handling for now. Make it work until then.
1221
length = sum(map(len, lines))
1222
record = ChunkedContentFactory(key, parents, None, lines)
1223
sha1 = list(self._insert_record_stream([record], random_id=random_id,
1224
nostore_sha=nostore_sha))[0]
1225
return sha1, length, None
1227
def _add_text(self, key, parents, text, nostore_sha=None, random_id=False):
1228
"""See VersionedFiles._add_text()."""
1314
self._check_lines_not_unicode(lines)
1315
self._check_lines_are_lines(lines)
1316
return self.add_content(
1317
ChunkedContentFactory(
1318
key, parents, osutils.sha_strings(lines), lines, chunks_are_lines=True),
1319
parent_texts, left_matching_blocks, nostore_sha, random_id)
1321
def add_content(self, factory, parent_texts=None,
1322
left_matching_blocks=None, nostore_sha=None,
1324
"""Add a text to the store.
1326
:param factory: A ContentFactory that can be used to retrieve the key,
1327
parents and contents.
1328
:param parent_texts: An optional dictionary containing the opaque
1329
representations of some or all of the parents of version_id to
1330
allow delta optimisations. VERY IMPORTANT: the texts must be those
1331
returned by add_lines or data corruption can be caused.
1332
:param left_matching_blocks: a hint about which areas are common
1333
between the text and its left-hand-parent. The format is
1334
the SequenceMatcher.get_matching_blocks format.
1335
:param nostore_sha: Raise ExistingContent and do not add the lines to
1336
the versioned file if the digest of the lines matches this.
1337
:param random_id: If True a random id has been selected rather than
1338
an id determined by some deterministic process such as a converter
1339
from a foreign VCS. When True the backend may choose not to check
1340
for uniqueness of the resulting key within the versioned file, so
1341
this should only be done when the result is expected to be unique
1343
:return: The text sha1, the number of bytes in the text, and an opaque
1344
representation of the inserted version which can be provided
1345
back to future add_lines calls in the parent_texts dictionary.
1229
1347
self._index._check_write_ok()
1230
self._check_add(key, None, random_id, check_content=False)
1231
if text.__class__ is not str:
1232
raise errors.BzrBadParameterUnicode("text")
1348
parents = factory.parents
1349
self._check_add(factory.key, random_id)
1233
1350
if parents is None:
1234
1351
# The caller might pass None if there is no graph data, but kndx
1235
1352
# indexes can't directly store that, so we give them
1236
1353
# an empty tuple instead.
1238
1355
# double handling for now. Make it work until then.
1240
record = FulltextContentFactory(key, parents, None, text)
1241
sha1 = list(self._insert_record_stream([record], random_id=random_id,
1242
nostore_sha=nostore_sha))[0]
1356
sha1, length = list(self._insert_record_stream(
1357
[factory], random_id=random_id, nostore_sha=nostore_sha))[0]
1243
1358
return sha1, length, None
1245
1360
def add_fallback_versioned_files(self, a_versioned_files):
1283
1399
# probably check that the existing content is identical to what is
1284
1400
# being inserted, and otherwise raise an exception. This would make
1285
1401
# the bundle code simpler.
1287
self._check_lines_not_unicode(lines)
1288
self._check_lines_are_lines(lines)
1290
def get_known_graph_ancestry(self, keys):
1291
"""Get a KnownGraph instance with the ancestry of keys."""
1292
# Note that this is identical to
1293
# KnitVersionedFiles.get_known_graph_ancestry, but they don't share
1295
parent_map, missing_keys = self._index.find_ancestry(keys)
1296
for fallback in self._fallback_vfs:
1297
if not missing_keys:
1299
(f_parent_map, f_missing_keys) = fallback._index.find_ancestry(
1301
parent_map.update(f_parent_map)
1302
missing_keys = f_missing_keys
1303
kg = _mod_graph.KnownGraph(parent_map)
1306
1403
def get_parent_map(self, keys):
1307
1404
"""Get a map of the graph parents of keys.
1534
1632
key_to_source_map)
1535
1633
elif ordering == 'as-requested':
1536
1634
source_keys = self._get_as_requested_source_keys(orig_keys,
1537
locations, unadded_keys, key_to_source_map)
1635
locations, unadded_keys, key_to_source_map)
1539
1637
# We want to yield the keys in a semi-optimal (read-wise) ordering.
1540
1638
# Otherwise we thrash the _group_cache and destroy performance
1541
1639
source_keys = self._get_io_ordered_source_keys(locations,
1542
unadded_keys, source_result)
1640
unadded_keys, source_result)
1543
1641
for key in missing:
1544
1642
yield AbsentContentFactory(key)
1545
1643
# Batch up as many keys as we can until either:
1546
1644
# - we encounter an unadded ref, or
1547
1645
# - we run out of keys, or
1548
1646
# - the total bytes to retrieve for this batch > BATCH_SIZE
1549
batcher = _BatchingBlockFetcher(self, locations)
1647
batcher = _BatchingBlockFetcher(self, locations,
1648
get_compressor_settings=self._get_compressor_settings)
1550
1649
for source, keys in source_keys:
1551
1650
if source is self:
1552
1651
for key in keys:
1595
1694
# test_insert_record_stream_existing_keys fail for groupcompress and
1596
1695
# groupcompress-nograph, this needs to be revisited while addressing
1597
1696
# 'bzr branch' performance issues.
1598
for _ in self._insert_record_stream(stream, random_id=False):
1697
for _, _ in self._insert_record_stream(stream, random_id=False):
1700
def _get_compressor_settings(self):
1701
if self._max_bytes_to_index is None:
1702
# TODO: VersionedFiles don't know about their containing
1703
# repository, so they don't have much of an idea about their
1704
# location. So for now, this is only a global option.
1705
c = config.GlobalConfig()
1706
val = c.get_user_option('bzr.groupcompress.max_bytes_to_index')
1710
except ValueError as e:
1711
trace.warning('Value for '
1712
'"bzr.groupcompress.max_bytes_to_index"'
1713
' %r is not an integer'
1717
val = self._DEFAULT_MAX_BYTES_TO_INDEX
1718
self._max_bytes_to_index = val
1719
return {'max_bytes_to_index': self._max_bytes_to_index}
1721
def _make_group_compressor(self):
1722
return GroupCompressor(self._get_compressor_settings())
1601
1724
def _insert_record_stream(self, stream, random_id=False, nostore_sha=None,
1602
1725
reuse_blocks=True):
1603
1726
"""Internal core to insert a record stream into this container.
1627
1751
# This will go up to fulltexts for gc to gc fetching, which isn't
1629
self._compressor = GroupCompressor()
1753
self._compressor = self._make_group_compressor()
1630
1754
self._unadded_refs = {}
1631
1755
keys_to_add = []
1633
bytes = self._compressor.flush().to_bytes()
1634
self._compressor = GroupCompressor()
1635
index, start, length = self._access.add_raw_records(
1636
[(None, len(bytes))], bytes)[0]
1758
bytes_len, chunks = self._compressor.flush().to_chunks()
1759
self._compressor = self._make_group_compressor()
1760
# Note: At this point we still have 1 copy of the fulltext (in
1761
# record and the var 'bytes'), and this generates 2 copies of
1762
# the compressed text (one for bytes, one in chunks)
1763
# TODO: Figure out how to indicate that we would be happy to free
1764
# the fulltext content at this point. Note that sometimes we
1765
# will want it later (streaming CHK pages), but most of the
1766
# time we won't (everything else)
1767
index, start, length = self._access.add_raw_record(
1768
None, bytes_len, chunks)
1638
1770
for key, reads, refs in keys_to_add:
1639
nodes.append((key, "%d %d %s" % (start, length, reads), refs))
1771
nodes.append((key, b"%d %d %s" % (start, length, reads), refs))
1640
1772
self._index.add_records(nodes, random_id=random_id)
1641
1773
self._unadded_refs = {}
1642
1774
del keys_to_add[:]
1689
1820
raise AssertionError('No insert_manager set')
1690
1821
if insert_manager is not record._manager:
1691
1822
raise AssertionError('insert_manager does not match'
1692
' the current record, we cannot be positive'
1693
' that the appropriate content was inserted.'
1695
value = "%d %d %d %d" % (block_start, block_length,
1696
record._start, record._end)
1823
' the current record, we cannot be positive'
1824
' that the appropriate content was inserted.'
1826
value = b"%d %d %d %d" % (block_start, block_length,
1827
record._start, record._end)
1697
1828
nodes = [(record.key, value, (record.parents,))]
1698
1829
# TODO: Consider buffering up many nodes to be added, not
1699
1830
# sure how much overhead this has, but we're seeing
1701
1832
self._index.add_records(nodes, random_id=random_id)
1704
bytes = record.get_bytes_as('fulltext')
1835
chunks = record.get_bytes_as('chunked')
1705
1836
except errors.UnavailableRepresentation:
1706
adapter_key = record.storage_kind, 'fulltext'
1837
adapter_key = record.storage_kind, 'chunked'
1707
1838
adapter = get_adapter(adapter_key)
1708
bytes = adapter.get_bytes(record)
1839
chunks = adapter.get_bytes(record, 'chunked')
1840
chunks_len = record.size
1841
if chunks_len is None:
1842
chunks_len = sum(map(len, chunks))
1709
1843
if len(record.key) > 1:
1710
1844
prefix = record.key[0]
1711
1845
soft = (prefix == last_prefix)
1715
if max_fulltext_len < len(bytes):
1716
max_fulltext_len = len(bytes)
1849
if max_fulltext_len < chunks_len:
1850
max_fulltext_len = chunks_len
1717
1851
max_fulltext_prefix = prefix
1718
1852
(found_sha1, start_point, end_point,
1719
type) = self._compressor.compress(record.key,
1720
bytes, record.sha1, soft=soft,
1721
nostore_sha=nostore_sha)
1722
# delta_ratio = float(len(bytes)) / (end_point - start_point)
1853
type) = self._compressor.compress(
1854
record.key, chunks, chunks_len, record.sha1, soft=soft,
1855
nostore_sha=nostore_sha)
1856
# delta_ratio = float(chunks_len) / (end_point - start_point)
1723
1857
# Check if we want to continue to include that text
1724
1858
if (prefix == max_fulltext_prefix
1725
and end_point < 2 * max_fulltext_len):
1859
and end_point < 2 * max_fulltext_len):
1726
1860
# As long as we are on the same file_id, we will fill at least
1727
1861
# 2 * max_fulltext_len
1728
1862
start_new_block = False
1729
elif end_point > 4*1024*1024:
1863
elif end_point > 4 * 1024 * 1024:
1730
1864
start_new_block = True
1731
1865
elif (prefix is not None and prefix != last_prefix
1732
and end_point > 2*1024*1024):
1866
and end_point > 2 * 1024 * 1024):
1733
1867
start_new_block = True
1735
1869
start_new_block = False
1737
1871
if start_new_block:
1738
1872
self._compressor.pop_last()
1740
max_fulltext_len = len(bytes)
1874
max_fulltext_len = chunks_len
1741
1875
(found_sha1, start_point, end_point,
1742
type) = self._compressor.compress(record.key, bytes,
1876
type) = self._compressor.compress(
1877
record.key, chunks, chunks_len, record.sha1)
1744
1878
if record.key[-1] is None:
1745
key = record.key[:-1] + ('sha1:' + found_sha1,)
1879
key = record.key[:-1] + (b'sha1:' + found_sha1,)
1747
1881
key = record.key
1748
1882
self._unadded_refs[key] = record.parents
1883
yield found_sha1, chunks_len
1750
1884
as_st = static_tuple.StaticTuple.from_sequence
1751
1885
if record.parents is not None:
1752
1886
parents = as_st([as_st(p) for p in record.parents])
1755
1889
refs = static_tuple.StaticTuple(parents)
1756
keys_to_add.append((key, '%d %d' % (start_point, end_point), refs))
1891
(key, b'%d %d' % (start_point, end_point), refs))
1757
1892
if len(keys_to_add):
1759
1894
self._compressor = None
1785
1920
# but we need to setup a list of records to visit.
1786
1921
# we need key, position, length
1787
1922
for key_idx, record in enumerate(self.get_record_stream(keys,
1788
'unordered', True)):
1923
'unordered', True)):
1789
1924
# XXX: todo - optimise to use less than full texts.
1790
1925
key = record.key
1791
1926
if pb is not None:
1792
1927
pb.update('Walking content', key_idx, total)
1793
1928
if record.storage_kind == 'absent':
1794
1929
raise errors.RevisionNotPresent(key, self)
1795
lines = osutils.split_lines(record.get_bytes_as('fulltext'))
1930
for line in record.iter_bytes_as('lines'):
1797
1931
yield line, key
1798
1932
if pb is not None:
1799
1933
pb.update('Walking content', total, total)
1802
1936
"""See VersionedFiles.keys."""
1803
1937
if 'evil' in debug.debug_flags:
1804
1938
trace.mutter_callsite(2, "keys scales with size of history")
1805
sources = [self._index] + self._fallback_vfs
1939
sources = [self._index] + self._immediate_fallback_vfs
1807
1941
for source in sources:
1808
1942
result.update(source.keys())
1946
class _GCBuildDetails(object):
1947
"""A blob of data about the build details.
1949
This stores the minimal data, which then allows compatibility with the old
1950
api, without taking as much memory.
1953
__slots__ = ('_index', '_group_start', '_group_end', '_basis_end',
1954
'_delta_end', '_parents')
1957
compression_parent = None
1959
def __init__(self, parents, position_info):
1960
self._parents = parents
1961
(self._index, self._group_start, self._group_end, self._basis_end,
1962
self._delta_end) = position_info
1965
return '%s(%s, %s)' % (self.__class__.__name__,
1966
self.index_memo, self._parents)
1969
def index_memo(self):
1970
return (self._index, self._group_start, self._group_end,
1971
self._basis_end, self._delta_end)
1974
def record_details(self):
1975
return static_tuple.StaticTuple(self.method, None)
1977
def __getitem__(self, offset):
1978
"""Compatibility thunk to act like a tuple."""
1980
return self.index_memo
1982
return self.compression_parent # Always None
1984
return self._parents
1986
return self.record_details
1988
raise IndexError('offset out of range')
1812
1994
class _GCGraphIndex(object):
1813
1995
"""Mapper from GroupCompressVersionedFiles needs into GraphIndex storage."""
1815
1997
def __init__(self, graph_index, is_locked, parents=True,
1816
add_callback=None, track_external_parent_refs=False,
1817
inconsistency_fatal=True, track_new_keys=False):
1998
add_callback=None, track_external_parent_refs=False,
1999
inconsistency_fatal=True, track_new_keys=False):
1818
2000
"""Construct a _GCGraphIndex on a graph_index.
1820
:param graph_index: An implementation of bzrlib.index.GraphIndex.
2002
:param graph_index: An implementation of breezy.index.GraphIndex.
1821
2003
:param is_locked: A callback, returns True if the index is locked and
1823
2005
:param parents: If True, record knits parents, if not do not record
1989
2171
:param keys: An iterable of keys.
1990
2172
:return: A dict of key:
1991
2173
(index_memo, compression_parent, parents, record_details).
1993
opaque structure to pass to read_records to extract the raw
1996
Content that this record is built upon, may be None
1998
Logical parents of this node
2000
extra information about the content which needs to be passed to
2001
Factory.parse_record
2175
* index_memo: opaque structure to pass to read_records to extract
2177
* compression_parent: Content that this record is built upon, may
2179
* parents: Logical parents of this node
2180
* record_details: extra information about the content which needs
2181
to be passed to Factory.parse_record
2003
2183
self._check_read()