/brz/remove-bazaar : revision 6685

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to breezy/groupcompress.py

Committer: Breezy landing bot
Author(s): Martin
Date: 2017-06-11 01:56:34 UTC
mfrom: (6684.1.5 py3_bootstrap2)
Revision ID: breezy.the.bot@gmail.com-20170611015634-9eeh86thh073hcko

More progress towards Python 3 support

Merged from https://code.launchpad.net/~gz/brz/py3_bootstrap2/+merge/325452

files modified:
breezy/_chk_map_py.py

breezy/_chunks_to_lines_py.py

breezy/_dirstate_helpers_py.py

breezy/_groupcompress_py.py

breezy/bzrworkingtree.py

breezy/chk_map.py

breezy/commit.py

breezy/config.py

breezy/dirstate.py

breezy/groupcompress.py

breezy/index.py

breezy/inventory.py

breezy/inventory_delta.py

breezy/osutils.py

breezy/pack.py

breezy/repofmt/groupcompress_repo.py

breezy/repofmt/pack_repo.py

breezy/repository.py

breezy/revision.py

breezy/sixish.py

breezy/tests/test__chk_map.py

breezy/tests/test__chunks_to_lines.py

breezy/tests/test_inv.py

breezy/tests/test_inventory_delta.py

breezy/transport/memory.py

breezy/versionedfile.py

breezy/xml_serializer.py

Show diffs side-by-side

added added

removed removed

breezy/groupcompress.py

BATCH_SIZE = 2**16

# osutils.sha_string('')

_null_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'

_null_sha1 = b'da39a3ee5e6b4b0d3255bfef95601890afd80709'

def sort_gc_optimal(parent_map):

"""Sort and group the keys in parent_map into groupcompress order.

# properly grouped by file-id.

per_prefix_map = {}

for key, value in viewitems(parent_map):

if isinstance(key, str) or len(key) == 1:

prefix = ''

if isinstance(key, bytes) or len(key) == 1:

prefix = b''

else:

prefix = key[0]

try:

102

"""

103

104

# Group Compress Block v1 Zlib

105

GCB_HEADER = 'gcb1z\n'

105

GCB_HEADER = b'gcb1z\n'

106

# Group Compress Block v1 Lzma

107

GCB_LZ_HEADER = 'gcb1l\n'

107

GCB_LZ_HEADER = b'gcb1l\n'

108

GCB_KNOWN_HEADERS = (GCB_HEADER, GCB_LZ_HEADER)

109

110

def __init__(self):

141

# Expand the content if required

142

if self._content is None:

143

if self._content_chunks is not None:

144

self._content = ''.join(self._content_chunks)

144

self._content = b''.join(self._content_chunks)

145

self._content_chunks = None

146

if self._content is None:

147

# We join self._z_content_chunks here, because if we are

149

# chunk

150

if self._z_content_chunks is None:

151

raise AssertionError('No content to decompress')

152

z_content = ''.join(self._z_content_chunks)

153

if z_content == '':

154

self._content = ''

152

z_content = b''.join(self._z_content_chunks)

153

if z_content == b'':

154

self._content = b''

155

elif self._compressor_name == 'lzma':

156

# We don't do partial lzma decomp yet

157

import pylzma

201

# The stream is finished

202

self._z_content_decompressor = None

203

204

def _parse_bytes(self, bytes, pos):

204

def _parse_bytes(self, data, pos):

205

"""Read the various lengths from the header.

206

207

This also populates the various 'compressed' buffers.

211

# At present, we have 2 integers for the compressed and uncompressed

212

# content. In base10 (ascii) 14 bytes can represent > 1TB, so to avoid

213

# checking too far, cap the search to 14 bytes.

214

pos2 = bytes.index('\n', pos, pos + 14)

215

self._z_content_length = int(bytes[pos:pos2])

216

pos = pos2 + 1

217

pos2 = bytes.index('\n', pos, pos + 14)

218

self._content_length = int(bytes[pos:pos2])

219

pos = pos2 + 1

220

if len(bytes) != (pos + self._z_content_length):

214

pos2 = data.index(b'\n', pos, pos + 14)

215

self._z_content_length = int(data[pos:pos2])

216

pos = pos2 + 1

217

pos2 = data.index(b'\n', pos, pos + 14)

218

self._content_length = int(data[pos:pos2])

219

pos = pos2 + 1

220

if len(data) != (pos + self._z_content_length):

221

# XXX: Define some GCCorrupt error ?

222

raise AssertionError('Invalid bytes: (%d) != %d + %d' %

223

(len(bytes), pos, self._z_content_length))

224

self._z_content_chunks = (bytes[pos:],)

223

(len(data), pos, self._z_content_length))

224

self._z_content_chunks = (data[pos:],)

225

226

@property

227

def _z_content(self):

230

Meant only to be used by the test suite.

231

"""

232

if self._z_content_chunks is not None:

233

return ''.join(self._z_content_chunks)

233

return b''.join(self._z_content_chunks)

234

return None

235

236

@classmethod

257

:return: The bytes for the content

258

"""

259

if start == end == 0:

260

return ''

260

return b''

261

self._ensure_content(end)

262

# The bytes are 'f' or 'd' for the type, then a variable-length

263

# base128 integer for the content size, then the actual content

264

# We know that the variable-length integer won't be longer than 5

265

# bytes (it takes 5 bytes to encode 2^32)

266

c = self._content[start]

267

if c == 'f':

267

if c == b'f':

268

type = 'fulltext'

269

else:

270

if c != 'd':

270

if c != b'd':

271

raise ValueError('Unknown content control code: %s'

272

% (c,))

273

type = 'delta'

277

if end != content_start + content_len:

278

raise ValueError('end != len according to field header'

279

' %s != %s' % (end, content_start + content_len))

280

if c == 'f':

281

bytes = self._content[content_start:end]

282

elif c == 'd':

283

bytes = apply_delta_to_source(self._content, content_start, end)

284

return bytes

280

if c == b'f':

281

return self._content[content_start:end]

282

# Must be type delta as checked above

283

return apply_delta_to_source(self._content, content_start, end)

285

284

286

285

def set_chunked_content(self, content_chunks, length):

287

286

"""Set the content of this block to the given chunks."""

324

323

"""Create the byte stream as a series of 'chunks'"""

325

324

self._create_z_content()

326

325

header = self.GCB_HEADER

327

chunks = ['%s%d\n%d\n'

326

chunks = [b'%s%d\n%d\n'

328

327

% (header, self._z_content_length, self._content_length),

329

328

]

330

329

chunks.extend(self._z_content_chunks)

334

333

def to_bytes(self):

335

334

"""Encode the information into a byte stream."""

336

335

total_len, chunks = self.to_chunks()

337

return ''.join(chunks)

336

return b''.join(chunks)

338

337

339

338

def _dump(self, include_text=False):

340

339

"""Take this block, and spit out a human-readable structure.

352

351

while pos < self._content_length:

353

352

kind = self._content[pos]

354

353

pos += 1

355

if kind not in ('f', 'd'):

354

if kind not in (b'f', b'd'):

356

355

raise ValueError('invalid kind character: %r' % (kind,))

357

356

content_len, len_len = decode_base128_int(

358

357

self._content[pos:pos + 5])

360

359

if content_len + pos > self._content_length:

361

360

raise ValueError('invalid content_len %d for record @ pos %d'

362

361

% (content_len, pos - len_len - 1))

363

if kind == 'f': # Fulltext

362

if kind == b'f': # Fulltext

364

363

if include_text:

365

364

text = self._content[pos:pos+content_len]

366

result.append(('f', content_len, text))

365

result.append((b'f', content_len, text))

367

366

else:

368

result.append(('f', content_len))

369

elif kind == 'd': # Delta

367

result.append((b'f', content_len))

368

elif kind == b'd': # Delta

370

369

delta_content = self._content[pos:pos+content_len]

371

370

delta_info = []

372

371

# The first entry in a delta is the decompressed length

373

372

decomp_len, delta_pos = decode_base128_int(delta_content)

374

result.append(('d', content_len, decomp_len, delta_info))

373

result.append((b'd', content_len, decomp_len, delta_info))

375

374

measured_len = 0

376

375

while delta_pos < content_len:

377

376

c = ord(delta_content[delta_pos])

382

381

delta_pos)

383

382

if include_text:

384

383

text = self._content[offset:offset+length]

385

delta_info.append(('c', offset, length, text))

384

delta_info.append((b'c', offset, length, text))

386

385

else:

387

delta_info.append(('c', offset, length))

386

delta_info.append((b'c', offset, length))

388

387

measured_len += length

389

388

else: # Insert

390

389

if include_text:

391

390

txt = delta_content[delta_pos:delta_pos+c]

392

391

else:

393

392

txt = ''

394

delta_info.append(('i', c, txt))

393

delta_info.append((b'i', c, txt))

395

394

measured_len += c

396

395

delta_pos += c

397

396

if delta_pos != content_len:

447

446

# wire bytes, something...

448

447

return self._manager._wire_bytes()

449

448

else:

450

return ''

449

return b''

451

450

if storage_kind in ('fulltext', 'chunked'):

452

451

if self._bytes is None:

453

452

# Grab and cache the raw bytes for this entry

842

841

if sha1 == nostore_sha:

843

842

raise errors.ExistingContent()

844

843

if key[-1] is None:

845

key = key[:-1] + ('sha1:' + sha1,)

844

# GZ 2017-06-10: Seems perverse to have to encode here.

845

sha1 = sha1.encode('ascii')

846

key = key[:-1] + (b'sha1:' + sha1,)

846

847

848

start, end, type = self._compress(key, bytes, len(bytes) / 2, soft)

848

849

return sha1, start, end, type

875

876

(start_byte, start_chunk, end_byte, end_chunk) = self.labels_deltas[key]

876

877

delta_chunks = self.chunks[start_chunk:end_chunk]

877

878

stored_bytes = ''.join(delta_chunks)

878

if stored_bytes[0] == 'f':

879

if stored_bytes[0] == b'f':

879

880

fulltext_len, offset = decode_base128_int(stored_bytes[1:10])

880

881

data_len = fulltext_len + 1 + offset

881

882

if data_len != len(stored_bytes):

947

948

if delta_length > max_delta_size:

948

949

# The delta is longer than the fulltext, insert a fulltext

949

950

type = 'fulltext'

950

out_lines = ['f', encode_base128_int(input_len)]

951

out_lines = [b'f', encode_base128_int(input_len)]

951

952

out_lines.extend(new_lines)

952

953

index_lines = [False, False]

953

954

index_lines.extend([True] * len(new_lines))

954

955

else:

955

956

# this is a worthy delta, output it

956

957

type = 'delta'

957

out_lines[0] = 'd'

958

out_lines[0] = b'd'

958

959

# Update the delta_length to include those two encoded integers

959

960

out_lines[1] = encode_base128_int(delta_length)

960

961

# Before insertion

1014

1015

enc_length = encode_base128_int(len(bytes))

1015

1016

len_mini_header = 1 + len(enc_length)

1016

1017

self._delta_index.add_source(bytes, len_mini_header)

1017

new_chunks = ['f', enc_length, bytes]

1018

new_chunks = [b'f', enc_length, bytes]

1018

1019

else:

1019

1020

type = 'delta'

1020

1021

enc_length = encode_base128_int(len(delta))

1021

1022

len_mini_header = 1 + len(enc_length)

1022

new_chunks = ['d', enc_length, delta]

1023

new_chunks = [b'd', enc_length, delta]

1023

1024

self._delta_index.add_delta_source(delta, len_mini_header)

1024

1025

# Before insertion

1025

1026

start = self.endpoint

1715

1716

# the fulltext content at this point. Note that sometimes we

1716

1717

# will want it later (streaming CHK pages), but most of the

1717

1718

# time we won't (everything else)

1718

bytes = ''.join(chunks)

1719

data = b''.join(chunks)

1719

1720

del chunks

1720

1721

index, start, length = self._access.add_raw_records(

1721

[(None, len(bytes))], bytes)[0]

1722

[(None, len(data))], data)[0]

1722

1723

nodes = []

1723

1724

for key, reads, refs in keys_to_add:

1724

nodes.append((key, "%d %d %s" % (start, length, reads), refs))

1725

nodes.append((key, b"%d %d %s" % (start, length, reads), refs))

1725

1726

self._index.add_records(nodes, random_id=random_id)

1726

1727

self._unadded_refs = {}

1727

1728

del keys_to_add[:]

1777

1778

' the current record, we cannot be positive'

1778

1779

' that the appropriate content was inserted.'

1779

1780

)

1780

value = "%d %d %d %d" % (block_start, block_length,

1781

value = b"%d %d %d %d" % (block_start, block_length,

1781

1782

record._start, record._end)

1782

1783

nodes = [(record.key, value, (record.parents,))]

1783

1784

# TODO: Consider buffering up many nodes to be added, not

1827

1828

type) = self._compressor.compress(record.key, bytes,

1828

1829

record.sha1)

1829

1830

if record.key[-1] is None:

1830

key = record.key[:-1] + ('sha1:' + found_sha1,)

1831

key = record.key[:-1] + (b'sha1:' + found_sha1,)

1831

1832

else:

1832

1833

key = record.key

1833

1834

self._unadded_refs[key] = record.parents

1838

1839

else:

1839

1840

parents = None

1840

1841

refs = static_tuple.StaticTuple(parents)

1841

keys_to_add.append((key, '%d %d' % (start_point, end_point), refs))

1842

keys_to_add.append(

1843

(key, b'%d %d' % (start_point, end_point), refs))

1842

1844

if len(keys_to_add):

1843

1845

flush()

1844

1846

self._compressor = None

Older »