1
# pack.py -- For dealing wih packed git objects.
2
# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
3
# Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
4
# The code is loosely based on that in the sha1_file.c file from git itself,
5
# which is Copyright (C) Linus Torvalds, 2005 and distributed under the
8
# This program is free software; you can redistribute it and/or
9
# modify it under the terms of the GNU General Public License
10
# as published by the Free Software Foundation; version 2
13
# This program is distributed in the hope that it will be useful,
14
# but WITHOUT ANY WARRANTY; without even the implied warranty of
15
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
# GNU General Public License for more details.
18
# You should have received a copy of the GNU General Public License
19
# along with this program; if not, write to the Free Software
20
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23
"""Classes for dealing with packed git objects.
25
A pack is a compact representation of a bunch of objects, stored
26
using deltas where possible.
28
They have two parts, the pack file, which stores the data, and an index
29
that tells you where the data is.
31
To find an object you look in all of the index files 'til you find a
32
match for the object name. You then use the pointer got from this as
33
a pointer in to the corresponding packfile.
36
from collections import defaultdict
38
from itertools import imap, izip
49
from errors import ApplyDeltaError
51
supports_mmap_offset = (sys.version_info[0] >= 3 or
52
(sys.version_info[0] == 2 and sys.version_info[1] >= 6))
55
def take_msb_bytes(map, offset):
57
while len(ret) == 0 or ret[-1] & 0x80:
58
ret.append(ord(map[offset]))
63
def read_zlib(data, offset, dec_size):
64
obj = zlib.decompressobj()
67
while obj.unused_data == "":
69
add = data[base:base+1024]
71
x += obj.decompress(add)
72
assert len(x) == dec_size
73
comp_len = fed-len(obj.unused_data)
81
return sha.hexdigest()
85
"""Convert a hex string to a binary sha string."""
87
for i in range(0, len(hex), 2):
88
ret += chr(int(hex[i:i+2], 16))
93
"""Convert a binary sha string to a hex sha string."""
96
ret += "%02x" % ord(i)
100
MAX_MMAP_SIZE = 256 * 1024 * 1024
102
def simple_mmap(f, offset, size, access=mmap.ACCESS_READ):
103
"""Simple wrapper for mmap() which always supports the offset parameter.
105
:param f: File object.
106
:param offset: Offset in the file, from the beginning of the file.
107
:param size: Size of the mmap'ed area
108
:param access: Access mechanism.
109
:return: MMAP'd area.
111
if offset+size > MAX_MMAP_SIZE and not supports_mmap_offset:
112
raise AssertionError("%s is larger than 256 meg, and this version "
113
"of Python does not support the offset argument to mmap().")
114
if supports_mmap_offset:
115
return mmap.mmap(f.fileno(), size, access=access, offset=offset)
117
class ArraySkipper(object):
119
def __init__(self, array, offset):
123
def __getslice__(self, i, j):
124
return self.array[i+self.offset:j+self.offset]
126
def __getitem__(self, i):
127
return self.array[i+self.offset]
130
return len(self.array) - self.offset
133
return str(self.array[self.offset:])
135
mem = mmap.mmap(f.fileno(), size+offset, access=access)
138
return ArraySkipper(mem, offset)
141
def resolve_object(offset, type, obj, get_ref, get_offset):
142
"""Resolve an object, possibly resolving deltas when necessary."""
143
if not type in (6, 7): # Not a delta
146
if type == 6: # offset delta
147
(delta_offset, delta) = obj
148
assert isinstance(delta_offset, int)
149
assert isinstance(delta, str)
150
offset = offset-delta_offset
151
type, base_obj = get_offset(offset)
152
assert isinstance(type, int)
153
elif type == 7: # ref delta
154
(basename, delta) = obj
155
assert isinstance(basename, str) and len(basename) == 20
156
assert isinstance(delta, str)
157
type, base_obj = get_ref(basename)
158
assert isinstance(type, int)
159
type, base_text = resolve_object(offset, type, base_obj, get_ref, get_offset)
160
return type, apply_delta(base_text, delta)
163
class PackIndex(object):
164
"""An index in to a packfile.
166
Given a sha id of an object a pack index can tell you the location in the
167
packfile of that object if it has it.
169
To do the loop it opens the file, and indexes first 256 4 byte groups
170
with the first byte of the sha id. The value in the four byte group indexed
171
is the end of the group that shares the same starting byte. Subtract one
172
from the starting byte and index again to find the start of the group.
173
The values are sorted by sha id within the group, so do the math to find
174
the start and end offset and then bisect in to find if the value is present.
177
def __init__(self, filename):
178
"""Create a pack index object.
180
Provide it with the name of the index file to consider, and it will map
181
it whenever required.
183
self._filename = filename
184
# Take the size now, so it can be checked each time we map the file to
185
# ensure that it hasn't changed.
186
self._size = os.path.getsize(filename)
187
self._file = open(filename, 'r')
188
self._contents = simple_mmap(self._file, 0, self._size)
189
if self._contents[:4] != '\377tOc':
191
self._fan_out_table = self._read_fan_out_table(0)
193
(self.version, ) = struct.unpack_from(">L", self._contents, 4)
194
assert self.version in (2,), "Version was %d" % self.version
195
self._fan_out_table = self._read_fan_out_table(8)
196
self._name_table_offset = 8 + 0x100 * 4
197
self._crc32_table_offset = self._name_table_offset + 20 * len(self)
198
self._pack_offset_table_offset = self._crc32_table_offset + 4 * len(self)
200
def __eq__(self, other):
201
if type(self) != type(other):
204
if self._fan_out_table != other._fan_out_table:
207
for (name1, _, _), (name2, _, _) in izip(self.iterentries(), other.iterentries()):
216
"""Return the number of entries in this pack index."""
217
return self._fan_out_table[-1]
219
def _unpack_entry(self, i):
220
"""Unpack the i-th entry in the index file.
222
:return: Tuple with object name (SHA), offset in pack file and
223
CRC32 checksum (if known)."""
224
if self.version == 1:
225
(offset, name) = struct.unpack_from(">L20s", self._contents,
226
(0x100 * 4) + (i * 24))
227
return (name, offset, None)
229
return (self._unpack_name(i), self._unpack_offset(i),
230
self._unpack_crc32_checksum(i))
232
def _unpack_name(self, i):
233
if self.version == 1:
234
return self._unpack_entry(i)[0]
236
return struct.unpack_from("20s", self._contents,
237
self._name_table_offset + i * 20)[0]
239
def _unpack_offset(self, i):
240
if self.version == 1:
241
return self._unpack_entry(i)[1]
243
return struct.unpack_from(">L", self._contents,
244
self._pack_offset_table_offset + i * 4)[0]
246
def _unpack_crc32_checksum(self, i):
247
if self.version == 1:
250
return struct.unpack_from(">L", self._contents,
251
self._crc32_table_offset + i * 4)[0]
254
return imap(sha_to_hex, self._itersha())
257
for i in range(len(self)):
258
yield self._unpack_name(i)
260
def objects_sha1(self):
261
return iter_sha1(self._itersha())
263
def iterentries(self):
264
"""Iterate over the entries in this pack index.
266
Will yield tuples with object name, offset in packfile and crc32 checksum.
268
for i in range(len(self)):
269
yield self._unpack_entry(i)
271
def _read_fan_out_table(self, start_offset):
273
for i in range(0x100):
274
ret.append(struct.unpack(">L", self._contents[start_offset+i*4:start_offset+(i+1)*4])[0])
278
"""Check that the stored checksum matches the actual checksum."""
279
return self.calculate_checksum() == self.get_stored_checksums()[1]
281
def calculate_checksum(self):
282
f = open(self._filename, 'r')
284
return hashlib.sha1(self._contents[:-20]).digest()
288
def get_stored_checksums(self):
289
"""Return the SHA1 checksums stored for the corresponding packfile and
290
this header file itself."""
291
return str(self._contents[-40:-20]), str(self._contents[-20:])
293
def object_index(self, sha):
294
"""Return the index in to the corresponding packfile for the object.
296
Given the name of an object it will return the offset that object lives
297
at within the corresponding pack file. If the pack file doesn't have the
298
object then None will be returned.
300
size = os.path.getsize(self._filename)
301
assert size == self._size, "Pack index %s has changed size, I don't " \
302
"like that" % self._filename
304
sha = hex_to_sha(sha)
305
return self._object_index(sha)
307
def _object_index(self, sha):
308
"""See object_index"""
313
start = self._fan_out_table[idx-1]
314
end = self._fan_out_table[idx]
318
file_sha = self._unpack_name(i)
324
return self._unpack_offset(i)
328
def read_pack_header(f):
330
assert header[:4] == "PACK"
331
(version,) = struct.unpack_from(">L", header, 4)
332
assert version in (2, 3), "Version was %d" % version
333
(num_objects,) = struct.unpack_from(">L", header, 8)
334
return (version, num_objects)
337
def read_pack_tail(f):
341
def unpack_object(map):
342
bytes = take_msb_bytes(map, 0)
343
type = (bytes[0] >> 4) & 0x07
344
size = bytes[0] & 0x0f
345
for i, byte in enumerate(bytes[1:]):
346
size += (byte & 0x7f) << ((i * 7) + 4)
347
raw_base = len(bytes)
348
if type == 6: # offset delta
349
bytes = take_msb_bytes(map, raw_base)
350
assert not (bytes[-1] & 0x80)
351
delta_base_offset = bytes[0] & 0x7f
352
for byte in bytes[1:]:
353
delta_base_offset += 1
354
delta_base_offset <<= 7
355
delta_base_offset += (byte & 0x7f)
357
uncomp, comp_len = read_zlib(map, raw_base, size)
358
assert size == len(uncomp)
359
return type, (delta_base_offset, uncomp), comp_len+raw_base
360
elif type == 7: # ref delta
361
basename = map[raw_base:raw_base+20]
362
uncomp, comp_len = read_zlib(map, raw_base+20, size)
363
assert size == len(uncomp)
364
return type, (basename, uncomp), comp_len+raw_base+20
366
uncomp, comp_len = read_zlib(map, raw_base, size)
367
assert len(uncomp) == size
368
return type, uncomp, comp_len+raw_base
371
class PackData(object):
372
"""The data contained in a packfile.
374
Pack files can be accessed both sequentially for exploding a pack, and
375
directly with the help of an index to retrieve a specific object.
377
The objects within are either complete or a delta aginst another.
379
The header is variable length. If the MSB of each byte is set then it
380
indicates that the subsequent byte is still part of the header.
381
For the first byte the next MS bits are the type, which tells you the type
382
of object, and whether it is a delta. The LS byte is the lowest bits of the
383
size. For each subsequent byte the LS 7 bits are the next MS bits of the
384
size, i.e. the last byte of the header contains the MS bits of the size.
386
For the complete objects the data is stored as zlib deflated data.
387
The size in the header is the uncompressed object size, so to uncompress
388
you need to just keep feeding data to zlib until you get an object back,
389
or it errors on bad data. This is done here by just giving the complete
390
buffer from the start of the deflated object on. This is bad, but until I
391
get mmap sorted out it will have to do.
393
Currently there are no integrity checks done. Also no attempt is made to try
394
and detect the delta case, or a request for an object at the wrong position.
395
It will all just throw a zlib or KeyError.
398
def __init__(self, filename):
399
"""Create a PackData object that represents the pack in the given filename.
401
The file must exist and stay readable until the object is disposed of. It
402
must also stay the same size. It will be mapped whenever needed.
404
Currently there is a restriction on the size of the pack as the python
405
mmap implementation is flawed.
407
self._filename = filename
408
assert os.path.exists(filename), "%s is not a packfile" % filename
409
self._size = os.path.getsize(filename)
410
self._header_size = 12
411
assert self._size >= self._header_size, "%s is too small for a packfile" % filename
414
def _read_header(self):
415
f = open(self._filename, 'rb')
417
(version, self._num_objects) = \
419
f.seek(self._size-20)
420
(self._stored_checksum,) = read_pack_tail(f)
425
"""Returns the number of objects in this pack."""
426
return self._num_objects
428
def calculate_checksum(self):
429
f = open(self._filename, 'rb')
431
map = simple_mmap(f, 0, self._size)
432
return hashlib.sha1(map[:-20]).digest()
436
def iterobjects(self):
437
offset = self._header_size
438
f = open(self._filename, 'rb')
439
for i in range(len(self)):
440
map = simple_mmap(f, offset, self._size-offset)
441
(type, obj, total_size) = unpack_object(map)
442
yield offset, type, obj
446
def iterentries(self, ext_resolve_ref=None):
449
postponed = defaultdict(list)
450
class Postpone(Exception):
451
"""Raised to postpone delta resolving."""
453
def get_ref_text(sha):
458
return ext_resolve_ref(sha)
461
raise Postpone, (sha, )
462
todo = list(self.iterobjects())
464
(offset, type, obj) = todo.pop(0)
465
at[offset] = (type, obj)
466
assert isinstance(offset, int)
467
assert isinstance(type, int)
468
assert isinstance(obj, tuple) or isinstance(obj, str)
470
type, obj = resolve_object(offset, type, obj, get_ref_text,
472
except Postpone, (sha, ):
473
postponed[sha].append((offset, type, obj))
475
shafile = ShaFile.from_raw_string(type, obj)
476
sha = shafile.sha().digest()
477
found[sha] = (type, obj)
478
yield sha, offset, shafile.crc32()
479
todo += postponed.get(sha, [])
481
raise KeyError([sha_to_hex(h) for h in postponed.keys()])
483
def sorted_entries(self, resolve_ext_ref=None):
484
ret = list(self.iterentries(resolve_ext_ref))
488
def create_index_v1(self, filename):
489
entries = self.sorted_entries()
490
write_pack_index_v1(filename, entries, self.calculate_checksum())
492
def create_index_v2(self, filename):
493
entries = self.sorted_entries()
494
write_pack_index_v2(filename, entries, self.calculate_checksum())
496
def get_stored_checksum(self):
497
return self._stored_checksum
500
return (self.calculate_checksum() == self.get_stored_checksum())
502
def get_object_at(self, offset):
503
"""Given an offset in to the packfile return the object that is there.
505
Using the associated index the location of an object can be looked up, and
506
then the packfile can be asked directly for that object using this
509
assert isinstance(offset, long) or isinstance(offset, int),\
510
"offset was %r" % offset
511
assert offset >= self._header_size
512
size = os.path.getsize(self._filename)
513
assert size == self._size, "Pack data %s has changed size, I don't " \
514
"like that" % self._filename
515
f = open(self._filename, 'rb')
517
map = simple_mmap(f, offset, size-offset)
518
return unpack_object(map)[:2]
523
class SHA1Writer(object):
525
def __init__(self, f):
527
self.sha1 = hashlib.sha1("")
529
def write(self, data):
530
self.sha1.update(data)
534
sha = self.sha1.digest()
535
assert len(sha) == 20
540
sha = self.write_sha()
548
def write_pack_object(f, type, object):
549
"""Write pack object to a file.
551
:param f: File to write to
552
:param o: Object to write
555
if type == 6: # ref delta
556
(delta_base_offset, object) = object
557
elif type == 7: # offset delta
558
(basename, object) = object
560
c = (type << 4) | (size & 15)
563
f.write(chr(c | 0x80))
567
if type == 6: # offset delta
568
ret = [delta_base_offset & 0x7f]
569
delta_base_offset >>= 7
570
while delta_base_offset:
571
delta_base_offset -= 1
572
ret.insert(0, 0x80 | (delta_base_offset & 0x7f))
573
delta_base_offset >>= 7
574
f.write("".join([chr(x) for x in ret]))
575
elif type == 7: # ref delta
576
assert len(basename) == 20
578
f.write(zlib.compress(object))
582
def write_pack(filename, objects, num_objects):
583
f = open(filename + ".pack", 'w')
585
entries, data_sum = write_pack_data(f, objects, num_objects)
589
write_pack_index_v2(filename + ".idx", entries, data_sum)
592
def write_pack_data(f, objects, num_objects):
593
"""Write a new pack file.
595
:param filename: The filename of the new pack file.
596
:param objects: List of objects to write.
597
:return: List with (name, offset, crc32 checksum) entries, pack checksum
601
f.write("PACK") # Pack header
602
f.write(struct.pack(">L", 2)) # Pack version
603
f.write(struct.pack(">L", num_objects)) # Number of objects in pack
605
sha1 = o.sha().digest()
608
t, o = o.as_raw_string()
609
offset = write_pack_object(f, t, o)
610
entries.append((sha1, offset, crc32))
611
return entries, f.write_sha()
614
def write_pack_index_v1(filename, entries, pack_checksum):
615
"""Write a new pack index file.
617
:param filename: The filename of the new pack index file.
618
:param entries: List of tuples with object name (sha), offset_in_pack, and
620
:param pack_checksum: Checksum of the pack file.
622
f = open(filename, 'w')
624
fan_out_table = defaultdict(lambda: 0)
625
for (name, offset, entry_checksum) in entries:
626
fan_out_table[ord(name[0])] += 1
628
for i in range(0x100):
629
f.write(struct.pack(">L", fan_out_table[i]))
630
fan_out_table[i+1] += fan_out_table[i]
631
for (name, offset, entry_checksum) in entries:
632
f.write(struct.pack(">L20s", offset, name))
633
assert len(pack_checksum) == 20
634
f.write(pack_checksum)
638
def apply_delta(src_buf, delta):
639
"""Based on the similar function in git's patch-delta.c."""
640
assert isinstance(src_buf, str), "was %r" % (src_buf,)
641
assert isinstance(delta, str)
646
return ord(ret), delta
647
def get_delta_header_size(delta):
651
cmd, delta = pop(delta)
652
size |= (cmd & ~0x80) << i
657
src_size, delta = get_delta_header_size(delta)
658
dest_size, delta = get_delta_header_size(delta)
659
assert src_size == len(src_buf)
661
cmd, delta = pop(delta)
666
x, delta = pop(delta)
667
cp_off |= x << (i * 8)
670
if cmd & (1 << (4+i)):
671
x, delta = pop(delta)
672
cp_size |= x << (i * 8)
675
if (cp_off + cp_size < cp_size or
676
cp_off + cp_size > src_size or
677
cp_size > dest_size):
679
out += src_buf[cp_off:cp_off+cp_size]
684
raise ApplyDeltaError("Invalid opcode 0")
687
raise ApplyDeltaError("delta not empty: %r" % delta)
689
if dest_size != len(out):
690
raise ApplyDeltaError("dest size incorrect")
695
def write_pack_index_v2(filename, entries, pack_checksum):
696
"""Write a new pack index file.
698
:param filename: The filename of the new pack index file.
699
:param entries: List of tuples with object name (sha), offset_in_pack, and
701
:param pack_checksum: Checksum of the pack file.
703
f = open(filename, 'w')
705
f.write('\377tOc') # Magic!
706
f.write(struct.pack(">L", 2))
707
fan_out_table = defaultdict(lambda: 0)
708
for (name, offset, entry_checksum) in entries:
709
fan_out_table[ord(name[0])] += 1
711
for i in range(0x100):
712
f.write(struct.pack(">L", fan_out_table[i]))
713
fan_out_table[i+1] += fan_out_table[i]
714
for (name, offset, entry_checksum) in entries:
716
for (name, offset, entry_checksum) in entries:
717
f.write(struct.pack(">l", entry_checksum))
718
for (name, offset, entry_checksum) in entries:
719
# FIXME: handle if MSBit is set in offset
720
f.write(struct.pack(">L", offset))
721
# FIXME: handle table for pack files > 8 Gb
722
assert len(pack_checksum) == 20
723
f.write(pack_checksum)
729
def __init__(self, basename):
730
self._basename = basename
731
self._data_path = self._basename + ".pack"
732
self._idx_path = self._basename + ".idx"
737
return self.idx.objects_sha1()
741
if self._data is None:
742
self._data = PackData(self._data_path)
743
assert len(self.idx) == len(self._data)
744
assert self.idx.get_stored_checksums()[0] == self._data.get_stored_checksum()
749
if self._idx is None:
750
self._idx = PackIndex(self._idx_path)
754
if self._data is not None:
758
def __eq__(self, other):
759
return type(self) == type(other) and self.idx == other.idx
762
"""Number of entries in this pack."""
766
return "Pack(%r)" % self._basename
769
"""Iterate over all the sha1s of the objects in this pack."""
770
return iter(self.idx)
773
return self.idx.check() and self.data.check()
775
def get_stored_checksum(self):
776
return self.data.get_stored_checksum()
778
def __contains__(self, sha1):
779
"""Check whether this pack contains a particular SHA1."""
780
return (self.idx.object_index(sha1) is not None)
782
def get_raw(self, sha1, resolve_ref=None):
783
if resolve_ref is None:
784
resolve_ref = self.get_raw
785
offset = self.idx.object_index(sha1)
789
type, obj = self.data.get_object_at(offset)
790
assert isinstance(offset, int)
791
return resolve_object(offset, type, obj, resolve_ref,
792
self.data.get_object_at)
794
def __getitem__(self, sha1):
795
"""Retrieve the specified SHA1."""
796
type, uncomp = self.get_raw(sha1)
797
return ShaFile.from_raw_string(type, uncomp)
799
def iterobjects(self):
800
for offset, type, obj in self.data.iterobjects():
801
assert isinstance(offset, int)
802
yield ShaFile.from_raw_string(
803
*resolve_object(offset, type, obj, self.get_raw,
804
self.data.get_object_at))
807
def load_packs(path):
808
if not os.path.exists(path):
810
for name in os.listdir(path):
811
if name.startswith("pack-") and name.endswith(".pack"):
812
yield Pack(os.path.join(path, name[:-len(".pack")]))