bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
|
0.211.9
by James Westby
Add some basic pack handling code. |
1 |
# pack.py -- For dealing wih packed git objects.
|
2 |
# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
|
|
3 |
# The code is loosely based on that in the sha1_file.c file from git itself,
|
|
4 |
# which is Copyright (C) Linus Torvalds, 2005 and distributed under the
|
|
5 |
# GPL version 2.
|
|
6 |
#
|
|
7 |
# This program is free software; you can redistribute it and/or
|
|
8 |
# modify it under the terms of the GNU General Public License
|
|
9 |
# as published by the Free Software Foundation; version 2
|
|
10 |
# of the License.
|
|
11 |
#
|
|
12 |
# This program is distributed in the hope that it will be useful,
|
|
13 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15 |
# GNU General Public License for more details.
|
|
16 |
#
|
|
17 |
# You should have received a copy of the GNU General Public License
|
|
18 |
# along with this program; if not, write to the Free Software
|
|
19 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
20 |
# MA 02110-1301, USA.
|
|
21 |
||
22 |
"""Classes for dealing with packed git objects.
|
|
23 |
||
24 |
A pack is a compact representation of a bunch of objects, stored
|
|
25 |
using deltas where possible.
|
|
26 |
||
27 |
They have two parts, the pack file, which stores the data, and an index
|
|
28 |
that tells you where the data is.
|
|
29 |
||
30 |
To find an object you look in all of the index files 'til you find a
|
|
31 |
match for the object name. You then use the pointer got from this as
|
|
32 |
a pointer in to the corresponding packfile.
|
|
33 |
"""
|
|
34 |
||
35 |
import mmap |
|
36 |
import os |
|
37 |
||
38 |
from objects import (ShaFile, |
|
39 |
_decompress, |
|
40 |
)
|
|
41 |
||
42 |
def hex_to_sha(hex): |
|
43 |
"""Converts a hex value to the number it represents""" |
|
44 |
mapping = { '0' : 0, '1' : 1, '2' : 2, '3' : 3, '4' : 4, '5' : 5, '6' : 6, |
|
45 |
'7' : 7, '8' : 8, '9' : 9, 'a' : 10, 'b' : 11, 'c' : 12, |
|
46 |
'd' : 13, 'e' : 14, 'f' : 15} |
|
47 |
value = 0 |
|
48 |
for c in hex: |
|
49 |
value = (16 * value) + mapping[c] |
|
50 |
return value |
|
51 |
||
52 |
def multi_ord(map, start, count): |
|
53 |
value = 0 |
|
54 |
for i in range(count): |
|
55 |
value = value * 256 + ord(map[start+i]) |
|
56 |
return value |
|
57 |
||
58 |
max_size = 256 * 1024 * 1024 |
|
59 |
||
60 |
class PackIndex(object): |
|
61 |
"""An index in to a packfile. |
|
62 |
||
63 |
Given a sha id of an object a pack index can tell you the location in the
|
|
64 |
packfile of that object if it has it.
|
|
65 |
||
66 |
To do the looup it opens the file, and indexes first 256 4 byte groups
|
|
67 |
with the first byte of the sha id. The value in the four byte group indexed
|
|
68 |
is the end of the group that shares the same starting byte. Subtract one
|
|
69 |
from the starting byte and index again to find the start of the group.
|
|
70 |
The values are sorted by sha id within the group, so do the math to find
|
|
71 |
the start and end offset and then bisect in to find if the value is present.
|
|
72 |
"""
|
|
73 |
||
74 |
header_record_size = 4 |
|
75 |
header_size = 256 * header_record_size |
|
76 |
index_size = 4 |
|
77 |
sha_bytes = 20 |
|
78 |
record_size = sha_bytes + index_size |
|
79 |
||
80 |
def __init__(self, filename): |
|
81 |
"""Create a pack index object. |
|
82 |
||
83 |
Provide it with the name of the index file to consider, and it will map
|
|
84 |
it whenever required.
|
|
85 |
"""
|
|
86 |
self._filename = filename |
|
87 |
assert os.path.exists(filename), "%s is not a pack index" % filename |
|
88 |
# Take the size now, so it can be checked each time we map the file to
|
|
89 |
# ensure that it hasn't changed.
|
|
90 |
self._size = os.path.getsize(filename) |
|
91 |
assert self._size > self.header_size, "%s is too small to be a packfile" % \ |
|
92 |
filename
|
|
93 |
assert self._size < max_size, "%s is larger than 256 meg, and it " \ |
|
94 |
"might not be a good idea to mmap it. If you want to go ahead " \
|
|
95 |
"delete this check, or get python to support mmap offsets so that " \
|
|
96 |
"I can map the files sensibly"
|
|
97 |
||
98 |
def object_index(self, sha): |
|
99 |
"""Return the index in to the corresponding packfile for the object. |
|
100 |
||
101 |
Given the name of an object it will return the offset that object lives
|
|
102 |
at within the corresponding pack file. If the pack file doesn't have the
|
|
103 |
object then None will be returned.
|
|
104 |
"""
|
|
105 |
size = os.path.getsize(self._filename) |
|
106 |
assert size == self._size, "Pack index %s has changed size, I don't " \ |
|
107 |
"like that" % self._filename |
|
108 |
f = open(self._filename, 'rb') |
|
109 |
try: |
|
110 |
map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ) |
|
111 |
return self._object_index(map, sha) |
|
112 |
finally: |
|
113 |
f.close() |
|
114 |
||
115 |
def _object_index(self, map, hexsha): |
|
116 |
"""See object_index""" |
|
117 |
first_byte = hex_to_sha(hexsha[:2]) |
|
118 |
header_offset = self.header_record_size * first_byte |
|
119 |
start = multi_ord(map, header_offset-self.header_record_size, self.header_record_size) |
|
120 |
end = multi_ord(map, header_offset, self.header_record_size) |
|
121 |
sha = hex_to_sha(hexsha) |
|
122 |
while start < end: |
|
123 |
i = (start + end)/2 |
|
124 |
offset = self.header_size + (i * self.record_size) |
|
125 |
file_sha = multi_ord(map, offset + self.index_size, self.sha_bytes) |
|
126 |
if file_sha == sha: |
|
127 |
return multi_ord(map, offset, self.index_size) |
|
128 |
elif file_sha < sha: |
|
129 |
start = offset + 1 |
|
130 |
else: |
|
131 |
end = offset - 1 |
|
132 |
return None |
|
133 |
||
134 |
||
135 |
class PackData(object): |
|
136 |
"""The data contained in a packfile. |
|
137 |
||
138 |
Pack files can be accessed both sequentially for exploding a pack, and
|
|
139 |
directly with the help of an index to retrieve a specific object.
|
|
140 |
||
141 |
The objects within are either complete or a delta aginst another.
|
|
142 |
||
143 |
The header is variable length. If the MSB of each byte is set then it
|
|
144 |
indicates that the subsequent byte is still part of the header.
|
|
145 |
For the first byte the next MS bits are the type, which tells you the type
|
|
146 |
of object, and whether it is a delta. The LS byte is the lowest bits of the
|
|
147 |
size. For each subsequent byte the LS 7 bits are the next MS bits of the
|
|
148 |
size, i.e. the last byte of the header contains the MS bits of the size.
|
|
149 |
||
150 |
For the complete objects the data is stored as zlib deflated data.
|
|
151 |
The size in the header is the uncompressed object size, so to uncompress
|
|
152 |
you need to just keep feeding data to zlib until you get an object back,
|
|
153 |
or it errors on bad data. This is done here by just giving the complete
|
|
154 |
buffer from the start of the deflated object on. This is bad, but until I
|
|
155 |
get mmap sorted out it will have to do.
|
|
156 |
||
157 |
Currently there are no integrity checks done. Also no attempt is made to try
|
|
158 |
and detect the delta case, or a request for an object at the wrong position.
|
|
159 |
It will all just throw a zlib or KeyError.
|
|
160 |
"""
|
|
161 |
||
162 |
def __init__(self, filename): |
|
163 |
"""Create a PackData object that represents the pack in the given filename. |
|
164 |
||
165 |
The file must exist and stay readable until the object is disposed of. It
|
|
166 |
must also stay the same size. It will be mapped whenever needed.
|
|
167 |
||
168 |
Currently there is a restriction on the size of the pack as the python
|
|
169 |
mmap implementation is flawed.
|
|
170 |
"""
|
|
171 |
self._filename = filename |
|
172 |
assert os.path.exists(filename), "%s is not a packfile" % filename |
|
173 |
self._size = os.path.getsize(filename) |
|
174 |
assert self._size < max_size, "%s is larger than 256 meg, and it " \ |
|
175 |
"might not be a good idea to mmap it. If you want to go ahead " \
|
|
176 |
"delete this check, or get python to support mmap offsets so that " \
|
|
177 |
"I can map the files sensibly"
|
|
178 |
||
179 |
def get_object_at(self, offset): |
|
180 |
"""Given an offset in to the packfile return the object that is there. |
|
181 |
||
182 |
Using the associated index the location of an object can be looked up, and
|
|
183 |
then the packfile can be asked directly for that object using this
|
|
184 |
function.
|
|
185 |
||
186 |
Currently only non-delta objects are supported.
|
|
187 |
"""
|
|
188 |
size = os.path.getsize(self._filename) |
|
189 |
assert size == self._size, "Pack data %s has changed size, I don't " \ |
|
190 |
"like that" % self._filename |
|
191 |
f = open(self._filename, 'rb') |
|
192 |
try: |
|
193 |
map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ) |
|
194 |
return self._get_object_at(map, offset) |
|
195 |
finally: |
|
196 |
f.close() |
|
197 |
||
198 |
def _get_object_at(self, map, offset): |
|
199 |
first_byte = ord(map[offset]) |
|
200 |
sign_extend = first_byte & 0x80 |
|
201 |
type = (first_byte >> 4) & 0x07 |
|
202 |
size = first_byte & 0x0f |
|
203 |
cur_offset = 0 |
|
204 |
while sign_extend > 0: |
|
205 |
byte = ord(map[offset+cur_offset+1]) |
|
206 |
sign_extend = byte & 0x80 |
|
207 |
size_part = byte & 0x7f |
|
208 |
size += size_part << ((cur_offset * 7) + 4) |
|
209 |
cur_offset += 1 |
|
210 |
raw_base = offset+cur_offset+1 |
|
211 |
# The size is the inflated size, so we have no idea what the deflated size
|
|
212 |
# is, so for now give it as much as we have. It should really iterate
|
|
213 |
# feeding it more data if it doesn't decompress, but as we have the whole
|
|
214 |
# thing then just use it.
|
|
215 |
raw = map[raw_base:] |
|
216 |
uncomp = _decompress(raw) |
|
217 |
obj = ShaFile.from_raw_string(type, uncomp) |
|
218 |
return obj |
|
219 |