/brz/remove-bazaar : revision 0.211.9

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to git/pack.py

Committer: James Westby
Date: 2007-03-30 16:20:00 UTC
mto: (0.215.1 trunk)
mto: This revision was merged to the branch mainline in revision 6960.
Revision ID: jw+debian@jameswestby.net-20070330162000-0muski7om3axcszs

Add some basic pack handling code.

It has classes for the index and data parts. It supports lookup of an object
name in the index, and then access to the object in the data part using the
offset returned from the index lookup.

There are many problems with it so far.

  * The mmap in python has no offset, so the whole files are mapped.
  * There is no support for delta objects.
  * There is no consistency checking.
  * The code is not hooked up to provide a simple API.
  * The code is not hooked in to the repo, so that objects are still not
    retrieved from packs.

files added:
git/pack.py

git/tests/data/packs

git/tests/data/packs/pack-bc63ddad95e7321ee734ea11a7a62d314e0d7481.idx

git/tests/data/packs/pack-bc63ddad95e7321ee734ea11a7a62d314e0d7481.pack

git/tests/test_pack.py

files modified:
git/objects.py

git/tests/__init__.py

Show diffs side-by-side

added added

removed removed

git/pack.py

# pack.py -- For dealing wih packed git objects.

# The code is loosely based on that in the sha1_file.c file from git itself,

# which is Copyright (C) Linus Torvalds, 2005 and distributed under the

# GPL version 2.

# This program is free software; you can redistribute it and/or

# modify it under the terms of the GNU General Public License

# as published by the Free Software Foundation; version 2

# of the License.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,

# MA 02110-1301, USA.

"""Classes for dealing with packed git objects.

A pack is a compact representation of a bunch of objects, stored

using deltas where possible.

They have two parts, the pack file, which stores the data, and an index

that tells you where the data is.

To find an object you look in all of the index files 'til you find a

match for the object name. You then use the pointer got from this as

a pointer in to the corresponding packfile.

"""

import mmap

import os

from objects import (ShaFile,

_decompress,

)

def hex_to_sha(hex):

"""Converts a hex value to the number it represents"""

mapping = { '0' : 0, '1' : 1, '2' : 2, '3' : 3, '4' : 4, '5' : 5, '6' : 6,

'7' : 7, '8' : 8, '9' : 9, 'a' : 10, 'b' : 11, 'c' : 12,

'd' : 13, 'e' : 14, 'f' : 15}

value = 0

for c in hex:

value = (16 * value) + mapping[c]

return value

def multi_ord(map, start, count):

value = 0

for i in range(count):

value = value * 256 + ord(map[start+i])

return value

max_size = 256 * 1024 * 1024

class PackIndex(object):

"""An index in to a packfile.

Given a sha id of an object a pack index can tell you the location in the

packfile of that object if it has it.

To do the looup it opens the file, and indexes first 256 4 byte groups

with the first byte of the sha id. The value in the four byte group indexed

is the end of the group that shares the same starting byte. Subtract one

from the starting byte and index again to find the start of the group.

The values are sorted by sha id within the group, so do the math to find

the start and end offset and then bisect in to find if the value is present.

"""

header_record_size = 4

header_size = 256 * header_record_size

index_size = 4

sha_bytes = 20

record_size = sha_bytes + index_size

def __init__(self, filename):

"""Create a pack index object.

Provide it with the name of the index file to consider, and it will map

it whenever required.

"""

self._filename = filename

assert os.path.exists(filename), "%s is not a pack index" % filename

# Take the size now, so it can be checked each time we map the file to

# ensure that it hasn't changed.

self._size = os.path.getsize(filename)

assert self._size > self.header_size, "%s is too small to be a packfile" % \

filename

assert self._size < max_size, "%s is larger than 256 meg, and it " \

"might not be a good idea to mmap it. If you want to go ahead " \

"delete this check, or get python to support mmap offsets so that " \

"I can map the files sensibly"

def object_index(self, sha):

"""Return the index in to the corresponding packfile for the object.

100

101

Given the name of an object it will return the offset that object lives

102

at within the corresponding pack file. If the pack file doesn't have the

103

object then None will be returned.

104

"""

105

size = os.path.getsize(self._filename)

106

assert size == self._size, "Pack index %s has changed size, I don't " \

107

"like that" % self._filename

108

f = open(self._filename, 'rb')

109

try:

110

map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)

111

return self._object_index(map, sha)

112

finally:

113

f.close()

114

115

def _object_index(self, map, hexsha):

116

"""See object_index"""

117

first_byte = hex_to_sha(hexsha[:2])

118

header_offset = self.header_record_size * first_byte

119

start = multi_ord(map, header_offset-self.header_record_size, self.header_record_size)

120

end = multi_ord(map, header_offset, self.header_record_size)

121

sha = hex_to_sha(hexsha)

122

while start < end:

123

i = (start + end)/2

124

offset = self.header_size + (i * self.record_size)

125

file_sha = multi_ord(map, offset + self.index_size, self.sha_bytes)

126

if file_sha == sha:

127

return multi_ord(map, offset, self.index_size)

128

elif file_sha < sha:

129

start = offset + 1

130

else:

131

end = offset - 1

132

return None

133

134

135

class PackData(object):

136

"""The data contained in a packfile.

137

138

Pack files can be accessed both sequentially for exploding a pack, and

139

directly with the help of an index to retrieve a specific object.

140

141

The objects within are either complete or a delta aginst another.

142

143

The header is variable length. If the MSB of each byte is set then it

144

indicates that the subsequent byte is still part of the header.

145

For the first byte the next MS bits are the type, which tells you the type

146

of object, and whether it is a delta. The LS byte is the lowest bits of the

147

size. For each subsequent byte the LS 7 bits are the next MS bits of the

148

size, i.e. the last byte of the header contains the MS bits of the size.

149

150

For the complete objects the data is stored as zlib deflated data.

151

The size in the header is the uncompressed object size, so to uncompress

152

you need to just keep feeding data to zlib until you get an object back,

153

or it errors on bad data. This is done here by just giving the complete

154

buffer from the start of the deflated object on. This is bad, but until I

155

get mmap sorted out it will have to do.

156

157

Currently there are no integrity checks done. Also no attempt is made to try

158

and detect the delta case, or a request for an object at the wrong position.

159

It will all just throw a zlib or KeyError.

160

"""

161

162

def __init__(self, filename):

163

"""Create a PackData object that represents the pack in the given filename.

164

165

The file must exist and stay readable until the object is disposed of. It

166

must also stay the same size. It will be mapped whenever needed.

167

168

Currently there is a restriction on the size of the pack as the python

169

mmap implementation is flawed.

170

"""

171

self._filename = filename

172

assert os.path.exists(filename), "%s is not a packfile" % filename

173

self._size = os.path.getsize(filename)

174

assert self._size < max_size, "%s is larger than 256 meg, and it " \

175

"might not be a good idea to mmap it. If you want to go ahead " \

176

"delete this check, or get python to support mmap offsets so that " \

177

"I can map the files sensibly"

178

179

def get_object_at(self, offset):

180

"""Given an offset in to the packfile return the object that is there.

181

182

Using the associated index the location of an object can be looked up, and

183

then the packfile can be asked directly for that object using this

184

function.

185

186

Currently only non-delta objects are supported.

187

"""

188

size = os.path.getsize(self._filename)

189

assert size == self._size, "Pack data %s has changed size, I don't " \

190

"like that" % self._filename

191

f = open(self._filename, 'rb')

192

try:

193

map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)

194

return self._get_object_at(map, offset)

195

finally:

196

f.close()

197

198

def _get_object_at(self, map, offset):

199

first_byte = ord(map[offset])

200

sign_extend = first_byte & 0x80

201

type = (first_byte >> 4) & 0x07

202

size = first_byte & 0x0f

203

cur_offset = 0

204

while sign_extend > 0:

205

byte = ord(map[offset+cur_offset+1])

206

sign_extend = byte & 0x80

207

size_part = byte & 0x7f

208

size += size_part << ((cur_offset * 7) + 4)

209

cur_offset += 1

210

raw_base = offset+cur_offset+1

211

# The size is the inflated size, so we have no idea what the deflated size

212

# is, so for now give it as much as we have. It should really iterate

213

# feeding it more data if it doesn't decompress, but as we have the whole

214

# thing then just use it.

215

raw = map[raw_base:]

216

uncomp = _decompress(raw)

217

obj = ShaFile.from_raw_string(type, uncomp)

218

return obj

219

Older »