bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
1 |
#! /usr/bin/env python
|
2 |
||
|
200
by mbp at sourcefrog
revfile: fix up __getitem__ to allow simple iteration |
3 |
# (C) 2005 Canonical Ltd
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
4 |
|
|
200
by mbp at sourcefrog
revfile: fix up __getitem__ to allow simple iteration |
5 |
# based on an idea by Matt Mackall
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
6 |
# modified to squish into bzr by Martin Pool
|
7 |
||
8 |
# This program is free software; you can redistribute it and/or modify
|
|
9 |
# it under the terms of the GNU General Public License as published by
|
|
10 |
# the Free Software Foundation; either version 2 of the License, or
|
|
11 |
# (at your option) any later version.
|
|
12 |
||
13 |
# This program is distributed in the hope that it will be useful,
|
|
14 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16 |
# GNU General Public License for more details.
|
|
17 |
||
18 |
# You should have received a copy of the GNU General Public License
|
|
19 |
# along with this program; if not, write to the Free Software
|
|
20 |
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
21 |
||
22 |
||
23 |
"""Packed file revision storage.
|
|
24 |
||
25 |
A Revfile holds the text history of a particular source file, such
|
|
26 |
as Makefile. It can represent a tree of text versions for that
|
|
27 |
file, allowing for microbranches within a single repository.
|
|
28 |
||
29 |
This is stored on disk as two files: an index file, and a data file.
|
|
30 |
The index file is short and always read completely into memory; the
|
|
31 |
data file is much longer and only the relevant bits of it,
|
|
32 |
identified by the index file, need to be read.
|
|
33 |
||
34 |
Each text version is identified by the SHA-1 of the full text of
|
|
35 |
that version. It also has a sequence number within the file.
|
|
36 |
||
37 |
The index file has a short header and then a sequence of fixed-length
|
|
38 |
records:
|
|
39 |
||
40 |
* byte[20] SHA-1 of text (as binary, not hex)
|
|
41 |
* uint32 sequence number this is based on, or -1 for full text
|
|
42 |
* uint32 flags: 1=zlib compressed
|
|
43 |
* uint32 offset in text file of start
|
|
44 |
* uint32 length of compressed delta in text file
|
|
45 |
* uint32[3] reserved
|
|
46 |
||
47 |
total 48 bytes.
|
|
48 |
||
|
199
by mbp at sourcefrog
- use -1 for no_base in revfile |
49 |
The header is also 48 bytes for tidyness and easy calculation.
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
50 |
|
51 |
Both the index and the text are only ever appended to; a consequence
|
|
52 |
is that sequence numbers are stable references. But not every
|
|
53 |
repository in the world will assign the same sequence numbers,
|
|
54 |
therefore the SHA-1 is the only universally unique reference.
|
|
55 |
||
56 |
This is meant to scale to hold 100,000 revisions of a single file, by
|
|
57 |
which time the index file will be ~4.8MB and a bit big to read
|
|
58 |
sequentially.
|
|
59 |
||
60 |
Some of the reserved fields could be used to implement a (semi?)
|
|
61 |
balanced tree indexed by SHA1 so we can much more efficiently find the
|
|
62 |
index associated with a particular hash. For 100,000 revs we would be
|
|
63 |
able to find it in about 17 random reads, which is not too bad.
|
|
64 |
"""
|
|
65 |
||
66 |
||
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
67 |
# TODO: Something like pread() would make this slightly simpler and
|
68 |
# perhaps more efficient.
|
|
69 |
||
70 |
# TODO: Could also try to mmap things...
|
|
71 |
||
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
72 |
|
73 |
import sys, zlib, struct, mdiff, stat, os, sha |
|
|
204
by mbp at sourcefrog
Revfile:- new find-sha command and implementation- new _check_index helper |
74 |
from binascii import hexlify, unhexlify |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
75 |
|
76 |
factor = 10 |
|
77 |
||
78 |
_RECORDSIZE = 48 |
|
79 |
||
80 |
_HEADER = "bzr revfile v1\n" |
|
81 |
_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER))) |
|
|
204
by mbp at sourcefrog
Revfile:- new find-sha command and implementation- new _check_index helper |
82 |
_NO_RECORD = 0xFFFFFFFFL |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
83 |
|
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
84 |
# fields in the index record
|
85 |
I_SHA = 0 |
|
86 |
I_BASE = 1 |
|
87 |
I_FLAGS = 2 |
|
88 |
I_OFFSET = 3 |
|
89 |
I_LEN = 4 |
|
90 |
||
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
91 |
class RevfileError(Exception): |
92 |
pass
|
|
93 |
||
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
94 |
|
95 |
||
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
96 |
class Revfile: |
97 |
def __init__(self, basename): |
|
|
202
by mbp at sourcefrog
Revfile: |
98 |
# TODO: Option to open readonly
|
99 |
||
100 |
# TODO: Lock file while open
|
|
101 |
||
102 |
# TODO: advise of random access
|
|
103 |
||
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
104 |
self.basename = basename |
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
105 |
|
106 |
idxname = basename + '.irev' |
|
107 |
dataname = basename + '.drev' |
|
108 |
||
109 |
idx_exists = os.path.exists(idxname) |
|
110 |
data_exists = os.path.exists(dataname) |
|
111 |
||
112 |
if idx_exists != data_exists: |
|
113 |
raise RevfileError("half-assed revfile") |
|
114 |
||
115 |
if not idx_exists: |
|
116 |
self.idxfile = open(idxname, 'w+b') |
|
117 |
self.datafile = open(dataname, 'w+b') |
|
118 |
||
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
119 |
print 'init empty file' |
120 |
self.idxfile.write(_HEADER) |
|
121 |
self.idxfile.flush() |
|
122 |
else: |
|
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
123 |
self.idxfile = open(idxname, 'r+b') |
|
202
by mbp at sourcefrog
Revfile: |
124 |
self.datafile = open(dataname, 'r+b') |
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
125 |
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
126 |
h = self.idxfile.read(_RECORDSIZE) |
127 |
if h != _HEADER: |
|
128 |
raise RevfileError("bad header %r in index of %r" |
|
129 |
% (h, self.basename)) |
|
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
130 |
|
131 |
||
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
132 |
def revision(self, rev): |
133 |
base = self.index[rev][0] |
|
134 |
start = self.index[base][1] |
|
135 |
end = self.index[rev][1] + self.index[rev][2] |
|
136 |
f = open(self.datafile()) |
|
137 |
||
138 |
f.seek(start) |
|
139 |
data = f.read(end - start) |
|
140 |
||
141 |
last = self.index[base][2] |
|
142 |
text = zlib.decompress(data[:last]) |
|
143 |
||
144 |
for r in range(base + 1, rev + 1): |
|
145 |
s = self.index[r][2] |
|
146 |
b = zlib.decompress(data[last:last + s]) |
|
147 |
text = mdiff.bpatch(text, b) |
|
148 |
last = last + s |
|
149 |
||
150 |
return text |
|
151 |
||
152 |
||
|
204
by mbp at sourcefrog
Revfile:- new find-sha command and implementation- new _check_index helper |
153 |
def _add_full_text(self, t): |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
154 |
"""Add a full text to the file. |
155 |
||
156 |
This is not compressed against any reference version.
|
|
157 |
||
158 |
Returns the index for that text."""
|
|
|
203
by mbp at sourcefrog
revfile: |
159 |
idx = len(self) |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
160 |
self.datafile.seek(0, 2) # to end |
161 |
self.idxfile.seek(0, 2) |
|
|
202
by mbp at sourcefrog
Revfile: |
162 |
assert self.idxfile.tell() == _RECORDSIZE * (idx + 1) |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
163 |
data_offset = self.datafile.tell() |
164 |
||
165 |
assert isinstance(t, str) # not unicode or anything wierd |
|
166 |
||
167 |
self.datafile.write(t) |
|
168 |
self.datafile.flush() |
|
169 |
||
170 |
entry = sha.new(t).digest() |
|
|
199
by mbp at sourcefrog
- use -1 for no_base in revfile |
171 |
entry += struct.pack(">IIII12x", 0xFFFFFFFFL, 0, data_offset, len(t)) |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
172 |
assert len(entry) == _RECORDSIZE |
173 |
||
174 |
self.idxfile.write(entry) |
|
175 |
self.idxfile.flush() |
|
176 |
||
177 |
return idx |
|
178 |
||
179 |
||
|
204
by mbp at sourcefrog
Revfile:- new find-sha command and implementation- new _check_index helper |
180 |
def _check_index(self, idx): |
181 |
if idx < 0 or idx > len(self): |
|
182 |
raise RevfileError("invalid index %r" % idx) |
|
183 |
||
184 |
||
185 |
def find_sha(self, s): |
|
186 |
assert isinstance(s, str) |
|
187 |
assert len(s) == 20 |
|
188 |
||
189 |
for idx, idxrec in enumerate(self): |
|
190 |
if idxrec[I_SHA] == s: |
|
191 |
return idx |
|
192 |
else: |
|
193 |
return _NO_RECORD |
|
194 |
||
195 |
||
196 |
def _add_diff(self, text, base): |
|
197 |
"""Add a text stored relative to a previous text.""" |
|
198 |
self._check_index(base) |
|
199 |
text_sha = sha.new(text).digest() |
|
200 |
||
201 |
||
202 |
def addrevision(self, text, changeset): |
|
203 |
t = self.tip() |
|
204 |
n = t + 1 |
|
205 |
||
206 |
if not n % factor: |
|
207 |
data = zlib.compress(text) |
|
208 |
base = n |
|
209 |
else: |
|
210 |
prev = self.revision(t) |
|
211 |
data = zlib.compress(mdiff.bdiff(prev, text)) |
|
212 |
base = self.index[t][0] |
|
213 |
||
214 |
offset = 0 |
|
215 |
if t >= 0: |
|
216 |
offset = self.index[t][1] + self.index[t][2] |
|
217 |
||
218 |
self.index.append((base, offset, len(data), changeset)) |
|
219 |
entry = struct.pack(">llll", base, offset, len(data), changeset) |
|
220 |
||
221 |
open(self.indexfile(), "a").write(entry) |
|
222 |
open(self.datafile(), "a").write(data) |
|
223 |
||
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
224 |
def _get_full_text(self, idx): |
225 |
idxrec = self[idx] |
|
226 |
assert idxrec[I_FLAGS] == 0 |
|
|
204
by mbp at sourcefrog
Revfile:- new find-sha command and implementation- new _check_index helper |
227 |
assert idxrec[I_BASE] == _NO_RECORD |
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
228 |
|
229 |
l = idxrec[I_LEN] |
|
230 |
if l == 0: |
|
231 |
return '' |
|
232 |
||
233 |
self.datafile.seek(idxrec[I_OFFSET]) |
|
234 |
||
235 |
text = self.datafile.read(l) |
|
236 |
if len(text) != l: |
|
237 |
raise RevfileError("short read %d of %d " |
|
238 |
"getting text for record %d in %r" |
|
239 |
% (len(text), l, idx, self.basename)) |
|
|
204
by mbp at sourcefrog
Revfile:- new find-sha command and implementation- new _check_index helper |
240 |
|
241 |
if sha.new(text).digest() != idxrec[I_SHA]: |
|
242 |
raise RevfileError("corrupt SHA-1 digest on record %d" |
|
243 |
% idx) |
|
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
244 |
|
245 |
return text |
|
246 |
||
247 |
||
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
248 |
def __len__(self): |
|
203
by mbp at sourcefrog
revfile: |
249 |
"""Return number of revisions.""" |
250 |
l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE] |
|
251 |
if l % _RECORDSIZE: |
|
252 |
raise RevfileError("bad length %d on index of %r" % (l, self.basename)) |
|
253 |
if l < _RECORDSIZE: |
|
254 |
raise RevfileError("no header present in index of %r" % (self.basename)) |
|
255 |
return int(l / _RECORDSIZE) - 1 |
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
256 |
|
|
200
by mbp at sourcefrog
revfile: fix up __getitem__ to allow simple iteration |
257 |
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
258 |
def __getitem__(self, idx): |
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
259 |
"""Index by sequence id returns the index field""" |
|
204
by mbp at sourcefrog
Revfile:- new find-sha command and implementation- new _check_index helper |
260 |
## TODO: Can avoid seek if we just moved there...
|
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
261 |
self._seek_index(idx) |
262 |
return self._read_next_index() |
|
263 |
||
264 |
||
265 |
def _seek_index(self, idx): |
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
266 |
self.idxfile.seek((idx + 1) * _RECORDSIZE) |
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
267 |
|
268 |
||
269 |
def _read_next_index(self): |
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
270 |
rec = self.idxfile.read(_RECORDSIZE) |
|
200
by mbp at sourcefrog
revfile: fix up __getitem__ to allow simple iteration |
271 |
if not rec: |
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
272 |
raise IndexError("end of index file") |
|
200
by mbp at sourcefrog
revfile: fix up __getitem__ to allow simple iteration |
273 |
elif len(rec) != _RECORDSIZE: |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
274 |
raise RevfileError("short read of %d bytes getting index %d from %r" |
275 |
% (len(rec), idx, self.basename)) |
|
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
276 |
|
|
199
by mbp at sourcefrog
- use -1 for no_base in revfile |
277 |
return struct.unpack(">20sIIII12x", rec) |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
278 |
|
279 |
||
|
199
by mbp at sourcefrog
- use -1 for no_base in revfile |
280 |
def dump(self, f=sys.stdout): |
281 |
f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n' |
|
282 |
% tuple('idx sha1 base flags offset len'.split())) |
|
283 |
f.write('-------- ---------------------------------------- ') |
|
284 |
f.write('-------- -------- -------- --------\n') |
|
285 |
||
|
200
by mbp at sourcefrog
revfile: fix up __getitem__ to allow simple iteration |
286 |
for i, rec in enumerate(self): |
|
199
by mbp at sourcefrog
- use -1 for no_base in revfile |
287 |
f.write("#%-7d %40s " % (i, hexlify(rec[0]))) |
|
204
by mbp at sourcefrog
Revfile:- new find-sha command and implementation- new _check_index helper |
288 |
if rec[1] == _NO_RECORD: |
|
199
by mbp at sourcefrog
- use -1 for no_base in revfile |
289 |
f.write("(none) ") |
290 |
else: |
|
291 |
f.write("#%-7d " % rec[1]) |
|
292 |
||
293 |
f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4])) |
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
294 |
|
295 |
||
296 |
||
297 |
def main(argv): |
|
298 |
r = Revfile("testrev") |
|
|
203
by mbp at sourcefrog
revfile: |
299 |
|
300 |
try: |
|
301 |
cmd = argv[1] |
|
302 |
except IndexError: |
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
303 |
sys.stderr.write("usage: revfile dump\n" |
|
201
by mbp at sourcefrog
Revfile: - get full text from a record- fix creation of files if they don't exist- protect against half-assed storage |
304 |
" revfile add\n" |
|
204
by mbp at sourcefrog
Revfile:- new find-sha command and implementation- new _check_index helper |
305 |
" revfile get IDX\n" |
306 |
" revfile find-sha HEX\n") |
|
|
203
by mbp at sourcefrog
revfile: |
307 |
return 1 |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
308 |
|
|
203
by mbp at sourcefrog
revfile: |
309 |
|
310 |
if cmd == 'add': |
|
|
204
by mbp at sourcefrog
Revfile:- new find-sha command and implementation- new _check_index helper |
311 |
new_idx = r._add_full_text(sys.stdin.read()) |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
312 |
print 'added idx %d' % new_idx |
|
203
by mbp at sourcefrog
revfile: |
313 |
elif cmd == 'dump': |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
314 |
r.dump() |
|
203
by mbp at sourcefrog
revfile: |
315 |
elif cmd == 'get': |
|
202
by mbp at sourcefrog
Revfile: |
316 |
try: |
|
203
by mbp at sourcefrog
revfile: |
317 |
idx = int(argv[2]) |
|
202
by mbp at sourcefrog
Revfile: |
318 |
except IndexError: |
|
203
by mbp at sourcefrog
revfile: |
319 |
sys.stderr.write("usage: revfile get IDX\n") |
320 |
return 1 |
|
321 |
||
322 |
if idx < 0 or idx >= len(r): |
|
323 |
sys.stderr.write("invalid index %r\n" % idx) |
|
324 |
return 1 |
|
325 |
||
326 |
sys.stdout.write(r._get_full_text(idx)) |
|
|
204
by mbp at sourcefrog
Revfile:- new find-sha command and implementation- new _check_index helper |
327 |
elif cmd == 'find-sha': |
328 |
try: |
|
329 |
s = unhexlify(argv[2]) |
|
330 |
except IndexError: |
|
331 |
sys.stderr.write("usage: revfile find-sha HEX\n") |
|
332 |
return 1 |
|
333 |
||
334 |
idx = r.find_sha(s) |
|
335 |
if idx == _NO_RECORD: |
|
336 |
sys.stderr.write("no such record\n") |
|
337 |
return 1 |
|
338 |
else: |
|
339 |
print idx |
|
340 |
||
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
341 |
else: |
|
203
by mbp at sourcefrog
revfile: |
342 |
sys.stderr.write("unknown command %r\n" % cmd) |
343 |
return 1 |
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
344 |
|
345 |
||
346 |
if __name__ == '__main__': |
|
347 |
import sys |
|
|
203
by mbp at sourcefrog
revfile: |
348 |
sys.exit(main(sys.argv) or 0) |