3
# (C) 2005 Canonical Ltd
5
# based on an idea by Matt Mackall
6
# modified to squish into bzr by Martin Pool
8
# This program is free software; you can redistribute it and/or modify
9
# it under the terms of the GNU General Public License as published by
10
# the Free Software Foundation; either version 2 of the License, or
11
# (at your option) any later version.
13
# This program is distributed in the hope that it will be useful,
14
# but WITHOUT ANY WARRANTY; without even the implied warranty of
15
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
# GNU General Public License for more details.
18
# You should have received a copy of the GNU General Public License
19
# along with this program; if not, write to the Free Software
20
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
"""Packed file revision storage.
25
A Revfile holds the text history of a particular source file, such
26
as Makefile. It can represent a tree of text versions for that
27
file, allowing for microbranches within a single repository.
29
This is stored on disk as two files: an index file, and a data file.
30
The index file is short and always read completely into memory; the
31
data file is much longer and only the relevant bits of it,
32
identified by the index file, need to be read.
34
Each text version is identified by the SHA-1 of the full text of
35
that version. It also has a sequence number within the file.
37
The index file has a short header and then a sequence of fixed-length
40
* byte[20] SHA-1 of text (as binary, not hex)
41
* uint32 sequence number this is based on, or -1 for full text
42
* uint32 flags: 1=zlib compressed
43
* uint32 offset in text file of start
44
* uint32 length of compressed delta in text file
49
The header is also 48 bytes for tidyness and easy calculation.
51
Both the index and the text are only ever appended to; a consequence
52
is that sequence numbers are stable references. But not every
53
repository in the world will assign the same sequence numbers,
54
therefore the SHA-1 is the only universally unique reference.
56
This is meant to scale to hold 100,000 revisions of a single file, by
57
which time the index file will be ~4.8MB and a bit big to read
60
Some of the reserved fields could be used to implement a (semi?)
61
balanced tree indexed by SHA1 so we can much more efficiently find the
62
index associated with a particular hash. For 100,000 revs we would be
63
able to find it in about 17 random reads, which is not too bad.
67
# TODO: Something like pread() would make this slightly simpler and
68
# perhaps more efficient.
70
# TODO: Could also try to mmap things...
73
import sys, zlib, struct, mdiff, stat, os, sha
74
from binascii import hexlify, unhexlify
80
_HEADER = "bzr revfile v1\n"
81
_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))
82
_NO_RECORD = 0xFFFFFFFFL
84
# fields in the index record
92
class RevfileError(Exception):
98
def __init__(self, basename):
99
# TODO: Option to open readonly
101
# TODO: Lock file while open
103
# TODO: advise of random access
105
self.basename = basename
107
idxname = basename + '.irev'
108
dataname = basename + '.drev'
110
idx_exists = os.path.exists(idxname)
111
data_exists = os.path.exists(dataname)
113
if idx_exists != data_exists:
114
raise RevfileError("half-assed revfile")
117
self.idxfile = open(idxname, 'w+b')
118
self.datafile = open(dataname, 'w+b')
120
print 'init empty file'
121
self.idxfile.write(_HEADER)
124
self.idxfile = open(idxname, 'r+b')
125
self.datafile = open(dataname, 'r+b')
127
h = self.idxfile.read(_RECORDSIZE)
129
raise RevfileError("bad header %r in index of %r"
130
% (h, self.basename))
133
def revision(self, rev):
134
base = self.index[rev][0]
135
start = self.index[base][1]
136
end = self.index[rev][1] + self.index[rev][2]
137
f = open(self.datafile())
140
data = f.read(end - start)
142
last = self.index[base][2]
143
text = zlib.decompress(data[:last])
145
for r in range(base + 1, rev + 1):
147
b = zlib.decompress(data[last:last + s])
148
text = mdiff.bpatch(text, b)
154
def _check_index(self, idx):
155
if idx < 0 or idx > len(self):
156
raise RevfileError("invalid index %r" % idx)
159
def find_sha(self, s):
160
assert isinstance(s, str)
163
for idx, idxrec in enumerate(self):
164
if idxrec[I_SHA] == s:
170
def _add_common(self, text_sha, data, flags, base):
171
"""Add pre-processed data, can be either full text or delta."""
173
self.datafile.seek(0, 2) # to end
174
self.idxfile.seek(0, 2)
175
assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)
176
data_offset = self.datafile.tell()
178
assert isinstance(data, str) # not unicode or anything wierd
180
self.datafile.write(data)
181
self.datafile.flush()
183
assert isinstance(text_sha, str)
185
entry += struct.pack(">IIII12x", base, flags, data_offset, len(data))
186
assert len(entry) == _RECORDSIZE
188
self.idxfile.write(entry)
195
def _add_full_text(self, text):
196
"""Add a full text to the file.
198
This is not compressed against any reference version.
200
Returns the index for that text."""
201
return self._add_common(sha.new(text).digest(), text, 0, _NO_RECORD)
204
def _add_delta(self, text, base):
205
"""Add a text stored relative to a previous text."""
206
self._check_index(base)
207
text_sha = sha.new(text).digest()
208
base_text = self.get(base)
209
data = mdiff.bdiff(base_text, text)
210
return self._add_common(text_sha, data, 0, base)
213
def add(self, text, base=None):
214
# TODO: check it's not already present?
218
def addrevision(self, text, changeset):
223
data = zlib.compress(text)
226
prev = self.revision(t)
227
data = zlib.compress(mdiff.bdiff(prev, text))
228
base = self.index[t][0]
232
offset = self.index[t][1] + self.index[t][2]
234
self.index.append((base, offset, len(data), changeset))
235
entry = struct.pack(">llll", base, offset, len(data), changeset)
237
open(self.indexfile(), "a").write(entry)
238
open(self.datafile(), "a").write(data)
243
base = idxrec[I_BASE]
244
if base == _NO_RECORD:
245
text = self._get_full_text(idx, idxrec)
247
text = self._get_patched(idx, idxrec)
249
if sha.new(text).digest() != idxrec[I_SHA]:
250
raise RevfileError("corrupt SHA-1 digest on record %d"
257
def _get_raw(self, idx, idxrec):
262
self.datafile.seek(idxrec[I_OFFSET])
264
data = self.datafile.read(l)
266
raise RevfileError("short read %d of %d "
267
"getting text for record %d in %r"
268
% (len(data), l, idx, self.basename))
273
def _get_full_text(self, idx, idxrec):
274
assert idxrec[I_FLAGS] == 0
275
assert idxrec[I_BASE] == _NO_RECORD
277
text = self._get_raw(idx, idxrec)
282
def _get_patched(self, idx, idxrec):
283
assert idxrec[I_FLAGS] == 0
284
base = idxrec[I_BASE]
286
assert base < idx # no loops!
288
base_text = self.get(base)
289
patch = self._get_raw(idx, idxrec)
291
text = mdiff.bpatch(base_text, patch)
298
"""Return number of revisions."""
299
l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]
301
raise RevfileError("bad length %d on index of %r" % (l, self.basename))
303
raise RevfileError("no header present in index of %r" % (self.basename))
304
return int(l / _RECORDSIZE) - 1
307
def __getitem__(self, idx):
308
"""Index by sequence id returns the index field"""
309
## TODO: Can avoid seek if we just moved there...
310
self._seek_index(idx)
311
return self._read_next_index()
314
def _seek_index(self, idx):
316
raise RevfileError("invalid index %r" % idx)
317
self.idxfile.seek((idx + 1) * _RECORDSIZE)
320
def _read_next_index(self):
321
rec = self.idxfile.read(_RECORDSIZE)
323
raise IndexError("end of index file")
324
elif len(rec) != _RECORDSIZE:
325
raise RevfileError("short read of %d bytes getting index %d from %r"
326
% (len(rec), idx, self.basename))
328
return struct.unpack(">20sIIII12x", rec)
331
def dump(self, f=sys.stdout):
332
f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n'
333
% tuple('idx sha1 base flags offset len'.split()))
334
f.write('-------- ---------------------------------------- ')
335
f.write('-------- -------- -------- --------\n')
337
for i, rec in enumerate(self):
338
f.write("#%-7d %40s " % (i, hexlify(rec[0])))
339
if rec[1] == _NO_RECORD:
342
f.write("#%-7d " % rec[1])
344
f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))
349
r = Revfile("testrev")
354
sys.stderr.write("usage: revfile dump\n"
356
" revfile add-delta BASE\n"
358
" revfile find-sha HEX\n")
363
new_idx = r._add_full_text(sys.stdin.read())
364
print 'added idx %d' % new_idx
365
elif cmd == 'add-delta':
366
new_idx = r._add_delta(sys.stdin.read(), int(argv[2]))
367
print 'added idx %d' % new_idx
374
sys.stderr.write("usage: revfile get IDX\n")
377
if idx < 0 or idx >= len(r):
378
sys.stderr.write("invalid index %r\n" % idx)
381
sys.stdout.write(r.get(idx))
382
elif cmd == 'find-sha':
384
s = unhexlify(argv[2])
386
sys.stderr.write("usage: revfile find-sha HEX\n")
390
if idx == _NO_RECORD:
391
sys.stderr.write("no such record\n")
397
sys.stderr.write("unknown command %r\n" % cmd)
401
if __name__ == '__main__':
403
sys.exit(main(sys.argv) or 0)