/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
198 by mbp at sourcefrog
- experimental compressed Revfile support
1
#! /usr/bin/env python
2
3
# (C) 2005 Matt Mackall
4
5
# modified to squish into bzr by Martin Pool
6
7
# This program is free software; you can redistribute it and/or modify
8
# it under the terms of the GNU General Public License as published by
9
# the Free Software Foundation; either version 2 of the License, or
10
# (at your option) any later version.
11
12
# This program is distributed in the hope that it will be useful,
13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
# GNU General Public License for more details.
16
17
# You should have received a copy of the GNU General Public License
18
# along with this program; if not, write to the Free Software
19
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20
21
22
"""Packed file revision storage.
23
24
A Revfile holds the text history of a particular source file, such
25
as Makefile.  It can represent a tree of text versions for that
26
file, allowing for microbranches within a single repository.
27
28
This is stored on disk as two files: an index file, and a data file.
29
The index file is short and always read completely into memory; the
30
data file is much longer and only the relevant bits of it,
31
identified by the index file, need to be read.
32
33
Each text version is identified by the SHA-1 of the full text of
34
that version.  It also has a sequence number within the file.
35
36
The index file has a short header and then a sequence of fixed-length
37
records:
38
39
* byte[20]    SHA-1 of text (as binary, not hex)
40
* uint32      sequence number this is based on, or -1 for full text
41
* uint32      flags: 1=zlib compressed
42
* uint32      offset in text file of start
43
* uint32      length of compressed delta in text file
44
* uint32[3]   reserved
45
46
total 48 bytes.
47
48
The header is also 48 bytes for tidyness.
49
50
Both the index and the text are only ever appended to; a consequence
51
is that sequence numbers are stable references.  But not every
52
repository in the world will assign the same sequence numbers,
53
therefore the SHA-1 is the only universally unique reference.
54
55
This is meant to scale to hold 100,000 revisions of a single file, by
56
which time the index file will be ~4.8MB and a bit big to read
57
sequentially.
58
59
Some of the reserved fields could be used to implement a (semi?)
60
balanced tree indexed by SHA1 so we can much more efficiently find the
61
index associated with a particular hash.  For 100,000 revs we would be
62
able to find it in about 17 random reads, which is not too bad.
63
"""
64
 
65
66
67
import sys, zlib, struct, mdiff, stat, os, sha
68
from binascii import hexlify
69
70
factor = 10
71
72
_RECORDSIZE = 48
73
74
_HEADER = "bzr revfile v1\n"
75
_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))
76
77
class RevfileError(Exception):
78
    pass
79
80
class Revfile:
81
    def __init__(self, basename):
82
        self.basename = basename
83
        self.idxfile = open(basename + '.irev', 'r+b')
84
        self.datafile = open(basename + '.drev', 'r+b')
85
        if self.last_idx() == -1:
86
            print 'init empty file'
87
            self.idxfile.write(_HEADER)
88
            self.idxfile.flush()
89
        else:
90
            h = self.idxfile.read(_RECORDSIZE)
91
            if h != _HEADER:
92
                raise RevfileError("bad header %r in index of %r"
93
                                   % (h, self.basename))
94
        
95
    def last_idx(self):
96
        """Return last index already present, or -1 if none."""
97
        l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]
98
        if l == 0:
99
            return -1
100
        if l % _RECORDSIZE:
101
            raise RevfileError("bad length %d on index of %r" % (l, self.basename))
102
        return (l / _RECORDSIZE) - 1
103
104
105
    def revision(self, rev):
106
        base = self.index[rev][0]
107
        start = self.index[base][1]
108
        end = self.index[rev][1] + self.index[rev][2]
109
        f = open(self.datafile())
110
111
        f.seek(start)
112
        data = f.read(end - start)
113
114
        last = self.index[base][2]
115
        text = zlib.decompress(data[:last])
116
117
        for r in range(base + 1, rev + 1):
118
            s = self.index[r][2]
119
            b = zlib.decompress(data[last:last + s])
120
            text = mdiff.bpatch(text, b)
121
            last = last + s
122
123
        return text    
124
125
126
    def add_full_text(self, t):
127
        """Add a full text to the file.
128
129
        This is not compressed against any reference version.
130
131
        Returns the index for that text."""
132
        idx = self.last_idx() + 1
133
        self.datafile.seek(0, 2)        # to end
134
        self.idxfile.seek(0, 2)
135
        assert self.idxfile.tell() == _RECORDSIZE * idx
136
        data_offset = self.datafile.tell()
137
138
        assert isinstance(t, str) # not unicode or anything wierd
139
140
        self.datafile.write(t)
141
        self.datafile.flush()
142
143
        entry = sha.new(t).digest()
144
        entry += struct.pack(">llll12x", 0, 0, data_offset, len(t))
145
        assert len(entry) == _RECORDSIZE
146
147
        self.idxfile.write(entry)
148
        self.idxfile.flush()
149
150
        return idx
151
152
153
    def __len__(self):
154
        return int(self.last_idx())
155
156
    def __getitem__(self, idx):
157
        self.idxfile.seek((idx + 1) * _RECORDSIZE)
158
        rec = self.idxfile.read(_RECORDSIZE)
159
        if len(rec) != _RECORDSIZE:
160
            raise RevfileError("short read of %d bytes getting index %d from %r"
161
                               % (len(rec), idx, self.basename))
162
        return struct.unpack(">20sllll12x", rec)
163
164
        
165
        
166
    def addrevision(self, text, changeset):
167
        t = self.tip()
168
        n = t + 1
169
170
        if not n % factor:
171
            data = zlib.compress(text)
172
            base = n
173
        else:
174
            prev = self.revision(t)
175
            data = zlib.compress(mdiff.bdiff(prev, text))
176
            base = self.index[t][0]
177
178
        offset = 0
179
        if t >= 0:
180
            offset = self.index[t][1] + self.index[t][2]
181
182
        self.index.append((base, offset, len(data), changeset))
183
        entry = struct.pack(">llll", base, offset, len(data), changeset)
184
185
        open(self.indexfile(), "a").write(entry)
186
        open(self.datafile(), "a").write(data)
187
188
    def dump(self):
189
        print '%-8s %-40s %-8s %-8s %-8s %-8s' \
190
              % tuple('idx sha1 base flags offset len'.split())
191
        print '-'*8, '-'*40, ('-'*8 + ' ')*4
192
        for i in range(len(self)):
193
            rec = self[i]
194
            print "#%-7d %40s #%-7d %08x %8d %8d " \
195
                  % (i, hexlify(rec[0]), rec[1], rec[2], rec[3], rec[4])
196
        
197
198
199
def main(argv):
200
    r = Revfile("testrev")
201
    if len(argv) < 2:
202
        sys.stderr.write("usage: revfile dump\n"
203
                         "       revfile add\n")
204
        sys.exit(1)
205
        
206
    if argv[1] == 'add':
207
        new_idx = r.add_full_text(sys.stdin.read())
208
        print 'added idx %d' % new_idx
209
    elif argv[1] == 'dump':
210
        r.dump()
211
    else:
212
        sys.stderr.write("unknown command %r\n" % argv[1])
213
        sys.exit(1)
214
    
215
216
if __name__ == '__main__':
217
    import sys
218
    main(sys.argv)