/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
198 by mbp at sourcefrog
- experimental compressed Revfile support
1
#! /usr/bin/env python
2
200 by mbp at sourcefrog
revfile: fix up __getitem__ to allow simple iteration
3
# (C) 2005 Canonical Ltd
198 by mbp at sourcefrog
- experimental compressed Revfile support
4
200 by mbp at sourcefrog
revfile: fix up __getitem__ to allow simple iteration
5
# based on an idea by Matt Mackall
198 by mbp at sourcefrog
- experimental compressed Revfile support
6
# modified to squish into bzr by Martin Pool
7
8
# This program is free software; you can redistribute it and/or modify
9
# it under the terms of the GNU General Public License as published by
10
# the Free Software Foundation; either version 2 of the License, or
11
# (at your option) any later version.
12
13
# This program is distributed in the hope that it will be useful,
14
# but WITHOUT ANY WARRANTY; without even the implied warranty of
15
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
# GNU General Public License for more details.
17
18
# You should have received a copy of the GNU General Public License
19
# along with this program; if not, write to the Free Software
20
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21
22
23
"""Packed file revision storage.
24
25
A Revfile holds the text history of a particular source file, such
26
as Makefile.  It can represent a tree of text versions for that
27
file, allowing for microbranches within a single repository.
28
29
This is stored on disk as two files: an index file, and a data file.
30
The index file is short and always read completely into memory; the
31
data file is much longer and only the relevant bits of it,
32
identified by the index file, need to be read.
33
34
Each text version is identified by the SHA-1 of the full text of
35
that version.  It also has a sequence number within the file.
36
37
The index file has a short header and then a sequence of fixed-length
38
records:
39
40
* byte[20]    SHA-1 of text (as binary, not hex)
41
* uint32      sequence number this is based on, or -1 for full text
42
* uint32      flags: 1=zlib compressed
43
* uint32      offset in text file of start
44
* uint32      length of compressed delta in text file
45
* uint32[3]   reserved
46
47
total 48 bytes.
48
199 by mbp at sourcefrog
- use -1 for no_base in revfile
49
The header is also 48 bytes for tidyness and easy calculation.
198 by mbp at sourcefrog
- experimental compressed Revfile support
50
51
Both the index and the text are only ever appended to; a consequence
52
is that sequence numbers are stable references.  But not every
53
repository in the world will assign the same sequence numbers,
54
therefore the SHA-1 is the only universally unique reference.
55
56
This is meant to scale to hold 100,000 revisions of a single file, by
57
which time the index file will be ~4.8MB and a bit big to read
58
sequentially.
59
60
Some of the reserved fields could be used to implement a (semi?)
61
balanced tree indexed by SHA1 so we can much more efficiently find the
62
index associated with a particular hash.  For 100,000 revs we would be
63
able to find it in about 17 random reads, which is not too bad.
64
"""
65
 
66
67
68
import sys, zlib, struct, mdiff, stat, os, sha
69
from binascii import hexlify
70
71
factor = 10
72
73
_RECORDSIZE = 48
74
75
_HEADER = "bzr revfile v1\n"
76
_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))
199 by mbp at sourcefrog
- use -1 for no_base in revfile
77
_NO_BASE = 0xFFFFFFFFL
198 by mbp at sourcefrog
- experimental compressed Revfile support
78
79
class RevfileError(Exception):
80
    pass
81
82
class Revfile:
83
    def __init__(self, basename):
84
        self.basename = basename
85
        self.idxfile = open(basename + '.irev', 'r+b')
86
        self.datafile = open(basename + '.drev', 'r+b')
87
        if self.last_idx() == -1:
88
            print 'init empty file'
89
            self.idxfile.write(_HEADER)
90
            self.idxfile.flush()
91
        else:
92
            h = self.idxfile.read(_RECORDSIZE)
93
            if h != _HEADER:
94
                raise RevfileError("bad header %r in index of %r"
95
                                   % (h, self.basename))
96
        
97
    def last_idx(self):
98
        """Return last index already present, or -1 if none."""
99
        l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]
100
        if l == 0:
101
            return -1
102
        if l % _RECORDSIZE:
103
            raise RevfileError("bad length %d on index of %r" % (l, self.basename))
104
        return (l / _RECORDSIZE) - 1
105
106
107
    def revision(self, rev):
108
        base = self.index[rev][0]
109
        start = self.index[base][1]
110
        end = self.index[rev][1] + self.index[rev][2]
111
        f = open(self.datafile())
112
113
        f.seek(start)
114
        data = f.read(end - start)
115
116
        last = self.index[base][2]
117
        text = zlib.decompress(data[:last])
118
119
        for r in range(base + 1, rev + 1):
120
            s = self.index[r][2]
121
            b = zlib.decompress(data[last:last + s])
122
            text = mdiff.bpatch(text, b)
123
            last = last + s
124
125
        return text    
126
127
128
    def add_full_text(self, t):
129
        """Add a full text to the file.
130
131
        This is not compressed against any reference version.
132
133
        Returns the index for that text."""
134
        idx = self.last_idx() + 1
135
        self.datafile.seek(0, 2)        # to end
136
        self.idxfile.seek(0, 2)
137
        assert self.idxfile.tell() == _RECORDSIZE * idx
138
        data_offset = self.datafile.tell()
139
140
        assert isinstance(t, str) # not unicode or anything wierd
141
142
        self.datafile.write(t)
143
        self.datafile.flush()
144
145
        entry = sha.new(t).digest()
199 by mbp at sourcefrog
- use -1 for no_base in revfile
146
        entry += struct.pack(">IIII12x", 0xFFFFFFFFL, 0, data_offset, len(t))
198 by mbp at sourcefrog
- experimental compressed Revfile support
147
        assert len(entry) == _RECORDSIZE
148
149
        self.idxfile.write(entry)
150
        self.idxfile.flush()
151
152
        return idx
153
154
155
    def __len__(self):
156
        return int(self.last_idx())
157
200 by mbp at sourcefrog
revfile: fix up __getitem__ to allow simple iteration
158
198 by mbp at sourcefrog
- experimental compressed Revfile support
159
    def __getitem__(self, idx):
160
        self.idxfile.seek((idx + 1) * _RECORDSIZE)
161
        rec = self.idxfile.read(_RECORDSIZE)
200 by mbp at sourcefrog
revfile: fix up __getitem__ to allow simple iteration
162
        if not rec:
163
            raise IndexError("no record %d in index for %r" % (idx, self.basename))
164
        elif len(rec) != _RECORDSIZE:
198 by mbp at sourcefrog
- experimental compressed Revfile support
165
            raise RevfileError("short read of %d bytes getting index %d from %r"
166
                               % (len(rec), idx, self.basename))
199 by mbp at sourcefrog
- use -1 for no_base in revfile
167
        return struct.unpack(">20sIIII12x", rec)
198 by mbp at sourcefrog
- experimental compressed Revfile support
168
169
        
170
        
171
    def addrevision(self, text, changeset):
172
        t = self.tip()
173
        n = t + 1
174
175
        if not n % factor:
176
            data = zlib.compress(text)
177
            base = n
178
        else:
179
            prev = self.revision(t)
180
            data = zlib.compress(mdiff.bdiff(prev, text))
181
            base = self.index[t][0]
182
183
        offset = 0
184
        if t >= 0:
185
            offset = self.index[t][1] + self.index[t][2]
186
187
        self.index.append((base, offset, len(data), changeset))
188
        entry = struct.pack(">llll", base, offset, len(data), changeset)
189
190
        open(self.indexfile(), "a").write(entry)
191
        open(self.datafile(), "a").write(data)
192
199 by mbp at sourcefrog
- use -1 for no_base in revfile
193
    def dump(self, f=sys.stdout):
194
        f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n' 
195
                % tuple('idx sha1 base flags offset len'.split()))
196
        f.write('-------- ---------------------------------------- ')
197
        f.write('-------- -------- -------- --------\n')
198
200 by mbp at sourcefrog
revfile: fix up __getitem__ to allow simple iteration
199
        for i, rec in enumerate(self):
199 by mbp at sourcefrog
- use -1 for no_base in revfile
200
            f.write("#%-7d %40s " % (i, hexlify(rec[0])))
201
            if rec[1] == _NO_BASE:
202
                f.write("(none)   ")
203
            else:
204
                f.write("#%-7d " % rec[1])
205
                
206
            f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))
198 by mbp at sourcefrog
- experimental compressed Revfile support
207
        
208
209
210
def main(argv):
211
    r = Revfile("testrev")
212
    if len(argv) < 2:
213
        sys.stderr.write("usage: revfile dump\n"
214
                         "       revfile add\n")
215
        sys.exit(1)
216
        
217
    if argv[1] == 'add':
218
        new_idx = r.add_full_text(sys.stdin.read())
219
        print 'added idx %d' % new_idx
220
    elif argv[1] == 'dump':
221
        r.dump()
222
    else:
223
        sys.stderr.write("unknown command %r\n" % argv[1])
224
        sys.exit(1)
225
    
226
227
if __name__ == '__main__':
228
    import sys
229
    main(sys.argv)