bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
| 
362
by Martin Pool
 - Import stat-cache code  | 
1  | 
# (C) 2005 Canonical Ltd
 | 
2  | 
||
3  | 
# This program is free software; you can redistribute it and/or modify
 | 
|
4  | 
# it under the terms of the GNU General Public License as published by
 | 
|
5  | 
# the Free Software Foundation; either version 2 of the License, or
 | 
|
6  | 
# (at your option) any later version.
 | 
|
7  | 
||
8  | 
# This program is distributed in the hope that it will be useful,
 | 
|
9  | 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
|
10  | 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
|
11  | 
# GNU General Public License for more details.
 | 
|
12  | 
||
13  | 
# You should have received a copy of the GNU General Public License
 | 
|
14  | 
# along with this program; if not, write to the Free Software
 | 
|
15  | 
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | 
|
16  | 
||
17  | 
import stat, os, sha, time  | 
|
18  | 
||
19  | 
from trace import mutter  | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
20  | 
from errors import BzrError, BzrCheckError  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
21  | 
|
22  | 
||
| 
427
by Martin Pool
 - statcache docs  | 
23  | 
"""File stat cache to speed up tree comparisons.
 | 
24  | 
||
25  | 
This module basically gives a quick way to find the SHA-1 and related
 | 
|
26  | 
information of a file in the working directory, without actually
 | 
|
27  | 
reading and hashing the whole file.
 | 
|
28  | 
||
| 
509
by Martin Pool
 clean up stat cache code:  | 
29  | 
|
30  | 
||
31  | 
Implementation
 | 
|
32  | 
==============
 | 
|
33  | 
||
34  | 
Users of this module should not need to know about how this is
 | 
|
35  | 
implemented, and in particular should not depend on the particular
 | 
|
36  | 
data which is stored or its format.
 | 
|
37  | 
||
| 
427
by Martin Pool
 - statcache docs  | 
38  | 
This is done by maintaining a cache indexed by a file fingerprint of
 | 
39  | 
(path, size, mtime, ctime, ino, dev) pointing to the SHA-1.  If the
 | 
|
40  | 
fingerprint has changed, we assume the file content has not changed
 | 
|
41  | 
either and the SHA-1 is therefore the same.
 | 
|
42  | 
||
43  | 
If any of the fingerprint fields have changed then the file content
 | 
|
44  | 
*may* have changed, or it may not have.  We need to reread the file
 | 
|
45  | 
contents to make sure, but this is not visible to the user or
 | 
|
46  | 
higher-level code (except as a delay of course).
 | 
|
47  | 
||
48  | 
The mtime and ctime are stored with nanosecond fields, but not all
 | 
|
49  | 
filesystems give this level of precision.  There is therefore a
 | 
|
50  | 
possible race: the file might be modified twice within a second
 | 
|
51  | 
without changing the size or mtime, and a SHA-1 cached from the first
 | 
|
52  | 
version would be wrong.  We handle this by not recording a cached hash
 | 
|
53  | 
for any files which were modified in the current second and that
 | 
|
54  | 
therefore have the chance to change again before the second is up.
 | 
|
55  | 
||
56  | 
The only known hole in this design is if the system clock jumps
 | 
|
57  | 
backwards crossing invocations of bzr.  Please don't do that; use ntp
 | 
|
58  | 
to gradually adjust your clock or don't use bzr over the step.
 | 
|
59  | 
||
60  | 
At the moment this is stored in a simple textfile; it might be nice
 | 
|
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
61  | 
to use a tdb instead to allow faster lookup by file-id.
 | 
| 
434
by Martin Pool
 doc  | 
62  | 
|
63  | 
The cache is represented as a map from file_id to a tuple of (file_id,
 | 
|
64  | 
sha1, path, size, mtime, ctime, ino, dev).
 | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
65  | 
|
66  | 
The SHA-1 is stored in memory as a hexdigest.
 | 
|
67  | 
||
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
68  | 
This version of the file on disk has one line per record, and fields
 | 
69  | 
separated by \0 records.
 | 
|
| 
427
by Martin Pool
 - statcache docs  | 
70  | 
"""
 | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
71  | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
72  | 
# order of fields returned by fingerprint()
 | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
73  | 
FP_SIZE = 0  | 
74  | 
FP_MTIME = 1  | 
|
75  | 
FP_CTIME = 2  | 
|
76  | 
FP_INO = 3  | 
|
77  | 
FP_DEV = 4  | 
|
78  | 
||
| 
509
by Martin Pool
 clean up stat cache code:  | 
79  | 
# order of fields in the statcache file and in the in-memory map
 | 
| 
437
by Martin Pool
 - new command 'bzr modified' to exercise the statcache  | 
80  | 
SC_FILE_ID = 0  | 
| 
509
by Martin Pool
 clean up stat cache code:  | 
81  | 
SC_SHA1 = 1  | 
82  | 
SC_PATH = 2  | 
|
83  | 
SC_SIZE = 3  | 
|
84  | 
SC_MTIME = 4  | 
|
85  | 
SC_CTIME = 5  | 
|
86  | 
SC_INO = 6  | 
|
87  | 
SC_DEV = 7  | 
|
| 
437
by Martin Pool
 - new command 'bzr modified' to exercise the statcache  | 
88  | 
|
89  | 
||
| 
524
by Martin Pool
 - add version marker at top of statcache  | 
90  | 
|
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
91  | 
CACHE_HEADER = "### bzr statcache v4"  | 
| 
524
by Martin Pool
 - add version marker at top of statcache  | 
92  | 
|
93  | 
||
| 
458
by Martin Pool
 - fix statcache update from subdirectories  | 
94  | 
def fingerprint(abspath):  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
95  | 
try:  | 
96  | 
fs = os.lstat(abspath)  | 
|
97  | 
except OSError:  | 
|
98  | 
        # might be missing, etc
 | 
|
99  | 
return None  | 
|
100  | 
||
101  | 
if stat.S_ISDIR(fs.st_mode):  | 
|
102  | 
return None  | 
|
103  | 
||
104  | 
return (fs.st_size, fs.st_mtime,  | 
|
105  | 
fs.st_ctime, fs.st_ino, fs.st_dev)  | 
|
106  | 
||
107  | 
||
| 
548
by Martin Pool
 - Write statcache using \u style encoding to avoid  | 
108  | 
|
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
109  | 
def _write_cache(basedir, entries):  | 
| 
428
by Martin Pool
 - Use AtomicFile to update statcache.  | 
110  | 
from atomicfile import AtomicFile  | 
| 
453
by Martin Pool
 - Split WorkingTree into its own file  | 
111  | 
|
112  | 
cachefn = os.path.join(basedir, '.bzr', 'stat-cache')  | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
113  | 
outf = AtomicFile(cachefn, 'wb')  | 
| 
428
by Martin Pool
 - Use AtomicFile to update statcache.  | 
114  | 
try:  | 
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
115  | 
outf.write(CACHE_HEADER + '\n')  | 
116  | 
||
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
117  | 
for entry in entries:  | 
| 
509
by Martin Pool
 clean up stat cache code:  | 
118  | 
if len(entry) != 8:  | 
119  | 
raise ValueError("invalid statcache entry tuple %r" % entry)  | 
|
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
120  | 
outf.write(entry[0].encode('utf-8')) # file id  | 
121  | 
outf.write('\0')  | 
|
| 
548
by Martin Pool
 - Write statcache using \u style encoding to avoid  | 
122  | 
outf.write(entry[1]) # hex sha1  | 
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
123  | 
outf.write('\0')  | 
124  | 
outf.write(entry[2].encode('utf-8')) # name  | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
125  | 
for nf in entry[3:]:  | 
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
126  | 
outf.write('\0%d' % nf)  | 
| 
509
by Martin Pool
 clean up stat cache code:  | 
127  | 
outf.write('\n')  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
128  | 
|
| 
428
by Martin Pool
 - Use AtomicFile to update statcache.  | 
129  | 
outf.commit()  | 
130  | 
finally:  | 
|
131  | 
if not outf.closed:  | 
|
132  | 
outf.abort()  | 
|
| 
551
by Martin Pool
 - Don't fail if unable to update the statcache  | 
133  | 
|
134  | 
||
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
135  | 
def _try_write_cache(basedir, entries):  | 
| 
551
by Martin Pool
 - Don't fail if unable to update the statcache  | 
136  | 
try:  | 
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
137  | 
return _write_cache(basedir, entries)  | 
| 
551
by Martin Pool
 - Don't fail if unable to update the statcache  | 
138  | 
except IOError, e:  | 
139  | 
mutter("cannot update statcache in %s: %s" % (basedir, e))  | 
|
140  | 
except OSError, e:  | 
|
141  | 
mutter("cannot update statcache in %s: %s" % (basedir, e))  | 
|
142  | 
||
| 
362
by Martin Pool
 - Import stat-cache code  | 
143  | 
|
144  | 
||
| 
453
by Martin Pool
 - Split WorkingTree into its own file  | 
145  | 
def load_cache(basedir):  | 
| 
537
by Martin Pool
 - file-ids are stored as quoted-printable in the stat cache,  | 
146  | 
import re  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
147  | 
cache = {}  | 
| 
540
by Martin Pool
 - use builtin set object in python2.4  | 
148  | 
seen_paths = {}  | 
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
149  | 
from bzrlib.trace import warning  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
150  | 
|
| 
637
by Martin Pool
 - add assertion  | 
151  | 
assert isinstance(basedir, basestring)  | 
152  | 
||
| 
537
by Martin Pool
 - file-ids are stored as quoted-printable in the stat cache,  | 
153  | 
sha_re = re.compile(r'[a-f0-9]{40}')  | 
154  | 
||
| 
362
by Martin Pool
 - Import stat-cache code  | 
155  | 
try:  | 
| 
453
by Martin Pool
 - Split WorkingTree into its own file  | 
156  | 
cachefn = os.path.join(basedir, '.bzr', 'stat-cache')  | 
| 
524
by Martin Pool
 - add version marker at top of statcache  | 
157  | 
cachefile = open(cachefn, 'rb')  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
158  | 
except IOError:  | 
159  | 
return cache  | 
|
| 
524
by Martin Pool
 - add version marker at top of statcache  | 
160  | 
|
161  | 
line1 = cachefile.readline().rstrip('\r\n')  | 
|
162  | 
if line1 != CACHE_HEADER:  | 
|
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
163  | 
mutter('cache header marker not found at top of %s; discarding cache'  | 
164  | 
% cachefn)  | 
|
| 
524
by Martin Pool
 - add version marker at top of statcache  | 
165  | 
return cache  | 
166  | 
||
| 
362
by Martin Pool
 - Import stat-cache code  | 
167  | 
for l in cachefile:  | 
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
168  | 
f = l.split('\0')  | 
| 
509
by Martin Pool
 clean up stat cache code:  | 
169  | 
|
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
170  | 
file_id = f[0].decode('utf-8')  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
171  | 
if file_id in cache:  | 
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
172  | 
warning("duplicated file_id in cache: {%s}" % file_id)  | 
| 
537
by Martin Pool
 - file-ids are stored as quoted-printable in the stat cache,  | 
173  | 
|
174  | 
text_sha = f[1]  | 
|
175  | 
if len(text_sha) != 40 or not sha_re.match(text_sha):  | 
|
176  | 
raise BzrCheckError("invalid file SHA-1 in cache: %r" % text_sha)  | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
177  | 
|
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
178  | 
path = f[2].decode('utf-8')  | 
| 
509
by Martin Pool
 clean up stat cache code:  | 
179  | 
if path in seen_paths:  | 
| 
602
by Martin Pool
 - new statcache format: use nul field separators rather than  | 
180  | 
warning("duplicated path in cache: %r" % path)  | 
| 
540
by Martin Pool
 - use builtin set object in python2.4  | 
181  | 
seen_paths[path] = True  | 
| 
509
by Martin Pool
 clean up stat cache code:  | 
182  | 
|
| 
537
by Martin Pool
 - file-ids are stored as quoted-printable in the stat cache,  | 
183  | 
entry = (file_id, text_sha, path) + tuple([long(x) for x in f[3:]])  | 
| 
509
by Martin Pool
 clean up stat cache code:  | 
184  | 
if len(entry) != 8:  | 
185  | 
raise ValueError("invalid statcache entry tuple %r" % entry)  | 
|
186  | 
||
187  | 
cache[file_id] = entry  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
188  | 
return cache  | 
189  | 
||
190  | 
||
191  | 
||
192  | 
def _files_from_inventory(inv):  | 
|
193  | 
for path, ie in inv.iter_entries():  | 
|
194  | 
if ie.kind != 'file':  | 
|
195  | 
            continue
 | 
|
196  | 
yield ie.file_id, path  | 
|
197  | 
||
198  | 
||
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
199  | 
|
| 
453
by Martin Pool
 - Split WorkingTree into its own file  | 
200  | 
def update_cache(basedir, inv, flush=False):  | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
201  | 
"""Update and return the cache for the branch.  | 
202  | 
||
203  | 
    The returned cache may contain entries that have not been written
 | 
|
204  | 
    to disk for files recently touched.
 | 
|
205  | 
||
206  | 
    flush -- discard any previous cache and recalculate from scratch.
 | 
|
207  | 
    """
 | 
|
208  | 
||
| 
554
by Martin Pool
 - clean up statcache code  | 
209  | 
    # load the existing cache; use information there to find a list of
 | 
210  | 
    # files ordered by inode, which is alleged to be the fastest order
 | 
|
211  | 
    # to stat the files.
 | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
212  | 
|
| 
554
by Martin Pool
 - clean up statcache code  | 
213  | 
to_update = _files_from_inventory(inv)  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
214  | 
|
| 
438
by Martin Pool
 - Avoid calling Inventory.iter_entries() when finding modified  | 
215  | 
assert isinstance(flush, bool)  | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
216  | 
if flush:  | 
217  | 
cache = {}  | 
|
218  | 
else:  | 
|
| 
453
by Martin Pool
 - Split WorkingTree into its own file  | 
219  | 
cache = load_cache(basedir)  | 
| 
554
by Martin Pool
 - clean up statcache code  | 
220  | 
|
221  | 
by_inode = []  | 
|
222  | 
without_inode = []  | 
|
223  | 
for file_id, path in to_update:  | 
|
224  | 
if file_id in cache:  | 
|
225  | 
by_inode.append((cache[file_id][SC_INO], file_id, path))  | 
|
226  | 
else:  | 
|
227  | 
without_inode.append((file_id, path))  | 
|
228  | 
by_inode.sort()  | 
|
229  | 
||
230  | 
to_update = [a[1:] for a in by_inode] + without_inode  | 
|
231  | 
||
232  | 
stat_cnt = missing_cnt = new_cnt = hardcheck = change_cnt = 0  | 
|
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
233  | 
|
| 
549
by Martin Pool
 doc  | 
234  | 
    # dangerfiles have been recently touched and can't be committed to
 | 
235  | 
    # a persistent cache yet, but they are returned to the caller.
 | 
|
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
236  | 
dangerfiles = []  | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
237  | 
|
238  | 
now = int(time.time())  | 
|
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
239  | 
|
| 
458
by Martin Pool
 - fix statcache update from subdirectories  | 
240  | 
    ## mutter('update statcache under %r' % basedir)
 | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
241  | 
for file_id, path in to_update:  | 
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
242  | 
abspath = os.path.join(basedir, path)  | 
| 
458
by Martin Pool
 - fix statcache update from subdirectories  | 
243  | 
fp = fingerprint(abspath)  | 
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
244  | 
stat_cnt += 1  | 
245  | 
||
| 
362
by Martin Pool
 - Import stat-cache code  | 
246  | 
cacheentry = cache.get(file_id)  | 
247  | 
||
248  | 
if fp == None: # not here  | 
|
249  | 
if cacheentry:  | 
|
250  | 
del cache[file_id]  | 
|
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
251  | 
change_cnt += 1  | 
252  | 
missing_cnt += 1  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
253  | 
            continue
 | 
| 
554
by Martin Pool
 - clean up statcache code  | 
254  | 
elif not cacheentry:  | 
255  | 
new_cnt += 1  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
256  | 
|
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
257  | 
if (fp[FP_MTIME] >= now) or (fp[FP_CTIME] >= now):  | 
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
258  | 
dangerfiles.append(file_id)  | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
259  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
260  | 
if cacheentry and (cacheentry[3:] == fp):  | 
261  | 
continue # all stat fields unchanged  | 
|
262  | 
||
263  | 
hardcheck += 1  | 
|
264  | 
||
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
265  | 
dig = sha.new(file(abspath, 'rb').read()).hexdigest()  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
266  | 
|
| 
486
by Martin Pool
 - Write out statcache if any files are updated, even  | 
267  | 
        # We update the cache even if the digest has not changed from
 | 
268  | 
        # last time we looked, so that the fingerprint fields will
 | 
|
269  | 
        # match in future.
 | 
|
270  | 
cacheentry = (file_id, dig, path) + fp  | 
|
271  | 
cache[file_id] = cacheentry  | 
|
272  | 
change_cnt += 1  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
273  | 
|
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
274  | 
mutter('statcache: statted %d files, read %d files, %d changed, %d dangerous, '  | 
| 
554
by Martin Pool
 - clean up statcache code  | 
275  | 
'%d deleted, %d new, '  | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
276  | 
'%d in cache'  | 
| 
554
by Martin Pool
 - clean up statcache code  | 
277  | 
% (stat_cnt, hardcheck, change_cnt, len(dangerfiles),  | 
278  | 
missing_cnt, new_cnt, len(cache)))  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
279  | 
|
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
280  | 
if change_cnt:  | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
281  | 
mutter('updating on-disk statcache')  | 
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
282  | 
|
283  | 
if dangerfiles:  | 
|
284  | 
safe_cache = cache.copy()  | 
|
285  | 
for file_id in dangerfiles:  | 
|
286  | 
del safe_cache[file_id]  | 
|
287  | 
else:  | 
|
288  | 
safe_cache = cache  | 
|
289  | 
||
290  | 
_try_write_cache(basedir, safe_cache.itervalues())  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
291  | 
|
292  | 
return cache  |