1
# (C) 2005 Canonical Ltd
1
# Copyright (C) 2005, 2006 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
5
5
# the Free Software Foundation; either version 2 of the License, or
6
6
# (at your option) any later version.
8
8
# This program is distributed in the hope that it will be useful,
9
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
11
# GNU General Public License for more details.
13
13
# You should have received a copy of the GNU General Public License
14
14
# along with this program; if not, write to the Free Software
15
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
23
# TODO: Perhaps return more details on the file to avoid statting it
24
24
# again: nonexistent, file type, size, etc
26
# TODO: Perhaps use a Python pickle instead of a text file; might be faster.
28
30
CACHE_HEADER = "### bzr hashcache v5\n"
30
32
import os, stat, time
32
from bzrlib.osutils import sha_file
35
from bzrlib.filters import sha_file_by_name
36
from bzrlib.osutils import pathjoin, safe_unicode
33
37
from bzrlib.trace import mutter, warning
37
def _fingerprint(abspath):
39
fs = os.lstat(abspath)
41
# might be missing, etc
44
if stat.S_ISDIR(fs.st_mode):
47
# we discard any high precision because it's not reliable; perhaps we
48
# could do better on some systems?
49
return (fs.st_size, long(fs.st_mtime),
50
long(fs.st_ctime), fs.st_ino, fs.st_dev)
38
from bzrlib.atomicfile import AtomicFile
39
from bzrlib.errors import BzrError
53
48
class HashCache(object):
111
106
self.needs_write = True
116
110
"""Scan all files and remove entries where the cache entry is obsolete.
118
112
Obsolete entries are those where the file has been modified or deleted
119
113
since the entry was inserted.
115
# FIXME optimisation opportunity, on linux [and check other oses]:
116
# rather than iteritems order, stat in inode order.
121
117
prep = [(ce[1][3], path, ce) for (path, ce) in self._cache.iteritems()]
124
120
for inum, path, cache_entry in prep:
125
abspath = os.sep.join([self.basedir, path])
126
fp = _fingerprint(abspath)
121
abspath = pathjoin(self.root, path)
122
fp = self._fingerprint(abspath)
127
123
self.stat_count += 1
129
125
cache_fp = cache_entry[1]
134
130
self.needs_write = True
135
131
del self._cache[path]
139
def get_sha1(self, path):
133
def get_sha1(self, path, stat_value=None):
140
134
"""Return the sha1 of a file.
142
abspath = os.sep.join([self.basedir, path])
136
if path.__class__ is str:
137
abspath = pathjoin(self.root_utf8, path)
139
abspath = pathjoin(self.root, path)
143
140
self.stat_count += 1
144
file_fp = _fingerprint(abspath)
141
file_fp = self._fingerprint(abspath, stat_value)
147
144
# not a regular file or not existing
157
154
cache_sha1, cache_fp = None, None
159
156
if cache_fp == file_fp:
157
## mutter("hashcache hit for %s %r -> %s", path, file_fp, cache_sha1)
158
## mutter("now = %s", time.time())
160
159
self.hit_count += 1
161
160
return cache_sha1
163
162
self.miss_count += 1
164
digest = sha_file(file(abspath, 'rb', buffering=65000))
166
now = int(time.time())
167
if file_fp[1] >= now or file_fp[2] >= now:
164
mode = file_fp[FP_MODE_COLUMN]
165
if stat.S_ISREG(mode):
166
digest = self._really_sha1_file(abspath)
167
elif stat.S_ISLNK(mode):
168
digest = sha.new(os.readlink(abspath)).hexdigest()
170
raise BzrError("file %r: unknown file stat mode: %o"%(abspath,mode))
172
# window of 3 seconds to allow for 2s resolution on windows,
173
# unsynchronized file servers, etc.
174
cutoff = self._cutoff_time()
175
if file_fp[FP_MTIME_COLUMN] >= cutoff \
176
or file_fp[FP_CTIME_COLUMN] >= cutoff:
168
177
# changed too recently; can't be cached. we can
169
178
# return the result and it could possibly be cached
171
self.danger_count += 1
181
# the point is that we only want to cache when we are sure that any
182
# subsequent modifications of the file can be detected. If a
183
# modification neither changes the inode, the device, the size, nor
184
# the mode, then we can only distinguish it by time; therefore we
185
# need to let sufficient time elapse before we may cache this entry
186
# again. If we didn't do this, then, for example, a very quick 1
187
# byte replacement in the file might go undetected.
188
## mutter('%r modified too recently; not caching', path)
189
self.danger_count += 1
173
191
self.removed_count += 1
174
192
self.needs_write = True
175
193
del self._cache[path]
195
## mutter('%r added to cache: now=%f, mtime=%d, ctime=%d',
196
## path, time.time(), file_fp[FP_MTIME_COLUMN],
197
## file_fp[FP_CTIME_COLUMN])
177
198
self.update_count += 1
178
199
self.needs_write = True
179
200
self._cache[path] = (digest, file_fp)
203
def _really_sha1_file(self, abspath):
204
"""Calculate the SHA1 of a file by reading the full text"""
205
return sha_file_by_name(abspath)
187
208
"""Write contents of cache to file."""
188
from atomicfile import AtomicFile
190
outf = AtomicFile(self.cache_file_name(), 'wb')
209
outf = AtomicFile(self.cache_file_name(), 'wb', new_mode=self._mode)
192
print >>outf, CACHE_HEADER,
211
outf.write(CACHE_HEADER)
194
213
for path, c in self._cache.iteritems():
195
214
assert '//' not in path, path
196
outf.write(path.encode('utf-8'))
198
print >>outf, c[0], # hex sha1
200
print >>outf, "%d" % fld,
215
line_info = [path.encode('utf-8'), '// ', c[0], ' ']
216
line_info.append(' '.join([str(fld) for fld in c[1]]))
217
line_info.append('\n')
218
outf.write(''.join(line_info))
204
220
self.needs_write = False
221
## mutter("write hash cache: %s hits=%d misses=%d stat=%d recent=%d updates=%d",
222
## self.cache_file_name(), self.hit_count, self.miss_count,
224
## self.danger_count, self.update_count)
212
229
"""Reinstate cache from file.
254
273
self._cache[path] = (sha1, fp)
256
275
self.needs_write = False
277
def _cutoff_time(self):
278
"""Return cutoff time.
280
Files modified more recently than this time are at risk of being
281
undetectably modified and so can't be cached.
283
return int(time.time()) - 3
285
def _fingerprint(self, abspath, stat_value=None):
286
if stat_value is None:
288
stat_value = os.lstat(abspath)
290
# might be missing, etc
292
if stat.S_ISDIR(stat_value.st_mode):
294
# we discard any high precision because it's not reliable; perhaps we
295
# could do better on some systems?
296
return (stat_value.st_size, long(stat_value.st_mtime),
297
long(stat_value.st_ctime), stat_value.st_ino,
298
stat_value.st_dev, stat_value.st_mode)