/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
1
# Copyright (C) 2009 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""A manager of caches."""
18
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
19
import atexit
20
import os
21
import shutil
22
import tempfile
23
import weakref
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
24
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
25
from bzrlib import lru_cache, trace
0.123.6 by Jelmer Vernooij
Split out reftracker.
26
from bzrlib.plugins.fastimport import (
27
    branch_mapper,
28
    )
0.123.3 by Jelmer Vernooij
Fix some imports.
29
from fastimport.helpers import (
30
    single_plural,
31
    )
0.64.279 by Jelmer Vernooij
Merge split of python-fastimport into a separate package.
32
from fastimport.reftracker import (
0.123.6 by Jelmer Vernooij
Split out reftracker.
33
    RefTracker,
34
    )
35
36
37
class _Cleanup(object):
38
    """This class makes sure we clean up when CacheManager goes away.
39
40
    We use a helper class to ensure that we are never in a refcycle.
41
    """
42
43
    def __init__(self, disk_blobs):
44
        self.disk_blobs = disk_blobs
45
        self.tempdir = None
46
        self.small_blobs = None
47
48
    def __del__(self):
49
        self.finalize()
50
51
    def finalize(self):
52
        if self.disk_blobs is not None:
53
            for info in self.disk_blobs.itervalues():
54
                if info[-1] is not None:
55
                    os.unlink(info[-1])
56
            self.disk_blobs = None
57
        if self.small_blobs is not None:
58
            self.small_blobs.close()
59
            self.small_blobs = None
60
        if self.tempdir is not None:
61
            shutil.rmtree(self.tempdir)
62
63
64
class _Cleanup(object):
65
    """This class makes sure we clean up when CacheManager goes away.
66
67
    We use a helper class to ensure that we are never in a refcycle.
68
    """
69
70
    def __init__(self, disk_blobs):
71
        self.disk_blobs = disk_blobs
72
        self.tempdir = None
73
        self.small_blobs = None
74
75
    def __del__(self):
76
        self.finalize()
77
78
    def finalize(self):
79
        if self.disk_blobs is not None:
80
            for info in self.disk_blobs.itervalues():
81
                if info[-1] is not None:
82
                    os.unlink(info[-1])
83
            self.disk_blobs = None
84
        if self.small_blobs is not None:
85
            self.small_blobs.close()
86
            self.small_blobs = None
87
        if self.tempdir is not None:
88
            shutil.rmtree(self.tempdir)
89
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
90
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
91
class CacheManager(object):
0.123.6 by Jelmer Vernooij
Split out reftracker.
92
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
93
    _small_blob_threshold = 25*1024
94
    _sticky_cache_size = 300*1024*1024
95
    _sticky_flushed_size = 100*1024*1024
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
96
0.83.1 by Ian Clatworthy
head tracking tests and fix
97
    def __init__(self, info=None, verbose=False, inventory_cache_size=10):
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
98
        """Create a manager of caches.
99
100
        :param info: a ConfigObj holding the output from
101
            the --info processor, or None if no hints are available
102
        """
103
        self.verbose = verbose
104
105
        # dataref -> data. datref is either :mark or the sha-1.
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
106
        # Sticky blobs are referenced more than once, and are saved until their
107
        # refcount goes to 0
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
108
        self._blobs = {}
109
        self._sticky_blobs = {}
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
110
        self._sticky_memory_bytes = 0
111
        # if we overflow our memory cache, then we will dump large blobs to
112
        # disk in this directory
113
        self._tempdir = None
114
        # id => (offset, n_bytes, fname)
115
        #   if fname is None, then the content is stored in the small file
116
        self._disk_blobs = {}
117
        self._cleanup = _Cleanup(self._disk_blobs)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
118
119
        # revision-id -> Inventory cache
120
        # these are large and we probably don't need too many as
121
        # most parents are recent in history
122
        self.inventories = lru_cache.LRUCache(inventory_cache_size)
123
124
        # import commmit-ids -> revision-id lookup table
125
        # we need to keep all of these but they are small
126
        self.revision_ids = {}
127
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
128
        # (path, branch_ref) -> file-ids - as generated.
129
        # (Use store_file_id/fetch_fileid methods rather than direct access.)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
130
131
        # Work out the blobs to make sticky - None means all
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
132
        self._blob_ref_counts = {}
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
133
        if info is not None:
134
            try:
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
135
                blobs_by_counts = info['Blob reference counts']
136
                # The parser hands values back as lists, already parsed
137
                for count, blob_list in blobs_by_counts.items():
138
                    n = int(count)
139
                    for b in blob_list:
140
                        self._blob_ref_counts[b] = n
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
141
            except KeyError:
142
                # info not in file - possible when no blobs used
143
                pass
144
0.112.4 by Max Bowsher
Store the BranchMapper in the CacheManager so it can be got from other places.
145
        # BranchMapper has no state (for now?), but we keep it around rather
146
        # than reinstantiate on every usage
147
        self.branch_mapper = branch_mapper.BranchMapper()
148
0.123.6 by Jelmer Vernooij
Split out reftracker.
149
        self.reftracker = RefTracker()
150
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
151
    def dump_stats(self, note=trace.note):
152
        """Dump some statistics about what we cached."""
153
        # TODO: add in inventory stastistics
154
        note("Cache statistics:")
155
        self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
156
        self._show_stats_for(self.revision_ids, "revision-ids", note=note)
157
        # These aren't interesting so omit from the output, at least for now
158
        #self._show_stats_for(self._blobs, "other blobs", note=note)
0.123.6 by Jelmer Vernooij
Split out reftracker.
159
        #self.reftracker.dump_stats(note=note)
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
160
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
161
    def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False):
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
162
        """Dump statistics about a given dictionary.
163
164
        By the key and value need to support len().
165
        """
166
        count = len(dict)
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
167
        if tuple_key:
168
            size = sum(map(len, (''.join(k) for k in dict.keys())))
169
        else:
170
            size = sum(map(len, dict.keys()))
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
171
        size += sum(map(len, dict.values()))
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
172
        size = size * 1.0 / 1024
173
        unit = 'K'
174
        if size > 1024:
175
            size = size / 1024
176
            unit = 'M'
177
            if size > 1024:
178
                size = size / 1024
179
                unit = 'G'
180
        note("    %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
0.123.3 by Jelmer Vernooij
Fix some imports.
181
            single_plural(count, "item", "items")))
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
182
183
    def clear_all(self):
184
        """Free up any memory used by the caches."""
185
        self._blobs.clear()
186
        self._sticky_blobs.clear()
187
        self.revision_ids.clear()
0.123.6 by Jelmer Vernooij
Split out reftracker.
188
        self.reftracker.clear()
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
189
        self.inventories.clear()
190
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
191
    def _flush_blobs_to_disk(self):
192
        blobs = self._sticky_blobs.keys()
193
        sticky_blobs = self._sticky_blobs
194
        total_blobs = len(sticky_blobs)
195
        blobs.sort(key=lambda k:len(sticky_blobs[k]))
196
        if self._tempdir is None:
0.123.3 by Jelmer Vernooij
Fix some imports.
197
            tempdir = tempfile.mkdtemp(prefix='fastimport_blobs-')
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
198
            self._tempdir = tempdir
199
            self._cleanup.tempdir = self._tempdir
200
            self._cleanup.small_blobs = tempfile.TemporaryFile(
201
                prefix='small-blobs-', dir=self._tempdir)
202
            small_blob_ref = weakref.ref(self._cleanup.small_blobs)
203
            # Even though we add it to _Cleanup it seems that the object can be
204
            # destroyed 'too late' for cleanup to actually occur. Probably a
205
            # combination of bzr's "die directly, don't clean up" and how
206
            # exceptions close the running stack.
207
            def exit_cleanup():
208
                small_blob = small_blob_ref()
209
                if small_blob is not None:
210
                    small_blob.close()
211
                shutil.rmtree(tempdir, ignore_errors=True)
212
            atexit.register(exit_cleanup)
213
        count = 0
214
        bytes = 0
215
        n_small_bytes = 0
216
        while self._sticky_memory_bytes > self._sticky_flushed_size:
217
            id = blobs.pop()
218
            blob = self._sticky_blobs.pop(id)
219
            n_bytes = len(blob)
220
            self._sticky_memory_bytes -= n_bytes
221
            if n_bytes < self._small_blob_threshold:
222
                f = self._cleanup.small_blobs
223
                f.seek(0, os.SEEK_END)
224
                self._disk_blobs[id] = (f.tell(), n_bytes, None)
225
                f.write(blob)
226
                n_small_bytes += n_bytes
227
            else:
228
                fd, name = tempfile.mkstemp(prefix='blob-', dir=self._tempdir)
229
                os.write(fd, blob)
230
                os.close(fd)
231
                self._disk_blobs[id] = (0, n_bytes, name)
232
            bytes += n_bytes
233
            del blob
234
            count += 1
235
        trace.note('flushed %d/%d blobs w/ %.1fMB (%.1fMB small) to disk'
236
                   % (count, total_blobs, bytes / 1024. / 1024,
237
                      n_small_bytes / 1024. / 1024))
238
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
239
    def store_blob(self, id, data):
240
        """Store a blob of data."""
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
241
        # Note: If we're not reference counting, everything has to be sticky
242
        if not self._blob_ref_counts or id in self._blob_ref_counts:
243
            self._sticky_blobs[id] = data
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
244
            self._sticky_memory_bytes += len(data)
245
            if self._sticky_memory_bytes > self._sticky_cache_size:
246
                self._flush_blobs_to_disk()
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
247
        elif data == '':
248
            # Empty data is always sticky
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
249
            self._sticky_blobs[id] = data
250
        else:
251
            self._blobs[id] = data
252
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
253
    def _decref(self, id, cache, fn):
254
        if not self._blob_ref_counts:
255
            return False
256
        count = self._blob_ref_counts.get(id, None)
257
        if count is not None:
258
            count -= 1
259
            if count <= 0:
260
                del cache[id]
261
                if fn is not None:
262
                    os.unlink(fn)
263
                del self._blob_ref_counts[id]
264
                return True
265
            else:
266
                self._blob_ref_counts[id] = count
267
        return False
268
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
269
    def fetch_blob(self, id):
270
        """Fetch a blob of data."""
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
271
        if id in self._blobs:
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
272
            return self._blobs.pop(id)
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
273
        if id in self._disk_blobs:
274
            (offset, n_bytes, fn) = self._disk_blobs[id]
275
            if fn is None:
276
                f = self._cleanup.small_blobs
277
                f.seek(offset)
278
                content = f.read(n_bytes)
279
            else:
280
                fp = open(fn, 'rb')
281
                try:
282
                    content = fp.read()
283
                finally:
284
                    fp.close()
285
            self._decref(id, self._disk_blobs, fn)
286
            return content
287
        content = self._sticky_blobs[id]
288
        if self._decref(id, self._sticky_blobs, None):
289
            self._sticky_memory_bytes -= len(content)
290
        return content
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
291
0.123.6 by Jelmer Vernooij
Split out reftracker.
292