/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
1
# Copyright (C) 2009 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""A manager of caches."""
18
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
19
import atexit
20
import os
21
import shutil
22
import tempfile
23
import time
24
import weakref
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
25
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
26
from bzrlib import lru_cache, trace
0.112.4 by Max Bowsher
Store the BranchMapper in the CacheManager so it can be got from other places.
27
from bzrlib.plugins.fastimport import branch_mapper, helpers
0.64.118 by Ian Clatworthy
fix lru_cache import
28
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
29
30
class _Cleanup(object):
31
    """This class makes sure we clean up when CacheManager goes away.
32
33
    We use a helper class to ensure that we are never in a refcycle.
34
    """
35
36
    def __init__(self, disk_blobs):
37
        self.disk_blobs = disk_blobs
38
        self.tempdir = None
39
        self.small_blobs = None
40
41
    def __del__(self):
42
        self.finalize()
43
44
    def finalize(self):
45
        if self.disk_blobs is not None:
46
            for info in self.disk_blobs.itervalues():
47
                if info[-1] is not None:
48
                    os.unlink(info[-1])
49
            self.disk_blobs = None
50
        if self.small_blobs is not None:
51
            self.small_blobs.close()
52
            self.small_blobs = None
53
        if self.tempdir is not None:
54
            shutils.rmtree(self.tempdir)
55
        
56
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
57
58
class _Cleanup(object):
59
    """This class makes sure we clean up when CacheManager goes away.
60
61
    We use a helper class to ensure that we are never in a refcycle.
62
    """
63
64
    def __init__(self, disk_blobs):
65
        self.disk_blobs = disk_blobs
66
        self.tempdir = None
67
        self.small_blobs = None
68
69
    def __del__(self):
70
        self.finalize()
71
72
    def finalize(self):
73
        if self.disk_blobs is not None:
74
            for info in self.disk_blobs.itervalues():
0.115.9 by John Arbash Meinel
Switch to closing the large-content blobs that we store to disk.
75
                if info[-1] is not None:
76
                    os.unlink(info[-1])
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
77
            self.disk_blobs = None
78
        if self.small_blobs is not None:
79
            self.small_blobs.close()
80
            self.small_blobs = None
81
        if self.tempdir is not None:
82
            shutils.rmtree(self.tempdir)
83
        
84
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
85
class CacheManager(object):
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
86
    
87
    _small_blob_threshold = 25*1024
88
    _sticky_cache_size = 300*1024*1024
89
    _sticky_flushed_size = 100*1024*1024
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
90
0.83.1 by Ian Clatworthy
head tracking tests and fix
91
    def __init__(self, info=None, verbose=False, inventory_cache_size=10):
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
92
        """Create a manager of caches.
93
94
        :param info: a ConfigObj holding the output from
95
            the --info processor, or None if no hints are available
96
        """
97
        self.verbose = verbose
98
99
        # dataref -> data. datref is either :mark or the sha-1.
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
100
        # Sticky blobs are referenced more than once, and are saved until their
101
        # refcount goes to 0
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
102
        self._blobs = {}
103
        self._sticky_blobs = {}
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
104
        self._sticky_memory_bytes = 0
105
        # if we overflow our memory cache, then we will dump large blobs to
106
        # disk in this directory
107
        self._tempdir = None
108
        # id => (offset, n_bytes, fname)
109
        #   if fname is None, then the content is stored in the small file
110
        self._disk_blobs = {}
111
        self._cleanup = _Cleanup(self._disk_blobs)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
112
113
        # revision-id -> Inventory cache
114
        # these are large and we probably don't need too many as
115
        # most parents are recent in history
116
        self.inventories = lru_cache.LRUCache(inventory_cache_size)
117
118
        # import commmit-ids -> revision-id lookup table
119
        # we need to keep all of these but they are small
120
        self.revision_ids = {}
121
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
122
        # (path, branch_ref) -> file-ids - as generated.
123
        # (Use store_file_id/fetch_fileid methods rather than direct access.)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
124
125
        # Head tracking: last ref, last id per ref & map of commit ids to ref*s*
126
        self.last_ref = None
127
        self.last_ids = {}
128
        self.heads = {}
129
130
        # Work out the blobs to make sticky - None means all
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
131
        self._blob_ref_counts = {}
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
132
        if info is not None:
133
            try:
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
134
                blobs_by_counts = info['Blob reference counts']
135
                # The parser hands values back as lists, already parsed
136
                for count, blob_list in blobs_by_counts.items():
137
                    n = int(count)
138
                    for b in blob_list:
139
                        self._blob_ref_counts[b] = n
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
140
            except KeyError:
141
                # info not in file - possible when no blobs used
142
                pass
143
0.112.4 by Max Bowsher
Store the BranchMapper in the CacheManager so it can be got from other places.
144
        # BranchMapper has no state (for now?), but we keep it around rather
145
        # than reinstantiate on every usage
146
        self.branch_mapper = branch_mapper.BranchMapper()
147
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
148
    def dump_stats(self, note=trace.note):
149
        """Dump some statistics about what we cached."""
150
        # TODO: add in inventory stastistics
151
        note("Cache statistics:")
152
        self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
153
        self._show_stats_for(self.revision_ids, "revision-ids", note=note)
154
        # These aren't interesting so omit from the output, at least for now
155
        #self._show_stats_for(self._blobs, "other blobs", note=note)
156
        #self._show_stats_for(self.last_ids, "last-ids", note=note)
157
        #self._show_stats_for(self.heads, "heads", note=note)
158
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
159
    def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False):
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
160
        """Dump statistics about a given dictionary.
161
162
        By the key and value need to support len().
163
        """
164
        count = len(dict)
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
165
        if tuple_key:
166
            size = sum(map(len, (''.join(k) for k in dict.keys())))
167
        else:
168
            size = sum(map(len, dict.keys()))
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
169
        size += sum(map(len, dict.values()))
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
170
        size = size * 1.0 / 1024
171
        unit = 'K'
172
        if size > 1024:
173
            size = size / 1024
174
            unit = 'M'
175
            if size > 1024:
176
                size = size / 1024
177
                unit = 'G'
178
        note("    %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
179
            helpers.single_plural(count, "item", "items")))
180
181
    def clear_all(self):
182
        """Free up any memory used by the caches."""
183
        self._blobs.clear()
184
        self._sticky_blobs.clear()
185
        self.revision_ids.clear()
186
        self.last_ids.clear()
187
        self.heads.clear()
188
        self.inventories.clear()
189
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
190
    def _flush_blobs_to_disk(self):
191
        blobs = self._sticky_blobs.keys()
192
        sticky_blobs = self._sticky_blobs
193
        total_blobs = len(sticky_blobs)
194
        blobs.sort(key=lambda k:len(sticky_blobs[k]))
195
        if self._tempdir is None:
196
            tempdir = tempfile.mkdtemp(prefix='bzr_fastimport_blobs-')
197
            self._tempdir = tempdir
198
            self._cleanup.tempdir = self._tempdir
199
            self._cleanup.small_blobs = tempfile.TemporaryFile(
200
                prefix='small-blobs-', dir=self._tempdir)
201
            small_blob_ref = weakref.ref(self._cleanup.small_blobs)
202
            # Even though we add it to _Cleanup it seems that the object can be
203
            # destroyed 'too late' for cleanup to actually occur. Probably a
204
            # combination of bzr's "die directly, don't clean up" and how
205
            # exceptions close the running stack.
206
            def exit_cleanup():
207
                small_blob = small_blob_ref()
208
                if small_blob is not None:
209
                    small_blob.close()
210
                shutil.rmtree(tempdir, ignore_errors=True)
211
            atexit.register(exit_cleanup)
212
        count = 0
213
        bytes = 0
214
        n_small_bytes = 0
215
        while self._sticky_memory_bytes > self._sticky_flushed_size:
216
            id = blobs.pop()
217
            blob = self._sticky_blobs.pop(id)
218
            n_bytes = len(blob)
219
            self._sticky_memory_bytes -= n_bytes
220
            if n_bytes < self._small_blob_threshold:
221
                f = self._cleanup.small_blobs
222
                f.seek(0, os.SEEK_END)
223
                self._disk_blobs[id] = (f.tell(), n_bytes, None)
224
                f.write(blob)
225
                n_small_bytes += n_bytes
226
            else:
227
                fd, name = tempfile.mkstemp(prefix='blob-', dir=self._tempdir)
228
                os.write(fd, blob)
229
                os.close(fd)
230
                self._disk_blobs[id] = (0, n_bytes, name)
231
            bytes += n_bytes
232
            del blob
233
            count += 1
234
        trace.note('flushed %d/%d blobs w/ %.1fMB (%.1fMB small) to disk'
235
                   % (count, total_blobs, bytes / 1024. / 1024,
236
                      n_small_bytes / 1024. / 1024))
237
        
238
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
239
    def store_blob(self, id, data):
240
        """Store a blob of data."""
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
241
        # Note: If we're not reference counting, everything has to be sticky
242
        if not self._blob_ref_counts or id in self._blob_ref_counts:
243
            self._sticky_blobs[id] = data
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
244
            self._sticky_memory_bytes += len(data)
245
            if self._sticky_memory_bytes > self._sticky_cache_size:
246
                self._flush_blobs_to_disk()
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
247
        elif data == '':
248
            # Empty data is always sticky
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
249
            self._sticky_blobs[id] = data
250
        else:
251
            self._blobs[id] = data
252
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
253
    def _decref(self, id, cache, fn):
254
        if not self._blob_ref_counts:
255
            return False
256
        count = self._blob_ref_counts.get(id, None)
257
        if count is not None:
258
            count -= 1
259
            if count <= 0:
260
                del cache[id]
261
                if fn is not None:
262
                    os.unlink(fn)
263
                del self._blob_ref_counts[id]
264
                return True
265
            else:
266
                self._blob_ref_counts[id] = count
267
        return False
268
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
269
    def fetch_blob(self, id):
270
        """Fetch a blob of data."""
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
271
        if id in self._blobs:
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
272
            return self._blobs.pop(id)
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
273
        if id in self._disk_blobs:
274
            (offset, n_bytes, fn) = self._disk_blobs[id]
275
            if fn is None:
276
                f = self._cleanup.small_blobs
277
                f.seek(offset)
278
                content = f.read(n_bytes)
279
            else:
280
                fp = open(fn, 'rb')
281
                try:
282
                    content = fp.read()
283
                finally:
284
                    fp.close()
285
            self._decref(id, self._disk_blobs, fn)
286
            return content
287
        content = self._sticky_blobs[id]
288
        if self._decref(id, self._sticky_blobs, None):
289
            self._sticky_memory_bytes -= len(content)
290
        return content
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
291
0.81.1 by Ian Clatworthy
move GenericCommitHandler into its own module in prep for a delta-based one
292
    def track_heads(self, cmd):
293
        """Track the repository heads given a CommitCommand.
294
        
295
        :param cmd: the CommitCommand
296
        :return: the list of parents in terms of commit-ids
297
        """
298
        # Get the true set of parents
299
        if cmd.from_ is not None:
300
            parents = [cmd.from_]
301
        else:
302
            last_id = self.last_ids.get(cmd.ref)
303
            if last_id is not None:
304
                parents = [last_id]
305
            else:
306
                parents = []
307
        parents.extend(cmd.merges)
308
309
        # Track the heads
310
        self.track_heads_for_ref(cmd.ref, cmd.id, parents)
311
        return parents
312
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
313
    def track_heads_for_ref(self, cmd_ref, cmd_id, parents=None):
314
        if parents is not None:
315
            for parent in parents:
0.83.1 by Ian Clatworthy
head tracking tests and fix
316
                if parent in self.heads:
317
                    del self.heads[parent]
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
318
        self.heads.setdefault(cmd_id, set()).add(cmd_ref)
319
        self.last_ids[cmd_ref] = cmd_id
320
        self.last_ref = cmd_ref