/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
1
# Copyright (C) 2009 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""A manager of caches."""
18
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
19
import atexit
20
import os
21
import shutil
22
import tempfile
23
import time
24
import weakref
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
25
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
26
from bzrlib import lru_cache, trace
0.112.4 by Max Bowsher
Store the BranchMapper in the CacheManager so it can be got from other places.
27
from bzrlib.plugins.fastimport import branch_mapper, helpers
0.64.118 by Ian Clatworthy
fix lru_cache import
28
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
29
30
class _Cleanup(object):
31
    """This class makes sure we clean up when CacheManager goes away.
32
33
    We use a helper class to ensure that we are never in a refcycle.
34
    """
35
36
    def __init__(self, disk_blobs):
37
        self.disk_blobs = disk_blobs
38
        self.tempdir = None
39
        self.small_blobs = None
40
41
    def __del__(self):
42
        self.finalize()
43
44
    def finalize(self):
45
        if self.disk_blobs is not None:
46
            for info in self.disk_blobs.itervalues():
47
                if info[-1] is not None:
48
                    os.unlink(info[-1])
49
            self.disk_blobs = None
50
        if self.small_blobs is not None:
51
            self.small_blobs.close()
52
            self.small_blobs = None
53
        if self.tempdir is not None:
54
            shutils.rmtree(self.tempdir)
55
        
56
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
57
class CacheManager(object):
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
58
    
59
    _small_blob_threshold = 25*1024
60
    _sticky_cache_size = 300*1024*1024
61
    _sticky_flushed_size = 100*1024*1024
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
62
0.83.1 by Ian Clatworthy
head tracking tests and fix
63
    def __init__(self, info=None, verbose=False, inventory_cache_size=10):
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
64
        """Create a manager of caches.
65
66
        :param info: a ConfigObj holding the output from
67
            the --info processor, or None if no hints are available
68
        """
69
        self.verbose = verbose
70
71
        # dataref -> data. datref is either :mark or the sha-1.
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
72
        # Sticky blobs are referenced more than once, and are saved until their
73
        # refcount goes to 0
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
74
        self._blobs = {}
75
        self._sticky_blobs = {}
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
76
        self._sticky_memory_bytes = 0
77
        # if we overflow our memory cache, then we will dump large blobs to
78
        # disk in this directory
79
        self._tempdir = None
80
        # id => (offset, n_bytes, fname)
81
        #   if fname is None, then the content is stored in the small file
82
        self._disk_blobs = {}
83
        self._cleanup = _Cleanup(self._disk_blobs)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
84
85
        # revision-id -> Inventory cache
86
        # these are large and we probably don't need too many as
87
        # most parents are recent in history
88
        self.inventories = lru_cache.LRUCache(inventory_cache_size)
89
90
        # import commmit-ids -> revision-id lookup table
91
        # we need to keep all of these but they are small
92
        self.revision_ids = {}
93
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
94
        # (path, branch_ref) -> file-ids - as generated.
95
        # (Use store_file_id/fetch_fileid methods rather than direct access.)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
96
97
        # Head tracking: last ref, last id per ref & map of commit ids to ref*s*
98
        self.last_ref = None
99
        self.last_ids = {}
100
        self.heads = {}
101
102
        # Work out the blobs to make sticky - None means all
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
103
        self._blob_ref_counts = {}
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
104
        if info is not None:
105
            try:
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
106
                blobs_by_counts = info['Blob reference counts']
107
                # The parser hands values back as lists, already parsed
108
                for count, blob_list in blobs_by_counts.items():
109
                    n = int(count)
110
                    for b in blob_list:
111
                        self._blob_ref_counts[b] = n
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
112
            except KeyError:
113
                # info not in file - possible when no blobs used
114
                pass
115
0.112.4 by Max Bowsher
Store the BranchMapper in the CacheManager so it can be got from other places.
116
        # BranchMapper has no state (for now?), but we keep it around rather
117
        # than reinstantiate on every usage
118
        self.branch_mapper = branch_mapper.BranchMapper()
119
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
120
    def dump_stats(self, note=trace.note):
121
        """Dump some statistics about what we cached."""
122
        # TODO: add in inventory stastistics
123
        note("Cache statistics:")
124
        self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
125
        self._show_stats_for(self.revision_ids, "revision-ids", note=note)
126
        # These aren't interesting so omit from the output, at least for now
127
        #self._show_stats_for(self._blobs, "other blobs", note=note)
128
        #self._show_stats_for(self.last_ids, "last-ids", note=note)
129
        #self._show_stats_for(self.heads, "heads", note=note)
130
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
131
    def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False):
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
132
        """Dump statistics about a given dictionary.
133
134
        By the key and value need to support len().
135
        """
136
        count = len(dict)
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
137
        if tuple_key:
138
            size = sum(map(len, (''.join(k) for k in dict.keys())))
139
        else:
140
            size = sum(map(len, dict.keys()))
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
141
        size += sum(map(len, dict.values()))
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
142
        size = size * 1.0 / 1024
143
        unit = 'K'
144
        if size > 1024:
145
            size = size / 1024
146
            unit = 'M'
147
            if size > 1024:
148
                size = size / 1024
149
                unit = 'G'
150
        note("    %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
151
            helpers.single_plural(count, "item", "items")))
152
153
    def clear_all(self):
154
        """Free up any memory used by the caches."""
155
        self._blobs.clear()
156
        self._sticky_blobs.clear()
157
        self.revision_ids.clear()
158
        self.last_ids.clear()
159
        self.heads.clear()
160
        self.inventories.clear()
161
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
162
    def _flush_blobs_to_disk(self):
163
        blobs = self._sticky_blobs.keys()
164
        sticky_blobs = self._sticky_blobs
165
        total_blobs = len(sticky_blobs)
166
        blobs.sort(key=lambda k:len(sticky_blobs[k]))
167
        if self._tempdir is None:
168
            tempdir = tempfile.mkdtemp(prefix='bzr_fastimport_blobs-')
169
            self._tempdir = tempdir
170
            self._cleanup.tempdir = self._tempdir
171
            self._cleanup.small_blobs = tempfile.TemporaryFile(
172
                prefix='small-blobs-', dir=self._tempdir)
173
            small_blob_ref = weakref.ref(self._cleanup.small_blobs)
174
            # Even though we add it to _Cleanup it seems that the object can be
175
            # destroyed 'too late' for cleanup to actually occur. Probably a
176
            # combination of bzr's "die directly, don't clean up" and how
177
            # exceptions close the running stack.
178
            def exit_cleanup():
179
                small_blob = small_blob_ref()
180
                if small_blob is not None:
181
                    small_blob.close()
182
                shutil.rmtree(tempdir, ignore_errors=True)
183
            atexit.register(exit_cleanup)
184
        count = 0
185
        bytes = 0
186
        n_small_bytes = 0
187
        while self._sticky_memory_bytes > self._sticky_flushed_size:
188
            id = blobs.pop()
189
            blob = self._sticky_blobs.pop(id)
190
            n_bytes = len(blob)
191
            self._sticky_memory_bytes -= n_bytes
192
            if n_bytes < self._small_blob_threshold:
193
                f = self._cleanup.small_blobs
194
                f.seek(0, os.SEEK_END)
195
                self._disk_blobs[id] = (f.tell(), n_bytes, None)
196
                f.write(blob)
197
                n_small_bytes += n_bytes
198
            else:
199
                fd, name = tempfile.mkstemp(prefix='blob-', dir=self._tempdir)
200
                os.write(fd, blob)
201
                os.close(fd)
202
                self._disk_blobs[id] = (0, n_bytes, name)
203
            bytes += n_bytes
204
            del blob
205
            count += 1
206
        trace.note('flushed %d/%d blobs w/ %.1fMB (%.1fMB small) to disk'
207
                   % (count, total_blobs, bytes / 1024. / 1024,
208
                      n_small_bytes / 1024. / 1024))
209
        
210
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
211
    def store_blob(self, id, data):
212
        """Store a blob of data."""
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
213
        # Note: If we're not reference counting, everything has to be sticky
214
        if not self._blob_ref_counts or id in self._blob_ref_counts:
215
            self._sticky_blobs[id] = data
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
216
            self._sticky_memory_bytes += len(data)
217
            if self._sticky_memory_bytes > self._sticky_cache_size:
218
                self._flush_blobs_to_disk()
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
219
        elif data == '':
220
            # Empty data is always sticky
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
221
            self._sticky_blobs[id] = data
222
        else:
223
            self._blobs[id] = data
224
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
225
    def _decref(self, id, cache, fn):
226
        if not self._blob_ref_counts:
227
            return False
228
        count = self._blob_ref_counts.get(id, None)
229
        if count is not None:
230
            count -= 1
231
            if count <= 0:
232
                del cache[id]
233
                if fn is not None:
234
                    os.unlink(fn)
235
                del self._blob_ref_counts[id]
236
                return True
237
            else:
238
                self._blob_ref_counts[id] = count
239
        return False
240
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
241
    def fetch_blob(self, id):
242
        """Fetch a blob of data."""
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
243
        if id in self._blobs:
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
244
            return self._blobs.pop(id)
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
245
        if id in self._disk_blobs:
246
            (offset, n_bytes, fn) = self._disk_blobs[id]
247
            if fn is None:
248
                f = self._cleanup.small_blobs
249
                f.seek(offset)
250
                content = f.read(n_bytes)
251
            else:
252
                fp = open(fn, 'rb')
253
                try:
254
                    content = fp.read()
255
                finally:
256
                    fp.close()
257
            self._decref(id, self._disk_blobs, fn)
258
            return content
259
        content = self._sticky_blobs[id]
260
        if self._decref(id, self._sticky_blobs, None):
261
            self._sticky_memory_bytes -= len(content)
262
        return content
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
263
0.81.1 by Ian Clatworthy
move GenericCommitHandler into its own module in prep for a delta-based one
264
    def track_heads(self, cmd):
265
        """Track the repository heads given a CommitCommand.
266
        
267
        :param cmd: the CommitCommand
268
        :return: the list of parents in terms of commit-ids
269
        """
270
        # Get the true set of parents
271
        if cmd.from_ is not None:
272
            parents = [cmd.from_]
273
        else:
274
            last_id = self.last_ids.get(cmd.ref)
275
            if last_id is not None:
276
                parents = [last_id]
277
            else:
278
                parents = []
279
        parents.extend(cmd.merges)
280
281
        # Track the heads
282
        self.track_heads_for_ref(cmd.ref, cmd.id, parents)
283
        return parents
284
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
285
    def track_heads_for_ref(self, cmd_ref, cmd_id, parents=None):
286
        if parents is not None:
287
            for parent in parents:
0.83.1 by Ian Clatworthy
head tracking tests and fix
288
                if parent in self.heads:
289
                    del self.heads[parent]
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
290
        self.heads.setdefault(cmd_id, set()).add(cmd_ref)
291
        self.last_ids[cmd_ref] = cmd_id
292
        self.last_ref = cmd_ref