/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
1
# Copyright (C) 2009 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""A manager of caches."""
18
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
19
import os
20
import shutil
21
import tempfile
22
import time
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
23
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
24
from bzrlib import lru_cache, trace
25
from bzrlib.plugins.fastimport import helpers
0.64.118 by Ian Clatworthy
fix lru_cache import
26
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
27
28
class _Cleanup(object):
29
    """This class makes sure we clean up when CacheManager goes away.
30
31
    We use a helper class to ensure that we are never in a refcycle.
32
    """
33
34
    def __init__(self, disk_blobs):
35
        self.disk_blobs = disk_blobs
36
        self.tempdir = None
37
        self.small_blobs = None
38
39
    def __del__(self):
40
        self.finalize()
41
42
    def finalize(self):
43
        if self.disk_blobs is not None:
44
            for info in self.disk_blobs.itervalues():
45
                info[-1].close()
46
            self.disk_blobs = None
47
        if self.small_blobs is not None:
48
            self.small_blobs.close()
49
            self.small_blobs = None
50
        if self.tempdir is not None:
51
            shutils.rmtree(self.tempdir)
52
        
53
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
54
class CacheManager(object):
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
55
    
56
    _small_blob_threshold = 100*1024
57
    _sticky_cache_size = 200*1024*1024
58
    _sticky_flushed_size = 100*1024*1024
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
59
0.83.1 by Ian Clatworthy
head tracking tests and fix
60
    def __init__(self, info=None, verbose=False, inventory_cache_size=10):
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
61
        """Create a manager of caches.
62
63
        :param info: a ConfigObj holding the output from
64
            the --info processor, or None if no hints are available
65
        """
66
        self.verbose = verbose
67
68
        # dataref -> data. datref is either :mark or the sha-1.
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
69
        # Sticky blobs are referenced more than once, and are saved until their
70
        # refcount goes to 0
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
71
        self._blobs = {}
72
        self._sticky_blobs = {}
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
73
        self._sticky_memory_bytes = 0
74
        # if we overflow our memory cache, then we will dump large blobs to
75
        # disk in this directory
76
        self._tempdir = None
77
        # id => TemporaryFile
78
        self._disk_blobs = {}
79
        self._cleanup = _Cleanup(self._disk_blobs)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
80
81
        # revision-id -> Inventory cache
82
        # these are large and we probably don't need too many as
83
        # most parents are recent in history
84
        self.inventories = lru_cache.LRUCache(inventory_cache_size)
85
86
        # import commmit-ids -> revision-id lookup table
87
        # we need to keep all of these but they are small
88
        self.revision_ids = {}
89
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
90
        # (path, branch_ref) -> file-ids - as generated.
91
        # (Use store_file_id/fetch_fileid methods rather than direct access.)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
92
93
        # Head tracking: last ref, last id per ref & map of commit ids to ref*s*
94
        self.last_ref = None
95
        self.last_ids = {}
96
        self.heads = {}
97
98
        # Work out the blobs to make sticky - None means all
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
99
        self._blob_ref_counts = {}
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
100
        if info is not None:
101
            try:
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
102
                blobs_by_counts = info['Blob reference counts']
103
                # The parser hands values back as lists, already parsed
104
                for count, blob_list in blobs_by_counts.items():
105
                    n = int(count)
106
                    for b in blob_list:
107
                        self._blob_ref_counts[b] = n
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
108
            except KeyError:
109
                # info not in file - possible when no blobs used
110
                pass
111
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
112
    def dump_stats(self, note=trace.note):
113
        """Dump some statistics about what we cached."""
114
        # TODO: add in inventory stastistics
115
        note("Cache statistics:")
116
        self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
117
        self._show_stats_for(self.revision_ids, "revision-ids", note=note)
118
        # These aren't interesting so omit from the output, at least for now
119
        #self._show_stats_for(self._blobs, "other blobs", note=note)
120
        #self._show_stats_for(self.last_ids, "last-ids", note=note)
121
        #self._show_stats_for(self.heads, "heads", note=note)
122
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
123
    def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False):
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
124
        """Dump statistics about a given dictionary.
125
126
        By the key and value need to support len().
127
        """
128
        count = len(dict)
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
129
        if tuple_key:
130
            size = sum(map(len, (''.join(k) for k in dict.keys())))
131
        else:
132
            size = sum(map(len, dict.keys()))
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
133
        size += sum(map(len, dict.values()))
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
134
        size = size * 1.0 / 1024
135
        unit = 'K'
136
        if size > 1024:
137
            size = size / 1024
138
            unit = 'M'
139
            if size > 1024:
140
                size = size / 1024
141
                unit = 'G'
142
        note("    %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
143
            helpers.single_plural(count, "item", "items")))
144
145
    def clear_all(self):
146
        """Free up any memory used by the caches."""
147
        self._blobs.clear()
148
        self._sticky_blobs.clear()
149
        self.revision_ids.clear()
150
        self.last_ids.clear()
151
        self.heads.clear()
152
        self.inventories.clear()
153
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
154
    def _flush_blobs_to_disk(self):
155
        blobs = self._sticky_blobs.keys()
156
        sticky_blobs = self._sticky_blobs
157
        blobs.sort(key=lambda k:len(sticky_blobs[k]))
158
        if self._tempdir is None:
159
            self._tempdir = tempfile.mkdtemp(prefix='bzr_fastimport_blobs-')
160
            self._cleanup.tempdir = self._tempdir
161
            self._cleanup.small_blobs = tempfile.TemporaryFile(
162
                prefix='small-blobs-')
163
        count = 0
164
        bytes = 0
165
        n_small_bytes = 0
166
        while self._sticky_memory_bytes > self._sticky_flushed_size:
167
            id = blobs.pop()
168
            blob = self._sticky_blobs.pop(id)
169
            n_bytes = len(blob)
170
            self._sticky_memory_bytes -= n_bytes
171
            if n_bytes < self._small_blob_threshold:
172
                f = self._cleanup.small_blobs
173
                f.seek(0, os.SEEK_END)
174
                self._disk_blobs[id] = (True, f.tell(), n_bytes, f)
175
                n_small_bytes += n_bytes
176
            else:
177
                f = tempfile.TemporaryFile(prefix='blob-', dir=self._tempdir)
178
                self._disk_blobs[id] = (False, 0, n_bytes, f)
179
            f.write(blob)
180
            bytes += n_bytes
181
            del blob
182
            count += 1
183
        trace.note('flushed %d blobs w/ %.1fMB (%.1fMB small) to disk'
184
                   % (count, bytes / 1024. / 1024,
185
                      n_small_bytes / 1024. / 1024))
186
        
187
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
188
    def store_blob(self, id, data):
189
        """Store a blob of data."""
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
190
        # Note: If we're not reference counting, everything has to be sticky
191
        if not self._blob_ref_counts or id in self._blob_ref_counts:
192
            self._sticky_blobs[id] = data
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
193
            self._sticky_memory_bytes += len(data)
194
            if self._sticky_memory_bytes > self._sticky_cache_size:
195
                self._flush_blobs_to_disk()
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
196
        elif data == '':
197
            # Empty data is always sticky
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
198
            self._sticky_blobs[id] = data
199
        else:
200
            self._blobs[id] = data
201
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
202
    def _decref(self, id, cache, f):
203
        if not self._blob_ref_counts:
204
            return
205
        count = self._blob_ref_counts.get(id, None)
206
        if count is not None:
207
            count -= 1
208
            if count <= 0:
209
                del cache[id]
210
                if f is not None:
211
                    f.close()
212
                del self._blob_ref_counts[id]
213
            else:
214
                self._blob_ref_counts[id] = count
215
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
216
    def fetch_blob(self, id):
217
        """Fetch a blob of data."""
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
218
        if id in self._blobs:
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
219
            return self._blobs.pop(id)
0.115.8 by John Arbash Meinel
Dump sticky blobs to disk when memory pressure gets high.
220
        if id in self._disk_blobs:
221
            (is_small, offset, n_bytes, f) = self._disk_blobs[id]
222
            f.seek(offset)
223
            content = f.read(n_bytes)
224
            self._decref(id, self._disk_blobs, f)
225
            return content
226
        content = self._sticky_blobs[id]
227
        self._sticky_memory_bytes -= len(content)
228
        self._decref(id, self._sticky_blobs, None)
229
        return content
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
230
0.81.1 by Ian Clatworthy
move GenericCommitHandler into its own module in prep for a delta-based one
231
    def track_heads(self, cmd):
232
        """Track the repository heads given a CommitCommand.
233
        
234
        :param cmd: the CommitCommand
235
        :return: the list of parents in terms of commit-ids
236
        """
237
        # Get the true set of parents
238
        if cmd.from_ is not None:
239
            parents = [cmd.from_]
240
        else:
241
            last_id = self.last_ids.get(cmd.ref)
242
            if last_id is not None:
243
                parents = [last_id]
244
            else:
245
                parents = []
246
        parents.extend(cmd.merges)
247
248
        # Track the heads
249
        self.track_heads_for_ref(cmd.ref, cmd.id, parents)
250
        return parents
251
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
252
    def track_heads_for_ref(self, cmd_ref, cmd_id, parents=None):
253
        if parents is not None:
254
            for parent in parents:
0.83.1 by Ian Clatworthy
head tracking tests and fix
255
                if parent in self.heads:
256
                    del self.heads[parent]
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
257
        self.heads.setdefault(cmd_id, set()).add(cmd_ref)
258
        self.last_ids[cmd_ref] = cmd_id
259
        self.last_ref = cmd_ref