/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
1
# Copyright (C) 2008 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""Import processor that dump stats about the input (and doesn't import)."""
18
19
20
from bzrlib.trace import (
21
    note,
22
    warning,
23
    )
24
from bzrlib.plugins.fastimport import (
0.83.2 by Ian Clatworthy
update fast-import-info to use same head-tracking code as fast-import
25
    cache_manager,
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
26
    commands,
0.64.30 by Ian Clatworthy
add heads analysis to info processor
27
    helpers,
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
28
    processor,
29
    )
30
31
32
class InfoProcessor(processor.ImportProcessor):
33
    """An import processor that dumps statistics about the input.
34
35
    No changes to the current repository are made.
36
37
    As well as providing useful information about an import
38
    stream before importing it, this processor is useful for
39
    benchmarking the speed at which data can be extracted from
40
    the source.
41
    """
42
0.64.212 by Ian Clatworthy
configrable output stream for InfoProcessor
43
    def __init__(self, target=None, params=None, verbose=0, outf=None):
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
44
        # Allow creation without a target
0.64.212 by Ian Clatworthy
configrable output stream for InfoProcessor
45
        processor.ImportProcessor.__init__(self, target, params, verbose,
46
            outf=outf)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
47
48
    def pre_process(self):
0.64.213 by Ian Clatworthy
Smarter blob tracking by implicitly collecting statistics before starting the import
49
        self.note("Collecting statistics ...")
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
50
        # Init statistics
51
        self.cmd_counts = {}
52
        for cmd in commands.COMMAND_NAMES:
53
            self.cmd_counts[cmd] = 0
54
        self.file_cmd_counts = {}
55
        for fc in commands.FILE_COMMAND_NAMES:
56
            self.file_cmd_counts[fc] = 0
57
        self.parent_counts = {}
0.64.105 by Ian Clatworthy
handle > 16 parents in fast-import-info
58
        self.max_parent_count = 0
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
59
        self.committers = set()
60
        self.separate_authors_found = False
61
        self.symlinks_found = False
62
        self.executables_found = False
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
63
        self.sha_blob_references = False
0.64.12 by Ian Clatworthy
lightweight tags, filter processor and param validation
64
        self.lightweight_tags = 0
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
65
        # Blob usage tracking
66
        self.blobs = {}
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
67
        for usage in ['new', 'used', 'unknown', 'unmarked']:
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
68
            self.blobs[usage] = set()
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
69
        self.blob_ref_counts = {}
0.83.2 by Ian Clatworthy
update fast-import-info to use same head-tracking code as fast-import
70
        # Head tracking - delegate to the cache manager
71
        self.cache_mgr = cache_manager.CacheManager(inventory_cache_size=0)
0.64.127 by Ian Clatworthy
add total merges to fast-import-info & record when old/src paths no longer required
72
        # Stuff to cache: a map from mark to # of times that mark is merged
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
73
        self.merges = {}
0.64.127 by Ian Clatworthy
add total merges to fast-import-info & record when old/src paths no longer required
74
        # Stuff to cache: these are maps from mark to sets
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
75
        self.rename_old_paths = {}
76
        self.copy_source_paths = {}
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
77
78
    def post_process(self):
79
        # Dump statistics
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
80
        cmd_names = commands.COMMAND_NAMES
81
        fc_names = commands.FILE_COMMAND_NAMES
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
82
        self._dump_stats_group("Command counts",
83
            [(c, self.cmd_counts[c]) for c in cmd_names], str)
84
        self._dump_stats_group("File command counts", 
85
            [(c, self.file_cmd_counts[c]) for c in fc_names], str)
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
86
87
        # Commit stats
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
88
        if self.cmd_counts['commit']:
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
89
            p_items = []
0.64.105 by Ian Clatworthy
handle > 16 parents in fast-import-info
90
            for i in xrange(0, self.max_parent_count + 1):
91
                if i in self.parent_counts:
92
                    count = self.parent_counts[i]
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
93
                    p_items.append(("parents-%d" % i, count))
0.64.127 by Ian Clatworthy
add total merges to fast-import-info & record when old/src paths no longer required
94
            merges_count = len(self.merges.keys())
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
95
            p_items.append(('total revisions merged', merges_count))
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
96
            flags = {
97
                'separate authors found': self.separate_authors_found,
98
                'executables': self.executables_found,
99
                'symlinks': self.symlinks_found,
100
                'blobs referenced by SHA': self.sha_blob_references,
101
                }
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
102
            self._dump_stats_group("Parent counts", p_items, str)
103
            self._dump_stats_group("Commit analysis", flags.iteritems(), _found)
0.83.2 by Ian Clatworthy
update fast-import-info to use same head-tracking code as fast-import
104
            heads = helpers.invert_dictset(self.cache_mgr.heads)
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
105
            self._dump_stats_group("Head analysis", heads.iteritems(), None,
106
                                    _iterable_as_config_list)
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
107
            # note("\t%d\t%s" % (len(self.committers), 'unique committers'))
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
108
            self._dump_stats_group("Merges", self.merges.iteritems(), None)
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
109
            # We only show the rename old path and copy source paths when -vv
110
            # (verbose=2) is specified. The output here for mysql's data can't
111
            # be parsed currently so this bit of code needs more work anyhow ..
112
            if self.verbose >= 2:
113
                self._dump_stats_group("Rename old paths",
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
114
                    self.rename_old_paths.iteritems(), len,
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
115
                    _iterable_as_config_list)
116
                self._dump_stats_group("Copy source paths",
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
117
                    self.copy_source_paths.iteritems(), len,
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
118
                    _iterable_as_config_list)
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
119
120
        # Blob stats
121
        if self.cmd_counts['blob']:
122
            # In verbose mode, don't list every blob used
123
            if self.verbose:
124
                del self.blobs['used']
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
125
            self._dump_stats_group("Blob usage tracking",
126
                self.blobs.iteritems(), len, _iterable_as_config_list)
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
127
        if self.blob_ref_counts:
128
            blobs_by_count = helpers.invert_dict(self.blob_ref_counts)
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
129
            blob_items = blobs_by_count.items()
130
            blob_items.sort()
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
131
            self._dump_stats_group("Blob reference counts",
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
132
                blob_items, len, _iterable_as_config_list)
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
133
134
        # Other stats
0.64.12 by Ian Clatworthy
lightweight tags, filter processor and param validation
135
        if self.cmd_counts['reset']:
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
136
            reset_stats = {
137
                'lightweight tags': self.lightweight_tags,
138
                }
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
139
            self._dump_stats_group("Reset analysis", reset_stats.iteritems())
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
140
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
141
    def _dump_stats_group(self, title, items, normal_formatter=None,
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
142
        verbose_formatter=None):
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
143
        """Dump a statistics group.
144
        
145
        In verbose mode, do so as a config file so
146
        that other processors can load the information if they want to.
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
147
        :param normal_formatter: the callable to apply to the value
148
          before displaying it in normal mode
149
        :param verbose_formatter: the callable to apply to the value
150
          before displaying it in verbose mode
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
151
        """
152
        if self.verbose:
0.64.212 by Ian Clatworthy
configrable output stream for InfoProcessor
153
            self.outf.write("[%s]\n" % (title,))
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
154
            for name, value in items:
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
155
                if verbose_formatter is not None:
156
                    value = verbose_formatter(value)
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
157
                if type(name) == str:
158
                    name = name.replace(' ', '-')
0.64.212 by Ian Clatworthy
configrable output stream for InfoProcessor
159
                self.outf.write("%s = %s\n" % (name, value))
160
            self.outf.write("\n")
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
161
        else:
0.64.212 by Ian Clatworthy
configrable output stream for InfoProcessor
162
            self.outf.write("%s:\n" % (title,))
0.115.1 by John Arbash Meinel
Change (.keys(), .values()) to using (.iteritems())
163
            for name, value in items:
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
164
                if normal_formatter is not None:
165
                    value = normal_formatter(value)
0.64.212 by Ian Clatworthy
configrable output stream for InfoProcessor
166
                self.outf.write("\t%s\t%s\n" % (value, name))
0.64.9 by Ian Clatworthy
dump parameter for info processor
167
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
168
    def progress_handler(self, cmd):
169
        """Process a ProgressCommand."""
170
        self.cmd_counts[cmd.name] += 1
171
172
    def blob_handler(self, cmd):
173
        """Process a BlobCommand."""
174
        self.cmd_counts[cmd.name] += 1
0.64.35 by Ian Clatworthy
identify unmarked blobs and commits by line numbers
175
        if cmd.mark is None:
176
            self.blobs['unmarked'].add(cmd.id)
177
        else:
178
            self.blobs['new'].add(cmd.id)
0.64.39 by Ian Clatworthy
fix blob tracking when marks are reused
179
            # Marks can be re-used so remove it from used if already there.
180
            # Note: we definitely do NOT want to remove it from multi if
181
            # it's already in that set.
182
            try:
183
                self.blobs['used'].remove(cmd.id)
184
            except KeyError:
185
                pass
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
186
187
    def checkpoint_handler(self, cmd):
188
        """Process a CheckpointCommand."""
189
        self.cmd_counts[cmd.name] += 1
190
191
    def commit_handler(self, cmd):
192
        """Process a CommitCommand."""
193
        self.cmd_counts[cmd.name] += 1
194
        self.committers.add(cmd.committer)
195
        if cmd.author is not None:
196
            self.separate_authors_found = True
197
        for fc in cmd.file_iter():
198
            self.file_cmd_counts[fc.name] += 1
199
            if isinstance(fc, commands.FileModifyCommand):
200
                if fc.is_executable:
201
                    self.executables_found = True
202
                if fc.kind == commands.SYMLINK_KIND:
203
                    self.symlinks_found = True
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
204
                if fc.dataref is not None:
205
                    if fc.dataref[0] == ':':
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
206
                        self._track_blob(fc.dataref)
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
207
                    else:
208
                        self.sha_blob_references = True
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
209
            elif isinstance(fc, commands.FileRenameCommand):
0.64.127 by Ian Clatworthy
add total merges to fast-import-info & record when old/src paths no longer required
210
                self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
211
            elif isinstance(fc, commands.FileCopyCommand):
0.64.127 by Ian Clatworthy
add total merges to fast-import-info & record when old/src paths no longer required
212
                self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
213
0.64.30 by Ian Clatworthy
add heads analysis to info processor
214
        # Track the heads
0.83.2 by Ian Clatworthy
update fast-import-info to use same head-tracking code as fast-import
215
        parents = self.cache_mgr.track_heads(cmd)
0.64.127 by Ian Clatworthy
add total merges to fast-import-info & record when old/src paths no longer required
216
217
        # Track the parent counts
0.64.105 by Ian Clatworthy
handle > 16 parents in fast-import-info
218
        parent_count = len(parents)
219
        if self.parent_counts.has_key(parent_count):
220
            self.parent_counts[parent_count] += 1
221
        else:
222
            self.parent_counts[parent_count] = 1
223
            if parent_count > self.max_parent_count:
224
                self.max_parent_count = parent_count
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
225
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
226
        # Remember the merges
227
        if cmd.merges:
228
            #self.merges.setdefault(cmd.ref, set()).update(cmd.merges)
229
            for merge in cmd.merges:
230
                if merge in self.merges:
231
                    self.merges[merge] += 1
232
                else:
233
                    self.merges[merge] = 1
234
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
235
    def reset_handler(self, cmd):
236
        """Process a ResetCommand."""
237
        self.cmd_counts[cmd.name] += 1
0.64.12 by Ian Clatworthy
lightweight tags, filter processor and param validation
238
        if cmd.ref.startswith('refs/tags/'):
239
            self.lightweight_tags += 1
240
        else:
0.83.2 by Ian Clatworthy
update fast-import-info to use same head-tracking code as fast-import
241
            if cmd.from_ is not None:
242
                self.cache_mgr.track_heads_for_ref(cmd.ref, cmd.from_)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
243
244
    def tag_handler(self, cmd):
245
        """Process a TagCommand."""
246
        self.cmd_counts[cmd.name] += 1
247
0.102.9 by Ian Clatworthy
parsing of multiple authors and commit properties
248
    def feature_handler(self, cmd):
249
        """Process a FeatureCommand."""
250
        self.cmd_counts[cmd.name] += 1
0.102.11 by Ian Clatworthy
Validate features are known before importing
251
        feature = cmd.feature_name
252
        if feature not in commands.FEATURE_NAMES:
253
            self.warning("feature %s is not supported - parsing may fail"
254
                % (feature,))
0.102.9 by Ian Clatworthy
parsing of multiple authors and commit properties
255
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
256
    def _track_blob(self, mark):
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
257
        if mark in self.blob_ref_counts:
258
            self.blob_ref_counts[mark] += 1
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
259
            pass
260
        elif mark in self.blobs['used']:
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
261
            self.blob_ref_counts[mark] = 2
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
262
            self.blobs['used'].remove(mark)
263
        elif mark in self.blobs['new']:
264
            self.blobs['used'].add(mark)
265
            self.blobs['new'].remove(mark)
266
        else:
267
            self.blobs['unknown'].add(mark)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
268
269
def _found(b):
270
    """Format a found boolean as a string."""
271
    return ['no', 'found'][b]
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
272
0.64.30 by Ian Clatworthy
add heads analysis to info processor
273
def _iterable_as_config_list(s):
274
    """Format an iterable as a sequence of comma-separated strings.
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
275
    
276
    To match what ConfigObj expects, a single item list has a trailing comma.
277
    """
278
    items = sorted(s)
279
    if len(items) == 1:
280
        return "%s," % (items[0],)
281
    else:
282
        return ", ".join(items)