1
# Copyright (C) 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Import processor that dump stats about the input (and doesn't import)."""
20
from bzrlib.trace import (
24
from bzrlib.plugins.fastimport import (
31
class InfoProcessor(processor.ImportProcessor):
32
"""An import processor that dumps statistics about the input.
34
No changes to the current repository are made.
36
As well as providing useful information about an import
37
stream before importing it, this processor is useful for
38
benchmarking the speed at which data can be extracted from
42
def __init__(self, target=None, params=None, verbose=False):
43
# Allow creation without a target
44
processor.ImportProcessor.__init__(self, target, params, verbose)
46
def pre_process(self):
49
for cmd in commands.COMMAND_NAMES:
50
self.cmd_counts[cmd] = 0
51
self.file_cmd_counts = {}
52
for fc in commands.FILE_COMMAND_NAMES:
53
self.file_cmd_counts[fc] = 0
54
self.parent_counts = {}
55
self.max_parent_count = 0
56
self.committers = set()
57
self.separate_authors_found = False
58
self.symlinks_found = False
59
self.executables_found = False
60
self.sha_blob_references = False
61
self.lightweight_tags = 0
62
self.named_branches = []
65
for usage in ['new', 'used', 'multi', 'unknown', 'unmarked']:
66
self.blobs[usage] = set()
67
# Head tracking: map of commit mark to ref
71
def post_process(self):
73
cmd_names = commands.COMMAND_NAMES
74
fc_names = commands.FILE_COMMAND_NAMES
75
cmd_values = [self.cmd_counts[c] for c in cmd_names]
76
fc_values = [self.file_cmd_counts[c] for c in fc_names]
77
self._dump_stats_group("Command counts", cmd_names, cmd_values, str)
78
self._dump_stats_group("File command counts", fc_names, fc_values, str)
81
if self.cmd_counts['commit']:
84
for i in xrange(0, self.max_parent_count + 1):
85
if i in self.parent_counts:
86
count = self.parent_counts[i]
87
p_names.append("parents-%d" % i)
88
p_values.append(count)
90
'separate authors found': self.separate_authors_found,
91
'executables': self.executables_found,
92
'symlinks': self.symlinks_found,
93
'blobs referenced by SHA': self.sha_blob_references,
95
self._dump_stats_group("Parent counts", p_names, p_values, str)
96
self._dump_stats_group("Commit analysis", flags.keys(),
97
flags.values(), _found)
98
heads = helpers.invert_dict(self.heads)
99
self._dump_stats_group("Head analysis", heads.keys(),
100
heads.values(), None, _iterable_as_config_list)
101
# note("\t%d\t%s" % (len(self.committers), 'unique committers'))
104
if self.cmd_counts['blob']:
105
# In verbose mode, don't list every blob used
107
del self.blobs['used']
108
self._dump_stats_group("Blob usage tracking", self.blobs.keys(),
109
self.blobs.values(), len, _iterable_as_config_list)
112
if self.cmd_counts['reset']:
114
'lightweight tags': self.lightweight_tags,
115
'other resets': self.named_branches,
117
self._dump_stats_group("Reset analysis", reset_stats.keys(),
118
reset_stats.values())
120
def _dump_stats_group(self, title, names, values, normal_formatter=None,
121
verbose_formatter=None):
122
"""Dump a statistics group.
124
In verbose mode, do so as a config file so
125
that other processors can load the information if they want to.
126
:param normal_formatter: the callable to apply to the value
127
before displaying it in normal mode
128
:param verbose_formatter: the callable to apply to the value
129
before displaying it in verbose mode
132
print "[%s]" % (title,)
133
for name, value in zip(names, values):
134
if verbose_formatter is not None:
135
value = verbose_formatter(value)
136
print "%s = %s" % (name.replace(' ', '-'),value)
139
print "%s:" % (title,)
140
for name, value in zip(names, values):
141
if normal_formatter is not None:
142
value = normal_formatter(value)
143
print "\t%s\t%s" % (value,name)
145
def progress_handler(self, cmd):
146
"""Process a ProgressCommand."""
147
self.cmd_counts[cmd.name] += 1
149
def blob_handler(self, cmd):
150
"""Process a BlobCommand."""
151
self.cmd_counts[cmd.name] += 1
153
self.blobs['unmarked'].add(cmd.id)
155
self.blobs['new'].add(cmd.id)
156
# Marks can be re-used so remove it from used if already there.
157
# Note: we definitely do NOT want to remove it from multi if
158
# it's already in that set.
160
self.blobs['used'].remove(cmd.id)
164
def checkpoint_handler(self, cmd):
165
"""Process a CheckpointCommand."""
166
self.cmd_counts[cmd.name] += 1
168
def commit_handler(self, cmd):
169
"""Process a CommitCommand."""
170
self.cmd_counts[cmd.name] += 1
171
self.committers.add(cmd.committer)
172
if cmd.author is not None:
173
self.separate_authors_found = True
174
for fc in cmd.file_iter():
175
self.file_cmd_counts[fc.name] += 1
176
if isinstance(fc, commands.FileModifyCommand):
178
self.executables_found = True
179
if fc.kind == commands.SYMLINK_KIND:
180
self.symlinks_found = True
181
if fc.dataref is not None:
182
if fc.dataref[0] == ':':
183
self._track_blob(fc.dataref)
185
self.sha_blob_references = True
187
if cmd.from_ is not None:
188
parents = [cmd.from_]
190
last_id = self.last_ids.get(cmd.ref)
191
if last_id is not None:
195
parents.extend(cmd.merges)
196
for parent in parents:
198
del self.heads[parent]
200
# it's ok if the parent isn't there - another
201
# commit may have already removed it
203
self.heads[cmd.id] = cmd.ref
204
self.last_ids[cmd.ref] = cmd.id
205
parent_count = len(parents)
206
if self.parent_counts.has_key(parent_count):
207
self.parent_counts[parent_count] += 1
209
self.parent_counts[parent_count] = 1
210
if parent_count > self.max_parent_count:
211
self.max_parent_count = parent_count
213
def reset_handler(self, cmd):
214
"""Process a ResetCommand."""
215
self.cmd_counts[cmd.name] += 1
216
if cmd.ref.startswith('refs/tags/'):
217
self.lightweight_tags += 1
219
self.named_branches.append(cmd.ref)
221
def tag_handler(self, cmd):
222
"""Process a TagCommand."""
223
self.cmd_counts[cmd.name] += 1
225
def _track_blob(self, mark):
226
if mark in self.blobs['multi']:
228
elif mark in self.blobs['used']:
229
self.blobs['multi'].add(mark)
230
self.blobs['used'].remove(mark)
231
elif mark in self.blobs['new']:
232
self.blobs['used'].add(mark)
233
self.blobs['new'].remove(mark)
235
self.blobs['unknown'].add(mark)
238
"""Format a found boolean as a string."""
239
return ['no', 'found'][b]
241
def _iterable_as_config_list(s):
242
"""Format an iterable as a sequence of comma-separated strings.
244
To match what ConfigObj expects, a single item list has a trailing comma.
248
return "%s," % (items[0],)
250
return ", ".join(items)