/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
3193.8.13 by Aaron Bentley
Update texts
1
# Copyright (C) 2009 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
3193.8.32 by Aaron Bentley
Update GPL preamble
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3193.8.13 by Aaron Bentley
Update texts
16
17
3193.8.4 by Aaron Bentley
Get rename detection working for files.
18
from cStringIO import StringIO
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
19
20
from bzrlib import (
21
    osutils,
22
    progress,
23
)
3193.8.16 by Aaron Bentley
Get a dict of required parents.
24
from bzrlib.ui import ui_factory
3193.8.4 by Aaron Bentley
Get rename detection working for files.
25
26
27
class RenameMap(object):
3193.8.13 by Aaron Bentley
Update texts
28
    """Determine a mapping of renames."""
3193.8.4 by Aaron Bentley
Get rename detection working for files.
29
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
30
    def __init__(self, tree):
31
        self.tree = tree
3193.8.4 by Aaron Bentley
Get rename detection working for files.
32
        self.edge_hashes = {}
33
3193.8.11 by Aaron Bentley
Make hash iterator static
34
    @staticmethod
35
    def iter_edge_hashes(lines):
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
36
        """Iterate through the hashes of line pairs (which make up an edge).
37
38
        The hash is truncated using a modulus to avoid excessive memory
39
        consumption by the hitscount dict.  A modulus of 10Mi means that the
40
        maximum number of keys is 10Mi.  (Keys are normally 32 bits, e.g.
41
        4 Gi)
42
        """
3193.8.10 by Aaron Bentley
Update to weight hits and use 10M of keyspace
43
        modulus = 1024 * 1024 * 10
3193.8.4 by Aaron Bentley
Get rename detection working for files.
44
        for n in range(len(lines)):
3193.8.10 by Aaron Bentley
Update to weight hits and use 10M of keyspace
45
            yield hash(tuple(lines[n:n+2])) % modulus
3193.8.4 by Aaron Bentley
Get rename detection working for files.
46
47
    def add_edge_hashes(self, lines, tag):
3193.8.13 by Aaron Bentley
Update texts
48
        """Update edge_hashes to include the given lines.
49
50
        :param lines: The lines to update the hashes for.
51
        :param tag: A tag uniquely associated with these lines (i.e. file-id)
52
        """
3193.8.4 by Aaron Bentley
Get rename detection working for files.
53
        for my_hash in self.iter_edge_hashes(lines):
54
            self.edge_hashes.setdefault(my_hash, set()).add(tag)
55
56
    def add_file_edge_hashes(self, tree, file_ids):
3193.8.13 by Aaron Bentley
Update texts
57
        """Update to reflect the hashes for files in the tree.
58
59
        :param tree: The tree containing the files.
60
        :param file_ids: A list of file_ids to perform the updates for.
61
        """
3193.8.4 by Aaron Bentley
Get rename detection working for files.
62
        desired_files = [(f, f) for f in file_ids]
3193.8.14 by Aaron Bentley
Add progress reporting to guess-renames
63
        task = ui_factory.nested_progress_bar()
64
        try:
65
            for num, (file_id, contents) in enumerate(
66
                tree.iter_files_bytes(desired_files)):
67
                task.update('Calculating hashes', num, len(file_ids))
68
                s = StringIO()
69
                s.writelines(contents)
70
                s.seek(0)
71
                self.add_edge_hashes(s.readlines(), file_id)
72
        finally:
73
            task.finished()
3193.8.4 by Aaron Bentley
Get rename detection working for files.
74
75
    def hitcounts(self, lines):
3193.8.13 by Aaron Bentley
Update texts
76
        """Count the number of hash hits for each tag, for the given lines.
77
78
        Hits are weighted according to the number of tags the hash is
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
79
        associated with; more tags means that the hash is less rare and should
80
        tend to be ignored.
3193.8.13 by Aaron Bentley
Update texts
81
        :param lines: The lines to calculate hashes of.
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
82
        :return: a dict of {tag: hitcount}
3193.8.13 by Aaron Bentley
Update texts
83
        """
3193.8.4 by Aaron Bentley
Get rename detection working for files.
84
        hits = {}
85
        for my_hash in self.iter_edge_hashes(lines):
86
            tags = self.edge_hashes.get(my_hash)
87
            if tags is None:
88
                continue
3193.8.12 by Aaron Bentley
Reorganize slightly for the benefit of kcachegrind
89
            taglen = len(tags)
3193.8.4 by Aaron Bentley
Get rename detection working for files.
90
            for tag in tags:
91
                if tag not in hits:
92
                    hits[tag] = 0
3193.8.12 by Aaron Bentley
Reorganize slightly for the benefit of kcachegrind
93
                hits[tag] += 1.0 / taglen
3193.8.4 by Aaron Bentley
Get rename detection working for files.
94
        return hits
95
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
96
    def get_all_hits(self, paths):
97
        """Find all the hit counts for the listed paths in the tree.
3193.8.13 by Aaron Bentley
Update texts
98
99
        :return: A list of tuples of count, path, file_id.
100
        """
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
101
        all_hits = []
3193.8.14 by Aaron Bentley
Add progress reporting to guess-renames
102
        task = ui_factory.nested_progress_bar()
103
        try:
104
            for num, path in enumerate(paths):
105
                task.update('Determining hash hits', num, len(paths))
3193.8.26 by Aaron Bentley
Updates from review.
106
                hits = self.hitcounts(self.tree.get_file_lines(None,
107
                                                               path=path))
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
108
                all_hits.extend((v, path, k) for k, v in hits.items())
3193.8.14 by Aaron Bentley
Add progress reporting to guess-renames
109
        finally:
110
            task.finished()
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
111
        return all_hits
3193.8.12 by Aaron Bentley
Reorganize slightly for the benefit of kcachegrind
112
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
113
    def file_match(self, paths):
3193.8.13 by Aaron Bentley
Update texts
114
        """Return a mapping from file_ids to the supplied paths."""
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
115
        return self._match_hits(self.get_all_hits(paths))
3193.8.17 by Aaron Bentley
Get directory rename handling working.
116
117
    @staticmethod
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
118
    def _match_hits(hit_list):
3193.8.26 by Aaron Bentley
Updates from review.
119
        """Using a hit list, determine a path-to-fileid map.
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
120
121
        The hit list is a list of (count, path, file_id), where count is a
122
        (possibly float) number, with higher numbers indicating stronger
123
        matches.
124
        """
3193.8.12 by Aaron Bentley
Reorganize slightly for the benefit of kcachegrind
125
        seen_file_ids = set()
126
        path_map = {}
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
127
        for count, path, file_id in sorted(hit_list, reverse=True):
3193.8.26 by Aaron Bentley
Updates from review.
128
            if path in path_map or file_id in seen_file_ids:
3193.8.7 by Aaron Bentley
Saner algorithm for picking optimal file.
129
                continue
130
            path_map[path] = file_id
131
            seen_file_ids.add(file_id)
3193.8.4 by Aaron Bentley
Get rename detection working for files.
132
        return path_map
3193.8.16 by Aaron Bentley
Get a dict of required parents.
133
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
134
    def get_required_parents(self, matches):
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
135
        """Return a dict of all file parents that must be versioned.
136
137
        The keys are the required parents and the values are sets of their
138
        children.
139
        """
3193.8.16 by Aaron Bentley
Get a dict of required parents.
140
        required_parents = {}
141
        for path in matches:
142
            while True:
143
                child = path
144
                path = osutils.dirname(path)
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
145
                if self.tree.path2id(path) is not None:
3193.8.16 by Aaron Bentley
Get a dict of required parents.
146
                    break
147
                required_parents.setdefault(path, []).append(child)
3193.8.17 by Aaron Bentley
Get directory rename handling working.
148
        require_ids = {}
149
        for parent, children in required_parents.iteritems():
150
            child_file_ids = set()
151
            for child in children:
152
                file_id = matches.get(child)
153
                if file_id is not None:
154
                    child_file_ids.add(file_id)
155
            require_ids[parent] = child_file_ids
156
        return require_ids
157
158
    def match_parents(self, required_parents, missing_parents):
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
159
        """Map parent directories to file-ids.
160
161
        This is done by finding similarity between the file-ids of children of
162
        required parent directories and the file-ids of children of missing
163
        parent directories.
164
        """
165
        all_hits = []
3193.8.17 by Aaron Bentley
Get directory rename handling working.
166
        for file_id, file_id_children in missing_parents.iteritems():
167
            for path, path_children in required_parents.iteritems():
168
                hits = len(path_children.intersection(file_id_children))
169
                if hits > 0:
3193.8.20 by Aaron Bentley
Cleanup and enhance tests.
170
                    all_hits.append((hits, path, file_id))
171
        return self._match_hits(all_hits)
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
172
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
173
    def _find_missing_files(self, basis):
3193.8.22 by Aaron Bentley
Reduce unnecessary locking.
174
        missing_files = set()
175
        missing_parents = {}
176
        candidate_files = set()
3193.8.25 by Aaron Bentley
Improve progress reporting.
177
        task = ui_factory.nested_progress_bar()
178
        iterator = self.tree.iter_changes(basis, want_unversioned=True,
179
                                          pb=task)
180
        try:
181
            for (file_id, paths, changed_content, versioned, parent, name,
182
                 kind, executable) in iterator:
183
                if kind[1] is None and versioned[1]:
184
                    missing_parents.setdefault(parent[0], set()).add(file_id)
185
                    if kind[0] == 'file':
186
                        missing_files.add(file_id)
187
                    else:
188
                        #other kinds are not handled
189
                        pass
190
                if versioned == (False, False):
191
                    if self.tree.is_ignored(paths[1]):
192
                        continue
193
                    if kind[1] == 'file':
194
                        candidate_files.add(paths[1])
195
                    if kind[1] == 'directory':
196
                        for _dir, children in self.tree.walkdirs(paths[1]):
197
                            for child in children:
198
                                if child[2] == 'file':
199
                                    candidate_files.add(child[0])
200
        finally:
201
            task.finished()
3193.8.22 by Aaron Bentley
Reduce unnecessary locking.
202
        return missing_files, missing_parents, candidate_files
203
204
    @classmethod
205
    def guess_renames(klass, tree):
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
206
        """Guess which files to rename, and perform the rename.
207
208
        We assume that unversioned files and missing files indicate that
209
        versioned files have been renamed outside of Bazaar.
3193.8.26 by Aaron Bentley
Updates from review.
210
211
        :param tree: A write-locked working tree.
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
212
        """
3193.8.22 by Aaron Bentley
Reduce unnecessary locking.
213
        required_parents = {}
3193.8.25 by Aaron Bentley
Improve progress reporting.
214
        task = ui_factory.nested_progress_bar()
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
215
        try:
3193.8.25 by Aaron Bentley
Improve progress reporting.
216
            pp = progress.ProgressPhase('Guessing renames', 4, task)
217
            basis = tree.basis_tree()
218
            basis.lock_read()
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
219
            try:
3193.8.25 by Aaron Bentley
Improve progress reporting.
220
                rn = klass(tree)
221
                pp.next_phase()
222
                missing_files, missing_parents, candidate_files = (
223
                    rn._find_missing_files(basis))
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
224
                pp.next_phase()
225
                rn.add_file_edge_hashes(basis, missing_files)
226
            finally:
3193.8.25 by Aaron Bentley
Improve progress reporting.
227
                basis.unlock()
228
            pp.next_phase()
229
            matches = rn.file_match(candidate_files)
230
            parents_matches = matches
231
            while len(parents_matches) > 0:
232
                required_parents = rn.get_required_parents(
233
                    parents_matches)
234
                parents_matches = rn.match_parents(required_parents,
235
                                                   missing_parents)
236
                matches.update(parents_matches)
237
            pp.next_phase()
238
            rn._update_tree(required_parents, matches)
3193.8.18 by Aaron Bentley
Move all rename-guessing into RenameMap
239
        finally:
3193.8.25 by Aaron Bentley
Improve progress reporting.
240
            task.finished()
3193.8.23 by Aaron Bentley
Split up guess_renames further.
241
3193.8.24 by Aaron Bentley
Use tree member instead of passing it in
242
    def _update_tree(self, required_parents, matches):
243
        self.tree.add(required_parents)
3193.8.27 by Aaron Bentley
Use apply_inventory_delta to rename files.
244
        delta = []
245
        file_id_matches = dict((f, p) for p, f in matches.items())
246
        for old_path, entry in self.tree.iter_entries_by_dir(matches.values()):
247
            new_path = file_id_matches[entry.file_id]
3193.8.29 by Aaron Bentley
Use split instead of basename/dirname
248
            parent_path, new_name = osutils.split(new_path)
3193.8.27 by Aaron Bentley
Use apply_inventory_delta to rename files.
249
            parent_id = matches.get(parent_path)
250
            if parent_id is None:
251
                parent_id = self.tree.path2id(parent_path)
252
            new_entry = entry.copy()
253
            new_entry.parent_id = parent_id
3193.8.29 by Aaron Bentley
Use split instead of basename/dirname
254
            new_entry.name = new_name
3193.8.27 by Aaron Bentley
Use apply_inventory_delta to rename files.
255
            delta.append((old_path, new_path, new_entry.file_id, new_entry))
256
        self.tree.apply_inventory_delta(delta)