1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
from __future__ import absolute_import
25
from .i18n import gettext
31
from .ui import ui_factory
34
class RenameMap(object):
35
"""Determine a mapping of renames."""
37
def __init__(self, tree):
42
def iter_edge_hashes(lines):
43
"""Iterate through the hashes of line pairs (which make up an edge).
45
The hash is truncated using a modulus to avoid excessive memory
46
consumption by the hitscount dict. A modulus of 10Mi means that the
47
maximum number of keys is 10Mi. (Keys are normally 32 bits, e.g.
50
modulus = 1024 * 1024 * 10
51
for n in range(len(lines)):
52
yield hash(tuple(lines[n:n + 2])) % modulus
54
def add_edge_hashes(self, lines, tag):
55
"""Update edge_hashes to include the given lines.
57
:param lines: The lines to update the hashes for.
58
:param tag: A tag uniquely associated with these lines (i.e. file-id)
60
for my_hash in self.iter_edge_hashes(lines):
61
self.edge_hashes.setdefault(my_hash, set()).add(tag)
63
def add_file_edge_hashes(self, tree, file_ids):
64
"""Update to reflect the hashes for files in the tree.
66
:param tree: The tree containing the files.
67
:param file_ids: A list of file_ids to perform the updates for.
69
desired_files = [(tree.id2path(f), f) for f in file_ids]
70
with ui_factory.nested_progress_bar() as task:
71
for num, (file_id, contents) in enumerate(
72
tree.iter_files_bytes(desired_files)):
73
task.update(gettext('Calculating hashes'), num, len(file_ids))
75
s.writelines(contents)
77
self.add_edge_hashes(s.readlines(), file_id)
79
def hitcounts(self, lines):
80
"""Count the number of hash hits for each tag, for the given lines.
82
Hits are weighted according to the number of tags the hash is
83
associated with; more tags means that the hash is less rare and should
85
:param lines: The lines to calculate hashes of.
86
:return: a dict of {tag: hitcount}
89
for my_hash in self.iter_edge_hashes(lines):
90
tags = self.edge_hashes.get(my_hash)
97
hits[tag] += 1.0 / taglen
100
def get_all_hits(self, paths):
101
"""Find all the hit counts for the listed paths in the tree.
103
:return: A list of tuples of count, path, file_id.
106
with ui_factory.nested_progress_bar() as task:
107
for num, path in enumerate(paths):
108
task.update(gettext('Determining hash hits'), num, len(paths))
109
hits = self.hitcounts(self.tree.get_file_lines(path))
110
all_hits.extend((v, path, k) for k, v in viewitems(hits))
113
def file_match(self, paths):
114
"""Return a mapping from file_ids to the supplied paths."""
115
return self._match_hits(self.get_all_hits(paths))
118
def _match_hits(hit_list):
119
"""Using a hit list, determine a path-to-fileid map.
121
The hit list is a list of (count, path, file_id), where count is a
122
(possibly float) number, with higher numbers indicating stronger
125
seen_file_ids = set()
127
for count, path, file_id in sorted(hit_list, reverse=True):
128
if path in path_map or file_id in seen_file_ids:
130
path_map[path] = file_id
131
seen_file_ids.add(file_id)
134
def get_required_parents(self, matches):
135
"""Return a dict of all file parents that must be versioned.
137
The keys are the required parents and the values are sets of their
140
required_parents = {}
144
path = osutils.dirname(path)
145
if self.tree.is_versioned(path):
147
required_parents.setdefault(path, []).append(child)
149
for parent, children in viewitems(required_parents):
150
child_file_ids = set()
151
for child in children:
152
file_id = matches.get(child)
153
if file_id is not None:
154
child_file_ids.add(file_id)
155
require_ids[parent] = child_file_ids
158
def match_parents(self, required_parents, missing_parents):
159
"""Map parent directories to file-ids.
161
This is done by finding similarity between the file-ids of children of
162
required parent directories and the file-ids of children of missing
166
for file_id, file_id_children in viewitems(missing_parents):
167
for path, path_children in viewitems(required_parents):
168
hits = len(path_children.intersection(file_id_children))
170
all_hits.append((hits, path, file_id))
171
return self._match_hits(all_hits)
173
def _find_missing_files(self, basis):
174
missing_files = set()
176
candidate_files = set()
177
with ui_factory.nested_progress_bar() as task:
178
iterator = self.tree.iter_changes(basis, want_unversioned=True,
180
for change in iterator:
181
if change.kind[1] is None and change.versioned[1]:
182
if not self.tree.has_filename(
183
self.tree.id2path(change.parent_id[0])):
184
missing_parents.setdefault(
185
change.parent_id[0], set()).add(change.file_id)
186
if change.kind[0] == 'file':
187
missing_files.add(change.file_id)
189
# other kinds are not handled
191
if change.versioned == (False, False):
192
if self.tree.is_ignored(change.path[1]):
194
if change.kind[1] == 'file':
195
candidate_files.add(change.path[1])
196
if change.kind[1] == 'directory':
197
for _dir, children in self.tree.walkdirs(change.path[1]):
198
for child in children:
199
if child[2] == 'file':
200
candidate_files.add(child[0])
201
return missing_files, missing_parents, candidate_files
204
def guess_renames(klass, from_tree, to_tree, dry_run=False):
205
"""Guess which files to rename, and perform the rename.
207
We assume that unversioned files and missing files indicate that
208
versioned files have been renamed outside of Bazaar.
210
:param from_tree: A tree to compare from
211
:param to_tree: A write-locked working tree.
213
required_parents = {}
214
with ui_factory.nested_progress_bar() as task:
215
pp = progress.ProgressPhase('Guessing renames', 4, task)
216
with from_tree.lock_read():
219
missing_files, missing_parents, candidate_files = (
220
rn._find_missing_files(from_tree))
222
rn.add_file_edge_hashes(from_tree, missing_files)
224
matches = rn.file_match(candidate_files)
225
parents_matches = matches
226
while len(parents_matches) > 0:
227
required_parents = rn.get_required_parents(
229
parents_matches = rn.match_parents(required_parents,
231
matches.update(parents_matches)
233
delta = rn._make_inventory_delta(matches)
234
for old, new, file_id, entry in delta:
235
trace.note(gettext("{0} => {1}").format(old, new))
237
to_tree.add(required_parents)
238
to_tree.apply_inventory_delta(delta)
240
def _make_inventory_delta(self, matches):
242
file_id_matches = dict((f, p) for p, f in viewitems(matches))
244
for f in viewvalues(matches):
246
file_id_query.append(self.tree.id2path(f))
247
except errors.NoSuchId:
249
for old_path, entry in self.tree.iter_entries_by_dir(
250
specific_files=file_id_query):
251
new_path = file_id_matches[entry.file_id]
252
parent_path, new_name = osutils.split(new_path)
253
parent_id = matches.get(parent_path)
254
if parent_id is None:
255
parent_id = self.tree.path2id(parent_path)
256
if parent_id is None:
257
added, ignored = self.tree.smart_add(
258
[parent_path], recurse=False)
259
if len(ignored) > 0 and ignored[0] == parent_path:
262
parent_id = self.tree.path2id(parent_path)
263
if entry.name == new_name and entry.parent_id == parent_id:
265
new_entry = entry.copy()
266
new_entry.parent_id = parent_id
267
new_entry.name = new_name
268
delta.append((old_path, new_path, new_entry.file_id, new_entry))