1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
from io import BytesIO
24
from .i18n import gettext
25
from .ui import ui_factory
28
class RenameMap(object):
29
"""Determine a mapping of renames."""
31
def __init__(self, tree):
36
def iter_edge_hashes(lines):
37
"""Iterate through the hashes of line pairs (which make up an edge).
39
The hash is truncated using a modulus to avoid excessive memory
40
consumption by the hitscount dict. A modulus of 10Mi means that the
41
maximum number of keys is 10Mi. (Keys are normally 32 bits, e.g.
44
modulus = 1024 * 1024 * 10
45
for n in range(len(lines)):
46
yield hash(tuple(lines[n:n + 2])) % modulus
48
def add_edge_hashes(self, lines, tag):
49
"""Update edge_hashes to include the given lines.
51
:param lines: The lines to update the hashes for.
52
:param tag: A tag uniquely associated with these lines (i.e. file-id)
54
for my_hash in self.iter_edge_hashes(lines):
55
self.edge_hashes.setdefault(my_hash, set()).add(tag)
57
def add_file_edge_hashes(self, tree, file_ids):
58
"""Update to reflect the hashes for files in the tree.
60
:param tree: The tree containing the files.
61
:param file_ids: A list of file_ids to perform the updates for.
63
desired_files = [(tree.id2path(f), f) for f in file_ids]
64
with ui_factory.nested_progress_bar() as task:
65
for num, (file_id, contents) in enumerate(
66
tree.iter_files_bytes(desired_files)):
67
task.update(gettext('Calculating hashes'), num, len(file_ids))
69
s.writelines(contents)
71
self.add_edge_hashes(s.readlines(), file_id)
73
def hitcounts(self, lines):
74
"""Count the number of hash hits for each tag, for the given lines.
76
Hits are weighted according to the number of tags the hash is
77
associated with; more tags means that the hash is less rare and should
79
:param lines: The lines to calculate hashes of.
80
:return: a dict of {tag: hitcount}
83
for my_hash in self.iter_edge_hashes(lines):
84
tags = self.edge_hashes.get(my_hash)
91
hits[tag] += 1.0 / taglen
94
def get_all_hits(self, paths):
95
"""Find all the hit counts for the listed paths in the tree.
97
:return: A list of tuples of count, path, file_id.
100
with ui_factory.nested_progress_bar() as task:
101
for num, path in enumerate(paths):
102
task.update(gettext('Determining hash hits'), num, len(paths))
103
hits = self.hitcounts(self.tree.get_file_lines(path))
104
all_hits.extend((v, path, k) for k, v in hits.items())
107
def file_match(self, paths):
108
"""Return a mapping from file_ids to the supplied paths."""
109
return self._match_hits(self.get_all_hits(paths))
112
def _match_hits(hit_list):
113
"""Using a hit list, determine a path-to-fileid map.
115
The hit list is a list of (count, path, file_id), where count is a
116
(possibly float) number, with higher numbers indicating stronger
119
seen_file_ids = set()
121
for count, path, file_id in sorted(hit_list, reverse=True):
122
if path in path_map or file_id in seen_file_ids:
124
path_map[path] = file_id
125
seen_file_ids.add(file_id)
128
def get_required_parents(self, matches):
129
"""Return a dict of all file parents that must be versioned.
131
The keys are the required parents and the values are sets of their
134
required_parents = {}
138
path = osutils.dirname(path)
139
if self.tree.is_versioned(path):
141
required_parents.setdefault(path, []).append(child)
143
for parent, children in required_parents.items():
144
child_file_ids = set()
145
for child in children:
146
file_id = matches.get(child)
147
if file_id is not None:
148
child_file_ids.add(file_id)
149
require_ids[parent] = child_file_ids
152
def match_parents(self, required_parents, missing_parents):
153
"""Map parent directories to file-ids.
155
This is done by finding similarity between the file-ids of children of
156
required parent directories and the file-ids of children of missing
160
for file_id, file_id_children in missing_parents.items():
161
for path, path_children in required_parents.items():
162
hits = len(path_children.intersection(file_id_children))
164
all_hits.append((hits, path, file_id))
165
return self._match_hits(all_hits)
167
def _find_missing_files(self, basis):
168
missing_files = set()
170
candidate_files = set()
171
with ui_factory.nested_progress_bar() as task:
172
iterator = self.tree.iter_changes(basis, want_unversioned=True,
174
for change in iterator:
175
if change.kind[1] is None and change.versioned[1]:
176
if not self.tree.has_filename(
177
self.tree.id2path(change.parent_id[0])):
178
missing_parents.setdefault(
179
change.parent_id[0], set()).add(change.file_id)
180
if change.kind[0] == 'file':
181
missing_files.add(change.file_id)
183
# other kinds are not handled
185
if change.versioned == (False, False):
186
if self.tree.is_ignored(change.path[1]):
188
if change.kind[1] == 'file':
189
candidate_files.add(change.path[1])
190
if change.kind[1] == 'directory':
191
for _dir, children in self.tree.walkdirs(change.path[1]):
192
for child in children:
193
if child[2] == 'file':
194
candidate_files.add(child[0])
195
return missing_files, missing_parents, candidate_files
198
def guess_renames(klass, from_tree, to_tree, dry_run=False):
199
"""Guess which files to rename, and perform the rename.
201
We assume that unversioned files and missing files indicate that
202
versioned files have been renamed outside of Bazaar.
204
:param from_tree: A tree to compare from
205
:param to_tree: A write-locked working tree.
207
required_parents = {}
208
with ui_factory.nested_progress_bar() as task:
209
pp = progress.ProgressPhase('Guessing renames', 4, task)
210
with from_tree.lock_read():
213
missing_files, missing_parents, candidate_files = (
214
rn._find_missing_files(from_tree))
216
rn.add_file_edge_hashes(from_tree, missing_files)
218
matches = rn.file_match(candidate_files)
219
parents_matches = matches
220
while len(parents_matches) > 0:
221
required_parents = rn.get_required_parents(
223
parents_matches = rn.match_parents(required_parents,
225
matches.update(parents_matches)
227
delta = rn._make_inventory_delta(matches)
228
for old, new, file_id, entry in delta:
229
trace.note(gettext("{0} => {1}").format(old, new))
231
to_tree.add(required_parents)
232
to_tree.apply_inventory_delta(delta)
234
def _make_inventory_delta(self, matches):
236
file_id_matches = dict((f, p) for p, f in matches.items())
238
for f in matches.values():
240
file_id_query.append(self.tree.id2path(f))
241
except errors.NoSuchId:
243
for old_path, entry in self.tree.iter_entries_by_dir(
244
specific_files=file_id_query):
245
new_path = file_id_matches[entry.file_id]
246
parent_path, new_name = osutils.split(new_path)
247
parent_id = matches.get(parent_path)
248
if parent_id is None:
249
parent_id = self.tree.path2id(parent_path)
250
if parent_id is None:
251
added, ignored = self.tree.smart_add(
252
[parent_path], recurse=False)
253
if len(ignored) > 0 and ignored[0] == parent_path:
256
parent_id = self.tree.path2id(parent_path)
257
if entry.name == new_name and entry.parent_id == parent_id:
259
new_entry = entry.copy()
260
new_entry.parent_id = parent_id
261
new_entry.name = new_name
262
delta.append((old_path, new_path, new_entry.file_id, new_entry))