1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
from __future__ import absolute_import
25
from .i18n import gettext
31
from .ui import ui_factory
33
class RenameMap(object):
34
"""Determine a mapping of renames."""
36
def __init__(self, tree):
41
def iter_edge_hashes(lines):
42
"""Iterate through the hashes of line pairs (which make up an edge).
44
The hash is truncated using a modulus to avoid excessive memory
45
consumption by the hitscount dict. A modulus of 10Mi means that the
46
maximum number of keys is 10Mi. (Keys are normally 32 bits, e.g.
49
modulus = 1024 * 1024 * 10
50
for n in range(len(lines)):
51
yield hash(tuple(lines[n:n+2])) % modulus
53
def add_edge_hashes(self, lines, tag):
54
"""Update edge_hashes to include the given lines.
56
:param lines: The lines to update the hashes for.
57
:param tag: A tag uniquely associated with these lines (i.e. file-id)
59
for my_hash in self.iter_edge_hashes(lines):
60
self.edge_hashes.setdefault(my_hash, set()).add(tag)
62
def add_file_edge_hashes(self, tree, file_ids):
63
"""Update to reflect the hashes for files in the tree.
65
:param tree: The tree containing the files.
66
:param file_ids: A list of file_ids to perform the updates for.
68
desired_files = [(tree.id2path(f), f) for f in file_ids]
69
with ui_factory.nested_progress_bar() as task:
70
for num, (file_id, contents) in enumerate(
71
tree.iter_files_bytes(desired_files)):
72
task.update(gettext('Calculating hashes'), num, len(file_ids))
74
s.writelines(contents)
76
self.add_edge_hashes(s.readlines(), file_id)
78
def hitcounts(self, lines):
79
"""Count the number of hash hits for each tag, for the given lines.
81
Hits are weighted according to the number of tags the hash is
82
associated with; more tags means that the hash is less rare and should
84
:param lines: The lines to calculate hashes of.
85
:return: a dict of {tag: hitcount}
88
for my_hash in self.iter_edge_hashes(lines):
89
tags = self.edge_hashes.get(my_hash)
96
hits[tag] += 1.0 / taglen
99
def get_all_hits(self, paths):
100
"""Find all the hit counts for the listed paths in the tree.
102
:return: A list of tuples of count, path, file_id.
105
with ui_factory.nested_progress_bar() as task:
106
for num, path in enumerate(paths):
107
task.update(gettext('Determining hash hits'), num, len(paths))
108
hits = self.hitcounts(self.tree.get_file_lines(path))
109
all_hits.extend((v, path, k) for k, v in viewitems(hits))
112
def file_match(self, paths):
113
"""Return a mapping from file_ids to the supplied paths."""
114
return self._match_hits(self.get_all_hits(paths))
117
def _match_hits(hit_list):
118
"""Using a hit list, determine a path-to-fileid map.
120
The hit list is a list of (count, path, file_id), where count is a
121
(possibly float) number, with higher numbers indicating stronger
124
seen_file_ids = set()
126
for count, path, file_id in sorted(hit_list, reverse=True):
127
if path in path_map or file_id in seen_file_ids:
129
path_map[path] = file_id
130
seen_file_ids.add(file_id)
133
def get_required_parents(self, matches):
134
"""Return a dict of all file parents that must be versioned.
136
The keys are the required parents and the values are sets of their
139
required_parents = {}
143
path = osutils.dirname(path)
144
if self.tree.is_versioned(path):
146
required_parents.setdefault(path, []).append(child)
148
for parent, children in viewitems(required_parents):
149
child_file_ids = set()
150
for child in children:
151
file_id = matches.get(child)
152
if file_id is not None:
153
child_file_ids.add(file_id)
154
require_ids[parent] = child_file_ids
157
def match_parents(self, required_parents, missing_parents):
158
"""Map parent directories to file-ids.
160
This is done by finding similarity between the file-ids of children of
161
required parent directories and the file-ids of children of missing
165
for file_id, file_id_children in viewitems(missing_parents):
166
for path, path_children in viewitems(required_parents):
167
hits = len(path_children.intersection(file_id_children))
169
all_hits.append((hits, path, file_id))
170
return self._match_hits(all_hits)
172
def _find_missing_files(self, basis):
173
missing_files = set()
175
candidate_files = set()
176
with ui_factory.nested_progress_bar() as task:
177
iterator = self.tree.iter_changes(basis, want_unversioned=True,
179
for (file_id, paths, changed_content, versioned, parent, name,
180
kind, executable) in iterator:
181
if kind[1] is None and versioned[1]:
182
if not self.tree.has_filename(self.tree.id2path(parent[0])):
183
missing_parents.setdefault(parent[0], set()).add(file_id)
184
if kind[0] == 'file':
185
missing_files.add(file_id)
187
#other kinds are not handled
189
if versioned == (False, False):
190
if self.tree.is_ignored(paths[1]):
192
if kind[1] == 'file':
193
candidate_files.add(paths[1])
194
if kind[1] == 'directory':
195
for _dir, children in self.tree.walkdirs(paths[1]):
196
for child in children:
197
if child[2] == 'file':
198
candidate_files.add(child[0])
199
return missing_files, missing_parents, candidate_files
202
def guess_renames(klass, from_tree, to_tree, dry_run=False):
203
"""Guess which files to rename, and perform the rename.
205
We assume that unversioned files and missing files indicate that
206
versioned files have been renamed outside of Bazaar.
208
:param from_tree: A tree to compare from
209
:param to_tree: A write-locked working tree.
211
required_parents = {}
212
with ui_factory.nested_progress_bar() as task:
213
pp = progress.ProgressPhase('Guessing renames', 4, task)
214
with from_tree.lock_read():
217
missing_files, missing_parents, candidate_files = (
218
rn._find_missing_files(from_tree))
220
rn.add_file_edge_hashes(from_tree, missing_files)
222
matches = rn.file_match(candidate_files)
223
parents_matches = matches
224
while len(parents_matches) > 0:
225
required_parents = rn.get_required_parents(
227
parents_matches = rn.match_parents(required_parents,
229
matches.update(parents_matches)
231
delta = rn._make_inventory_delta(matches)
232
for old, new, file_id, entry in delta:
233
trace.note( gettext("{0} => {1}").format(old, new) )
235
to_tree.add(required_parents)
236
to_tree.apply_inventory_delta(delta)
238
def _make_inventory_delta(self, matches):
240
file_id_matches = dict((f, p) for p, f in viewitems(matches))
242
for f in viewvalues(matches):
244
file_id_query.append(self.tree.id2path(f))
245
except errors.NoSuchId:
247
for old_path, entry in self.tree.iter_entries_by_dir(
248
specific_files=file_id_query):
249
new_path = file_id_matches[entry.file_id]
250
parent_path, new_name = osutils.split(new_path)
251
parent_id = matches.get(parent_path)
252
if parent_id is None:
253
parent_id = self.tree.path2id(parent_path)
254
if parent_id is None:
255
added, ignored = self.tree.smart_add([parent_path], recurse=False)
256
if len(ignored) > 0 and ignored[0] == parent_path:
259
parent_id = self.tree.path2id(parent_path)
260
if entry.name == new_name and entry.parent_id == parent_id:
262
new_entry = entry.copy()
263
new_entry.parent_id = parent_id
264
new_entry.name = new_name
265
delta.append((old_path, new_path, new_entry.file_id, new_entry))