136
126
def line_delta_iter(self, new_lines):
137
127
"""Generate line-based delta from this content to new_lines."""
138
new_texts = new_lines.text()
139
old_texts = self.text()
128
new_texts = [text for origin, text in new_lines._lines]
129
old_texts = [text for origin, text in self._lines]
140
130
s = KnitSequenceMatcher(None, old_texts, new_texts)
141
for tag, i1, i2, j1, j2 in s.get_opcodes():
131
for op in s.get_opcodes():
144
# ofrom, oto, length, data
145
yield i1, i2, j2 - j1, new_lines._lines[j1:j2]
134
# ofrom oto length data
135
yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])
147
137
def line_delta(self, new_lines):
148
138
return list(self.line_delta_iter(new_lines))
195
186
internal representation is
196
187
(start, end, count, [1..count tuples (revid, newline)])
189
decode_utf8 = cache_utf8.decode
199
191
lines = iter(lines)
200
192
next = lines.next
203
def cache_and_return(line):
204
origin, text = line.split(' ', 1)
205
return cache.setdefault(origin, origin), text
207
193
# walk through the lines parsing.
208
194
for header in lines:
209
195
start, end, count = [int(n) for n in header.split(',')]
210
contents = [tuple(next().split(' ', 1)) for i in xrange(count)]
199
origin, text = next().split(' ', 1)
201
contents.append((decode_utf8(origin), text))
211
202
result.append((start, end, count, contents))
214
def get_fulltext_content(self, lines):
215
"""Extract just the content lines from a fulltext."""
216
return (line.split(' ', 1)[1] for line in lines)
218
def get_linedelta_content(self, lines):
219
"""Extract just the content from a line delta.
221
This doesn't return all of the extra information stored in a delta.
222
Only the actual content lines.
227
header = header.split(',')
228
count = int(header[2])
229
for i in xrange(count):
230
origin, text = next().split(' ', 1)
233
205
def lower_fulltext(self, content):
234
206
"""convert a fulltext content record into a serializable form.
236
208
see parse_fulltext which this inverts.
238
# TODO: jam 20070209 We only do the caching thing to make sure that
239
# the origin is a valid utf-8 line, eventually we could remove it
240
return ['%s %s' % (o, t) for o, t in content._lines]
210
encode_utf8 = cache_utf8.encode
211
return ['%s %s' % (encode_utf8(o), t) for o, t in content._lines]
242
213
def lower_line_delta(self, delta):
243
214
"""convert a delta into a serializable form.
245
216
See parse_line_delta which this inverts.
247
# TODO: jam 20070209 We only do the caching thing to make sure that
248
# the origin is a valid utf-8 line, eventually we could remove it
218
encode_utf8 = cache_utf8.encode
250
220
for start, end, c, lines in delta:
251
221
out.append('%d,%d,%d\n' % (start, end, c))
252
out.extend(origin + ' ' + text
222
out.extend(encode_utf8(origin) + ' ' + text
253
223
for origin, text in lines)
260
230
annotated = False
262
def parse_fulltext(self, content, version_id):
232
def parse_fulltext(self, content, version):
263
233
"""This parses an unannotated fulltext.
265
235
Note that this is not a noop - the internal representation
266
236
has (versionid, line) - its just a constant versionid.
268
return self.make(content, version_id)
238
return self.make(content, version)
270
def parse_line_delta_iter(self, lines, version_id):
272
num_lines = len(lines)
273
while cur < num_lines:
240
def parse_line_delta_iter(self, lines, version):
242
header = lines.pop(0)
276
243
start, end, c = [int(n) for n in header.split(',')]
277
yield start, end, c, zip([version_id] * c, lines[cur:cur+c])
280
def parse_line_delta(self, lines, version_id):
281
return list(self.parse_line_delta_iter(lines, version_id))
283
def get_fulltext_content(self, lines):
284
"""Extract just the content lines from a fulltext."""
287
def get_linedelta_content(self, lines):
288
"""Extract just the content from a line delta.
290
This doesn't return all of the extra information stored in a delta.
291
Only the actual content lines.
296
header = header.split(',')
297
count = int(header[2])
298
for i in xrange(count):
244
yield start, end, c, zip([version] * c, lines[:c])
247
def parse_line_delta(self, lines, version):
248
return list(self.parse_line_delta_iter(lines, version))
301
250
def lower_fulltext(self, content):
302
251
return content.text()
372
319
return '%s(%s)' % (self.__class__.__name__,
373
320
self.transport.abspath(self.filename))
375
def _check_should_delta(self, first_parents):
376
"""Iterate back through the parent listing, looking for a fulltext.
378
This is used when we want to decide whether to add a delta or a new
379
fulltext. It searches for _max_delta_chain parents. When it finds a
380
fulltext parent, it sees if the total size of the deltas leading up to
381
it is large enough to indicate that we want a new full text anyway.
383
Return True if we should create a new delta, False if we should use a
388
delta_parents = first_parents
389
for count in xrange(self._max_delta_chain):
390
parent = delta_parents[0]
391
method = self._index.get_method(parent)
392
pos, size = self._index.get_position(parent)
393
if method == 'fulltext':
397
delta_parents = self._index.get_parents(parent)
399
# We couldn't find a fulltext, so we must create a new one
402
return fulltext_size > delta_size
404
322
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
405
323
"""See VersionedFile._add_delta()."""
406
324
self._check_add(version_id, []) # should we check the lines ?
867
798
text_map[version_id] = text
868
799
return text_map, final_content
870
def iter_lines_added_or_present_in_versions(self, version_ids=None,
801
def iter_lines_added_or_present_in_versions(self, version_ids=None):
872
802
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
873
803
if version_ids is None:
874
804
version_ids = self.versions()
876
version_ids = [osutils.safe_revision_id(v) for v in version_ids]
878
pb = progress.DummyProgress()
879
805
# we don't care about inclusions, the caller cares.
880
806
# but we need to setup a list of records to visit.
881
807
# we need version_id, position, length
882
808
version_id_records = []
883
requested_versions = set(version_ids)
809
requested_versions = list(version_ids)
884
810
# filter for available versions
885
811
for version_id in requested_versions:
886
812
if not self.has_version(version_id):
887
813
raise RevisionNotPresent(version_id, self.filename)
888
814
# get a in-component-order queue:
889
816
for version_id in self.versions():
890
817
if version_id in requested_versions:
818
version_ids.append(version_id)
891
819
data_pos, length = self._index.get_position(version_id)
892
820
version_id_records.append((version_id, data_pos, length))
822
pb = bzrlib.ui.ui_factory.nested_progress_bar()
894
824
total = len(version_id_records)
895
for version_idx, (version_id, data, sha_value) in \
896
enumerate(self._data.read_records_iter(version_id_records)):
897
pb.update('Walking content.', version_idx, total)
898
method = self._index.get_method(version_id)
900
assert method in ('fulltext', 'line-delta')
901
if method == 'fulltext':
902
line_iterator = self.factory.get_fulltext_content(data)
904
line_iterator = self.factory.get_linedelta_content(data)
905
for line in line_iterator:
908
pb.update('Walking content.', total, total)
826
pb.update('Walking content.', count, total)
827
for version_id, data, sha_value in \
828
self._data.read_records_iter(version_id_records):
829
pb.update('Walking content.', count, total)
830
method = self._index.get_method(version_id)
831
version_idx = self._index.lookup(version_id)
832
assert method in ('fulltext', 'line-delta')
833
if method == 'fulltext':
834
content = self.factory.parse_fulltext(data, version_idx)
835
for line in content.text():
838
delta = self.factory.parse_line_delta(data, version_idx)
839
for start, end, count, lines in delta:
840
for origin, line in lines:
843
pb.update('Walking content.', total, total)
846
pb.update('Walking content.', total, total)
910
850
def num_versions(self):
911
851
"""See VersionedFile.num_versions()."""
1145
1071
# so - wc -l of a knit index is != the number of unique names
1147
1073
self._history = []
1074
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1149
fp = self._transport.get(self._filename)
1151
# _load_data may raise NoSuchFile if the target knit is
1157
if mode != 'w' or not create:
1160
self._need_to_create = True
1162
self._transport.put_bytes_non_atomic(
1163
self._filename, self.HEADER, mode=self._file_mode)
1165
def _load_data(self, fp):
1167
history = self._history
1169
self.check_header(fp)
1170
# readlines reads the whole file at once:
1171
# bad for transports like http, good for local disk
1172
# we save 60 ms doing this one change (
1173
# from calling readline each time to calling
1175
# probably what we want for nice behaviour on
1176
# http is a incremental readlines that yields, or
1177
# a check for local vs non local indexes,
1178
history_top = len(history) - 1
1179
for line in fp.readlines():
1181
if len(rec) < 5 or rec[-1] != ':':
1183
# FIXME: in the future we should determine if its a
1184
# short write - and ignore it
1185
# or a different failure, and raise. RBC 20060407
1189
for value in rec[4:-1]:
1191
# uncompressed reference
1192
parent_id = value[1:]
1079
pb.update('read knit index', count, total)
1080
fp = self._transport.get(self._filename)
1082
self.check_header(fp)
1083
# readlines reads the whole file at once:
1084
# bad for transports like http, good for local disk
1085
# we save 60 ms doing this one change (
1086
# from calling readline each time to calling
1088
# probably what we want for nice behaviour on
1089
# http is a incremental readlines that yields, or
1090
# a check for local vs non local indexes,
1091
for l in fp.readlines():
1093
if len(rec) < 5 or rec[-1] != ':':
1095
# FIXME: in the future we should determine if its a
1096
# short write - and ignore it
1097
# or a different failure, and raise. RBC 20060407
1101
#pb.update('read knit index', count, total)
1102
# See self._parse_parents
1104
for value in rec[4:-1]:
1106
# uncompressed reference
1107
parents.append(value[1:])
1109
# this is 15/4000ms faster than isinstance,
1111
# this function is called thousands of times a
1112
# second so small variations add up.
1113
assert value.__class__ is str
1114
parents.append(self._history[int(value)])
1115
# end self._parse_parents
1116
# self._cache_version(rec[0],
1117
# rec[1].split(','),
1121
# --- self._cache_version
1122
# only want the _history index to reference the 1st
1123
# index entry for version_id
1125
if version_id not in self._cache:
1126
index = len(self._history)
1127
self._history.append(version_id)
1129
index = self._cache[version_id][5]
1130
self._cache[version_id] = (version_id,
1136
# --- self._cache_version
1139
except NoSuchFile, e:
1140
if mode != 'w' or not create:
1143
self._need_to_create = True
1194
parent_id = history[int(value)]
1195
parents.append(parent_id)
1197
version_id, options, pos, size = rec[:4]
1198
version_id = version_id
1200
# See self._cache_version
1201
# only want the _history index to reference the 1st
1202
# index entry for version_id
1203
if version_id not in cache:
1206
history.append(version_id)
1145
self._transport.put_bytes_non_atomic(self._filename,
1146
self.HEADER, mode=self._file_mode)
1149
pb.update('read knit index', total, total)
1152
def _parse_parents(self, compressed_parents):
1153
"""convert a list of string parent values into version ids.
1155
ints are looked up in the index.
1156
.FOO values are ghosts and converted in to FOO.
1158
NOTE: the function is retained here for clarity, and for possible
1159
use in partial index reads. However bulk processing now has
1160
it inlined in __init__ for inner-loop optimisation.
1163
for value in compressed_parents:
1164
if value[-1] == '.':
1165
# uncompressed reference
1166
result.append(value[1:])
1208
index = cache[version_id][5]
1209
cache[version_id] = (version_id,
1215
# end self._cache_version
1168
# this is 15/4000ms faster than isinstance,
1169
# this function is called thousands of times a
1170
# second so small variations add up.
1171
assert value.__class__ is str
1172
result.append(self._history[int(value)])
1217
1175
def get_graph(self):
1218
return [(vid, idx[4]) for vid, idx in self._cache.iteritems()]
1177
for version_id, index in self._cache.iteritems():
1178
graph.append((version_id, index[4]))
1220
1181
def get_ancestry(self, versions):
1221
1182
"""See VersionedFile.get_ancestry."""
1222
1183
# get a graph of all the mentioned versions:
1224
1185
pending = set(versions)
1227
1187
version = pending.pop()
1188
parents = self._cache[version][4]
1189
# got the parents ok
1230
parents = [p for p in cache[version][4] if p in cache]
1232
raise RevisionNotPresent(version, self._filename)
1233
# if not completed and not a ghost
1234
pending.update([p for p in parents if p not in graph])
1191
parents = [parent for parent in parents if parent in self._cache]
1192
for parent in parents:
1193
# if not completed and not a ghost
1194
if parent not in graph:
1235
1196
graph[version] = parents
1236
1197
return topo_sort(graph.items())
1238
1199
def get_ancestry_with_ghosts(self, versions):
1239
1200
"""See VersionedFile.get_ancestry_with_ghosts."""
1240
1201
# get a graph of all the mentioned versions:
1241
self.check_versions_present(versions)
1244
1203
pending = set(versions)
1246
1205
version = pending.pop()
1248
parents = cache[version][4]
1207
parents = self._cache[version][4]
1249
1208
except KeyError:
1250
1209
# ghost, fake it
1251
1210
graph[version] = []
1254
pending.update([p for p in parents if p not in graph])
1213
# got the parents ok
1214
for parent in parents:
1215
if parent not in graph:
1255
1217
graph[version] = parents
1256
1218
return topo_sort(graph.items())
1293
1255
(version_id, options, pos, size, parents).
1296
orig_history = self._history[:]
1297
orig_cache = self._cache.copy()
1300
for version_id, options, pos, size, parents in versions:
1301
line = "\n%s %s %s %s %s :" % (version_id,
1305
self._version_list_to_index(parents))
1306
assert isinstance(line, str), \
1307
'content must be utf-8 encoded: %r' % (line,)
1309
self._cache_version(version_id, options, pos, size, parents)
1310
if not self._need_to_create:
1311
self._transport.append_bytes(self._filename, ''.join(lines))
1314
sio.write(self.HEADER)
1315
sio.writelines(lines)
1317
self._transport.put_file_non_atomic(self._filename, sio,
1318
create_parent_dir=self._create_parent_dir,
1319
mode=self._file_mode,
1320
dir_mode=self._dir_mode)
1321
self._need_to_create = False
1323
# If any problems happen, restore the original values and re-raise
1324
self._history = orig_history
1325
self._cache = orig_cache
1258
encode_utf8 = cache_utf8.encode
1259
for version_id, options, pos, size, parents in versions:
1260
line = "\n%s %s %s %s %s :" % (encode_utf8(version_id),
1264
self._version_list_to_index(parents))
1265
assert isinstance(line, str), \
1266
'content must be utf-8 encoded: %r' % (line,)
1268
if not self._need_to_create:
1269
self._transport.append_bytes(self._filename, ''.join(lines))
1272
sio.write(self.HEADER)
1273
sio.writelines(lines)
1275
self._transport.put_file_non_atomic(self._filename, sio,
1276
create_parent_dir=self._create_parent_dir,
1277
mode=self._file_mode,
1278
dir_mode=self._dir_mode)
1279
self._need_to_create = False
1281
# cache after writing, so that a failed write leads to missing cache
1282
# entries not extra ones. XXX TODO: RBC 20060502 in the event of a
1283
# failure, reload the index or flush it or some such, to prevent
1284
# writing records that did complete twice.
1285
for version_id, options, pos, size, parents in versions:
1286
self._cache_version(version_id, options, pos, size, parents)
1328
1288
def has_version(self, version_id):
1329
1289
"""True if the version is in the index."""
1330
return version_id in self._cache
1290
return (version_id in self._cache)
1332
1292
def get_position(self, version_id):
1333
1293
"""Return data position and size of specified version."""
1334
entry = self._cache[version_id]
1335
return entry[2], entry[3]
1294
return (self._cache[version_id][2], \
1295
self._cache[version_id][3])
1337
1297
def get_method(self, version_id):
1338
1298
"""Return compression method of specified version."""
1466
1427
as (stream, header_record)
1468
1429
df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
1470
rec = self._check_header(version_id, df.readline())
1471
except Exception, e:
1472
raise KnitCorrupt(self._filename,
1473
"While reading {%s} got %s(%s)"
1474
% (version_id, e.__class__.__name__, str(e)))
1430
rec = df.readline().split()
1432
raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')
1433
if cache_utf8.decode(rec[1]) != version_id:
1434
raise KnitCorrupt(self._filename,
1435
'unexpected version, wanted %r, got %r' % (
1436
version_id, rec[1]))
1477
def _check_header(self, version_id, line):
1480
raise KnitCorrupt(self._filename,
1481
'unexpected number of elements in record header')
1482
if rec[1] != version_id:
1483
raise KnitCorrupt(self._filename,
1484
'unexpected version, wanted %r, got %r'
1485
% (version_id, rec[1]))
1488
1439
def _parse_record(self, version_id, data):
1489
1440
# profiling notes:
1490
1441
# 4168 calls in 2880 217 internal
1491
1442
# 4168 calls to _parse_record_header in 2121
1492
1443
# 4168 calls to readlines in 330
1493
df = GzipFile(mode='rb', fileobj=StringIO(data))
1496
record_contents = df.readlines()
1497
except Exception, e:
1498
raise KnitCorrupt(self._filename,
1499
"While reading {%s} got %s(%s)"
1500
% (version_id, e.__class__.__name__, str(e)))
1501
header = record_contents.pop(0)
1502
rec = self._check_header(version_id, header)
1504
last_line = record_contents.pop()
1505
if len(record_contents) != int(rec[2]):
1506
raise KnitCorrupt(self._filename,
1507
'incorrect number of lines %s != %s'
1509
% (len(record_contents), int(rec[2]),
1511
if last_line != 'end %s\n' % rec[1]:
1512
raise KnitCorrupt(self._filename,
1513
'unexpected version end line %r, wanted %r'
1514
% (last_line, version_id))
1444
df, rec = self._parse_record_header(version_id, data)
1445
record_contents = df.readlines()
1446
l = record_contents.pop()
1447
assert len(record_contents) == int(rec[2])
1448
if l != 'end %s\n' % cache_utf8.encode(version_id):
1449
raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'
1516
1452
return record_contents, rec[3]