239
265
return self.make(content, version)
241
267
def parse_line_delta_iter(self, lines, version):
243
header = lines.pop(0)
269
num_lines = len(lines)
270
while cur < num_lines:
244
273
start, end, c = [int(n) for n in header.split(',')]
245
yield start, end, c, zip([version] * c, lines[:c])
274
yield start, end, c, zip([version] * c, lines[cur:cur+c])
248
277
def parse_line_delta(self, lines, version):
249
278
return list(self.parse_line_delta_iter(lines, version))
280
def get_fulltext_content(self, lines):
281
"""Extract just the content lines from a fulltext."""
284
def get_linedelta_content(self, lines):
285
"""Extract just the content from a line delta.
287
This doesn't return all of the extra information stored in a delta.
288
Only the actual content lines.
293
header = header.split(',')
294
count = int(header[2])
295
for i in xrange(count):
251
298
def lower_fulltext(self, content):
252
299
return content.text()
320
369
return '%s(%s)' % (self.__class__.__name__,
321
370
self.transport.abspath(self.filename))
372
def _check_should_delta(self, first_parents):
373
"""Iterate back through the parent listing, looking for a fulltext.
375
This is used when we want to decide whether to add a delta or a new
376
fulltext. It searches for _max_delta_chain parents. When it finds a
377
fulltext parent, it sees if the total size of the deltas leading up to
378
it is large enough to indicate that we want a new full text anyway.
380
Return True if we should create a new delta, False if we should use a
385
delta_parents = first_parents
386
for count in xrange(self._max_delta_chain):
387
parent = delta_parents[0]
388
method = self._index.get_method(parent)
389
pos, size = self._index.get_position(parent)
390
if method == 'fulltext':
394
delta_parents = self._index.get_parents(parent)
396
# We couldn't find a fulltext, so we must create a new one
399
return fulltext_size > delta_size
323
401
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
324
402
"""See VersionedFile._add_delta()."""
325
403
self._check_add(version_id, []) # should we check the lines ?
810
871
# but we need to setup a list of records to visit.
811
872
# we need version_id, position, length
812
873
version_id_records = []
813
requested_versions = list(version_ids)
874
requested_versions = set(version_ids)
814
875
# filter for available versions
815
876
for version_id in requested_versions:
816
877
if not self.has_version(version_id):
817
878
raise RevisionNotPresent(version_id, self.filename)
818
879
# get a in-component-order queue:
820
880
for version_id in self.versions():
821
881
if version_id in requested_versions:
822
version_ids.append(version_id)
823
882
data_pos, length = self._index.get_position(version_id)
824
883
version_id_records.append((version_id, data_pos, length))
827
885
total = len(version_id_records)
828
pb.update('Walking content.', count, total)
829
for version_id, data, sha_value in \
830
self._data.read_records_iter(version_id_records):
831
pb.update('Walking content.', count, total)
886
for version_idx, (version_id, data, sha_value) in \
887
enumerate(self._data.read_records_iter(version_id_records)):
888
pb.update('Walking content.', version_idx, total)
832
889
method = self._index.get_method(version_id)
833
890
version_idx = self._index.lookup(version_id)
834
892
assert method in ('fulltext', 'line-delta')
835
893
if method == 'fulltext':
836
content = self.factory.parse_fulltext(data, version_idx)
837
for line in content.text():
894
line_iterator = self.factory.get_fulltext_content(data)
840
delta = self.factory.parse_line_delta(data, version_idx)
841
for start, end, count, lines in delta:
842
for origin, line in lines:
896
line_iterator = self.factory.get_linedelta_content(data)
897
for line in line_iterator:
845
900
pb.update('Walking content.', total, total)
847
902
def num_versions(self):
1068
1130
# so - wc -l of a knit index is != the number of unique names
1070
1132
self._history = []
1071
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1133
decode_utf8 = cache_utf8.decode
1134
pb = ui.ui_factory.nested_progress_bar()
1136
pb.update('read knit index', 0, 1)
1076
pb.update('read knit index', count, total)
1077
1138
fp = self._transport.get(self._filename)
1079
self.check_header(fp)
1080
# readlines reads the whole file at once:
1081
# bad for transports like http, good for local disk
1082
# we save 60 ms doing this one change (
1083
# from calling readline each time to calling
1085
# probably what we want for nice behaviour on
1086
# http is a incremental readlines that yields, or
1087
# a check for local vs non local indexes,
1088
for l in fp.readlines():
1090
if len(rec) < 5 or rec[-1] != ':':
1092
# FIXME: in the future we should determine if its a
1093
# short write - and ignore it
1094
# or a different failure, and raise. RBC 20060407
1098
#pb.update('read knit index', count, total)
1099
# See self._parse_parents
1101
for value in rec[4:-1]:
1103
# uncompressed reference
1104
parents.append(value[1:])
1106
# this is 15/4000ms faster than isinstance,
1108
# this function is called thousands of times a
1109
# second so small variations add up.
1110
assert value.__class__ is str
1111
parents.append(self._history[int(value)])
1112
# end self._parse_parents
1113
# self._cache_version(rec[0],
1114
# rec[1].split(','),
1118
# --- self._cache_version
1119
# only want the _history index to reference the 1st
1120
# index entry for version_id
1122
if version_id not in self._cache:
1123
index = len(self._history)
1124
self._history.append(version_id)
1126
index = self._cache[version_id][5]
1127
self._cache[version_id] = (version_id,
1133
# --- self._cache_version
1140
# _load_data may raise NoSuchFile if the target knit is
1136
except NoSuchFile, e:
1137
1146
if mode != 'w' or not create:
1140
1149
self._need_to_create = True
1142
self._transport.put_bytes_non_atomic(self._filename,
1143
self.HEADER, mode=self._file_mode)
1151
self._transport.put_bytes_non_atomic(
1152
self._filename, self.HEADER, mode=self._file_mode)
1146
pb.update('read knit index', total, total)
1154
pb.update('read knit index', 1, 1)
1149
def _parse_parents(self, compressed_parents):
1150
"""convert a list of string parent values into version ids.
1152
ints are looked up in the index.
1153
.FOO values are ghosts and converted in to FOO.
1155
NOTE: the function is retained here for clarity, and for possible
1156
use in partial index reads. However bulk processing now has
1157
it inlined in __init__ for inner-loop optimisation.
1160
for value in compressed_parents:
1161
if value[-1] == '.':
1162
# uncompressed reference
1163
result.append(value[1:])
1157
def _load_data(self, fp):
1159
history = self._history
1160
decode_utf8 = cache_utf8.decode
1162
self.check_header(fp)
1163
# readlines reads the whole file at once:
1164
# bad for transports like http, good for local disk
1165
# we save 60 ms doing this one change (
1166
# from calling readline each time to calling
1168
# probably what we want for nice behaviour on
1169
# http is a incremental readlines that yields, or
1170
# a check for local vs non local indexes,
1171
history_top = len(history) - 1
1172
for line in fp.readlines():
1174
if len(rec) < 5 or rec[-1] != ':':
1176
# FIXME: in the future we should determine if its a
1177
# short write - and ignore it
1178
# or a different failure, and raise. RBC 20060407
1182
for value in rec[4:-1]:
1184
# uncompressed reference
1185
parents.append(decode_utf8(value[1:]))
1187
parents.append(history[int(value)])
1189
version_id, options, pos, size = rec[:4]
1190
version_id = decode_utf8(version_id)
1192
# See self._cache_version
1193
# only want the _history index to reference the 1st
1194
# index entry for version_id
1195
if version_id not in cache:
1198
history.append(version_id)
1165
# this is 15/4000ms faster than isinstance,
1166
# this function is called thousands of times a
1167
# second so small variations add up.
1168
assert value.__class__ is str
1169
result.append(self._history[int(value)])
1200
index = cache[version_id][5]
1201
cache[version_id] = (version_id,
1207
# end self._cache_version
1172
1209
def get_graph(self):
1174
for version_id, index in self._cache.iteritems():
1175
graph.append((version_id, index[4]))
1210
return [(vid, idx[4]) for vid, idx in self._cache.iteritems()]
1178
1212
def get_ancestry(self, versions):
1179
1213
"""See VersionedFile.get_ancestry."""
1180
1214
# get a graph of all the mentioned versions:
1182
1216
pending = set(versions)
1184
1219
version = pending.pop()
1185
parents = self._cache[version][4]
1186
# got the parents ok
1188
parents = [parent for parent in parents if parent in self._cache]
1189
for parent in parents:
1190
# if not completed and not a ghost
1191
if parent not in graph:
1222
parents = [p for p in cache[version][4] if p in cache]
1224
raise RevisionNotPresent(version, self._filename)
1225
# if not completed and not a ghost
1226
pending.update([p for p in parents if p not in graph])
1193
1227
graph[version] = parents
1194
1228
return topo_sort(graph.items())
1196
1230
def get_ancestry_with_ghosts(self, versions):
1197
1231
"""See VersionedFile.get_ancestry_with_ghosts."""
1198
1232
# get a graph of all the mentioned versions:
1233
self.check_versions_present(versions)
1200
1236
pending = set(versions)
1202
1238
version = pending.pop()
1204
parents = self._cache[version][4]
1240
parents = cache[version][4]
1205
1241
except KeyError:
1206
1242
# ghost, fake it
1207
1243
graph[version] = []
1210
# got the parents ok
1211
for parent in parents:
1212
if parent not in graph:
1246
pending.update([p for p in parents if p not in graph])
1214
1247
graph[version] = parents
1215
1248
return topo_sort(graph.items())
1255
1289
encode_utf8 = cache_utf8.encode
1256
for version_id, options, pos, size, parents in versions:
1257
line = "\n%s %s %s %s %s :" % (encode_utf8(version_id),
1261
self._version_list_to_index(parents))
1262
assert isinstance(line, str), \
1263
'content must be utf-8 encoded: %r' % (line,)
1265
if not self._need_to_create:
1266
self._transport.append_bytes(self._filename, ''.join(lines))
1269
sio.write(self.HEADER)
1270
sio.writelines(lines)
1272
self._transport.put_file_non_atomic(self._filename, sio,
1273
create_parent_dir=self._create_parent_dir,
1274
mode=self._file_mode,
1275
dir_mode=self._dir_mode)
1276
self._need_to_create = False
1278
# cache after writing, so that a failed write leads to missing cache
1279
# entries not extra ones. XXX TODO: RBC 20060502 in the event of a
1280
# failure, reload the index or flush it or some such, to prevent
1281
# writing records that did complete twice.
1282
for version_id, options, pos, size, parents in versions:
1283
self._cache_version(version_id, options, pos, size, parents)
1290
orig_history = self._history[:]
1291
orig_cache = self._cache.copy()
1294
for version_id, options, pos, size, parents in versions:
1295
line = "\n%s %s %s %s %s :" % (encode_utf8(version_id),
1299
self._version_list_to_index(parents))
1300
assert isinstance(line, str), \
1301
'content must be utf-8 encoded: %r' % (line,)
1303
self._cache_version(version_id, options, pos, size, parents)
1304
if not self._need_to_create:
1305
self._transport.append_bytes(self._filename, ''.join(lines))
1308
sio.write(self.HEADER)
1309
sio.writelines(lines)
1311
self._transport.put_file_non_atomic(self._filename, sio,
1312
create_parent_dir=self._create_parent_dir,
1313
mode=self._file_mode,
1314
dir_mode=self._dir_mode)
1315
self._need_to_create = False
1317
# If any problems happen, restore the original values and re-raise
1318
self._history = orig_history
1319
self._cache = orig_cache
1285
1322
def has_version(self, version_id):
1286
1323
"""True if the version is in the index."""
1287
return (version_id in self._cache)
1324
return version_id in self._cache
1289
1326
def get_position(self, version_id):
1290
1327
"""Return data position and size of specified version."""
1291
return (self._cache[version_id][2], \
1292
self._cache[version_id][3])
1328
entry = self._cache[version_id]
1329
return entry[2], entry[3]
1294
1331
def get_method(self, version_id):
1295
1332
"""Return compression method of specified version."""
1424
1460
as (stream, header_record)
1426
1462
df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
1427
rec = df.readline().split()
1463
rec = self._check_header(version_id, df.readline())
1466
def _check_header(self, version_id, line):
1428
1468
if len(rec) != 4:
1429
raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')
1469
raise KnitCorrupt(self._filename,
1470
'unexpected number of elements in record header')
1430
1471
if cache_utf8.decode(rec[1]) != version_id:
1431
raise KnitCorrupt(self._filename,
1432
'unexpected version, wanted %r, got %r' % (
1433
version_id, rec[1]))
1472
raise KnitCorrupt(self._filename,
1473
'unexpected version, wanted %r, got %r'
1474
% (version_id, rec[1]))
1436
1477
def _parse_record(self, version_id, data):
1437
1478
# profiling notes:
1438
1479
# 4168 calls in 2880 217 internal
1439
1480
# 4168 calls to _parse_record_header in 2121
1440
1481
# 4168 calls to readlines in 330
1441
df, rec = self._parse_record_header(version_id, data)
1482
df = GzipFile(mode='rb', fileobj=StringIO(data))
1442
1484
record_contents = df.readlines()
1443
l = record_contents.pop()
1485
header = record_contents.pop(0)
1486
rec = self._check_header(version_id, header)
1488
last_line = record_contents.pop()
1444
1489
assert len(record_contents) == int(rec[2])
1445
if l != 'end %s\n' % cache_utf8.encode(version_id):
1446
raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'
1490
if last_line != 'end %s\n' % rec[1]:
1491
raise KnitCorrupt(self._filename,
1492
'unexpected version end line %r, wanted %r'
1493
% (last_line, version_id))
1449
1495
return record_contents, rec[3]