228
236
see parse_fulltext which this inverts.
230
encode_utf8 = cache_utf8.encode
231
return ['%s %s' % (encode_utf8(o), t) for o, t in content._lines]
238
# TODO: jam 20070209 We only do the caching thing to make sure that
239
# the origin is a valid utf-8 line, eventually we could remove it
240
return ['%s %s' % (o, t) for o, t in content._lines]
233
242
def lower_line_delta(self, delta):
234
243
"""convert a delta into a serializable form.
236
245
See parse_line_delta which this inverts.
238
encode_utf8 = cache_utf8.encode
247
# TODO: jam 20070209 We only do the caching thing to make sure that
248
# the origin is a valid utf-8 line, eventually we could remove it
240
250
for start, end, c, lines in delta:
241
251
out.append('%d,%d,%d\n' % (start, end, c))
242
out.extend(encode_utf8(origin) + ' ' + text
252
out.extend(origin + ' ' + text
243
253
for origin, text in lines)
250
260
annotated = False
252
def parse_fulltext(self, content, version):
262
def parse_fulltext(self, content, version_id):
253
263
"""This parses an unannotated fulltext.
255
265
Note that this is not a noop - the internal representation
256
266
has (versionid, line) - its just a constant versionid.
258
return self.make(content, version)
268
return self.make(content, version_id)
260
def parse_line_delta_iter(self, lines, version):
270
def parse_line_delta_iter(self, lines, version_id):
262
272
num_lines = len(lines)
263
273
while cur < num_lines:
264
274
header = lines[cur]
266
276
start, end, c = [int(n) for n in header.split(',')]
267
yield start, end, c, zip([version] * c, lines[cur:cur+c])
277
yield start, end, c, zip([version_id] * c, lines[cur:cur+c])
270
def parse_line_delta(self, lines, version):
271
return list(self.parse_line_delta_iter(lines, version))
280
def parse_line_delta(self, lines, version_id):
281
return list(self.parse_line_delta_iter(lines, version_id))
273
283
def get_fulltext_content(self, lines):
274
284
"""Extract just the content lines from a fulltext."""
1122
1145
# so - wc -l of a knit index is != the number of unique names
1124
1147
self._history = []
1125
decode_utf8 = cache_utf8.decode
1126
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1149
fp = self._transport.get(self._filename)
1131
pb.update('read knit index', count, total)
1132
fp = self._transport.get(self._filename)
1134
self.check_header(fp)
1135
# readlines reads the whole file at once:
1136
# bad for transports like http, good for local disk
1137
# we save 60 ms doing this one change (
1138
# from calling readline each time to calling
1140
# probably what we want for nice behaviour on
1141
# http is a incremental readlines that yields, or
1142
# a check for local vs non local indexes,
1143
for l in fp.readlines():
1145
if len(rec) < 5 or rec[-1] != ':':
1147
# FIXME: in the future we should determine if its a
1148
# short write - and ignore it
1149
# or a different failure, and raise. RBC 20060407
1153
#pb.update('read knit index', count, total)
1154
# See self._parse_parents
1156
for value in rec[4:-1]:
1158
# uncompressed reference
1159
parents.append(decode_utf8(value[1:]))
1161
# this is 15/4000ms faster than isinstance,
1163
# this function is called thousands of times a
1164
# second so small variations add up.
1165
assert value.__class__ is str
1166
parents.append(self._history[int(value)])
1167
# end self._parse_parents
1168
# self._cache_version(decode_utf8(rec[0]),
1169
# rec[1].split(','),
1173
# --- self._cache_version
1174
# only want the _history index to reference the 1st
1175
# index entry for version_id
1176
version_id = decode_utf8(rec[0])
1177
if version_id not in self._cache:
1178
index = len(self._history)
1179
self._history.append(version_id)
1181
index = self._cache[version_id][5]
1182
self._cache[version_id] = (version_id,
1188
# --- self._cache_version
1191
except NoSuchFile, e:
1192
if mode != 'w' or not create:
1195
self._need_to_create = True
1151
# _load_data may raise NoSuchFile if the target knit is
1157
if mode != 'w' or not create:
1160
self._need_to_create = True
1162
self._transport.put_bytes_non_atomic(
1163
self._filename, self.HEADER, mode=self._file_mode)
1165
def _load_data(self, fp):
1167
history = self._history
1169
self.check_header(fp)
1170
# readlines reads the whole file at once:
1171
# bad for transports like http, good for local disk
1172
# we save 60 ms doing this one change (
1173
# from calling readline each time to calling
1175
# probably what we want for nice behaviour on
1176
# http is a incremental readlines that yields, or
1177
# a check for local vs non local indexes,
1178
history_top = len(history) - 1
1179
for line in fp.readlines():
1181
if len(rec) < 5 or rec[-1] != ':':
1183
# FIXME: in the future we should determine if its a
1184
# short write - and ignore it
1185
# or a different failure, and raise. RBC 20060407
1189
for value in rec[4:-1]:
1191
# uncompressed reference
1192
parent_id = value[1:]
1197
self._transport.put_bytes_non_atomic(self._filename,
1198
self.HEADER, mode=self._file_mode)
1201
pb.update('read knit index', total, total)
1204
def _parse_parents(self, compressed_parents):
1205
"""convert a list of string parent values into version ids.
1207
ints are looked up in the index.
1208
.FOO values are ghosts and converted in to FOO.
1210
NOTE: the function is retained here for clarity, and for possible
1211
use in partial index reads. However bulk processing now has
1212
it inlined in __init__ for inner-loop optimisation.
1215
for value in compressed_parents:
1216
if value[-1] == '.':
1217
# uncompressed reference
1218
result.append(cache_utf8.decode_utf8(value[1:]))
1194
parent_id = history[int(value)]
1195
parents.append(parent_id)
1197
version_id, options, pos, size = rec[:4]
1198
version_id = version_id
1200
# See self._cache_version
1201
# only want the _history index to reference the 1st
1202
# index entry for version_id
1203
if version_id not in cache:
1206
history.append(version_id)
1220
# this is 15/4000ms faster than isinstance,
1221
# this function is called thousands of times a
1222
# second so small variations add up.
1223
assert value.__class__ is str
1224
result.append(self._history[int(value)])
1208
index = cache[version_id][5]
1209
cache[version_id] = (version_id,
1215
# end self._cache_version
1227
1217
def get_graph(self):
1229
for version_id, index in self._cache.iteritems():
1230
graph.append((version_id, index[4]))
1218
return [(vid, idx[4]) for vid, idx in self._cache.iteritems()]
1233
1220
def get_ancestry(self, versions):
1234
1221
"""See VersionedFile.get_ancestry."""
1235
1222
# get a graph of all the mentioned versions:
1237
1224
pending = set(versions)
1239
1227
version = pending.pop()
1240
parents = self._cache[version][4]
1241
# got the parents ok
1243
parents = [parent for parent in parents if parent in self._cache]
1244
for parent in parents:
1245
# if not completed and not a ghost
1246
if parent not in graph:
1230
parents = [p for p in cache[version][4] if p in cache]
1232
raise RevisionNotPresent(version, self._filename)
1233
# if not completed and not a ghost
1234
pending.update([p for p in parents if p not in graph])
1248
1235
graph[version] = parents
1249
1236
return topo_sort(graph.items())
1251
1238
def get_ancestry_with_ghosts(self, versions):
1252
1239
"""See VersionedFile.get_ancestry_with_ghosts."""
1253
1240
# get a graph of all the mentioned versions:
1241
self.check_versions_present(versions)
1255
1244
pending = set(versions)
1257
1246
version = pending.pop()
1259
parents = self._cache[version][4]
1248
parents = cache[version][4]
1260
1249
except KeyError:
1261
1250
# ghost, fake it
1262
1251
graph[version] = []
1265
# got the parents ok
1266
for parent in parents:
1267
if parent not in graph:
1254
pending.update([p for p in parents if p not in graph])
1269
1255
graph[version] = parents
1270
1256
return topo_sort(graph.items())
1482
1466
as (stream, header_record)
1484
1468
df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
1485
rec = df.readline().split()
1470
rec = self._check_header(version_id, df.readline())
1471
except Exception, e:
1472
raise KnitCorrupt(self._filename,
1473
"While reading {%s} got %s(%s)"
1474
% (version_id, e.__class__.__name__, str(e)))
1477
def _check_header(self, version_id, line):
1486
1479
if len(rec) != 4:
1487
raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')
1488
if cache_utf8.decode(rec[1]) != version_id:
1489
raise KnitCorrupt(self._filename,
1490
'unexpected version, wanted %r, got %r' % (
1491
version_id, rec[1]))
1480
raise KnitCorrupt(self._filename,
1481
'unexpected number of elements in record header')
1482
if rec[1] != version_id:
1483
raise KnitCorrupt(self._filename,
1484
'unexpected version, wanted %r, got %r'
1485
% (version_id, rec[1]))
1494
1488
def _parse_record(self, version_id, data):
1495
1489
# profiling notes:
1496
1490
# 4168 calls in 2880 217 internal
1497
1491
# 4168 calls to _parse_record_header in 2121
1498
1492
# 4168 calls to readlines in 330
1499
df, rec = self._parse_record_header(version_id, data)
1500
record_contents = df.readlines()
1501
l = record_contents.pop()
1502
assert len(record_contents) == int(rec[2])
1503
if l != 'end %s\n' % cache_utf8.encode(version_id):
1504
raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'
1493
df = GzipFile(mode='rb', fileobj=StringIO(data))
1496
record_contents = df.readlines()
1497
except Exception, e:
1498
raise KnitCorrupt(self._filename,
1499
"While reading {%s} got %s(%s)"
1500
% (version_id, e.__class__.__name__, str(e)))
1501
header = record_contents.pop(0)
1502
rec = self._check_header(version_id, header)
1504
last_line = record_contents.pop()
1505
if len(record_contents) != int(rec[2]):
1506
raise KnitCorrupt(self._filename,
1507
'incorrect number of lines %s != %s'
1509
% (len(record_contents), int(rec[2]),
1511
if last_line != 'end %s\n' % rec[1]:
1512
raise KnitCorrupt(self._filename,
1513
'unexpected version end line %r, wanted %r'
1514
% (last_line, version_id))
1507
1516
return record_contents, rec[3]