1121
1121
yield (self, next_sub_key, value)
1123
def get_ancestry(self, keys, ref_list_num, parent_map):
1124
"""Iterate over the given keys and all parents that are found.
1126
:param keys: A sorted list keys whose ancestry we want to return
1127
:param ref_list_num: This index in the ref_lists is the parents we
1129
:param parent_map: keys that we already know the parents to. when
1130
finding new keys we will add nodes to this dict.
1131
:return: [not_present_keys], [sorted_search_tips]
1132
A dict mapping {key: parent_keys} but including all
1133
parent_keys that we encounter.
1134
not_present_keys are keys where we found the LeafNode, but the key
1136
search_tips parents that we found, which might exist in this
1137
index, but which we were unable to find immediately, callers
1138
should re-query this index for those keys.
1140
if not self.key_count():
1141
# We use key_count() to trigger reading the root node and
1142
# determining info about this BTreeGraphIndex
1143
# If we don't have any keys, then everything is missing
1144
return keys, parent_map
1145
if ref_list_num >= self.node_ref_lists:
1146
raise ValueError('No ref list %d, index has %d ref lists'
1147
% (ref_list_num, self.node_ref_lists))
1149
# The main trick we are trying to accomplish is that when we find a
1150
# key listing its parents, we expect that the parent key is also likely
1151
# to sit on the same page. Allowing us to expand parents quickly
1152
# without suffering the full stack of bisecting, etc.
1153
nodes_and_keys = [(0, sorted(keys))]
1155
# Search through the internal nodes, until we get to the leaf nodes
1156
for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
1157
node_indexes = [idx for idx, s_keys in nodes_and_keys]
1158
nodes = self._get_internal_nodes(node_indexes)
1160
next_nodes_and_keys = []
1161
for node_index, sub_keys in nodes_and_keys:
1162
node = nodes[node_index]
1163
positions = self._multi_bisect_right(sub_keys, node.keys)
1164
node_offset = next_row_start + node.offset
1165
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
1166
for pos, s_keys in positions])
1167
nodes_and_keys = next_nodes_and_keys
1168
# We should now be at the _LeafNodes
1169
node_indexes = [idx for idx, s_keys in nodes_and_keys]
1171
# TODO: We may *not* want to always read all the nodes in one
1172
# big go. Consider setting a max size on this.
1174
# Should missing_keys be a set?
1175
missing_keys = set()
1176
# These are parent keys which could not be immediately resolved on the
1177
# page where the child was present. Note that we may already be
1178
# searching for that key, and it may actually be present [or known
1179
# missing] on one of the other pages we are reading.
1181
# We could try searching for them in the immediate previous or next
1182
# page. If they occur "later" we could put them in a pending lookup
1183
# set, and then for each node we read thereafter we could check to
1184
# see if they are present.
1185
# However, we don't know the impact of keeping this list of things
1186
# that I'm going to search for every node I come across from here on
1188
# It doesn't handle the case when the parent key is missing on a
1189
# page that we *don't* read. So we already have to handle being
1190
# re-entrant for that.
1191
# Since most keys contain a date string, they are more likely to be
1192
# found earlier in the file than later, but we would know that right
1193
# away (key < min_key), and wouldn't keep searching it on every other
1194
# page that we read.
1195
# Mostly, it is an idea, one which should be benchmarked.
1196
parents_not_on_page = set()
1198
nodes = self._get_leaf_nodes(node_indexes)
1199
for node_index, sub_keys in nodes_and_keys:
1202
# sub_keys is all of the keys we are looking for that should exist
1203
# on this page, if they aren't here, then they won't be found
1204
node = nodes[node_index]
1205
parents_to_check = set()
1206
for next_sub_key in sub_keys:
1207
if next_sub_key not in node.keys:
1208
# This one is just not present
1209
missing_keys.add(next_sub_key)
1211
value, refs = node.keys[next_sub_key]
1212
parent_keys = refs[ref_list_num]
1213
parent_map[next_sub_key] = parent_keys
1214
parents_to_check.update(parent_keys)
1215
# Don't look for things we've already found
1216
parents_to_check = parents_to_check.difference(parent_map)
1217
# TODO: we should really track what the minimum and maximum key is
1218
# on a given leaf node. We know it trivially when
1219
# deserializing the leaf node, and it would tell us right
1220
# away if a missing parent would be in this btree if it was
1221
# going to be present at all.
1222
# Also, if we knew a parent was supposed to come before this
1223
# page, we could cheaply check to see if node_index - 1 was
1224
# in nodes, or if node_index + 1 is in nodes and quickly look
1226
min_key = min(node.keys)
1227
max_key = max(node.keys)
1228
while parents_to_check:
1229
next_parents_to_check = set()
1230
for key in parents_to_check:
1231
if key in node.keys:
1232
value, refs = node.keys[key]
1233
parent_keys = refs[ref_list_num]
1234
parent_map[key] = parent_keys
1235
next_parents_to_check.update(parent_keys)
1237
# Missing for some reason
1238
if key < min_key or key > max_key:
1239
# This parent key would be present on a different
1241
parents_not_on_page.add(key)
1243
assert key != min_key and key != max_key
1244
# If it was going to be present, it would be on
1245
# *this* page, so mark it missing.
1246
missing_keys.add(key)
1247
parents_to_check = next_parents_to_check.difference(parent_map)
1248
# Some parents we will already know are missing from searching
1249
# other LeafNodes, is it faster to check now, or just wait
1251
# search_tips = search_tips.difference(missing_keys)
1252
# parents_not_on_page could have been found on a different page, or be
1253
# known to be missing. So cull out everything that has already been
1255
search_tips = parents_not_on_page.difference(
1256
parent_map).difference(missing_keys)
1257
return missing_keys, search_tips
1123
1259
def iter_entries_prefix(self, keys):
1124
1260
"""Iterate over keys within the index using prefix matching.