100
100
--- Format 1 had the following different definition: ---
101
101
rows = dirname, NULL, basename, NULL, MINIKIND, NULL, fileid_utf8, NULL,
102
WHOLE NUMBER (* size *), NULL, packed stat, NULL, sha1|symlink target,
102
WHOLE NUMBER (* size *), NULL, packed stat, NULL, sha1|symlink target,
104
104
PARENT ROW = NULL, revision_utf8, NULL, MINIKIND, NULL, dirname, NULL,
105
105
basename, NULL, WHOLE NUMBER (* size *), NULL, "y" | "n", NULL,
130
130
operations for the less common but still occurs a lot status/diff/commit
131
131
on specific files). Operations on specific files involve a scan for all
132
132
the children of a path, *in every involved tree*, which the current
133
format did not accommodate.
133
format did not accommodate.
136
136
Design priorities:
149
149
Memory representation:
150
150
vector of all directories, and vector of the childen ?
152
root_entrie = (direntry for root, [parent_direntries_for_root]),
152
root_entrie = (direntry for root, [parent_direntries_for_root]),
154
154
('', ['data for achild', 'data for bchild', 'data for cchild'])
155
155
('dir', ['achild', 'cchild', 'echild'])
158
158
- in-order for serialisation - this is 'dirblock' grouping.
159
159
- insertion of a file '/a' affects only the '/' child-vector, that is, to
160
160
insert 10K elements from scratch does not generates O(N^2) memoves of a
161
single vector, rather each individual, which tends to be limited to a
162
manageable number. Will scale badly on trees with 10K entries in a
161
single vector, rather each individual, which tends to be limited to a
162
manageable number. Will scale badly on trees with 10K entries in a
163
163
single directory. compare with Inventory.InventoryDirectory which has
164
164
a dictionary for the children. No bisect capability, can only probe for
165
165
exact matches, or grab all elements and sort.
166
166
- What's the risk of error here? Once we have the base format being processed
167
we should have a net win regardless of optimality. So we are going to
167
we should have a net win regardless of optimality. So we are going to
168
168
go with what seems reasonable.
331
331
# IN_MEMORY_UNMODIFIED indicates that what we have in memory
332
332
# is the same as is on disk
333
333
# IN_MEMORY_MODIFIED indicates that we have a modified version
334
# of what is on disk.
334
# of what is on disk.
335
335
# In future we will add more granularity, for instance _dirblock_state
336
336
# will probably support partially-in-memory as a separate variable,
337
337
# allowing for partially-in-memory unmodified and partially-in-memory
338
338
# modified states.
339
339
self._header_state = DirState.NOT_IN_MEMORY
340
340
self._dirblock_state = DirState.NOT_IN_MEMORY
341
# If true, an error has been detected while updating the dirstate, and
341
# If true, an error has been detected while updating the dirstate, and
342
342
# for safety we're not going to commit to disk.
343
343
self._changes_aborted = False
344
344
self._dirblocks = []
374
374
"""Add a path to be tracked.
376
376
:param path: The path within the dirstate - '' is the root, 'foo' is the
377
path foo within the root, 'foo/bar' is the path bar within foo
377
path foo within the root, 'foo/bar' is the path bar within foo
379
379
:param file_id: The file id of the path being added.
380
:param kind: The kind of the path, as a string like 'file',
380
:param kind: The kind of the path, as a string like 'file',
381
381
'directory', etc.
382
382
:param stat: The output of os.lstat for the path.
383
383
:param fingerprint: The sha value of the file,
405
405
# in the parent, or according to the special treatment for the root
406
406
if basename == '.' or basename == '..':
407
407
raise errors.InvalidEntryName(path)
408
# now that we've normalised, we need the correct utf8 path and
408
# now that we've normalised, we need the correct utf8 path and
409
409
# dirname and basename elements. This single encode and split should be
410
410
# faster than three separate encodes.
411
411
utf8path = (dirname + '/' + basename).strip('/').encode('utf8')
415
415
raise AssertionError(
416
416
"must be a utf8 file_id not %s" % (type(file_id), ))
417
417
# Make sure the file_id does not exist in this tree
418
file_id_entry = self._get_entry(0, fileid_utf8=file_id)
419
file_id_entry = self._get_entry(0, fileid_utf8=file_id, include_deleted=True)
419
420
if file_id_entry != (None, None):
420
path = osutils.pathjoin(file_id_entry[0][0], file_id_entry[0][1])
421
kind = DirState._minikind_to_kind[file_id_entry[1][0][0]]
422
info = '%s:%s' % (kind, path)
423
raise errors.DuplicateFileId(file_id, info)
421
if file_id_entry[1][0][0] == 'a':
422
if file_id_entry[0] != (dirname, basename, file_id):
423
# set the old name's current operation to rename
424
self.update_minimal(file_id_entry[0],
430
rename_from = file_id_entry[0][0:2]
432
path = osutils.pathjoin(file_id_entry[0][0], file_id_entry[0][1])
433
kind = DirState._minikind_to_kind[file_id_entry[1][0][0]]
434
info = '%s:%s' % (kind, path)
435
raise errors.DuplicateFileId(file_id, info)
424
436
first_key = (dirname, basename, '')
425
437
block_index, present = self._find_block_index_from_key(first_key)
427
439
# check the path is not in the tree
428
440
block = self._dirblocks[block_index][1]
429
441
entry_index, _ = self._find_entry_index(first_key, block)
430
while (entry_index < len(block) and
442
while (entry_index < len(block) and
431
443
block[entry_index][0][0:2] == first_key[0:2]):
432
444
if block[entry_index][1][0][0] not in 'ar':
433
445
# this path is in the dirstate in the current tree.
453
465
packed_stat = pack_stat(stat)
454
466
parent_info = self._empty_parent_info()
455
467
minikind = DirState._kind_to_minikind[kind]
468
if rename_from is not None:
470
old_path_utf8 = '%s/%s' % rename_from
472
old_path_utf8 = rename_from[1]
473
parent_info[0] = ('r', old_path_utf8, 0, False, '')
456
474
if kind == 'file':
457
475
entry_data = entry_key, [
458
476
(minikind, fingerprint, size, False, packed_stat),
918
936
def _discard_merge_parents(self):
919
937
"""Discard any parents trees beyond the first.
921
939
Note that if this fails the dirstate is corrupted.
923
941
After this function returns the dirstate contains 2 trees, neither of
993
1011
raise AssertionError("bad dirname %r" % dirname)
994
1012
block_index, present = self._find_block_index_from_key((dirname, '', ''))
996
## In future, when doing partial parsing, this should load and
1014
## In future, when doing partial parsing, this should load and
997
1015
# populate the entire block.
998
1016
self._dirblocks.insert(block_index, (dirname, []))
999
1017
return block_index
1011
1029
if new_entries[0][0][0:2] != ('', ''):
1012
1030
raise AssertionError(
1013
1031
"Missing root row %r" % (new_entries[0][0],))
1014
# The two blocks here are deliberate: the root block and the
1032
# The two blocks here are deliberate: the root block and the
1015
1033
# contents-of-root block.
1016
1034
self._dirblocks = [('', []), ('', [])]
1017
1035
current_block = self._dirblocks[0][1]
1141
1159
# one to use it. we use _right here because there are two
1142
1160
# '' blocks - the root, and the contents of root
1143
1161
# we always have a minimum of 2 in self._dirblocks: root and
1144
# root-contents, and for '', we get 2 back, so this is
1162
# root-contents, and for '', we get 2 back, so this is
1145
1163
# simple and correct:
1146
1164
present = (block_index < len(self._dirblocks) and
1147
1165
self._dirblocks[block_index][0] == key[0])
1570
1588
# already in memory. However, this really needs to be done at a
1571
1589
# higher level, because there either won't be anything on disk,
1572
1590
# or the thing on disk will be a file.
1573
return os.readlink(abspath)
1591
return os.readlink(abspath.encode(osutils._fs_enc))
1575
1593
def get_ghosts(self):
1576
1594
"""Return a list of the parent tree revision ids that are ghosts."""
1722
1740
entry_index += 1
1723
1741
return block_index, entry_index, True, False
1725
def _get_entry(self, tree_index, fileid_utf8=None, path_utf8=None):
1743
def _get_entry(self, tree_index, fileid_utf8=None, path_utf8=None, include_deleted=False):
1726
1744
"""Get the dirstate entry for path in tree tree_index.
1728
1746
If either file_id or path is supplied, it is used as the key to lookup.
1737
1755
:param fileid_utf8: A utf8 file_id to look up.
1738
1756
:param path_utf8: An utf8 path to be looked up.
1757
:param include_deleted: If True, and performing a lookup via
1758
fileid_utf8 rather than path_utf8, return an entry for deleted
1739
1760
:return: The dirstate entry tuple for path, or (None, None)
1741
1762
self._read_dirblocks_if_needed()
1778
1799
entry = self._dirblocks[block_index][1][entry_index]
1779
1800
if entry[1][tree_index][0] in 'fdlt':
1780
# this is the result we are looking for: the
1801
# this is the result we are looking for: the
1781
1802
# real home of this file_id in this tree.
1783
1804
if entry[1][tree_index][0] == 'a':
1784
1805
# there is no home for this entry in this tree
1785
1808
return None, None
1786
1809
if entry[1][tree_index][0] != 'r':
1787
1810
raise AssertionError(
1862
1885
def _iter_child_entries(self, tree_index, path_utf8):
1863
1886
"""Iterate over all the entries that are children of path_utf.
1865
This only returns entries that are present (not in 'a', 'r') in
1888
This only returns entries that are present (not in 'a', 'r') in
1866
1889
tree_index. tree_index data is not refreshed, so if tree 0 is used,
1867
1890
results may differ from that obtained if paths were statted to
1868
1891
determine what ones were directories.
1957
1980
def _read_dirblocks_if_needed(self):
1958
1981
"""Read in all the dirblocks from the file if they are not in memory.
1960
1983
This populates self._dirblocks, and sets self._dirblock_state to
1961
1984
IN_MEMORY_UNMODIFIED. It is not currently ready for incremental block
2088
2111
:param parent_ids: A list of parent tree revision ids.
2089
2112
:param dirblocks: A list containing one tuple for each directory in the
2090
tree. Each tuple contains the directory path and a list of entries
2113
tree. Each tuple contains the directory path and a list of entries
2091
2114
found in that directory.
2093
2116
# our memory copy is now authoritative.
2127
2150
"""Set the parent trees for the dirstate.
2129
2152
:param trees: A list of revision_id, tree tuples. tree must be provided
2130
even if the revision_id refers to a ghost: supply an empty tree in
2153
even if the revision_id refers to a ghost: supply an empty tree in
2132
2155
:param ghosts: A list of the revision_ids that are ghosts at the time
2135
# TODO: generate a list of parent indexes to preserve to save
2158
# TODO: generate a list of parent indexes to preserve to save
2136
2159
# processing specific parent trees. In the common case one tree will
2137
2160
# be preserved - the left most parent.
2138
2161
# TODO: if the parent tree is a dirstate, we might want to walk them
2143
2166
# map and then walk the new parent trees only, mapping them into the
2144
2167
# dirstate. Walk the dirstate at the same time to remove unreferenced
2147
# sketch: loop over all entries in the dirstate, cherry picking
2170
# sketch: loop over all entries in the dirstate, cherry picking
2148
2171
# entries from the parent trees, if they are not ghost trees.
2149
2172
# after we finish walking the dirstate, all entries not in the dirstate
2150
2173
# are deletes, so we want to append them to the end as per the design
2155
2178
# links. We dont't trivially use the inventory from other trees
2156
2179
# because this leads to either double touching, or to accessing
2157
2180
# missing keys,
2158
# - find other keys containing a path
2159
# We accumulate each entry via this dictionary, including the root
2181
# - find other keys containing a path
2182
# We accumulate each entry via this dictionary, including the root
2162
2185
# we could do parallel iterators, but because file id data may be
2166
2189
# parent, but for now the common cases are adding a new parent (merge),
2167
2190
# and replacing completely (commit), and commit is more common: so
2168
2191
# optimise merge later.
2170
2193
# ---- start generation of full tree mapping data
2171
2194
# what trees should we use?
2172
2195
parent_trees = [tree for rev_id, tree in trees if rev_id not in ghosts]
2173
# how many trees do we end up with
2196
# how many trees do we end up with
2174
2197
parent_count = len(parent_trees)
2176
2199
# one: the current tree
2181
2204
by_path[entry[0]] = [entry[1][0]] + \
2182
2205
[DirState.NULL_PARENT_DETAILS] * parent_count
2183
2206
id_index[entry[0][2]] = set([entry[0]])
2185
2208
# now the parent trees:
2186
2209
for tree_index, tree in enumerate(parent_trees):
2187
2210
# the index is off by one, adjust it.
2201
2224
# avoid checking all known paths for the id when generating a
2202
2225
# new entry at this path: by adding the id->path mapping last,
2203
2226
# all the mappings are valid and have correct relocation
2204
# records where needed.
2227
# records where needed.
2205
2228
file_id = entry.file_id
2206
2229
path_utf8 = path.encode('utf8')
2207
2230
dirname, basename = osutils.split(path_utf8)
2218
2241
# This is the vertical axis in the matrix, all pointing
2219
2242
# to the real path.
2220
2243
by_path[entry_key][tree_index] = ('r', path_utf8, 0, False, '')
2221
# by path consistency: Insert into an existing path record (trivial), or
2244
# by path consistency: Insert into an existing path record (trivial), or
2222
2245
# add a new one with relocation pointers for the other tree indexes.
2223
2246
if new_entry_key in id_index[file_id]:
2224
2247
# there is already an entry where this data belongs, just insert it.
2237
2260
new_details.append(DirState.NULL_PARENT_DETAILS)
2239
2262
# grab any one entry, use it to find the right path.
2240
# TODO: optimise this to reduce memory use in highly
2263
# TODO: optimise this to reduce memory use in highly
2241
2264
# fragmented situations by reusing the relocation
2243
2266
a_key = iter(id_index[file_id]).next()
2276
2299
return sorted(entry_list, key=_key)
2278
2301
def set_state_from_inventory(self, new_inv):
2279
"""Set new_inv as the current state.
2302
"""Set new_inv as the current state.
2281
2304
This API is called by tree transform, and will usually occur with
2282
2305
existing parent trees.
2288
2311
"set_state_from_inventory called; please mutate the tree instead")
2289
2312
self._read_dirblocks_if_needed()
2291
# Two iterators: current data and new data, both in dirblock order.
2314
# Two iterators: current data and new data, both in dirblock order.
2292
2315
# We zip them together, which tells about entries that are new in the
2293
2316
# inventory, or removed in the inventory, or present in both and
2296
2319
# You might think we could just synthesize a new dirstate directly
2297
2320
# since we're processing it in the right order. However, we need to
2446
2469
:param minikind: The type for the entry ('f' == 'file', 'd' ==
2447
2470
'directory'), etc.
2448
2471
:param executable: Should the executable bit be set?
2449
:param fingerprint: Simple fingerprint for new entry: sha1 for files,
2472
:param fingerprint: Simple fingerprint for new entry: sha1 for files,
2450
2473
referenced revision id for subtrees, etc.
2451
2474
:param packed_stat: Packed stat value for new entry.
2452
2475
:param size: Size information for new entry
2497
2520
num_present_parents = self._num_present_parents()
2498
2521
for lookup_index in xrange(1, num_present_parents + 1):
2499
2522
# grab any one entry, use it to find the right path.
2500
# TODO: optimise this to reduce memory use in highly
2523
# TODO: optimise this to reduce memory use in highly
2501
2524
# fragmented situations by reusing the relocation
2503
2526
update_block_index, present = \
2520
2543
block.insert(entry_index, new_entry)
2521
2544
existing_keys.add(key)
2523
# Does the new state matter?
2546
# Does the new state matter?
2524
2547
block[entry_index][1][0] = new_details
2525
2548
# parents cannot be affected by what we do.
2526
# other occurences of this id can be found
2549
# other occurences of this id can be found
2527
2550
# from the id index.
2529
2552
# tree index consistency: All other paths for this id in this tree
2562
2585
def _validate(self):
2563
2586
"""Check that invariants on the dirblock are correct.
2565
This can be useful in debugging; it shouldn't be necessary in
2588
This can be useful in debugging; it shouldn't be necessary in
2568
2591
This must be called with a lock held.
2637
2660
# For each file id, for each tree: either
2638
2661
# the file id is not present at all; all rows with that id in the
2639
2662
# key have it marked as 'absent'
2640
# OR the file id is present under exactly one name; any other entries
2663
# OR the file id is present under exactly one name; any other entries
2641
2664
# that mention that id point to the correct name.
2643
2666
# We check this with a dict per tree pointing either to the present
2690
2713
# absent; should not occur anywhere else
2691
2714
this_tree_map[file_id] = None, this_path
2692
2715
elif minikind == 'r':
2693
# relocation, must occur at expected location
2716
# relocation, must occur at expected location
2694
2717
this_tree_map[file_id] = tree_state[1], this_path
2696
2719
this_tree_map[file_id] = this_path, this_path
3158
3181
search_specific_files = self.search_specific_files
3159
3182
searched_specific_files = self.searched_specific_files
3160
3183
splitpath = osutils.splitpath
3162
3185
# compare source_index and target_index at or under each element of search_specific_files.
3163
3186
# follow the following comparison table. Note that we only want to do diff operations when
3164
# the target is fdl because thats when the walkdirs logic will have exposed the pathinfo
3187
# the target is fdl because thats when the walkdirs logic will have exposed the pathinfo
3165
3188
# for the target.
3168
3191
# Source | Target | disk | action
3169
3192
# r | fdlt | | add source to search, add id path move and perform
3170
3193
# | | | diff check on source-target
3171
# r | fdlt | a | dangling file that was present in the basis.
3194
# r | fdlt | a | dangling file that was present in the basis.
3173
3196
# r | a | | add source to search
3175
3198
# r | r | | this path is present in a non-examined tree, skip.
3176
3199
# r | r | a | this path is present in a non-examined tree, skip.
3177
3200
# a | fdlt | | add new id
3285
3308
raise AssertionError()
3286
3309
del current_dir_info[1][bzr_index]
3287
3310
# walk until both the directory listing and the versioned metadata
3289
3312
if (block_index < len(self.state._dirblocks) and
3290
3313
osutils.is_inside(current_root, self.state._dirblocks[block_index][0])):
3291
3314
current_block = self.state._dirblocks[block_index]
3452
3475
if current_path_info[2] in ('directory'):
3453
3476
del current_dir_info[1][path_index]
3454
3477
path_index -= 1
3455
# dont descend the disk iterator into any tree
3478
# dont descend the disk iterator into any tree
3457
3480
if current_path_info[2] == 'tree-reference':
3458
3481
del current_dir_info[1][path_index]