30
30
dirstate format = header line, full checksum, row count, parent details,
31
31
ghost_details, entries;
32
header line = "#bazaar dirstate flat format 2", NL;
32
header line = "#bazaar dirstate flat format 3", NL;
33
33
full checksum = "crc32: ", ["-"], WHOLE_NUMBER, NL;
34
row count = "num_entries: ", digit, NL;
34
row count = "num_entries: ", WHOLE_NUMBER, NL;
35
35
parent_details = WHOLE NUMBER, {REVISION_ID}* NL;
36
36
ghost_details = WHOLE NUMBER, {REVISION_ID}*, NL;
162
162
manageable number. Will scale badly on trees with 10K entries in a
163
163
single directory. compare with Inventory.InventoryDirectory which has
164
164
a dictionary for the children. No bisect capability, can only probe for
165
exact matches, or grab all elements and sorta.
166
- Whats the risk of error here? Once we have the base format being processed
165
exact matches, or grab all elements and sort.
166
- What's the risk of error here? Once we have the base format being processed
167
167
we should have a net win regardless of optimality. So we are going to
168
go with what seems reasonably.
168
go with what seems reasonable.
171
maybe we should do a test profile of these core structure - 10K simulated searches/lookups/etc?
171
Maybe we should do a test profile of the core structure - 10K simulated
172
searches/lookups/etc?
173
174
Objects for each row?
174
175
The lifetime of Dirstate objects is current per lock, but see above for
223
class _Bisector(object):
224
"""This just keeps track of information as we are bisecting."""
227
223
def pack_stat(st, _encode=binascii.b2a_base64, _pack=struct.pack):
228
224
"""Convert stat values into a packed representation."""
229
225
# jam 20060614 it isn't really worth removing more entries if we
230
226
# are going to leave it in packed form.
231
227
# With only st_mtime and st_mode filesize is 5.5M and read time is 275ms
232
# With all entries filesize is 5.9M and read time is mabye 280ms
228
# With all entries, filesize is 5.9M and read time is maybe 280ms
233
229
# well within the noise margin
235
231
# base64 encoding always adds a final newline, so strip it off
462
458
if self._id_index:
463
459
self._id_index.setdefault(entry_key[2], set()).add(entry_key)
465
def _bisect(self, dir_name_list):
461
def _bisect(self, paths):
466
462
"""Bisect through the disk structure for specific rows.
468
:param dir_name_list: A list of (dir, name) pairs.
469
:return: A dict mapping (dir, name) => entry for found entries. Missing
464
:param paths: A list of paths to find
465
:return: A dict mapping path => entries for found entries. Missing
470
466
entries will not be in the map.
467
The list is not sorted, and entries will be populated
468
based on when they were read.
472
470
self._requires_lock()
473
471
# We need the file pointer to be right after the initial header block
596
600
# Either we will find them here, or we can mark them as
599
if middle_files[0] == first_dir_name:
603
if middle_files[0] == first_path:
600
604
# We might need to go before this location
601
pre.append(first_dir_name)
602
if middle_files[-1] == last_dir_name:
603
post.insert(0, last_dir_name)
605
pre.append(first_path)
606
if middle_files[-1] == last_path:
607
post.insert(0, last_path)
605
609
# Find out what paths we have
606
paths = {first_dir_name:[first_fields]}
607
# last_dir_name might == first_dir_name so we need to be
610
paths = {first_path:[first_fields]}
611
# last_path might == first_path so we need to be
608
612
# careful if we should append rather than overwrite
609
613
if last_entry_num != first_entry_num:
610
paths.setdefault(last_dir_name, []).append(last_fields)
614
paths.setdefault(last_path, []).append(last_fields)
611
615
for num in xrange(first_entry_num+1, last_entry_num):
612
616
# TODO: jam 20070223 We are already splitting here, so
613
617
# shouldn't we just split the whole thing rather
614
618
# than doing the split again in add_one_record?
615
619
fields = entries[num].split('\0')
616
dir_name = (fields[1], fields[2])
617
paths.setdefault(dir_name, []).append(fields)
621
path = fields[1] + '/' + fields[2]
624
paths.setdefault(path, []).append(fields)
619
for dir_name in middle_files:
620
for fields in paths.get(dir_name, []):
626
for path in middle_files:
627
for fields in paths.get(path, []):
621
628
# offset by 1 because of the opening '\0'
622
629
# consider changing fields_to_entry to avoid the
623
630
# extra list slice
624
631
entry = fields_to_entry(fields[1:])
625
found.setdefault(dir_name, []).append(entry)
632
found.setdefault(path, []).append(entry)
627
634
# Now we have split up everything into pre, middle, and post, and
628
635
# we have handled everything that fell in 'middle'.
867
874
if dir_name[0] in pending_dirs:
868
875
# This entry will be found in the dir search
870
# TODO: We need to check if this entry has
871
# already been found. Otherwise we might be
872
# hitting infinite recursion.
873
877
if dir_name not in found_dir_names:
874
paths_to_search.add(dir_name)
878
paths_to_search.add(tree_info[1])
875
879
# Now we have a list of paths to look for directly, and
876
880
# directory blocks that need to be read.
877
881
# newly_found is mixing the keys between (dir, name) and path
1328
1332
:return: A tuple describing where the path is located, or should be
1329
1333
inserted. The tuple contains four fields: the block index, the row
1330
index, anda two booleans are True when the directory is present, and
1331
when the entire path is present. There is no guarantee that either
1334
index, the directory is present (boolean), the entire path is
1335
present (boolean). There is no guarantee that either
1332
1336
coordinate is currently reachable unless the found field for it is
1333
1337
True. For instance, a directory not present in the searched tree
1334
1338
may be returned with a value one greater than the current highest
1551
1555
self._read_header_if_needed()
1552
1556
if self._dirblock_state == DirState.NOT_IN_MEMORY:
1553
# move the _state_file pointer to after the header (in case bisect
1554
# has been called in the mean time)
1555
self._state_file.seek(self._end_of_header)
1556
text = self._state_file.read()
1557
# TODO: check the crc checksums. crc_measured = zlib.crc32(text)
1559
fields = text.split('\0')
1560
# Remove the last blank entry
1561
trailing = fields.pop()
1562
assert trailing == ''
1563
# consider turning fields into a tuple.
1565
# skip the first field which is the trailing null from the header.
1567
# Each line now has an extra '\n' field which is not used
1568
# so we just skip over it
1570
# 3 fields for the key
1571
# + number of fields per tree_data (5) * tree count
1573
num_present_parents = self._num_present_parents()
1574
tree_count = 1 + num_present_parents
1575
entry_size = self._fields_per_entry()
1576
expected_field_count = entry_size * self._num_entries
1577
field_count = len(fields)
1578
# this checks our adjustment, and also catches file too short.
1579
assert field_count - cur == expected_field_count, \
1580
'field count incorrect %s != %s, entry_size=%s, '\
1581
'num_entries=%s fields=%r' % (
1582
field_count - cur, expected_field_count, entry_size,
1583
self._num_entries, fields)
1585
if num_present_parents == 1:
1586
# Bind external functions to local names
1588
# We access all fields in order, so we can just iterate over
1589
# them. Grab an straight iterator over the fields. (We use an
1590
# iterator because we don't want to do a lot of additions, nor
1591
# do we want to do a lot of slicing)
1592
next = iter(fields).next
1593
# Move the iterator to the current position
1594
for x in xrange(cur):
1596
# The two blocks here are deliberate: the root block and the
1597
# contents-of-root block.
1598
self._dirblocks = [('', []), ('', [])]
1599
current_block = self._dirblocks[0][1]
1600
current_dirname = ''
1601
append_entry = current_block.append
1602
for count in xrange(self._num_entries):
1606
if dirname != current_dirname:
1607
# new block - different dirname
1609
current_dirname = dirname
1610
self._dirblocks.append((current_dirname, current_block))
1611
append_entry = current_block.append
1612
# we know current_dirname == dirname, so re-use it to avoid
1613
# creating new strings
1614
entry = ((current_dirname, name, file_id),
1617
next(), # fingerprint
1618
_int(next()), # size
1619
next() == 'y', # executable
1620
next(), # packed_stat or revision_id
1624
next(), # fingerprint
1625
_int(next()), # size
1626
next() == 'y', # executable
1627
next(), # packed_stat or revision_id
1631
assert trailing == '\n'
1632
# append the entry to the current block
1634
self._split_root_dirblock_into_contents()
1636
fields_to_entry = self._get_fields_to_entry()
1637
entries = [fields_to_entry(fields[pos:pos+entry_size])
1638
for pos in xrange(cur, field_count, entry_size)]
1639
self._entries_to_current_state(entries)
1640
# To convert from format 2 => format 3
1641
# self._dirblocks = sorted(self._dirblocks,
1642
# key=lambda blk:blk[0].split('/'))
1643
# To convert from format 3 => format 2
1644
# self._dirblocks = sorted(self._dirblocks)
1645
self._dirblock_state = DirState.IN_MEMORY_UNMODIFIED
1557
_read_dirblocks(self)
1647
1559
def _read_header(self):
1648
1560
"""This reads in the metadata header, and the parent ids.
1700
1612
We reuse the existing file, because that prevents race conditions with
1701
1613
file creation, and use oslocks on it to prevent concurrent modification
1702
and reads - because dirstates incremental data aggretation is not
1614
and reads - because dirstate's incremental data aggregation is not
1703
1615
compatible with reading a modified file, and replacing a file in use by
1704
another process is impossible on windows.
1616
another process is impossible on Windows.
1706
1618
A dirstate in read only mode should be smart enough though to validate
1707
1619
that the file has not changed, and otherwise discard its cache and
2025
1937
def _make_absent(self, current_old):
2026
1938
"""Mark current_old - an entry - as absent for tree 0.
2028
:return: True if this was the last details entry for they entry key:
1940
:return: True if this was the last details entry for the entry key:
2029
1941
that is, if the underlying block has had the entry removed, thus
2030
1942
shrinking in length.
2032
1944
# build up paths that this id will be left at after the change is made,
2033
1945
# so we can update their cross references in tree 0
2034
1946
all_remaining_keys = set()
2035
# Dont check the working tree, because its going.
1947
# Dont check the working tree, because it's going.
2036
1948
for details in current_old[1][1:]:
2037
1949
if details[0] not in ('a', 'r'): # absent, relocated
2038
1950
all_remaining_keys.add(current_old[0])
2367
2279
self._split_path_cache = {}
2369
2281
def _requires_lock(self):
2370
"""Checks that a lock is currently held by someone on the dirstate"""
2282
"""Check that a lock is currently held by someone on the dirstate."""
2371
2283
if not self._lock_token:
2372
2284
raise errors.ObjectNotLocked(self)
2375
def bisect_dirblock(dirblocks, dirname, lo=0, hi=None, cache={}):
2376
"""Return the index where to insert dirname into the dirblocks.
2378
The return value idx is such that all directories blocks in dirblock[:idx]
2379
have names < dirname, and all blocks in dirblock[idx:] have names >=
2382
Optional args lo (default 0) and hi (default len(dirblocks)) bound the
2383
slice of a to be searched.
2388
dirname_split = cache[dirname]
2390
dirname_split = dirname.split('/')
2391
cache[dirname] = dirname_split
2394
# Grab the dirname for the current dirblock
2395
cur = dirblocks[mid][0]
2397
cur_split = cache[cur]
2399
cur_split = cur.split('/')
2400
cache[cur] = cur_split
2401
if cur_split < dirname_split: lo = mid+1
2287
# Try to load the compiled form if possible
2289
from bzrlib._dirstate_helpers_c import (
2290
_read_dirblocks_c as _read_dirblocks,
2291
bisect_dirblock_c as bisect_dirblock,
2292
_bisect_path_left_c as _bisect_path_left,
2293
_bisect_path_right_c as _bisect_path_right,
2294
cmp_by_dirs_c as cmp_by_dirs,
2297
from bzrlib._dirstate_helpers_py import (
2298
_read_dirblocks_py as _read_dirblocks,
2299
bisect_dirblock_py as bisect_dirblock,
2300
_bisect_path_left_py as _bisect_path_left,
2301
_bisect_path_right_py as _bisect_path_right,
2302
cmp_by_dirs_py as cmp_by_dirs,