1
# Copyright (C) 2006, 2007 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""DirState objects record the state of a directory and its bzr metadata.
19
Pseduo EBNF grammar for the disk format:
20
MINIKIND = "f" | "d" | "l";
23
WHOLE NUMBER = {digit}, digit;
25
dirstate format = header line, full checksum, row count, parent details,
27
header line = "#bazaar dirstate flat format 1", NL;
28
full checksum = "adler32: ", ["-"], WHOLE NUMBER, NL;
29
row count = "num_entries: ", digit, NL;
30
parent_details = WHOLE NUMBER, NULL, NL; XXX: complete this line
31
ghost_details = WHOLE NUMBER, NULL, {GHOST_ID NULL}*, NL;
32
rows = dirname, NULL, basename, NULL, MINIKIND, NULL, fileid_utf8, NULL,
33
WHOLE NUMBER (* size *), NULL, packed stat, NULL, sha1|symlink target,
35
PARENT ROW = NULL, revision_utf8, NULL, MINIKIND, NULL, dirname, NULL,
36
basename, NULL, WHOLE NUMBER (* size *), NULL, "y" | "n", NULL,
39
PARENT ROW's are emitted for every parent that is not in the ghosts details
40
line. That is, if the parents are foo, bar, baz, and the ghosts are bar, then
41
each row will have a PARENT ROW for foo and baz, but not for bar.
46
1) Fast end to end use for bzr's top 5 uses cases.
47
2) fall back current object model as needed.
48
3) scale usably to the largest trees known today - say 50K entries.
52
Eventually reuse dirstate objects across locks IFF the dirstate file has not
53
been modified, but will require that we flush/ignore cached stat-hit data
54
because we wont want to restat all files on disk just because a lock was
55
acquired, yet we cannot trust the data after the previous lock was released.
57
Memory representation:
58
vecter of all directories, and vector of the childen ?
60
root_row = (direntry for root, [parent_direntries_for_root]),
62
('', ['data for achild', 'data for bchild', 'data for cchild'])
63
('dir', ['achild', 'cchild', 'echild'])
65
- single bisect to find N subtrees from a path spec
66
- in-order for serialisation - this is 'dirblock' grouping.
67
- insertion of a file '/a' affects only the '/' child-vector, that is, to
68
insert 10K elements from scratch does not generates O(N^2) memoves of a
69
single vector, rather each individual, which tends to be limited to a
70
manageable number. Will scale badly on trees with 10K entries in a
71
single directory. compare with Inventory.InventoryDirectory which has
72
a dictionary for the children. No bisect capability, can only probe for
73
exact matches, or grab all elements and sorta.
74
- Whats the risk of error here? Once we have the base format being processed
75
we should have a net win regardless of optimality. So we are going to
76
go with what seems reasonably.
79
maybe we should do a test profile of these core structure - 10K simulated searches/lookups/etc?
82
The lifetime of Dirstate objects is current per lock, but see above for
83
possible extensions. The lifetime of a row from a dirstate is expected to be
84
very short in the optimistic case: which we are optimising for. For instance,
85
subtree status will determine from analysis of the disk data what rows need to
86
be examined at all, and will be able to determine from a single row whether
87
that file has altered or not, so we are aiming to process tens of thousands of
88
entries each second within the dirstate context, before exposing anything to
89
the larger codebase. This suggests we want the time for a single file
90
comparison to be < 0.1 milliseconds. That would give us 10000 paths per second
91
processed, and to scale to 100 thousand we'll another order of magnitude to do
92
that. Now, as the lifetime for all unchanged entries is the time to parse, stat
93
the file on disk, and then immediately discard, the overhead of object creation
94
becomes a significant cost.
96
Figures: Creating a tuple from from 3 elements was profiled at 0.0625
97
microseconds, whereas creating a object which is subclassed from tuple was
98
0.500 microseconds, and creating an object with 3 elements and slots was 3
99
microseconds long. 0.1 milliseconds is 100 microseconds, and ideally we'll get
100
down to 10 microseconds for the total processing - having 33% of that be object
101
creation is a huge overhead. There is a potential cost in using tuples within
102
each row which is that the conditional code to do comparisons may be slower
103
than method invocation, but method invocation is known to be slow due to stack
104
frame creation, so avoiding methods in these tight inner loops in unfortunately
105
desirable. We can consider a pyrex version of this with objects in future if
123
import bzrlib.inventory
124
from bzrlib import osutils
125
from bzrlib.osutils import (
133
class DirState(object):
134
"""Record directory and metadata state for fast access.
136
A dirstate is a specialised data structure for managing local working
137
tree state information. Its not yet well defined whether it is platform
138
specific, and if it is how we detect/parameterise that.
141
_kind_to_minikind = {'file':'f', 'directory':'d', 'symlink':'l'}
142
_minikind_to_kind = {'f':'file', 'd':'directory', 'l':'symlink'}
143
_to_yesno = {True:'y', False: 'n'} # TODO profile the performance gain
144
# of using int conversion rather than a dict here. AND BLAME ANDREW IF
148
IN_MEMORY_UNMODIFIED = 1
149
IN_MEMORY_MODIFIED = 2
152
NULL_PARENT_ROW = ('', 'file', '/', 'RECYCLED.BIN', 0, False, '')
155
"""Create a DirState object.
159
:attr _root_row: The root row of the directory/file information,
160
- contains the path to / - '', ''
161
- kind of 'directory',
162
- the file id of the root in utf8
165
- and no sha information.
167
# _header_state and _dirblock_state represent the current state
168
# of the dirstate metadata and the per-row data respectiely.
169
# NOT_IN_MEMORY indicates that no data is in memory
170
# IN_MEMORY_UNMODIFIED indicates that what we have in memory
171
# is the same as is on disk
172
# IN_MEMORY_MODIFIED indicates that we have a modified version
173
# of what is on disk.
174
# In future we will add more granularity, for instance _dirblock_state
175
# will probably support partially-in-memory as a separate variable,
176
# allowing for partially-in-memory unmodified and partially-in-memory
178
self._header_state = DirState.NOT_IN_MEMORY
179
self._dirblock_state = DirState.NOT_IN_MEMORY
183
self._root_row = None
184
self._state_file=None
186
def add(self, path, file_id, kind, stat, link_or_sha1):
187
"""Add a path to be tracked.
189
:param path: The path within the dirstate - '' is the root, 'foo' is the
190
path foo within the root, 'foo/bar' is the path bar within foo
192
:param file_id: The file id of the path being added.
193
:param kind: The kind of the path.
194
:param stat: The output of os.lstate for the path.
195
:param link_or_sha1: The sha value of the file, or the target of a
196
symlink. '' for directories.
199
# find the block its in.
200
# find the location in the block.
201
# check its not there
203
#------- copied from bzrlib.inventory.make_entry
204
# --- normalized_filename wants a unicode basename only, so get one.
205
dirname, basename = os.path.split(path)
206
# we dont import normalized_filename directly because we want to be
207
# able to change the implementation at runtime for tests.
208
norm_name, can_access = osutils.normalized_filename(basename)
209
if norm_name != basename:
213
raise errors.InvalidNormalization(path)
214
# now that we've normalised, we need the correct utf8 path and
215
# dirname and basename elements. This single encode and split should be
216
# faster than three separate encodes.
217
utf8path = (dirname + '/' + basename).strip('/').encode('utf8')
218
dirname, basename = os.path.split(utf8path)
219
self._read_dirblocks_if_needed()
220
block_index = self._find_dirblock_index(dirname)
222
# some parent path has not been added - its an error to add this
224
raise errors.NotVersionedError(path, str(self))
225
block = self._dirblocks[block_index][1]
228
packed_stat = DirState.NULLSTAT
231
packed_stat = pack_stat(stat)
232
parent_info = self._empty_parent_info()
234
row_data = ((dirname, basename, kind, file_id.encode('utf8'),
235
size, packed_stat, link_or_sha1), parent_info)
236
elif kind == 'directory':
237
row_data = ((dirname, basename, kind, file_id.encode('utf8'),
238
0, packed_stat, ''), parent_info)
239
elif kind == 'symlink':
240
row_data = ((dirname, basename, kind, file_id.encode('utf8'),
241
size, packed_stat, link_or_sha1), parent_info)
243
raise errors.BzrError('unknown kind %r' % kind)
244
row_index = bisect.bisect_left(block, row_data)
245
if len(block) > row_index:
246
assert block[row_index][0][1] != basename, \
247
"basename %r already added" % basename
248
block.insert(row_index, row_data)
250
if kind == 'directory':
251
# insert a new dirblock
252
self._ensure_block(block_index, row_index, utf8path)
253
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
255
def add_deleted(self, fileid_utf8, parents):
256
"""Add fileid_utf8 with parents as deleted."""
257
self._read_dirblocks_if_needed()
258
new_row = self._make_deleted_row(fileid_utf8, parents)
259
block_index = self._find_dirblock_index(new_row[0][0])
261
# no deleted block yet.
262
bisect.insort_left(self._dirblocks, (new_row[0][0], []))
263
block_index = self._find_dirblock_index(new_row[0][0])
264
block = self._dirblocks[block_index][1]
265
row_index = bisect.insort_left(block, new_row)
266
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
268
def _empty_parent_info(self):
269
return [DirState.NULL_PARENT_ROW] * (len(self._parents) -
272
def _ensure_block(self, parent_block_index, parent_row_index, dirname):
273
"""Enssure a block for dirname exists.
275
This function exists to let callers which know that there is a
276
directory dirname ensure that the block for it exists. This block can
277
fail to exist because of demand loading, or because a directory had no
278
children. In either case it is not an error. It is however an error to
279
call this if there is no parent entry for the directory, and thus the
280
function requires the coordinates of such an entry to be provided.
282
The root row is special cased and can be indicated with a parent block
285
:param parent_block_index: The index of the block in which dirname's row
287
:param parent_row_index: The index in the parent block where the row
289
:param dirname: The utf8 dirname to ensure there is a block for.
290
:return: The index for the block.
292
# the basename of the directory must be the end of its full name.
293
if not (parent_block_index == -1 and
294
parent_block_index == -1 and dirname == ''):
295
assert dirname.endswith(
296
self._dirblocks[parent_block_index][1][parent_row_index][0][1])
297
## In future, when doing partial parsing, this should load and
298
# populate the entire block.
299
index = bisect.bisect_left(self._dirblocks, (dirname, []))
300
if (index == len(self._dirblocks) or
301
self._dirblocks[index][0] != dirname):
302
self._dirblocks.insert(index, (dirname, []))
305
def _find_dirblock_index(self, dirname):
306
"""Find the dirblock index for dirname.
308
:return: -1 if the dirname is not present, or the index in
309
self._dirblocks for it otherwise.
311
block_index = bisect.bisect_left(self._dirblocks, (dirname, []))
312
if (block_index == len(self._dirblocks) or
313
self._dirblocks[block_index][0] != dirname):
318
def from_tree(tree, dir_state_filename):
319
"""Create a dirstate from a bzr Tree.
321
:param tree: The tree which should provide parent information and
324
# XXX: aka the big ugly.
326
result._state_file = open(dir_state_filename, 'wb+')
328
_encode = base64.encodestring
330
parent_ids = tree.get_parent_ids()
331
num_parents = len(parent_ids)
333
raise ValueError('Cannot handle more than 3 parents')
336
for parent_id in parent_ids:
337
parent_trees.append(tree.branch.repository.revision_tree(parent_id))
339
# FIXME: is this utf8 safe?
341
to_minikind = DirState._kind_to_minikind
342
to_yesno = DirState._to_yesno
344
st = os.lstat(tree.basedir)
347
, 'directory', tree.inventory.root.file_id.encode('utf8')
348
, 0 # no point having a size for dirs.
353
for parent_tree in parent_trees:
354
root_parents.append((
355
parent_tree.inventory.root.revision.encode('utf8'),
363
root_row = (root_info, root_parents)
365
for dirinfo, block in tree.walkdirs():
366
# dirinfo is path, id
368
# add the row for this block
370
dirblocks.append((dirinfo[0], block_row))
371
for relpath, name, kind, st, fileid, versionedkind in block:
373
# unversioned file, skip
375
# TODO? factor out this loop body as a helper function ?
377
dirname, basename = os.path.split(relpath.encode('utf8'))
379
s = tree.get_file_sha1(fileid, relpath)
380
elif kind == 'directory':
381
if name in ('.bzr', '.hg', 'CVS', '.svn', '_svn'):
382
raise Exception('skipping dirs not supported yet')
383
# Skip this, and all children
384
to_remove.append((relpath, name, kind, st, abspath))
388
elif kind == 'symlink':
389
# sha value of the link target ?!
390
s = os.readlink(abspath)
392
for count in xrange(num_parents):
394
result._parent_info(parent_trees[count], fileid))
395
row_data = (dirname.encode('utf8'), basename.encode('utf8'),
396
kind, fileid.encode('utf8'), st.st_size, pack_stat(st),
398
block_row.append((row_data, parent_info))
400
# It isn't safe to remove entries while we are iterating
401
# over the same list, so remove them now
402
for entry in to_remove:
405
#lines.append(result._get_parents_line(parent_ids))
406
#lines.append(result._get_ghosts_line([]))
407
result._set_data(parent_ids, root_row, dirblocks)
411
def get_ghosts(self):
412
"""Return a list of the parent tree revision ids that are ghosts."""
413
self._read_header_if_needed()
417
"""Serialise the entire dirstate to a sequence of lines."""
418
if (self._header_state == DirState.IN_MEMORY_UNMODIFIED and
419
self._dirblock_state == DirState.IN_MEMORY_UNMODIFIED):
420
# read whats on disk.
421
self._state_file.seek(0)
422
return self._state_file.readlines()
424
lines.append(self._get_parents_line(self.get_parent_ids()))
425
lines.append(self._get_ghosts_line(self._ghosts))
426
# append the root line which is special cased
427
lines.extend(map(self._row_to_line, self._iter_rows()))
428
return self._get_output_lines(lines)
430
def _get_ghosts_line(self, ghost_ids):
431
"""Create a line for the state file for ghost information."""
432
return '\0'.join([str(len(ghost_ids))] +
433
[g.encode('utf8') for g in ghost_ids])
435
def _get_parents_line(self, parent_ids):
436
"""Create a line for the state file for parents information."""
437
return '\0'.join([str(len(parent_ids))] +
438
[p.encode('utf8') for p in parent_ids])
440
def get_parent_ids(self):
441
"""Return a list of the parent tree ids for the directory state."""
442
self._read_header_if_needed()
443
return list(self._parents)
445
def _get_row(self, path_utf8):
446
"""Get the dirstate row for path.
448
:param path_utf8: An utf8 path to be looked up.
449
:return: The dirstate row tuple for path, or (None, None)
451
self._read_dirblocks_if_needed()
453
return self._root_row
454
dirname, basename = os.path.split(path_utf8)
455
block_index, row_index, dir_present, file_present = \
456
self._get_block_row_index(dirname, basename)
459
row = self._dirblocks[block_index][1][row_index]
460
assert row[0][3], 'unversioned row?!?!'
463
def _get_block_row_index(self, dirname, basename):
464
"""Get the coordinates for a path in the state structure.
466
:param dirname: The utf8 dirname to lookup.
467
:param basename: The utf8 basename to lookup.
468
:return: A tuple describing where the path is located, or should be
469
inserted. The tuple contains four fields: the block index, the row
470
index, anda two booleans are True when the directory is present, and
471
when the entire path is present. There is no guarantee that either
472
coordinate is currently reachable unless the found field for it is
473
True. For instance, a directory not present in the state may be
474
returned with a value one greater than the current highest block
475
offset. The directory present field will always be True when the
476
path present field is True.
478
assert not (dirname == '' and basename == ''), 'blackhole lookup error'
479
self._read_dirblocks_if_needed()
480
block_index = bisect.bisect_left(self._dirblocks, (dirname, []))
481
if (block_index == len(self._dirblocks) or
482
self._dirblocks[block_index][0] != dirname):
483
# no such directory - return the dir index and 0 for the row.
484
return block_index, 0, False, False
485
block = self._dirblocks[block_index][1] # access the rows only
486
search = ((dirname, basename), [])
487
row_index = bisect.bisect_left(block, search)
488
if row_index == len(block) or block[row_index][0][1] != basename:
489
return block_index, row_index, True, False
490
return block_index, row_index, True, True
493
def initialize(path):
494
"""Create a new dirstate on path.
496
The new dirstate will be an empty tree - that is it has no parents,
497
and only a root node - which has id ROOT_ID.
499
:param path: The name of the file for the dirstate.
500
:return: A DirState object.
502
# This constructs a new DirState object on a path, sets the _state_file
503
# to a new empty file for that path. It then calls _set_data() with our
504
# stock empty dirstate information - a root with ROOT_ID, no children,
505
# and no parents. Finally it calls save() to ensure that this data will
508
result._state_file = open(path, 'wb+')
509
# a new root directory, with a pack_stat (the x's) that is just noise and will
510
# never match the output of base64 encode.
511
root_row_data = ('', '', 'directory', bzrlib.inventory.ROOT_ID, 0,
512
DirState.NULLSTAT, '')
514
root_row = (root_row_data, root_parents)
515
empty_tree_dirblocks = [('', [])] # root dir contents - no entries.
516
result._set_data([], root_row, empty_tree_dirblocks)
520
result._state_file.close()
524
def _iter_rows(self):
525
"""Iterate over all the row data in the dirstate.
527
Each yelt item is a tuple of (row_data, parent_data_list).
529
self._read_dirblocks_if_needed()
531
for directory in self._dirblocks:
532
for row in directory[1]:
535
def _get_output_lines(self, lines):
536
"""format lines for final output.
538
:param lines: A sequece of lines containing the parents list and the
541
output_lines = ['#bazaar dirstate flat format 1\n']
542
lines.append('') # a final newline
543
inventory_text = '\0\n\0'.join(lines)
544
output_lines.append('adler32: %s\n' % (zlib.adler32(inventory_text),))
545
# -3, 1 for num parents, 1 for ghosts, 1 for final newline
546
num_entries = len(lines)-3
547
output_lines.append('num_entries: %s\n' % (num_entries,))
548
output_lines.append(inventory_text)
551
def _make_deleted_row(self, fileid_utf8, parents):
552
"""Return a deleted for for fileid_utf8."""
553
return ('/', 'RECYCLED.BIN', 'file', fileid_utf8, 0, DirState.NULLSTAT,
558
"""Construct a DirState on the file at path path."""
560
result._state_file = open(path, 'rb+')
563
def _parent_info(self, parent_tree, fileid):
564
"""Generate the parent information for file_id in parent_tree."""
565
# FIXME: This is probably expensive - we encode the path that in the
566
# common case will be the same across all parents, twice.
567
# also, id2path is slow in inventory, and we should make that fast.
569
parent_entry = parent_tree.inventory[fileid]
570
except errors.NoSuchId:
571
# this parent does not have fileid - return a bogus entry, which
572
# will be filtered out on iteration etc.
573
# an empty revision id is bogus and safe to put on disk
574
# we make it be a 'file', just because. We give it the
575
# deleted file path dirname and basename, 0 size, not executable
577
return DirState.NULL_PARENT_ROW
578
parent_path = parent_tree.inventory.id2path(fileid)
579
dirname, basename = os.path.split(parent_path.encode('utf8'))
580
return (parent_entry.revision.encode('utf8'),
582
# FIXME: set these from the parent
584
parent_entry.text_size or 0,
585
parent_entry.executable,
586
parent_entry.text_sha1 or '',
589
def _read_dirblocks_if_needed(self):
590
"""Read in all the dirblocks from the file if they are not in memory."""
591
self._read_header_if_needed()
592
if self._dirblock_state == DirState.NOT_IN_MEMORY:
593
# the _state_file pointer will be positioned at the start of the
595
text = self._state_file.read()
596
# TODO: check the adler checksums. adler_measured = zlib.adler32(text)
598
fields = text.split('\0')
599
# Remove the last blank entry
600
trailing = fields.pop()
601
assert trailing == ''
602
# consider turning fields into a tuple.
604
# skip the first field which is the trailing null from the header.
606
# Each line now has an extra '\n' field which is not used
607
# so we just skip over it
608
# number of fields per dir_entry
609
# + number of fields per parent_entry
611
num_present_parents = len(self._parents) - len(self._ghosts)
612
entry_size = 7 + (7 * (num_present_parents)) + 1
613
expected_field_count = entry_size * self._num_entries
614
if len(fields) - cur > expected_field_count:
615
fields = fields[:expected_field_count + cur]
616
trace.mutter('Unexpectedly long dirstate field count!')
617
print "XXX: incorrectly truncated dirstate file bug triggered."
618
field_count = len(fields)
619
# this checks our adjustment, and also catches file too short.
620
assert field_count - cur == expected_field_count, \
621
'field count incorrect %s != %s, entry_size=%s, '\
622
'num_entries=%s fields=%r' % (
623
field_count - cur, expected_field_count, entry_size,
624
self._num_entries, fields)
626
# Fast path the case where there are 1 or 2 parents
627
if num_present_parents == 0:
628
entries = [(fields[pos:pos+7], []) for pos in xrange(cur, field_count, entry_size)]
629
elif num_present_parents == 1:
630
entries = [(fields[pos:pos+7], [fields[pos+7:pos+14],])
631
for pos in xrange(cur, field_count, entry_size)]
632
elif num_present_parents == 2:
633
entries = [(fields[pos:pos+7], [
634
fields[pos+7:pos+14],
635
fields[pos+14:pos+21],])
636
for pos in xrange(cur, field_count, entry_size)]
640
tuple([fields[chunk:chunk+7] for
641
chunk in xrange(pos + 7, pos+entry_size-1, 7)]))
642
for pos in xrange(cur, field_count, entry_size)
645
assert len(entries) == self._num_entries, '%s != %s entries' % (len(entries),
648
def _line_to_row(line):
649
"""Convert a freshly read line's size and minikind for use."""
650
# convert the minikind to kind
651
line[0][2] = self._minikind_to_kind[line[0][2]]
652
# convert the size to an int
653
line[0][4] = int(line[0][4])
654
for parent in line[1]:
655
parent[1] = self._minikind_to_kind[parent[1]]
656
parent[4] = int(parent[4])
657
parent[5] = parent[5] == 'y'
658
return tuple(line[0]), map(tuple, line[1])
659
new_rows = map(_line_to_row, entries)
660
self._rows_to_current_state(new_rows)
661
self._dirblock_state = DirState.IN_MEMORY_UNMODIFIED
663
def _read_header(self):
664
"""This reads in the metadata header, and the parent ids.
666
After reading in, the file should be positioned at the null
667
just before the start of the first record in the file.
669
:return: (expected adler checksum, number of entries, parent list)
672
parent_line = self._state_file.readline()
673
info = parent_line.split('\0')
674
num_parents = int(info[0])
675
assert num_parents == len(info)-2, 'incorrect parent info line'
676
self._parents = [p.decode('utf8') for p in info[1:-1]]
678
ghost_line = self._state_file.readline()
679
info = ghost_line.split('\0')
680
num_ghosts = int(info[1])
681
assert num_ghosts == len(info)-3, 'incorrect ghost info line'
682
self._ghosts = [p.decode('utf8') for p in info[2:-1]]
683
self._header_state = DirState.IN_MEMORY_UNMODIFIED
685
def _read_header_if_needed(self):
686
"""Read the header of the dirstate file if needed."""
687
if self._header_state == DirState.NOT_IN_MEMORY:
690
def _read_prelude(self):
691
"""Read in the prelude header of the dirstate file
693
This only reads in the stuff that is not connected to the adler
694
checksum. The position will be correct to read in the rest of
695
the file and check the checksum after this point.
696
The next entry in the file should be the number of parents,
697
and their ids. Followed by a newline.
699
header = self._state_file.readline()
700
assert header == '#bazaar dirstate flat format 1\n', \
701
'invalid header line: %r' % (header,)
702
adler_line = self._state_file.readline()
703
assert adler_line.startswith('adler32: '), 'missing adler32 checksum'
704
self.adler_expected = int(adler_line[len('adler32: '):-1])
705
num_entries_line = self._state_file.readline()
706
assert num_entries_line.startswith('num_entries: '), 'missing num_entries line'
707
self._num_entries = int(num_entries_line[len('num_entries: '):-1])
709
def _row_to_line(self, row):
710
"""Serialize row to a NULL delimited line ready for _get_output_lines.
712
:param row: A row_tuple as defined in the module docstring.
714
entire_row = list(row[0])
715
for parent_number, parent_data in enumerate(row[1]):
716
# (revision, kind, dirname, basename, size, executable_bool, sha1)
717
entire_row.extend(parent_data)
718
# minikind conversion of the parent
719
parent_offset = 7 + parent_number * 7
720
entire_row[parent_offset + 1] = DirState._kind_to_minikind[parent_data[1]]
721
entire_row[parent_offset + 4] = str(parent_data[4])
722
entire_row[parent_offset + 5] = DirState._to_yesno[parent_data[5]]
723
# conversion from memory to disk-ready format:
724
# minikind conversion of the current row type.
725
entire_row[2] = DirState._kind_to_minikind[entire_row[2]]
726
entire_row[4] = str(entire_row[4])
727
# minikind of parents
728
return '\0'.join(entire_row)
730
def _rows_to_current_state(self, new_rows):
731
"""Load new_rows into self._root_row and self.dirblocks.
733
Process new_rows into the current state object, making them the active
736
:param new_rows: A sorted list of rows. This function does not sort
737
to prevent unneeded overhead when callers have a sorted list
741
assert new_rows[0][0][0:2] == ('', ''), \
742
"Incorrect root row %r" % new_rows[0][0]
743
self._root_row = new_rows[0]
744
self._dirblocks = [('', [])]
745
for row in new_rows[1:]:
746
if row[0][0] != self._dirblocks[-1][0]:
748
self._dirblocks.append((row[0][0], []))
749
# append the row to the current block
750
self._dirblocks[-1][1].append(row)
753
"""Save any pending changes created during this session.
755
We reuse the existing file, because that prevents race conditions with
756
file creation, and we expect to be using oslocks on it in the near
757
future to prevent concurrent modification and reads - because dirstates
758
incremental data aggretation is not compatible with reading a modified
759
file, and replacing a file in use by another process is impossible on
762
A dirstate in read only mode should be smart enough though to validate
763
that the file has not changed, and otherwise discard its cache and
764
start over, to allow for fine grained read lock duration, so 'status'
765
wont block 'commit' - for example.
767
if (self._header_state == DirState.IN_MEMORY_MODIFIED or
768
self._dirblock_state == DirState.IN_MEMORY_MODIFIED):
769
self._state_file.seek(0)
770
self._state_file.writelines(self.get_lines())
771
self._state_file.truncate()
772
self._state_file.flush()
773
self._header_state = DirState.IN_MEMORY_UNMODIFIED
774
self._dirblock_state = DirState.IN_MEMORY_UNMODIFIED
776
def _set_data(self, parent_ids, root_row, dirblocks):
777
"""Set the full dirstate data in memory.
779
This is an internal function used to completely replace the objects
780
in memory state. It puts the dirstate into state 'full-dirty'.
782
:param parent_ids: A list of parent tree revision ids.
783
:param root_row: The root row - a tuple of the root direntry and the
784
list of matching direntries from the parent_ids trees.
785
:param dirblocks: A list containing one tuple for each directory in the
786
tree. Each tuple contains the directory path and a list of
787
row data in the same format as root_row.
789
# our memory copy is now authoritative.
790
self._dirblocks = dirblocks
791
self._root_row = root_row
792
self._header_state = DirState.IN_MEMORY_MODIFIED
793
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
794
self._parents = list(parent_ids)
796
def set_path_id(self, path, new_id):
797
"""Change the id of path to new_id.
799
:param path: The path inside the tree to set - '' is the root, 'foo'
800
is the path foo in the root.
801
:param new_id: The new id to assign to the path. If unicode, it will
802
be encoded to utf8. In future this will be deprecated: avoid using
803
unicode ids if possible.
805
# TODO: start warning here.
806
if new_id.__class__ == unicode:
807
new_id = new_id.encode('utf8')
808
self._read_dirblocks_if_needed()
810
import pdb;pdb.set_trace()
812
raise NotImplementedError(self.set_path_id)
813
# TODO: check new id is unique
814
if new_id == self._root_row[0][3]:
815
# the root id is unchanged
817
root_info, root_parents = self._root_row
818
if len(root_parents):
819
self.add_deleted(root_info[3], root_parents)
820
self._root_row = ((root_info[0:3] + (new_id, ) + root_info[4:7]),
821
self._empty_parent_info())
822
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
824
def set_parent_trees(self, trees, ghosts):
825
"""Set the parent trees for the dirstate.
827
:param trees: A list of revision_id, tree tuples. tree must be provided
828
even if the revision_id refers to a ghost: supply an empty tree in
830
:param ghosts: A list of the revision_ids that are ghosts at the time
833
# TODO: generate a list of parent indexes to preserve to save
834
# processing specific parent trees. In the common case one tree will
835
# be preserved - the left most parent.
836
# TODO: if the parent tree is a dirstate, we might want to walk them
837
# all by path in parallel for 'optimal' common-case performance.
838
# generate new root row.
839
self._read_dirblocks_if_needed()
840
old_root = self._root_row
841
root_info = self._root_row[0]
842
new_parent_count = len(trees)
843
# sketch: loop over all rows in the dirstate, cherry picking
844
# entries from the parent trees, if they are not ghosts.
845
# after we finish walking the dirstate, all entries not in the dirstate
846
# are deletes, so we want to append them to the end as per the design
847
# discussions. So do a set difference on ids with the parents to
848
# get deletes, and add them to the end.
850
# skip ghost trees, as they dont get represented.
851
parent_trees = [tree for rev_id, tree in trees if rev_id not in ghosts]
852
parent_tree_count = len(parent_trees)
853
# loop over existing entries in the dirstate.
855
for entry, old_parents in self._iter_rows():
857
checked_ids.add(file_id)
858
new_parents = [None] * parent_tree_count
859
for position, parent_tree in enumerate(parent_trees):
860
# revision_utf8, KIND, dirname, basename, size, executable, sha
861
new_parents[position] = self._parent_info(parent_tree, file_id)
862
assert None not in new_parents
863
new_rows.append((entry, new_parents))
864
# get additional ids that are present in parents and not in this tree.
866
for tree in parent_trees:
867
deleted_ids.update(set(tree.inventory._byid).difference(checked_ids))
868
# add the deleted ids to the dirstate. deleted files are represented as
869
# a file with dirname '', basename ''
870
for file_id in deleted_ids:
871
# add these ids to the deleted block
872
checked_ids.add(file_id)
873
# deleted items have a synthetic entry.
874
entry = ('/', 'RECYCLED.BIN', 'file', file_id.encode('utf8'), 0,
875
DirState.NULLSTAT, '')
876
new_parents = [None] * parent_tree_count
877
for position, parent_tree in enumerate(parent_trees):
878
# revision_utf8, KIND, dirname, basename, size, executable, sha
879
new_parents[position] = self._parent_info(parent_tree, file_id)
880
assert None not in new_parents
881
new_rows.append((entry, new_parents))
884
new_rows = sorted(new_rows)
885
self._rows_to_current_state(new_rows)
886
self._parents = [rev_id for rev_id, tree in trees]
887
self._ghosts = list(ghosts)
888
self._header_state = DirState.IN_MEMORY_MODIFIED
889
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
891
def set_state_from_inventory(self, new_inv):
892
"""Set new_inv as the current state.
894
:param new_inv: The inventory object to set current state from.
896
self._read_dirblocks_if_needed()
898
# generate a byid index of the dirstate
900
for row, parents in self._iter_rows():
901
parent_rows[row[3]] = parents
903
num_present_parents = len(self._parents) - len(self._ghosts)
904
# walk the new inventory in directory order, copying parent data
907
for path, entry in new_inv.iter_entries_by_dir():
908
dirname, basename = os.path.split(path.encode('utf8'))
910
fileid_utf8 = entry.file_id.encode('utf8')
912
size = entry.text_size or 0
913
sha1 = entry.text_sha1 or ''
914
elif kind == 'symlink':
916
sha1 = (entry.symlink_target or '').encode('utf8')
921
parents = parent_rows[fileid_utf8]
922
del parent_rows[fileid_utf8]
924
parents = [DirState.NULL_PARENT_ROW] * num_present_parents
925
new_row = (dirname, basename, kind, fileid_utf8, size, DirState.NULLSTAT, sha1), parents
926
new_rows.append(new_row)
927
# append deleted data to the end of the tree as usual.
928
for fileid_utf8, parents in parent_rows.items():
930
# this row was only present in the old state, had no parents
932
# deleted items have a synthetic entry.
933
new_row = ('/', 'RECYCLED.BIN', 'file', fileid_utf8, 0,
934
DirState.NULLSTAT, ''), parents
935
new_rows.append(new_row)
937
# sort all the rows (the ones in parents not in current may be unsorted)
938
new_rows = sorted(new_rows)
939
self._rows_to_current_state(new_rows)
940
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
943
def pack_stat(st, _encode=base64.encodestring, _pack=struct.pack):
944
"""Convert stat values into a packed representation."""
945
# jam 20060614 it isn't really worth removing more entries if we
946
# are going to leave it in packed form.
947
# With only st_mtime and st_mode filesize is 5.5M and read time is 275ms
948
# With all entries filesize is 5.9M and read time is mabye 280ms
949
# well within the noise margin
951
# base64.encode always adds a final newline, so strip it off
952
return _encode(_pack('>llllll'
953
, st.st_size, st.st_mtime, st.st_ctime
954
, st.st_dev, st.st_ino, st.st_mode))[:-1]