1
# Copyright (C) 2006, 2007 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""DirState objects record the state of a directory and its bzr metadata.
19
Pseduo EBNF grammar for the disk format:
20
MINIKIND = "f" | "d" | "l";
23
WHOLE NUMBER = {digit}, digit;
25
dirstate format = header line, full checksum, row count, parent details,
27
header line = "#bazaar dirstate flat format 1", NL;
28
full checksum = "adler32: ", ["-"], WHOLE NUMBER, NL;
29
row count = "num_entries: ", digit, NL;
30
parent_details = WHOLE NUMBER, NULL, NL; XXX: complete this line
31
ghost_details = WHOLE NUMBER, NULL, {GHOST_ID NULL}*, NL;
32
rows = dirname, NULL, basename, NULL, MINIKIND, NULL, fileid_utf8, NULL,
33
WHOLE NUMBER (* size *), NULL, packed stat, NULL, sha1|symlink target,
35
PARENT ROW = NULL, revision_utf8, NULL, MINIKIND, NULL, dirname, NULL,
36
basename, NULL, WHOLE NUMBER (* size *), NULL, "y" | "n", NULL,
39
PARENT ROW's are emitted for every parent that is not in the ghosts details
40
line. That is, if the parents are foo, bar, baz, and the ghosts are bar, then
41
each row will have a PARENT ROW for foo and baz, but not for bar.
46
1) Fast end to end use for bzr's top 5 uses cases.
47
2) fall back current object model as needed.
48
3) scale usably to the largest trees known today - say 50K entries.
52
Eventually reuse dirstate objects across locks IFF the dirstate file has not
53
been modified, but will require that we flush/ignore cached stat-hit data
54
because we wont want to restat all files on disk just because a lock was
55
acquired, yet we cannot trust the data after the previous lock was released.
57
Memory representation:
58
vecter of all directories, and vector of the childen ?
60
root_row = (direntry for root, [parent_direntries_for_root]),
62
('', ['data for achild', 'data for bchild', 'data for cchild'])
63
('dir', ['achild', 'cchild', 'echild'])
65
- single bisect to find N subtrees from a path spec
66
- in-order for serialisation - this is 'dirblock' grouping.
67
- insertion of a file '/a' affects only the '/' child-vector, that is, to
68
insert 10K elements from scratch does not generates O(N^2) memoves of a
69
single vector, rather each individual, which tends to be limited to a
70
manageable number. Will scale badly on trees with 10K entries in a
71
single directory. compare with Inventory.InventoryDirectory which has
72
a dictionary for the children. No bisect capability, can only probe for
73
exact matches, or grab all elements and sorta.
74
- Whats the risk of error here? Once we have the base format being processed
75
we should have a net win regardless of optimality. So we are going to
76
go with what seems reasonably.
79
maybe we should do a test profile of these core structure - 10K simulated searches/lookups/etc?
82
The lifetime of Dirstate objects is current per lock, but see above for
83
possible extensions. The lifetime of a row from a dirstate is expected to be
84
very short in the optimistic case: which we are optimising for. For instance,
85
subtree status will determine from analysis of the disk data what rows need to
86
be examined at all, and will be able to determine from a single row whether
87
that file has altered or not, so we are aiming to process tens of thousands of
88
entries each second within the dirstate context, before exposing anything to
89
the larger codebase. This suggests we want the time for a single file
90
comparison to be < 0.1 milliseconds. That would give us 10000 paths per second
91
processed, and to scale to 100 thousand we'll another order of magnitude to do
92
that. Now, as the lifetime for all unchanged entries is the time to parse, stat
93
the file on disk, and then immediately discard, the overhead of object creation
94
becomes a significant cost.
96
Figures: Creating a tuple from from 3 elements was profiled at 0.0625
97
microseconds, whereas creating a object which is subclassed from tuple was
98
0.500 microseconds, and creating an object with 3 elements and slots was 3
99
microseconds long. 0.1 milliseconds is 100 microseconds, and ideally we'll get
100
down to 10 microseconds for the total processing - having 33% of that be object
101
creation is a huge overhead. There is a potential cost in using tuples within
102
each row which is that the conditional code to do comparisons may be slower
103
than method invocation, but method invocation is known to be slow due to stack
104
frame creation, so avoiding methods in these tight inner loops in unfortunately
105
desirable. We can consider a pyrex version of this with objects in future if
123
import bzrlib.inventory
124
from bzrlib.osutils import pathjoin, sha_file, sha_string, walkdirs
127
class DirState(object):
128
"""Record directory and metadata state for fast access.
130
A dirstate is a specialised data structure for managing local working
131
tree state information. Its not yet well defined whether it is platform
132
specific, and if it is how we detect/parameterise that.
135
_kind_to_minikind = {'file':'f', 'directory':'d', 'symlink':'l'}
136
_minikind_to_kind = {'f':'file', 'd':'directory', 'l':'symlink'}
137
_to_yesno = {True:'y', False: 'n'} # TODO profile the performance gain
138
# of using int conversion rather than a dict here. AND BLAME ANDREW IF
142
IN_MEMORY_UNMODIFIED = 1
143
IN_MEMORY_MODIFIED = 2
148
"""Create a DirState object.
152
:attr _root_row: The root row of the directory/file information,
153
- contains the path to / - '', ''
154
- kind of 'directory',
155
- the file id of the root in utf8
158
- and no sha information.
160
# _header_state and _dirblock_state represent the current state
161
# of the dirstate metadata and the per-row data respectiely.
162
# NOT_IN_MEMORY indicates that no data is in memory
163
# IN_MEMORY_UNMODIFIED indicates that what we have in memory
164
# is the same as is on disk
165
# IN_MEMORY_MODIFIED indicates that we have a modified version
166
# of what is on disk.
167
# In future we will add more granularity, for instance _dirblock_state
168
# will probably support partially-in-memory as a separate variable,
169
# allowing for partially-in-memory unmodified and partially-in-memory
171
self._header_state = DirState.NOT_IN_MEMORY
172
self._dirblock_state = DirState.NOT_IN_MEMORY
176
self._root_row = None
177
self._state_file=None
179
def add(self, path, file_id, kind, stat, link_or_sha1):
180
"""Add a path to be tracked.
182
:param path: The path within the dirstate - '' is the root, 'foo' is the
183
path foo within the root, 'foo/bar' is the path bar within foo
185
:param file_id: The file id of the path being added.
186
:param kind: The kind of the path.
187
:param stat: The output of os.lstate for the path.
188
:param link_or_sha1: The sha value of the file, or the target of a
189
symlink. '' for directories.
192
# find the block its in.
193
# find the location in the block.
194
# check its not there
196
self._read_dirblocks_if_needed()
197
utf8path = path.encode('utf8')
198
dirname, basename = os.path.split(utf8path)
199
block_index = self._find_dirblock_index(dirname)
201
# some parent path has not been added - its an error to add this
203
raise errors.NotVersionedError(path, str(self))
204
block = self._dirblocks[block_index][1]
207
packed_stat = DirState.NULLSTAT
210
packed_stat = pack_stat(stat)
212
row_data = ((dirname, basename, kind, file_id.encode('utf8'),
213
size, packed_stat, link_or_sha1), [])
214
elif kind == 'directory':
215
row_data = ((dirname, basename, kind, file_id.encode('utf8'),
216
0, packed_stat, ''), [])
217
elif kind == 'symlink':
218
row_data = ((dirname, basename, kind, file_id.encode('utf8'),
219
size, packed_stat, link_or_sha1), [])
221
raise errors.BzrError('unknown kind %r' % kind)
222
row_index = bisect.bisect_left(block, row_data)
223
if len(block) > row_index:
224
assert block[row_index][0][1] != basename, \
225
"basename %r already added" % basename
226
block.insert(row_index, row_data)
228
if kind == 'directory':
229
# insert a new dirblock
230
bisect.insort_left(self._dirblocks, (utf8path, []))
231
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
233
def add_deleted(self, fileid_utf8, parents):
234
"""Add fileid_utf8 with parents as deleted."""
235
self._read_dirblocks_if_needed()
236
new_row = self._make_deleted_row(fileid_utf8, parents)
237
block_index = self._find_dirblock_index(new_row[0][0])
239
# no deleted block yet.
240
bisect.insort_left(self._dirblocks, (new_row[0][0], []))
241
block_index = self._find_dirblock_index(new_row[0][0])
242
block = self._dirblocks[block_index][1]
243
row_index = bisect.insort_left(block, new_row)
244
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
246
def _find_dirblock_index(self, dirname):
247
"""Find the dirblock index for dirname.
249
:return: -1 if the dirname is not present, or the index in
250
self._dirblocks for it otherwise.
252
block_index = bisect.bisect_left(self._dirblocks, (dirname, []))
253
if (block_index == len(self._dirblocks) or
254
self._dirblocks[block_index][0] != dirname):
259
def from_tree(tree, dir_state_filename):
260
"""Create a dirstate from a bzr Tree.
262
:param tree: The tree which should provide parent information and
265
# XXX: aka the big ugly.
267
result._state_file = open(dir_state_filename, 'wb+')
269
_encode = base64.encodestring
271
parent_ids = tree.get_parent_ids()
272
num_parents = len(parent_ids)
274
raise ValueError('Cannot handle more than 3 parents')
277
for parent_id in parent_ids:
278
parent_trees.append(tree.branch.repository.revision_tree(parent_id))
280
# FIXME: is this utf8 safe?
282
to_minikind = DirState._kind_to_minikind
283
to_yesno = DirState._to_yesno
285
st = os.lstat(tree.basedir)
288
, 'directory', tree.inventory.root.file_id.encode('utf8')
289
, 0 # no point having a size for dirs.
294
for parent_tree in parent_trees:
295
root_parents.append((
296
parent_tree.inventory.root.revision.encode('utf8'),
304
root_row = (root_info, root_parents)
306
for dirinfo, block in tree.walkdirs():
307
# dirinfo is path, id
309
# add the row for this block
311
dirblocks.append((dirinfo[0], block_row))
312
for relpath, name, kind, st, fileid, versionedkind in block:
314
# unversioned file, skip
316
# TODO? factor out this loop body as a helper function ?
318
dirname, basename = os.path.split(relpath.encode('utf8'))
320
s = tree.get_file_sha1(fileid, relpath)
321
elif kind == 'directory':
322
if name in ('.bzr', '.hg', 'CVS', '.svn', '_svn'):
323
raise Exception('skipping dirs not supported yet')
324
# Skip this, and all children
325
to_remove.append((relpath, name, kind, st, abspath))
329
elif kind == 'symlink':
330
# sha value of the link target ?!
331
s = os.readlink(abspath)
333
for count in xrange(num_parents):
335
result._parent_info(parent_trees[count], fileid))
336
row_data = (dirname.encode('utf8'), basename.encode('utf8'),
337
kind, fileid.encode('utf8'), st.st_size, pack_stat(st),
339
block_row.append((row_data, parent_info))
341
# It isn't safe to remove entries while we are iterating
342
# over the same list, so remove them now
343
for entry in to_remove:
346
#lines.append(result._get_parents_line(parent_ids))
347
#lines.append(result._get_ghosts_line([]))
348
result._set_data(parent_ids, root_row, dirblocks)
352
def get_ghosts(self):
353
"""Return a list of the parent tree revision ids that are ghosts."""
354
self._read_header_if_needed()
358
"""Serialise the entire dirstate to a sequence of lines."""
359
if (self._header_state == DirState.IN_MEMORY_UNMODIFIED and
360
self._dirblock_state == DirState.IN_MEMORY_UNMODIFIED):
361
# read whats on disk.
362
self._state_file.seek(0)
363
return self._state_file.readlines()
365
lines.append(self._get_parents_line(self.get_parent_ids()))
366
lines.append(self._get_ghosts_line(self._ghosts))
367
# append the root line which is special cased
368
lines.extend(map(self._row_to_line, self._iter_rows()))
369
return self._get_output_lines(lines)
371
def _get_ghosts_line(self, ghost_ids):
372
"""Create a line for the state file for ghost information."""
373
return '\0'.join([str(len(ghost_ids))] + ghost_ids)
375
def _get_parents_line(self, parent_ids):
376
"""Create a line for the state file for parents information."""
377
return '\0'.join([str(len(parent_ids))] + parent_ids)
379
def get_parent_ids(self):
380
"""Return a list of the parent tree ids for the directory state."""
381
self._read_header_if_needed()
382
return list(self._parents)
385
def initialize(path):
386
"""Create a new dirstate on path.
388
The new dirstate will be an empty tree - that is it has no parents,
389
and only a root node - which has id ROOT_ID.
391
:param path: The name of the file for the dirstate.
392
:return: A DirState object.
394
# This constructs a new DirState object on a path, sets the _state_file
395
# to a new empty file for that path. It then calls _set_data() with our
396
# stock empty dirstate information - a root with ROOT_ID, no children,
397
# and no parents. Finally it calls save() to ensure that this data will
400
result._state_file = open(path, 'wb+')
401
# a new root directory, with a pack_stat (the x's) that is just noise and will
402
# never match the output of base64 encode.
403
root_row_data = ('', '', 'directory', bzrlib.inventory.ROOT_ID, 0,
404
DirState.NULLSTAT, '')
406
root_row = (root_row_data, root_parents)
407
empty_tree_dirblocks = [('', [])] # root dir contents - no entries.
408
result._set_data([], root_row, empty_tree_dirblocks)
412
result._state_file.close()
416
def _iter_rows(self):
417
"""Iterate over all the row data in the dirstate.
419
Each yelt item is a tuple of (row_data, parent_data_list).
421
self._read_dirblocks_if_needed()
423
for directory in self._dirblocks:
424
for row in directory[1]:
427
def _get_output_lines(self, lines):
428
"""format lines for final output.
430
:param lines: A sequece of lines containing the parents list and the
433
output_lines = ['#bazaar dirstate flat format 1\n']
434
lines.append('') # a final newline
435
inventory_text = '\0\n\0'.join(lines)
436
output_lines.append('adler32: %s\n' % (zlib.adler32(inventory_text),))
437
# -3, 1 for num parents, 1 for ghosts, 1 for final newline
438
num_entries = len(lines)-3
439
output_lines.append('num_entries: %s\n' % (num_entries,))
440
output_lines.append(inventory_text)
443
def _make_deleted_row(self, fileid_utf8, parents):
444
"""Return a deleted for for fileid_utf8."""
445
return ('/', 'RECYCLED.BIN', 'file', fileid_utf8, 0, DirState.NULLSTAT,
450
"""Construct a DirState on the file at path path."""
452
result._state_file = open(path, 'rb+')
455
def _parent_info(self, parent_tree, fileid):
456
"""Generate the parent information for file_id in parent_tree."""
457
# FIXME: This is probably expensive - we encode the path that in the
458
# common case will be the same across all parents, twice.
459
# also, id2path is slow in inventory, and we should make that fast.
461
parent_entry = parent_tree.inventory[fileid]
462
except errors.NoSuchId:
463
# this parent does not have fileid - return a bogus entry, which
464
# will be filtered out on iteration etc.
465
# an empty revision id is bogus and safe to put on disk
466
# we make it be a 'file', just because. We give it the
467
# deleted file path dirname and basename, 0 size, not executable
469
return ('', 'file', '/', 'RECYCLED.BIN', 0, False, '')
470
parent_path = parent_tree.inventory.id2path(fileid)
471
dirname, basename = os.path.split(parent_path.encode('utf8'))
472
return (parent_entry.revision.encode('utf8'),
474
# FIXME: set these from the parent
475
dirname.encode('utf8'), basename.encode('utf8'),
476
parent_entry.text_size or 0,
477
parent_entry.executable,
478
parent_entry.text_sha1 or '',
481
def _read_dirblocks_if_needed(self):
482
"""Read in all the dirblocks from the file if they are not in memory."""
483
self._read_header_if_needed()
484
if self._dirblock_state == DirState.NOT_IN_MEMORY:
485
# the _state_file pointer will be positioned at the start of the
487
text = self._state_file.read()
488
# TODO: check the adler checksums. adler_measured = zlib.adler32(text)
490
fields = text.split('\0')
491
# Remove the last blank entry
492
trailing = fields.pop()
493
assert trailing == ''
494
# consider turning fields into a tuple.
496
# skip the first field which is the trailing null from the header.
498
# Each line now has an extra '\n' field which is not used
499
# so we just skip over it
500
# number of fields per dir_entry
501
# + number of fields per parent_entry
503
num_present_parents = len(self._parents) - len(self._ghosts)
504
entry_size = 7 + (7 * (num_present_parents)) + 1
505
expected_field_count = entry_size * self._num_entries
506
if len(fields) - cur > expected_field_count:
507
fields = fields[:expected_field_count + cur]
508
trace.mutter('Unexpectedly long dirstate field count!')
509
print "XXX: incorrectly truncated dirstate file bug triggered."
510
field_count = len(fields)
511
# this checks our adjustment, and also catches file too short.
512
assert field_count - cur == expected_field_count, \
513
'field count incorrect %s != %s, entry_size=%s, '\
514
'num_entries=%s fields=%r' % (
515
field_count - cur, expected_field_count, entry_size,
516
self._num_entries, fields)
518
# Fast path the case where there are 1 or 2 parents
519
if num_present_parents == 0:
520
entries = [(fields[pos:pos+7], []) for pos in xrange(cur, field_count, entry_size)]
521
elif num_present_parents == 1:
522
entries = [(fields[pos:pos+7], [fields[pos+7:pos+14],])
523
for pos in xrange(cur, field_count, entry_size)]
524
elif num_present_parents == 2:
525
entries = [(fields[pos:pos+7], [
526
fields[pos+7:pos+14],
527
fields[pos+14:pos+21],])
528
for pos in xrange(cur, field_count, entry_size)]
532
tuple([fields[chunk:chunk+7] for
533
chunk in xrange(pos + 7, pos+entry_size-1, 7)]))
534
for pos in xrange(cur, field_count, entry_size)
537
assert len(entries) == self._num_entries, '%s != %s entries' % (len(entries),
540
def _line_to_row(line):
541
"""Convert a freshly read line's size and minikind for use."""
542
# convert the minikind to kind
543
line[0][2] = self._minikind_to_kind[line[0][2]]
544
# convert the size to an int
545
line[0][4] = int(line[0][4])
546
for parent in line[1]:
547
parent[1] = self._minikind_to_kind[parent[1]]
548
parent[4] = int(parent[4])
549
parent[5] = parent[5] == 'y'
550
return tuple(line[0]), map(tuple, line[1])
551
new_rows = map(_line_to_row, entries)
552
self._rows_to_current_state(new_rows)
553
self._dirblock_state = DirState.IN_MEMORY_UNMODIFIED
555
def _read_header(self):
556
"""This reads in the metadata header, and the parent ids.
558
After reading in, the file should be positioned at the null
559
just before the start of the first record in the file.
561
:return: (expected adler checksum, number of entries, parent list)
564
parent_line = self._state_file.readline()
565
info = parent_line.split('\0')
566
num_parents = int(info[0])
567
assert num_parents == len(info)-2, 'incorrect parent info line'
568
self._parents = [p.decode('utf8') for p in info[1:-1]]
570
ghost_line = self._state_file.readline()
571
info = ghost_line.split('\0')
572
num_ghosts = int(info[1])
573
assert num_ghosts == len(info)-3, 'incorrect ghost info line'
574
self._ghosts = [p.decode('utf8') for p in info[2:-1]]
575
self._header_state = DirState.IN_MEMORY_UNMODIFIED
577
def _read_header_if_needed(self):
578
"""Read the header of the dirstate file if needed."""
579
if self._header_state == DirState.NOT_IN_MEMORY:
582
def _read_prelude(self):
583
"""Read in the prelude header of the dirstate file
585
This only reads in the stuff that is not connected to the adler
586
checksum. The position will be correct to read in the rest of
587
the file and check the checksum after this point.
588
The next entry in the file should be the number of parents,
589
and their ids. Followed by a newline.
591
header = self._state_file.readline()
592
assert header == '#bazaar dirstate flat format 1\n', \
593
'invalid header line: %r' % (header,)
594
adler_line = self._state_file.readline()
595
assert adler_line.startswith('adler32: '), 'missing adler32 checksum'
596
self.adler_expected = int(adler_line[len('adler32: '):-1])
597
num_entries_line = self._state_file.readline()
598
assert num_entries_line.startswith('num_entries: '), 'missing num_entries line'
599
self._num_entries = int(num_entries_line[len('num_entries: '):-1])
601
def _row_to_line(self, row):
602
"""Serialize row to a NULL delimited line ready for _get_output_lines.
604
:param row: A row_tuple as defined in the module docstring.
606
entire_row = list(row[0])
607
for parent_number, parent_data in enumerate(row[1]):
608
# (revision, kind, dirname, basename, size, executable_bool, sha1)
609
entire_row.extend(parent_data)
610
# minikind conversion of the parent
611
parent_offset = 7 + parent_number * 7
612
entire_row[parent_offset + 1] = DirState._kind_to_minikind[parent_data[1]]
613
entire_row[parent_offset + 4] = str(parent_data[4])
614
entire_row[parent_offset + 5] = DirState._to_yesno[parent_data[5]]
615
# conversion from memory to disk-ready format:
616
# minikind conversion of the current row type.
617
entire_row[2] = DirState._kind_to_minikind[entire_row[2]]
618
entire_row[4] = str(entire_row[4])
619
# minikind of parents
620
return '\0'.join(entire_row)
622
def _rows_to_current_state(self, new_rows):
623
"""Load new_rows into self._root_row and self.dirblocks.
625
Process new_rows into the current state object, making them the active
628
:param new_rows: A sorted list of rows. This function does not sort
629
to prevent unneeded overhead when callers have a sorted list
633
assert new_rows[0][0][0:2] == ('', ''), \
634
"Incorrect root row %r" % new_rows[0][0]
635
self._root_row = new_rows[0]
636
self._dirblocks = [('', [])]
637
for row in new_rows[1:]:
638
if row[0] != self._dirblocks[-1][0]:
640
self._dirblocks.append((row[0][0], []))
641
# append the row to the current block
642
self._dirblocks[-1][1].append(row)
645
"""Save any pending changes created during this session.
647
We reuse the existing file, because that prevents race conditions with
648
file creation, and we expect to be using oslocks on it in the near
649
future to prevent concurrent modification and reads - because dirstates
650
incremental data aggretation is not compatible with reading a modified
651
file, and replacing a file in use by another process is impossible on
654
A dirstate in read only mode should be smart enough though to validate
655
that the file has not changed, and otherwise discard its cache and
656
start over, to allow for fine grained read lock duration, so 'status'
657
wont block 'commit' - for example.
659
if (self._header_state == DirState.IN_MEMORY_MODIFIED or
660
self._dirblock_state == DirState.IN_MEMORY_MODIFIED):
661
self._state_file.seek(0)
662
self._state_file.writelines(self.get_lines())
663
self._state_file.truncate()
664
self._state_file.flush()
665
self._header_state = DirState.IN_MEMORY_UNMODIFIED
666
self._dirblock_state = DirState.IN_MEMORY_UNMODIFIED
668
def _set_data(self, parent_ids, root_row, dirblocks):
669
"""Set the full dirstate data in memory.
671
This is an internal function used to completely replace the objects
672
in memory state. It puts the dirstate into state 'full-dirty'.
674
:param parent_ids: A list of parent tree revision ids.
675
:param root_row: The root row - a tuple of the root direntry and the
676
list of matching direntries from the parent_ids trees.
677
:param dirblocks: A list containing one tuple for each directory in the
678
tree. Each tuple contains the directory path and a list of
679
row data in the same format as root_row.
681
# our memory copy is now authoritative.
682
self._dirblocks = dirblocks
683
self._root_row = root_row
684
self._header_state = DirState.IN_MEMORY_MODIFIED
685
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
686
self._parents = list(parent_ids)
688
def set_path_id(self, path, new_id):
689
"""Change the id of path to new_id.
691
:param path: The path inside the tree to set - '' is the root, 'foo'
692
is the path foo in the root.
693
:param new_id: The new id to assign to the path. If unicode, it will
694
be encoded to utf8. In future this will be deprecated: avoid using
695
unicode ids if possible.
697
# TODO: start warning here.
698
if new_id.__class__ == unicode:
699
new_id = new_id.encode('utf8')
700
self._read_dirblocks_if_needed()
701
if new_id == self._root_row[0][3]:
702
# the root id is unchanged
704
if len(path) or len(self._parents):
705
import pdb;pdb.set_trace()
707
raise NotImplementedError(self.set_path_id)
708
root_info, root_parents = self._root_row
709
self._root_row = (root_info[0:3] + (new_id, ) + root_info[4:7]), root_parents
710
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
712
def set_parent_trees(self, trees, ghosts):
713
"""Set the parent trees for the dirstate.
715
:param trees: A list of revision_id, tree tuples. tree must be provided
716
even if the revision_id refers to a ghost: supply an empty tree in
718
:param ghosts: A list of the revision_ids that are ghosts at the time
721
# TODO: generate a list of parent indexes to preserve to save
722
# processing specific parent trees. In the common case one tree will
723
# be preserved - the left most parent.
724
# TODO: if the parent tree is a dirstate, we might want to walk them
725
# all by path in parallel for 'optimal' common-case performance.
726
# generate new root row.
727
self._read_dirblocks_if_needed()
728
old_root = self._root_row
729
root_info = self._root_row[0]
730
new_parent_count = len(trees)
731
# sketch: loop over all rows in the dirstate, cherry picking
732
# entries from the parent trees, if they are not ghosts.
733
# after we finish walking the dirstate, all entries not in the dirstate
734
# are deletes, so we want to append them to the end as per the design
735
# discussions. So do a set difference on ids with the parents to
736
# get deletes, and add them to the end.
738
# skip ghost trees, as they dont get represented.
739
parent_trees = [tree for rev_id, tree in trees if rev_id not in ghosts]
740
parent_tree_count = len(parent_trees)
741
# loop over existing entries in the dirstate.
743
for entry, old_parents in self._iter_rows():
745
checked_ids.add(file_id)
746
new_parents = [None] * parent_tree_count
747
for position, parent_tree in enumerate(parent_trees):
748
# revision_utf8, KIND, dirname, basename, size, executable, sha
749
new_parents[position] = self._parent_info(parent_tree, file_id)
750
assert None not in new_parents
751
new_rows.append((entry, new_parents))
752
# get additional ids that are present in parents and not in this tree.
754
for tree in parent_trees:
755
deleted_ids.update(set(tree.inventory._byid).difference(checked_ids))
756
# add the deleted ids to the dirstate. deleted files are represented as
757
# a file with dirname '', basename ''
758
for file_id in deleted_ids:
759
# add these ids to the deleted block
760
checked_ids.add(file_id)
761
# deleted items have a synthetic entry.
762
entry = ('/', 'RECYCLED.BIN', 'file', file_id.encode('utf8'), 0,
763
DirState.NULLSTAT, '')
764
new_parents = [None] * parent_tree_count
765
for position, parent_tree in enumerate(parent_trees):
766
# revision_utf8, KIND, dirname, basename, size, executable, sha
767
new_parents[position] = self._parent_info(parent_tree, file_id)
768
assert None not in new_parents
769
new_rows.append((entry, new_parents))
772
new_rows = sorted(new_rows)
773
self._rows_to_current_state(new_rows)
774
self._parents = [rev_id for rev_id, tree in trees]
775
self._ghosts = list(ghosts)
776
self._header_state = DirState.IN_MEMORY_MODIFIED
777
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
779
def set_state_from_inventory(self, new_inv):
780
"""Set new_inv as the current state.
782
:param new_inv: The inventory object to set current state from.
784
self._read_dirblocks_if_needed()
786
# generate a byid index of the dirstate
788
for row, parents in self._iter_rows():
789
parent_rows[row[3]] = parents
790
# walk the new inventory in directory order, copying parent data
793
for path, entry in new_inv.iter_entries_by_dir():
794
dirname, basename = os.path.split(path.encode('utf8'))
796
fileid_utf8 = entry.file_id.encode('utf8')
798
size = entry.text_size or 0
799
sha1 = entry.text_sha1 or ''
800
elif kind == 'symlink':
802
sha1 = (entry.symlink_target or '').encode('utf8')
807
parents = parent_rows[fileid_utf8]
808
del parent_rows[fileid_utf8]
811
new_row = (dirname, basename, kind, fileid_utf8, size, DirState.NULLSTAT, sha1), parents
812
new_rows.append(new_row)
813
# append deleted data to the end of the tree as usual.
814
for fileid_utf8, parents in parent_rows.items():
816
# this row was only present in the old state, had no parents
818
# deleted items have a synthetic entry.
819
new_row = ('/', 'RECYCLED.BIN', 'file', fileid_utf8, 0,
820
DirState.NULLSTAT, ''), parents
821
new_rows.append(new_row)
823
# sort all the rows (the ones in parents not in current may be unsorted)
824
new_rows = sorted(new_rows)
825
self._rows_to_current_state(new_rows)
826
self._dirblock_state = DirState.IN_MEMORY_MODIFIED
829
def pack_stat(st, _encode=base64.encodestring, _pack=struct.pack):
830
"""Convert stat values into a packed representation."""
831
# jam 20060614 it isn't really worth removing more entries if we
832
# are going to leave it in packed form.
833
# With only st_mtime and st_mode filesize is 5.5M and read time is 275ms
834
# With all entries filesize is 5.9M and read time is mabye 280ms
835
# well within the noise margin
837
# base64.encode always adds a final newline, so strip it off
838
return _encode(_pack('>llllll'
839
, st.st_size, st.st_mtime, st.st_ctime
840
, st.st_dev, st.st_ino, st.st_mode))[:-1]