1
# Copyright (C) 2007, 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Helper functions for DirState.
19
This is the python implementation for DirState functions.
22
from bzrlib import errors
23
from bzrlib.dirstate import DirState
26
# Give Pyrex some function definitions for it to understand.
27
# All of these are just hints to Pyrex, so that it can try to convert python
28
# objects into similar C objects. (such as PyInt => int).
29
# In anything defined 'cdef extern from XXX' the real C header will be
30
# imported, and the real definition will be used from there. So these are just
31
# hints, and do not need to match exactly to the C definitions.
34
ctypedef unsigned long size_t
36
cdef extern from "_dirstate_helpers_c.h":
40
cdef extern from "stdlib.h":
41
unsigned long int strtoul(char *nptr, char **endptr, int base)
44
# These functions allow us access to a bit of the 'bare metal' of python
45
# objects, rather than going through the object abstraction. (For example,
46
# PyList_Append, rather than getting the 'append' attribute of the object, and
47
# creating a tuple, and then using PyCallObject).
48
# Functions that return (or take) a void* are meant to grab a C PyObject*. This
49
# differs from the Pyrex 'object'. If you declare a variable as 'object' Pyrex
50
# will automatically Py_INCREF and Py_DECREF when appropriate. But for some
51
# inner loops, we don't need to do that at all, as the reference only lasts for
53
cdef extern from "Python.h":
54
int PyList_Append(object lst, object item) except -1
55
void *PyList_GetItem_object_void "PyList_GET_ITEM" (object lst, int index)
56
int PyList_CheckExact(object)
58
void *PyTuple_GetItem_void_void "PyTuple_GET_ITEM" (void* tpl, int index)
60
char *PyString_AsString(object p)
61
char *PyString_AS_STRING_void "PyString_AS_STRING" (void *p)
62
object PyString_FromString(char *)
63
object PyString_FromStringAndSize(char *, Py_ssize_t)
64
int PyString_Size(object p)
65
int PyString_GET_SIZE_void "PyString_GET_SIZE" (void *p)
66
int PyString_CheckExact(object p)
69
cdef extern from "string.h":
70
int strncmp(char *s1, char *s2, int len)
71
void *memchr(void *s, int c, size_t len)
72
int memcmp(void *b1, void *b2, size_t len)
73
# ??? memrchr is a GNU extension :(
74
# void *memrchr(void *s, int c, size_t len)
77
cdef void* _my_memrchr(void *s, int c, size_t n):
78
# memrchr seems to be a GNU extension, so we have to implement it ourselves
91
def _py_memrchr(s, c):
92
"""Just to expose _my_memrchr for testing.
94
:param s: The Python string to search
95
:param c: The character to search for
96
:return: The offset to the last instance of 'c' in s
103
_s = PyString_AsString(s)
104
length = PyString_Size(s)
106
_c = PyString_AsString(c)
107
assert PyString_Size(c) == 1,\
108
'Must be a single character string, not %s' % (c,)
109
found = _my_memrchr(_s, _c[0], length)
112
return <char*>found - <char*>_s
114
cdef object safe_string_from_size(char *s, Py_ssize_t size):
116
raise AssertionError(
117
'tried to create a string with an invalid size: %d @0x%x'
119
return PyString_FromStringAndSize(s, size)
122
cdef int _is_aligned(void *ptr):
123
"""Is this pointer aligned to an integer size offset?
125
:return: 1 if this pointer is aligned, 0 otherwise.
127
return ((<intptr_t>ptr) & ((sizeof(int))-1)) == 0
130
cdef int _cmp_by_dirs(char *path1, int size1, char *path2, int size2):
131
cdef unsigned char *cur1
132
cdef unsigned char *cur2
133
cdef unsigned char *end1
134
cdef unsigned char *end2
140
if path1 == path2 and size1 == size2:
143
end1 = <unsigned char*>path1+size1
144
end2 = <unsigned char*>path2+size2
146
# Use 32-bit comparisons for the matching portion of the string.
147
# Almost all CPU's are faster at loading and comparing 32-bit integers,
148
# than they are at 8-bit integers.
149
# 99% of the time, these will be aligned, but in case they aren't just skip
151
if _is_aligned(path1) and _is_aligned(path2):
152
cur_int1 = <int*>path1
153
cur_int2 = <int*>path2
154
end_int1 = <int*>(path1 + size1 - (size1 % sizeof(int)))
155
end_int2 = <int*>(path2 + size2 - (size2 % sizeof(int)))
157
while cur_int1 < end_int1 and cur_int2 < end_int2:
158
if cur_int1[0] != cur_int2[0]:
160
cur_int1 = cur_int1 + 1
161
cur_int2 = cur_int2 + 1
163
cur1 = <unsigned char*>cur_int1
164
cur2 = <unsigned char*>cur_int2
166
cur1 = <unsigned char*>path1
167
cur2 = <unsigned char*>path2
169
while cur1 < end1 and cur2 < end2:
170
if cur1[0] == cur2[0]:
171
# This character matches, just go to the next one
175
# The current characters do not match
177
return -1 # Reached the end of path1 segment first
178
elif cur2[0] == c'/':
179
return 1 # Reached the end of path2 segment first
180
elif cur1[0] < cur2[0]:
185
# We reached the end of at least one of the strings
187
return 1 # Not at the end of cur1, must be at the end of cur2
189
return -1 # At the end of cur1, but not at cur2
190
# We reached the end of both strings
194
def cmp_by_dirs_c(path1, path2):
195
"""Compare two paths directory by directory.
197
This is equivalent to doing::
199
cmp(path1.split('/'), path2.split('/'))
201
The idea is that you should compare path components separately. This
202
differs from plain ``cmp(path1, path2)`` for paths like ``'a-b'`` and
203
``a/b``. "a-b" comes after "a" but would come before "a/b" lexically.
205
:param path1: first path
206
:param path2: second path
207
:return: negative number if ``path1`` comes first,
208
0 if paths are equal,
209
and positive number if ``path2`` sorts first
211
if not PyString_CheckExact(path1):
212
raise TypeError("'path1' must be a plain string, not %s: %r"
213
% (type(path1), path1))
214
if not PyString_CheckExact(path2):
215
raise TypeError("'path2' must be a plain string, not %s: %r"
216
% (type(path2), path2))
217
return _cmp_by_dirs(PyString_AsString(path1),
218
PyString_Size(path1),
219
PyString_AsString(path2),
220
PyString_Size(path2))
223
def _cmp_path_by_dirblock_c(path1, path2):
224
"""Compare two paths based on what directory they are in.
226
This generates a sort order, such that all children of a directory are
227
sorted together, and grandchildren are in the same order as the
228
children appear. But all grandchildren come after all children.
230
In other words, all entries in a directory are sorted together, and
231
directorys are sorted in cmp_by_dirs order.
233
:param path1: first path
234
:param path2: the second path
235
:return: negative number if ``path1`` comes first,
237
and a positive number if ``path2`` sorts first
239
if not PyString_CheckExact(path1):
240
raise TypeError("'path1' must be a plain string, not %s: %r"
241
% (type(path1), path1))
242
if not PyString_CheckExact(path2):
243
raise TypeError("'path2' must be a plain string, not %s: %r"
244
% (type(path2), path2))
245
return _cmp_path_by_dirblock(PyString_AsString(path1),
246
PyString_Size(path1),
247
PyString_AsString(path2),
248
PyString_Size(path2))
251
cdef int _cmp_path_by_dirblock(char *path1, int path1_len,
252
char *path2, int path2_len):
253
"""Compare two paths by what directory they are in.
255
see ``_cmp_path_by_dirblock_c`` for details.
258
cdef int dirname1_len
260
cdef int dirname2_len
262
cdef int basename1_len
264
cdef int basename2_len
268
if path1_len == 0 and path2_len == 0:
271
if path1 == path2 and path1_len == path2_len:
280
basename1 = <char*>_my_memrchr(path1, c'/', path1_len)
282
if basename1 == NULL:
284
basename1_len = path1_len
289
dirname1_len = basename1 - path1
290
basename1 = basename1 + 1
291
basename1_len = path1_len - dirname1_len - 1
293
basename2 = <char*>_my_memrchr(path2, c'/', path2_len)
295
if basename2 == NULL:
297
basename2_len = path2_len
302
dirname2_len = basename2 - path2
303
basename2 = basename2 + 1
304
basename2_len = path2_len - dirname2_len - 1
306
cmp_val = _cmp_by_dirs(dirname1, dirname1_len,
307
dirname2, dirname2_len)
311
cur_len = basename1_len
312
if basename2_len < basename1_len:
313
cur_len = basename2_len
315
cmp_val = memcmp(basename1, basename2, cur_len)
318
if basename1_len == basename2_len:
320
if basename1_len < basename2_len:
325
def _bisect_path_left_c(paths, path):
326
"""Return the index where to insert path into paths.
328
This uses a path-wise comparison so we get::
338
:param paths: A list of paths to search through
339
:param path: A single path to insert
340
:return: An offset where 'path' can be inserted.
341
:seealso: bisect.bisect_left
352
if not PyList_CheckExact(paths):
353
raise TypeError("you must pass a python list for 'paths' not: %s %r"
354
% (type(paths), paths))
355
if not PyString_CheckExact(path):
356
raise TypeError("you must pass a string for 'path' not: %s %r"
357
% (type(path), path))
362
path_cstr = PyString_AsString(path)
363
path_size = PyString_Size(path)
366
_mid = (_lo + _hi) / 2
367
cur = PyList_GetItem_object_void(paths, _mid)
368
cur_cstr = PyString_AS_STRING_void(cur)
369
cur_size = PyString_GET_SIZE_void(cur)
370
if _cmp_path_by_dirblock(cur_cstr, cur_size, path_cstr, path_size) < 0:
377
def _bisect_path_right_c(paths, path):
378
"""Return the index where to insert path into paths.
380
This uses a path-wise comparison so we get::
390
:param paths: A list of paths to search through
391
:param path: A single path to insert
392
:return: An offset where 'path' can be inserted.
393
:seealso: bisect.bisect_right
404
if not PyList_CheckExact(paths):
405
raise TypeError("you must pass a python list for 'paths' not: %s %r"
406
% (type(paths), paths))
407
if not PyString_CheckExact(path):
408
raise TypeError("you must pass a string for 'path' not: %s %r"
409
% (type(path), path))
414
path_cstr = PyString_AsString(path)
415
path_size = PyString_Size(path)
418
_mid = (_lo + _hi) / 2
419
cur = PyList_GetItem_object_void(paths, _mid)
420
cur_cstr = PyString_AS_STRING_void(cur)
421
cur_size = PyString_GET_SIZE_void(cur)
422
if _cmp_path_by_dirblock(path_cstr, path_size, cur_cstr, cur_size) < 0:
429
def bisect_dirblock_c(dirblocks, dirname, lo=0, hi=None, cache=None):
430
"""Return the index where to insert dirname into the dirblocks.
432
The return value idx is such that all directories blocks in dirblock[:idx]
433
have names < dirname, and all blocks in dirblock[idx:] have names >=
436
Optional args lo (default 0) and hi (default len(dirblocks)) bound the
437
slice of a to be searched.
442
cdef char *dirname_cstr
443
cdef int dirname_size
448
if not PyList_CheckExact(dirblocks):
449
raise TypeError("you must pass a python list for 'dirblocks' not: %s %r"
450
% (type(dirblocks), dirblocks))
451
if not PyString_CheckExact(dirname):
452
raise TypeError("you must pass a string for dirname not: %s %r"
453
% (type(dirname), dirname))
460
dirname_cstr = PyString_AsString(dirname)
461
dirname_size = PyString_Size(dirname)
464
_mid = (_lo + _hi) / 2
465
# Grab the dirname for the current dirblock
466
# cur = dirblocks[_mid][0]
467
cur = PyTuple_GetItem_void_void(
468
PyList_GetItem_object_void(dirblocks, _mid), 0)
469
cur_cstr = PyString_AS_STRING_void(cur)
470
cur_size = PyString_GET_SIZE_void(cur)
471
if _cmp_by_dirs(cur_cstr, cur_size, dirname_cstr, dirname_size) < 0:
479
"""Maintain the current location, and return fields as you parse them."""
481
cdef object state # The DirState object
482
cdef object text # The overall string object
483
cdef char *text_cstr # Pointer to the beginning of text
484
cdef int text_size # Length of text
486
cdef char *end_cstr # End of text
487
cdef char *cur_cstr # Pointer to the current record
488
cdef char *next # Pointer to the end of this record
490
def __init__(self, text, state):
493
self.text_cstr = PyString_AsString(text)
494
self.text_size = PyString_Size(text)
495
self.end_cstr = self.text_cstr + self.text_size
496
self.cur_cstr = self.text_cstr
498
cdef char *get_next(self, int *size) except NULL:
499
"""Return a pointer to the start of the next field."""
501
cdef Py_ssize_t extra_len
503
if self.cur_cstr == NULL:
504
raise AssertionError('get_next() called when cur_str is NULL')
505
elif self.cur_cstr >= self.end_cstr:
506
raise AssertionError('get_next() called when there are no chars'
509
self.cur_cstr = <char*>memchr(next, c'\0', self.end_cstr - next)
510
if self.cur_cstr == NULL:
511
extra_len = self.end_cstr - next
512
raise errors.DirstateCorrupt(self.state,
513
'failed to find trailing NULL (\\0).'
514
' Trailing garbage: %r'
515
% safe_string_from_size(next, extra_len))
516
size[0] = self.cur_cstr - next
517
self.cur_cstr = self.cur_cstr + 1
520
cdef object get_next_str(self):
521
"""Get the next field as a Python string."""
524
next = self.get_next(&size)
525
return safe_string_from_size(next, size)
527
cdef int _init(self) except -1:
528
"""Get the pointer ready.
530
This assumes that the dirstate header has already been read, and we
531
already have the dirblock string loaded into memory.
532
This just initializes our memory pointers, etc for parsing of the
537
# The first field should be an empty string left over from the Header
538
first = self.get_next(&size)
539
if first[0] != c'\0' and size == 0:
540
raise AssertionError('First character should be null not: %s'
544
cdef object _get_entry(self, int num_trees, void **p_current_dirname,
546
"""Extract the next entry.
548
This parses the next entry based on the current location in
550
Each entry can be considered a "row" in the total table. And each row
551
has a fixed number of columns. It is generally broken up into "key"
552
columns, then "current" columns, and then "parent" columns.
554
:param num_trees: How many parent trees need to be parsed
555
:param p_current_dirname: A pointer to the current PyString
556
representing the directory name.
557
We pass this in as a void * so that pyrex doesn't have to
558
increment/decrement the PyObject reference counter for each
560
We use a pointer so that _get_entry can update it with the new
562
:param new_block: This is to let the caller know that it needs to
563
create a new directory block to store the next entry.
565
cdef object path_name_file_id_key
566
cdef char *entry_size_cstr
567
cdef unsigned long int entry_size
568
cdef char* executable_cstr
569
cdef int is_executable
570
cdef char* dirname_cstr
575
cdef object fingerprint
578
# Read the 'key' information (dirname, name, file_id)
579
dirname_cstr = self.get_next(&cur_size)
580
# Check to see if we have started a new directory block.
581
# If so, then we need to create a new dirname PyString, so that it can
582
# be used in all of the tuples. This saves time and memory, by re-using
583
# the same object repeatedly.
585
# Do the cheap 'length of string' check first. If the string is a
586
# different length, then we *have* to be a different directory.
587
if (cur_size != PyString_GET_SIZE_void(p_current_dirname[0])
588
or strncmp(dirname_cstr,
589
# Extract the char* from our current dirname string. We
590
# know it is a PyString, so we can use
591
# PyString_AS_STRING, we use the _void version because
592
# we are tricking Pyrex by using a void* rather than an
594
PyString_AS_STRING_void(p_current_dirname[0]),
596
dirname = safe_string_from_size(dirname_cstr, cur_size)
597
p_current_dirname[0] = <void*>dirname
602
# Build up the key that will be used.
603
# By using <object>(void *) Pyrex will automatically handle the
604
# Py_INCREF that we need.
605
path_name_file_id_key = (<object>p_current_dirname[0],
610
# Parse all of the per-tree information. current has the information in
611
# the same location as parent trees. The only difference is that 'info'
612
# is a 'packed_stat' for current, while it is a 'revision_id' for
614
# minikind, fingerprint, and info will be returned as regular python
616
# entry_size and is_executable will be parsed into a python Long and
617
# python Boolean, respectively.
618
# TODO: jam 20070718 Consider changin the entry_size conversion to
619
# prefer python Int when possible. They are generally faster to
620
# work with, and it will be rare that we have a file >2GB.
621
# Especially since this code is pretty much fixed at a max of
624
for i from 0 <= i < num_trees:
625
minikind = self.get_next_str()
626
fingerprint = self.get_next_str()
627
entry_size_cstr = self.get_next(&cur_size)
628
entry_size = strtoul(entry_size_cstr, NULL, 10)
629
executable_cstr = self.get_next(&cur_size)
630
is_executable = (executable_cstr[0] == c'y')
631
info = self.get_next_str()
632
PyList_Append(trees, (
634
fingerprint, # fingerprint
636
is_executable,# executable
637
info, # packed_stat or revision_id
640
# The returned tuple is (key, [trees])
641
ret = (path_name_file_id_key, trees)
642
# Ignore the trailing newline, but assert that it does exist, this
643
# ensures that we always finish parsing a line on an end-of-entry
645
trailing = self.get_next(&cur_size)
646
if cur_size != 1 or trailing[0] != c'\n':
647
raise errors.DirstateCorrupt(self.state,
648
'Bad parse, we expected to end on \\n, not: %d %s: %s'
649
% (cur_size, safe_string_from_size(trailing, cur_size),
653
def _parse_dirblocks(self):
654
"""Parse all dirblocks in the state file."""
656
cdef object current_block
658
cdef void * current_dirname
660
cdef int expected_entry_count
663
num_trees = self.state._num_present_parents() + 1
664
expected_entry_count = self.state._num_entries
666
# Ignore the first record
670
dirblocks = [('', current_block), ('', [])]
671
self.state._dirblocks = dirblocks
673
current_dirname = <void*>obj
677
# TODO: jam 2007-05-07 Consider pre-allocating some space for the
678
# members, and then growing and shrinking from there. If most
679
# directories have close to 10 entries in them, it would save a
680
# few mallocs if we default our list size to something
681
# reasonable. Or we could malloc it to something large (100 or
682
# so), and then truncate. That would give us a malloc + realloc,
683
# rather than lots of reallocs.
684
while self.cur_cstr < self.end_cstr:
685
entry = self._get_entry(num_trees, ¤t_dirname, &new_block)
687
# new block - different dirname
689
PyList_Append(dirblocks,
690
(<object>current_dirname, current_block))
691
PyList_Append(current_block, entry)
692
entry_count = entry_count + 1
693
if entry_count != expected_entry_count:
694
raise errors.DirstateCorrupt(self.state,
695
'We read the wrong number of entries.'
696
' We expected to read %s, but read %s'
697
% (expected_entry_count, entry_count))
698
self.state._split_root_dirblock_into_contents()
701
def _read_dirblocks_c(state):
702
"""Read in the dirblocks for the given DirState object.
704
This is tightly bound to the DirState internal representation. It should be
705
thought of as a member function, which is only separated out so that we can
706
re-write it in pyrex.
708
:param state: A DirState object.
710
:postcondition: The dirblocks will be loaded into the appropriate fields in
713
state._state_file.seek(state._end_of_header)
714
text = state._state_file.read()
715
# TODO: check the crc checksums. crc_measured = zlib.crc32(text)
717
reader = Reader(text, state)
719
reader._parse_dirblocks()
720
state._dirblock_state = DirState.IN_MEMORY_UNMODIFIED