1
# Copyright (C) 2007 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Helper functions for DirState.
19
This is the python implementation for DirState functions.
22
from bzrlib.dirstate import DirState
26
ctypedef unsigned size_t
28
cdef extern from "stdint.h":
32
cdef extern from "stdlib.h":
33
unsigned long int strtoul(char *nptr, char **endptr, int base)
36
cdef extern from "Python.h":
37
int PyList_Append(object lst, object item) except -1
38
void *PyList_GetItem_object_void "PyList_GET_ITEM" (object lst, int index)
39
int PyList_CheckExact(object)
41
void *PyTuple_GetItem_void_void "PyTuple_GET_ITEM" (void* tpl, int index)
43
char *PyString_AsString(object p)
44
char *PyString_AS_STRING_void "PyString_AS_STRING" (void *p)
45
object PyString_FromString(char *)
46
object PyString_FromStringAndSize(char *, int)
47
int PyString_Size(object p)
48
int PyString_GET_SIZE_void "PyString_GET_SIZE" (void *p)
49
int PyString_CheckExact(object p)
52
cdef extern from "string.h":
53
int strncmp(char *s1, char *s2, int len)
54
void *memchr(void *s, int c, size_t len)
55
int memcmp(void *b1, void *b2, size_t len)
56
# ??? memrchr is a GNU extension :(
57
# void *memrchr(void *s, int c, size_t len)
60
cdef void* _my_memrchr(void *s, int c, size_t n):
61
# memrchr seems to be a GNU extension, so we have to implement it ourselves
74
def _py_memrchr(s, c):
75
"""Just to expose _my_memrchr for testing.
77
:param s: The Python string to search
78
:param c: The character to search for
79
:return: The offset to the last instance of 'c' in s
86
_s = PyString_AsString(s)
87
length = PyString_Size(s)
89
_c = PyString_AsString(c)
90
assert PyString_Size(c) == 1,\
91
'Must be a single character string, not %s' % (c,)
92
found = _my_memrchr(_s, _c[0], length)
98
cdef int _is_aligned(void *ptr):
99
"""Is this pointer aligned to an integer size offset?
101
:return: 1 if this pointer is aligned, 0 otherwise.
103
return ((<intptr_t>ptr) & ((sizeof(int))-1)) == 0
106
cdef int _cmp_by_dirs(char *path1, int size1, char *path2, int size2):
107
cdef unsigned char *cur1
108
cdef unsigned char *cur2
109
cdef unsigned char *end1
110
cdef unsigned char *end2
116
if path1 == path2 and size1 == size2:
119
end1 = <unsigned char*>path1+size1
120
end2 = <unsigned char*>path2+size2
122
# Use 32-bit comparisons for the matching portion of the string.
123
# Almost all CPU's are faster at loading and comparing 32-bit integers,
124
# than they are at 8-bit integers.
125
# 99% of the time, these will be aligned, but in case they aren't just skip
127
if _is_aligned(path1) and _is_aligned(path2):
128
cur_int1 = <int*>path1
129
cur_int2 = <int*>path2
130
end_int1 = <int*>(path1 + size1 - (size1 % sizeof(int)))
131
end_int2 = <int*>(path2 + size2 - (size2 % sizeof(int)))
133
while cur_int1 < end_int1 and cur_int2 < end_int2:
134
if cur_int1[0] != cur_int2[0]:
136
cur_int1 = cur_int1 + 1
137
cur_int2 = cur_int2 + 1
139
cur1 = <unsigned char*>cur_int1
140
cur2 = <unsigned char*>cur_int2
142
cur1 = <unsigned char*>path1
143
cur2 = <unsigned char*>path2
145
while cur1 < end1 and cur2 < end2:
146
if cur1[0] == cur2[0]:
147
# This character matches, just go to the next one
151
# The current characters do not match
153
return -1 # Reached the end of path1 segment first
154
elif cur2[0] == c'/':
155
return 1 # Reached the end of path2 segment first
156
elif cur1[0] < cur2[0]:
161
# We reached the end of at least one of the strings
163
return 1 # Not at the end of cur1, must be at the end of cur2
165
return -1 # At the end of cur1, but not at cur2
166
# We reached the end of both strings
170
def cmp_by_dirs_c(path1, path2):
171
"""Compare two paths directory by directory.
173
This is equivalent to doing::
175
cmp(path1.split('/'), path2.split('/'))
177
The idea is that you should compare path components separately. This
178
differs from plain ``cmp(path1, path2)`` for paths like ``'a-b'`` and
179
``a/b``. "a-b" comes after "a" but would come before "a/b" lexically.
181
:param path1: first path
182
:param path2: second path
183
:return: positive number if ``path1`` comes first,
184
0 if paths are equal,
185
and negative number if ``path2`` sorts first
187
if not PyString_CheckExact(path1):
188
raise TypeError("'path1' must be a plain string, not %s: %r"
189
% (type(path1), path1))
190
if not PyString_CheckExact(path2):
191
raise TypeError("'path2' must be a plain string, not %s: %r"
192
% (type(path2), path2))
193
return _cmp_by_dirs(PyString_AsString(path1),
194
PyString_Size(path1),
195
PyString_AsString(path2),
196
PyString_Size(path2))
199
def _cmp_path_by_dirblock_c(path1, path2):
200
"""Compare two paths based on what directory they are in.
202
This generates a sort order, such that all children of a directory are
203
sorted together, and grandchildren are in the same order as the
204
children appear. But all grandchildren come after all children.
206
In other words, all entries in a directory are sorted together, and
207
directorys are sorted in cmp_by_dirs order.
209
:param path1: first path
210
:param path2: the second path
211
:return: positive number if ``path1`` comes first,
213
and a negative number if ``path2`` sorts first
215
if not PyString_CheckExact(path1):
216
raise TypeError("'path1' must be a plain string, not %s: %r"
217
% (type(path1), path1))
218
if not PyString_CheckExact(path2):
219
raise TypeError("'path2' must be a plain string, not %s: %r"
220
% (type(path2), path2))
221
return _cmp_path_by_dirblock(PyString_AsString(path1),
222
PyString_Size(path1),
223
PyString_AsString(path2),
224
PyString_Size(path2))
227
cdef int _cmp_path_by_dirblock(char *path1, int path1_len,
228
char *path2, int path2_len):
229
"""Compare two paths by what directory they are in.
231
see ``_cmp_path_by_dirblock_c`` for details.
234
cdef int dirname1_len
236
cdef int dirname2_len
238
cdef int basename1_len
240
cdef int basename2_len
244
if path1_len == 0 and path2_len == 0:
247
if path1 == path2 and path1_len == path2_len:
256
basename1 = <char*>_my_memrchr(path1, c'/', path1_len)
258
if basename1 == NULL:
260
basename1_len = path1_len
265
dirname1_len = basename1 - path1
266
basename1 = basename1 + 1
267
basename1_len = path1_len - dirname1_len - 1
269
basename2 = <char*>_my_memrchr(path2, c'/', path2_len)
271
if basename2 == NULL:
273
basename2_len = path2_len
278
dirname2_len = basename2 - path2
279
basename2 = basename2 + 1
280
basename2_len = path2_len - dirname2_len - 1
282
cmp_val = _cmp_by_dirs(dirname1, dirname1_len,
283
dirname2, dirname2_len)
287
cur_len = basename1_len
288
if basename2_len < basename1_len:
289
cur_len = basename2_len
291
cmp_val = memcmp(basename1, basename2, cur_len)
294
if basename1_len == basename2_len:
296
if basename1_len < basename2_len:
301
def _bisect_path_left_c(paths, path):
302
"""Return the index where to insert path into paths.
304
This uses a path-wise comparison so we get::
314
:param paths: A list of paths to search through
315
:param path: A single path to insert
316
:return: An offset where 'path' can be inserted.
317
:seealso: bisect.bisect_left
328
if not PyList_CheckExact(paths):
329
raise TypeError("you must pass a python list for 'paths' not: %s %r"
330
% (type(paths), paths))
331
if not PyString_CheckExact(path):
332
raise TypeError("you must pass a string for 'path' not: %s %r"
333
% (type(path), path))
338
path_cstr = PyString_AsString(path)
339
path_size = PyString_Size(path)
342
_mid = (_lo + _hi) / 2
343
cur = PyList_GetItem_object_void(paths, _mid)
344
cur_cstr = PyString_AS_STRING_void(cur)
345
cur_size = PyString_GET_SIZE_void(cur)
346
if _cmp_path_by_dirblock(cur_cstr, cur_size, path_cstr, path_size) < 0:
353
def _bisect_path_right_c(paths, path):
354
"""Return the index where to insert path into paths.
356
This uses a path-wise comparison so we get::
366
:param paths: A list of paths to search through
367
:param path: A single path to insert
368
:return: An offset where 'path' can be inserted.
369
:seealso: bisect.bisect_right
380
if not PyList_CheckExact(paths):
381
raise TypeError("you must pass a python list for 'paths' not: %s %r"
382
% (type(paths), paths))
383
if not PyString_CheckExact(path):
384
raise TypeError("you must pass a string for 'path' not: %s %r"
385
% (type(path), path))
390
path_cstr = PyString_AsString(path)
391
path_size = PyString_Size(path)
394
_mid = (_lo + _hi) / 2
395
cur = PyList_GetItem_object_void(paths, _mid)
396
cur_cstr = PyString_AS_STRING_void(cur)
397
cur_size = PyString_GET_SIZE_void(cur)
398
if _cmp_path_by_dirblock(path_cstr, path_size, cur_cstr, cur_size) < 0:
405
def bisect_dirblock_c(dirblocks, dirname, lo=0, hi=None, cache=None):
406
"""Return the index where to insert dirname into the dirblocks.
408
The return value idx is such that all directories blocks in dirblock[:idx]
409
have names < dirname, and all blocks in dirblock[idx:] have names >=
412
Optional args lo (default 0) and hi (default len(dirblocks)) bound the
413
slice of a to be searched.
418
cdef char *dirname_cstr
419
cdef int dirname_size
424
if not PyList_CheckExact(dirblocks):
425
raise TypeError("you must pass a python list for 'dirblocks' not: %s %r"
426
% (type(dirblocks), dirblocks))
427
if not PyString_CheckExact(dirname):
428
raise TypeError("you must pass a string for dirname not: %s %r"
429
% (type(dirname), dirname))
436
dirname_cstr = PyString_AsString(dirname)
437
dirname_size = PyString_Size(dirname)
440
_mid = (_lo + _hi) / 2
441
# Grab the dirname for the current dirblock
442
# cur = dirblocks[_mid][0]
443
cur = PyTuple_GetItem_void_void(
444
PyList_GetItem_object_void(dirblocks, _mid), 0)
445
cur_cstr = PyString_AS_STRING_void(cur)
446
cur_size = PyString_GET_SIZE_void(cur)
447
if _cmp_by_dirs(cur_cstr, cur_size, dirname_cstr, dirname_size) < 0:
455
"""Maintain the current location, and return fields as you parse them."""
457
cdef object text # The overall string object
458
cdef char *text_cstr # Pointer to the beginning of text
459
cdef int text_size # Length of text
461
cdef char *end_cstr # End of text
462
cdef char *cur_cstr # Pointer to the current record
463
cdef char *next # Pointer to the end of this record
465
def __new__(self, text):
467
self.text_cstr = PyString_AsString(text)
468
self.text_size = PyString_Size(text)
469
self.end_cstr = self.text_cstr + self.text_size
470
self.cur_cstr = self.text_cstr
472
cdef char *get_next(self, int *size):
473
"""Return a pointer to the start of the next field."""
476
self.cur_cstr = <char*>memchr(next, c'\0', self.end_cstr-next)
477
size[0] = self.cur_cstr - next
478
self.cur_cstr = self.cur_cstr + 1
481
cdef object get_next_str(self):
482
"""Get the next field as a Python string."""
485
next = self.get_next(&size)
486
return PyString_FromStringAndSize(next, size)
488
cdef int _init(self) except -1:
489
"""Get the pointer ready.
491
This assumes that the dirstate header has already been read, and we
492
already have the dirblock string loaded into memory.
493
This just initializes our memory pointers, etc for parsing of the
498
# The first field should be an empty string left over from the Header
499
first = self.get_next(&size)
500
if first[0] != c'\0' and size == 0:
501
raise AssertionError('First character should be null not: %s'
505
cdef object _get_entry(self, int num_trees, void **p_current_dirname,
507
"""Extract the next entry.
509
This parses the next entry based on the current location in
511
Each entry can be considered a "row" in the total table. And each row
512
has a fixed number of columns. It is generally broken up into "key"
513
columns, then "current" columns, and then "parent" columns.
515
:param num_trees: How many parent trees need to be parsed
516
:param p_current_dirname: A pointer to the current PyString
517
representing the directory name.
518
We pass this in as a void * so that pyrex doesn't have to
519
increment/decrement the PyObject reference counter for each
521
We use a pointer so that _get_entry can update it with the new
523
:param new_block: This is to let the caller know that it needs to
524
create a new directory block to store the next entry.
526
cdef object path_name_file_id_key
527
cdef char *entry_size_cstr
528
cdef unsigned long int entry_size
529
cdef char* executable_cstr
530
cdef int is_executable
531
cdef char* dirname_cstr
536
cdef object fingerprint
539
# Read the 'key' information (dirname, name, file_id)
540
dirname_cstr = self.get_next(&cur_size)
541
# Check to see if we have started a new directory block.
542
# If so, then we need to create a new dirname PyString, so that it can
543
# be used in all of the tuples. This saves time and memory, by re-using
544
# the same object repeatedly.
546
# Do the cheap 'length of string' check first. If the string is a
547
# different length, then we *have* to be a different directory.
548
if (cur_size != PyString_GET_SIZE_void(p_current_dirname[0])
549
or strncmp(dirname_cstr,
550
# Extract the char* from our current dirname string. We
551
# know it is a PyString, so we can use
552
# PyString_AS_STRING, we use the _void version because
553
# we are tricking Pyrex by using a void* rather than an
555
PyString_AS_STRING_void(p_current_dirname[0]),
557
dirname = PyString_FromStringAndSize(dirname_cstr, cur_size)
558
p_current_dirname[0] = <void*>dirname
563
# Build up the key that will be used.
564
# By using <object>(void *) Pyrex will automatically handle the
565
# Py_INCREF that we need.
566
path_name_file_id_key = (<object>p_current_dirname[0],
571
# Parse all of the per-tree information. current has the information in
572
# the same location as parent trees. The only difference is that 'info'
573
# is a 'packed_stat' for current, while it is a 'revision_id' for
575
# minikind, fingerprint, and info will be returned as regular python
577
# entry_size and is_executable will be parsed into a python Long and
578
# python Boolean, respectively.
579
# TODO: jam 20070718 Consider changin the entry_size conversion to
580
# prefer python Int when possible. They are generally faster to
581
# work with, and it will be rare that we have a file >2GB.
582
# Especially since this code is pretty much fixed at a max of
585
for i from 0 <= i < num_trees:
586
minikind = self.get_next_str()
587
fingerprint = self.get_next_str()
588
entry_size_cstr = self.get_next(&cur_size)
589
entry_size = strtoul(entry_size_cstr, NULL, 10)
590
executable_cstr = self.get_next(&cur_size)
591
is_executable = (executable_cstr[0] == c'y')
592
info = self.get_next_str()
593
PyList_Append(trees, (
595
fingerprint, # fingerprint
597
is_executable,# executable
598
info, # packed_stat or revision_id
601
# The returned tuple is (key, [trees])
602
ret = (path_name_file_id_key, trees)
603
# Ignore the trailing newline, but assert that it does exist, this
604
# ensures that we always finish parsing a line on an end-of-entry
606
trailing = self.get_next(&cur_size)
607
if cur_size != 1 or trailing[0] != c'\n':
608
raise AssertionError(
609
'Bad parse, we expected to end on \\n, not: %d %s: %s'
610
% (cur_size, PyString_FromStringAndSize(trailing, cur_size),
614
def _parse_dirblocks(self, state):
615
"""Parse all dirblocks in the state file."""
617
cdef object current_block
619
cdef void * current_dirname
621
cdef int expected_entry_count
624
num_trees = state._num_present_parents() + 1
625
expected_entry_count = state._num_entries
627
# Ignore the first record
631
state._dirblocks = [('', current_block), ('', [])]
633
current_dirname = <void*>obj
637
# TODO: jam 2007-05-07 Consider pre-allocating some space for the
638
# members, and then growing and shrinking from there. If most
639
# directories have close to 10 entries in them, it would save a
640
# few mallocs if we default our list size to something
641
# reasonable. Or we could malloc it to something large (100 or
642
# so), and then truncate. That would give us a malloc + realloc,
643
# rather than lots of reallocs.
644
while self.cur_cstr < self.end_cstr:
645
entry = self._get_entry(num_trees, ¤t_dirname, &new_block)
647
# new block - different dirname
649
PyList_Append(state._dirblocks,
650
(<object>current_dirname, current_block))
651
PyList_Append(current_block, entry)
652
entry_count = entry_count + 1
653
if entry_count != expected_entry_count:
654
raise AssertionError('We read the wrong number of entries.'
655
' We expected to read %s, but read %s'
656
% (expected_entry_count, entry_count))
657
state._split_root_dirblock_into_contents()
660
def _read_dirblocks_c(state):
661
"""Read in the dirblocks for the given DirState object.
663
This is tightly bound to the DirState internal representation. It should be
664
thought of as a member function, which is only separated out so that we can
665
re-write it in pyrex.
667
:param state: A DirState object.
669
:postcondition: The dirblocks will be loaded into the appropriate fields in
672
state._state_file.seek(state._end_of_header)
673
text = state._state_file.read()
674
# TODO: check the crc checksums. crc_measured = zlib.crc32(text)
676
reader = Reader(text)
678
reader._parse_dirblocks(state)
679
state._dirblock_state = DirState.IN_MEMORY_UNMODIFIED