1
# Copyright (C) 2007, 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Helper functions for DirState.
19
This is the python implementation for DirState functions.
22
from bzrlib import errors
23
from bzrlib.dirstate import DirState
27
cdef extern from "python-compat.h":
30
# Give Pyrex some function definitions for it to understand.
31
# All of these are just hints to Pyrex, so that it can try to convert python
32
# objects into similar C objects. (such as PyInt => int).
33
# In anything defined 'cdef extern from XXX' the real C header will be
34
# imported, and the real definition will be used from there. So these are just
35
# hints, and do not need to match exactly to the C definitions.
38
ctypedef unsigned long size_t
40
cdef extern from "_dirstate_helpers_c.h":
44
cdef extern from "stdlib.h":
45
unsigned long int strtoul(char *nptr, char **endptr, int base)
48
# These functions allow us access to a bit of the 'bare metal' of python
49
# objects, rather than going through the object abstraction. (For example,
50
# PyList_Append, rather than getting the 'append' attribute of the object, and
51
# creating a tuple, and then using PyCallObject).
52
# Functions that return (or take) a void* are meant to grab a C PyObject*. This
53
# differs from the Pyrex 'object'. If you declare a variable as 'object' Pyrex
54
# will automatically Py_INCREF and Py_DECREF when appropriate. But for some
55
# inner loops, we don't need to do that at all, as the reference only lasts for
57
cdef extern from "Python.h":
58
ctypedef int Py_ssize_t
59
int PyList_Append(object lst, object item) except -1
60
void *PyList_GetItem_object_void "PyList_GET_ITEM" (object lst, int index)
61
int PyList_CheckExact(object)
63
void *PyTuple_GetItem_void_void "PyTuple_GET_ITEM" (void* tpl, int index)
65
char *PyString_AsString(object p)
66
char *PyString_AS_STRING_void "PyString_AS_STRING" (void *p)
67
object PyString_FromString(char *)
68
object PyString_FromStringAndSize(char *, Py_ssize_t)
69
int PyString_Size(object p)
70
int PyString_GET_SIZE_void "PyString_GET_SIZE" (void *p)
71
int PyString_CheckExact(object p)
74
cdef extern from "string.h":
75
int strncmp(char *s1, char *s2, int len)
76
void *memchr(void *s, int c, size_t len)
77
int memcmp(void *b1, void *b2, size_t len)
78
# ??? memrchr is a GNU extension :(
79
# void *memrchr(void *s, int c, size_t len)
82
cdef void* _my_memrchr(void *s, int c, size_t n):
83
# memrchr seems to be a GNU extension, so we have to implement it ourselves
96
def _py_memrchr(s, c):
97
"""Just to expose _my_memrchr for testing.
99
:param s: The Python string to search
100
:param c: The character to search for
101
:return: The offset to the last instance of 'c' in s
108
_s = PyString_AsString(s)
109
length = PyString_Size(s)
111
_c = PyString_AsString(c)
112
assert PyString_Size(c) == 1,\
113
'Must be a single character string, not %s' % (c,)
114
found = _my_memrchr(_s, _c[0], length)
117
return <char*>found - <char*>_s
119
cdef object safe_string_from_size(char *s, Py_ssize_t size):
121
raise AssertionError(
122
'tried to create a string with an invalid size: %d @0x%x'
124
return PyString_FromStringAndSize(s, size)
127
cdef int _is_aligned(void *ptr):
128
"""Is this pointer aligned to an integer size offset?
130
:return: 1 if this pointer is aligned, 0 otherwise.
132
return ((<intptr_t>ptr) & ((sizeof(int))-1)) == 0
135
cdef int _cmp_by_dirs(char *path1, int size1, char *path2, int size2):
136
cdef unsigned char *cur1
137
cdef unsigned char *cur2
138
cdef unsigned char *end1
139
cdef unsigned char *end2
145
if path1 == path2 and size1 == size2:
148
end1 = <unsigned char*>path1+size1
149
end2 = <unsigned char*>path2+size2
151
# Use 32-bit comparisons for the matching portion of the string.
152
# Almost all CPU's are faster at loading and comparing 32-bit integers,
153
# than they are at 8-bit integers.
154
# 99% of the time, these will be aligned, but in case they aren't just skip
156
if _is_aligned(path1) and _is_aligned(path2):
157
cur_int1 = <int*>path1
158
cur_int2 = <int*>path2
159
end_int1 = <int*>(path1 + size1 - (size1 % sizeof(int)))
160
end_int2 = <int*>(path2 + size2 - (size2 % sizeof(int)))
162
while cur_int1 < end_int1 and cur_int2 < end_int2:
163
if cur_int1[0] != cur_int2[0]:
165
cur_int1 = cur_int1 + 1
166
cur_int2 = cur_int2 + 1
168
cur1 = <unsigned char*>cur_int1
169
cur2 = <unsigned char*>cur_int2
171
cur1 = <unsigned char*>path1
172
cur2 = <unsigned char*>path2
174
while cur1 < end1 and cur2 < end2:
175
if cur1[0] == cur2[0]:
176
# This character matches, just go to the next one
180
# The current characters do not match
182
return -1 # Reached the end of path1 segment first
183
elif cur2[0] == c'/':
184
return 1 # Reached the end of path2 segment first
185
elif cur1[0] < cur2[0]:
190
# We reached the end of at least one of the strings
192
return 1 # Not at the end of cur1, must be at the end of cur2
194
return -1 # At the end of cur1, but not at cur2
195
# We reached the end of both strings
199
def cmp_by_dirs_c(path1, path2):
200
"""Compare two paths directory by directory.
202
This is equivalent to doing::
204
cmp(path1.split('/'), path2.split('/'))
206
The idea is that you should compare path components separately. This
207
differs from plain ``cmp(path1, path2)`` for paths like ``'a-b'`` and
208
``a/b``. "a-b" comes after "a" but would come before "a/b" lexically.
210
:param path1: first path
211
:param path2: second path
212
:return: negative number if ``path1`` comes first,
213
0 if paths are equal,
214
and positive number if ``path2`` sorts first
216
if not PyString_CheckExact(path1):
217
raise TypeError("'path1' must be a plain string, not %s: %r"
218
% (type(path1), path1))
219
if not PyString_CheckExact(path2):
220
raise TypeError("'path2' must be a plain string, not %s: %r"
221
% (type(path2), path2))
222
return _cmp_by_dirs(PyString_AsString(path1),
223
PyString_Size(path1),
224
PyString_AsString(path2),
225
PyString_Size(path2))
228
def _cmp_path_by_dirblock_c(path1, path2):
229
"""Compare two paths based on what directory they are in.
231
This generates a sort order, such that all children of a directory are
232
sorted together, and grandchildren are in the same order as the
233
children appear. But all grandchildren come after all children.
235
In other words, all entries in a directory are sorted together, and
236
directorys are sorted in cmp_by_dirs order.
238
:param path1: first path
239
:param path2: the second path
240
:return: negative number if ``path1`` comes first,
242
and a positive number if ``path2`` sorts first
244
if not PyString_CheckExact(path1):
245
raise TypeError("'path1' must be a plain string, not %s: %r"
246
% (type(path1), path1))
247
if not PyString_CheckExact(path2):
248
raise TypeError("'path2' must be a plain string, not %s: %r"
249
% (type(path2), path2))
250
return _cmp_path_by_dirblock(PyString_AsString(path1),
251
PyString_Size(path1),
252
PyString_AsString(path2),
253
PyString_Size(path2))
256
cdef int _cmp_path_by_dirblock(char *path1, int path1_len,
257
char *path2, int path2_len):
258
"""Compare two paths by what directory they are in.
260
see ``_cmp_path_by_dirblock_c`` for details.
263
cdef int dirname1_len
265
cdef int dirname2_len
267
cdef int basename1_len
269
cdef int basename2_len
273
if path1_len == 0 and path2_len == 0:
276
if path1 == path2 and path1_len == path2_len:
285
basename1 = <char*>_my_memrchr(path1, c'/', path1_len)
287
if basename1 == NULL:
289
basename1_len = path1_len
294
dirname1_len = basename1 - path1
295
basename1 = basename1 + 1
296
basename1_len = path1_len - dirname1_len - 1
298
basename2 = <char*>_my_memrchr(path2, c'/', path2_len)
300
if basename2 == NULL:
302
basename2_len = path2_len
307
dirname2_len = basename2 - path2
308
basename2 = basename2 + 1
309
basename2_len = path2_len - dirname2_len - 1
311
cmp_val = _cmp_by_dirs(dirname1, dirname1_len,
312
dirname2, dirname2_len)
316
cur_len = basename1_len
317
if basename2_len < basename1_len:
318
cur_len = basename2_len
320
cmp_val = memcmp(basename1, basename2, cur_len)
323
if basename1_len == basename2_len:
325
if basename1_len < basename2_len:
330
def _bisect_path_left_c(paths, path):
331
"""Return the index where to insert path into paths.
333
This uses a path-wise comparison so we get::
343
:param paths: A list of paths to search through
344
:param path: A single path to insert
345
:return: An offset where 'path' can be inserted.
346
:seealso: bisect.bisect_left
357
if not PyList_CheckExact(paths):
358
raise TypeError("you must pass a python list for 'paths' not: %s %r"
359
% (type(paths), paths))
360
if not PyString_CheckExact(path):
361
raise TypeError("you must pass a string for 'path' not: %s %r"
362
% (type(path), path))
367
path_cstr = PyString_AsString(path)
368
path_size = PyString_Size(path)
371
_mid = (_lo + _hi) / 2
372
cur = PyList_GetItem_object_void(paths, _mid)
373
cur_cstr = PyString_AS_STRING_void(cur)
374
cur_size = PyString_GET_SIZE_void(cur)
375
if _cmp_path_by_dirblock(cur_cstr, cur_size, path_cstr, path_size) < 0:
382
def _bisect_path_right_c(paths, path):
383
"""Return the index where to insert path into paths.
385
This uses a path-wise comparison so we get::
395
:param paths: A list of paths to search through
396
:param path: A single path to insert
397
:return: An offset where 'path' can be inserted.
398
:seealso: bisect.bisect_right
409
if not PyList_CheckExact(paths):
410
raise TypeError("you must pass a python list for 'paths' not: %s %r"
411
% (type(paths), paths))
412
if not PyString_CheckExact(path):
413
raise TypeError("you must pass a string for 'path' not: %s %r"
414
% (type(path), path))
419
path_cstr = PyString_AsString(path)
420
path_size = PyString_Size(path)
423
_mid = (_lo + _hi) / 2
424
cur = PyList_GetItem_object_void(paths, _mid)
425
cur_cstr = PyString_AS_STRING_void(cur)
426
cur_size = PyString_GET_SIZE_void(cur)
427
if _cmp_path_by_dirblock(path_cstr, path_size, cur_cstr, cur_size) < 0:
434
def bisect_dirblock_c(dirblocks, dirname, lo=0, hi=None, cache=None):
435
"""Return the index where to insert dirname into the dirblocks.
437
The return value idx is such that all directories blocks in dirblock[:idx]
438
have names < dirname, and all blocks in dirblock[idx:] have names >=
441
Optional args lo (default 0) and hi (default len(dirblocks)) bound the
442
slice of a to be searched.
447
cdef char *dirname_cstr
448
cdef int dirname_size
453
if not PyList_CheckExact(dirblocks):
454
raise TypeError("you must pass a python list for 'dirblocks' not: %s %r"
455
% (type(dirblocks), dirblocks))
456
if not PyString_CheckExact(dirname):
457
raise TypeError("you must pass a string for dirname not: %s %r"
458
% (type(dirname), dirname))
465
dirname_cstr = PyString_AsString(dirname)
466
dirname_size = PyString_Size(dirname)
469
_mid = (_lo + _hi) / 2
470
# Grab the dirname for the current dirblock
471
# cur = dirblocks[_mid][0]
472
cur = PyTuple_GetItem_void_void(
473
PyList_GetItem_object_void(dirblocks, _mid), 0)
474
cur_cstr = PyString_AS_STRING_void(cur)
475
cur_size = PyString_GET_SIZE_void(cur)
476
if _cmp_by_dirs(cur_cstr, cur_size, dirname_cstr, dirname_size) < 0:
484
"""Maintain the current location, and return fields as you parse them."""
486
cdef object state # The DirState object
487
cdef object text # The overall string object
488
cdef char *text_cstr # Pointer to the beginning of text
489
cdef int text_size # Length of text
491
cdef char *end_cstr # End of text
492
cdef char *cur_cstr # Pointer to the current record
493
cdef char *next # Pointer to the end of this record
495
def __init__(self, text, state):
498
self.text_cstr = PyString_AsString(text)
499
self.text_size = PyString_Size(text)
500
self.end_cstr = self.text_cstr + self.text_size
501
self.cur_cstr = self.text_cstr
503
cdef char *get_next(self, int *size) except NULL:
504
"""Return a pointer to the start of the next field."""
506
cdef Py_ssize_t extra_len
508
if self.cur_cstr == NULL:
509
raise AssertionError('get_next() called when cur_str is NULL')
510
elif self.cur_cstr >= self.end_cstr:
511
raise AssertionError('get_next() called when there are no chars'
514
self.cur_cstr = <char*>memchr(next, c'\0', self.end_cstr - next)
515
if self.cur_cstr == NULL:
516
extra_len = self.end_cstr - next
517
raise errors.DirstateCorrupt(self.state,
518
'failed to find trailing NULL (\\0).'
519
' Trailing garbage: %r'
520
% safe_string_from_size(next, extra_len))
521
size[0] = self.cur_cstr - next
522
self.cur_cstr = self.cur_cstr + 1
525
cdef object get_next_str(self):
526
"""Get the next field as a Python string."""
529
next = self.get_next(&size)
530
return safe_string_from_size(next, size)
532
cdef int _init(self) except -1:
533
"""Get the pointer ready.
535
This assumes that the dirstate header has already been read, and we
536
already have the dirblock string loaded into memory.
537
This just initializes our memory pointers, etc for parsing of the
542
# The first field should be an empty string left over from the Header
543
first = self.get_next(&size)
544
if first[0] != c'\0' and size == 0:
545
raise AssertionError('First character should be null not: %s'
549
cdef object _get_entry(self, int num_trees, void **p_current_dirname,
551
"""Extract the next entry.
553
This parses the next entry based on the current location in
555
Each entry can be considered a "row" in the total table. And each row
556
has a fixed number of columns. It is generally broken up into "key"
557
columns, then "current" columns, and then "parent" columns.
559
:param num_trees: How many parent trees need to be parsed
560
:param p_current_dirname: A pointer to the current PyString
561
representing the directory name.
562
We pass this in as a void * so that pyrex doesn't have to
563
increment/decrement the PyObject reference counter for each
565
We use a pointer so that _get_entry can update it with the new
567
:param new_block: This is to let the caller know that it needs to
568
create a new directory block to store the next entry.
570
cdef object path_name_file_id_key
571
cdef char *entry_size_cstr
572
cdef unsigned long int entry_size
573
cdef char* executable_cstr
574
cdef int is_executable
575
cdef char* dirname_cstr
580
cdef object fingerprint
583
# Read the 'key' information (dirname, name, file_id)
584
dirname_cstr = self.get_next(&cur_size)
585
# Check to see if we have started a new directory block.
586
# If so, then we need to create a new dirname PyString, so that it can
587
# be used in all of the tuples. This saves time and memory, by re-using
588
# the same object repeatedly.
590
# Do the cheap 'length of string' check first. If the string is a
591
# different length, then we *have* to be a different directory.
592
if (cur_size != PyString_GET_SIZE_void(p_current_dirname[0])
593
or strncmp(dirname_cstr,
594
# Extract the char* from our current dirname string. We
595
# know it is a PyString, so we can use
596
# PyString_AS_STRING, we use the _void version because
597
# we are tricking Pyrex by using a void* rather than an
599
PyString_AS_STRING_void(p_current_dirname[0]),
601
dirname = safe_string_from_size(dirname_cstr, cur_size)
602
p_current_dirname[0] = <void*>dirname
607
# Build up the key that will be used.
608
# By using <object>(void *) Pyrex will automatically handle the
609
# Py_INCREF that we need.
610
path_name_file_id_key = (<object>p_current_dirname[0],
615
# Parse all of the per-tree information. current has the information in
616
# the same location as parent trees. The only difference is that 'info'
617
# is a 'packed_stat' for current, while it is a 'revision_id' for
619
# minikind, fingerprint, and info will be returned as regular python
621
# entry_size and is_executable will be parsed into a python Long and
622
# python Boolean, respectively.
623
# TODO: jam 20070718 Consider changin the entry_size conversion to
624
# prefer python Int when possible. They are generally faster to
625
# work with, and it will be rare that we have a file >2GB.
626
# Especially since this code is pretty much fixed at a max of
629
for i from 0 <= i < num_trees:
630
minikind = self.get_next_str()
631
fingerprint = self.get_next_str()
632
entry_size_cstr = self.get_next(&cur_size)
633
entry_size = strtoul(entry_size_cstr, NULL, 10)
634
executable_cstr = self.get_next(&cur_size)
635
is_executable = (executable_cstr[0] == c'y')
636
info = self.get_next_str()
637
PyList_Append(trees, (
639
fingerprint, # fingerprint
641
is_executable,# executable
642
info, # packed_stat or revision_id
645
# The returned tuple is (key, [trees])
646
ret = (path_name_file_id_key, trees)
647
# Ignore the trailing newline, but assert that it does exist, this
648
# ensures that we always finish parsing a line on an end-of-entry
650
trailing = self.get_next(&cur_size)
651
if cur_size != 1 or trailing[0] != c'\n':
652
raise errors.DirstateCorrupt(self.state,
653
'Bad parse, we expected to end on \\n, not: %d %s: %s'
654
% (cur_size, safe_string_from_size(trailing, cur_size),
658
def _parse_dirblocks(self):
659
"""Parse all dirblocks in the state file."""
661
cdef object current_block
663
cdef void * current_dirname
665
cdef int expected_entry_count
668
num_trees = self.state._num_present_parents() + 1
669
expected_entry_count = self.state._num_entries
671
# Ignore the first record
675
dirblocks = [('', current_block), ('', [])]
676
self.state._dirblocks = dirblocks
678
current_dirname = <void*>obj
682
# TODO: jam 2007-05-07 Consider pre-allocating some space for the
683
# members, and then growing and shrinking from there. If most
684
# directories have close to 10 entries in them, it would save a
685
# few mallocs if we default our list size to something
686
# reasonable. Or we could malloc it to something large (100 or
687
# so), and then truncate. That would give us a malloc + realloc,
688
# rather than lots of reallocs.
689
while self.cur_cstr < self.end_cstr:
690
entry = self._get_entry(num_trees, ¤t_dirname, &new_block)
692
# new block - different dirname
694
PyList_Append(dirblocks,
695
(<object>current_dirname, current_block))
696
PyList_Append(current_block, entry)
697
entry_count = entry_count + 1
698
if entry_count != expected_entry_count:
699
raise errors.DirstateCorrupt(self.state,
700
'We read the wrong number of entries.'
701
' We expected to read %s, but read %s'
702
% (expected_entry_count, entry_count))
703
self.state._split_root_dirblock_into_contents()
706
def _read_dirblocks_c(state):
707
"""Read in the dirblocks for the given DirState object.
709
This is tightly bound to the DirState internal representation. It should be
710
thought of as a member function, which is only separated out so that we can
711
re-write it in pyrex.
713
:param state: A DirState object.
715
:postcondition: The dirblocks will be loaded into the appropriate fields in
718
state._state_file.seek(state._end_of_header)
719
text = state._state_file.read()
720
# TODO: check the crc checksums. crc_measured = zlib.crc32(text)
722
reader = Reader(text, state)
724
reader._parse_dirblocks()
725
state._dirblock_state = DirState.IN_MEMORY_UNMODIFIED