18
18
"""Pyrex extensions to btree node parsing."""
20
from __future__ import absolute_import
23
21
cdef extern from "python-compat.h":
26
from libc.stdlib cimport (
30
from libc.string cimport (
37
from cpython.bytes cimport (
42
PyBytes_FromStringAndSize,
46
from cpython.list cimport (
49
from cpython.object cimport (
52
from cpython.mem cimport (
56
from cpython.ref cimport (
59
from cpython.tuple cimport (
67
from ._str_helpers cimport (
69
safe_interned_string_from_size,
70
safe_string_from_size,
73
from .._static_tuple_c cimport (
74
import_static_tuple_c,
79
StaticTuple_CheckExact,
24
cdef extern from "stdlib.h":
25
ctypedef unsigned size_t
27
cdef extern from "Python.h":
28
ctypedef int Py_ssize_t # Required for older pyrex versions
29
ctypedef struct PyObject:
31
int PyList_Append(object lst, object item) except -1
33
char *PyString_AsString(object p) except NULL
34
object PyString_FromStringAndSize(char *, Py_ssize_t)
35
PyObject *PyString_FromStringAndSize_ptr "PyString_FromStringAndSize" (char *, Py_ssize_t)
36
int PyString_CheckExact(object s)
37
int PyString_CheckExact_ptr "PyString_CheckExact" (PyObject *)
38
Py_ssize_t PyString_Size(object p)
39
Py_ssize_t PyString_GET_SIZE_ptr "PyString_GET_SIZE" (PyObject *)
40
char * PyString_AS_STRING_ptr "PyString_AS_STRING" (PyObject *)
41
char * PyString_AS_STRING(object)
42
Py_ssize_t PyString_GET_SIZE(object)
43
int PyString_AsStringAndSize_ptr(PyObject *, char **buf, Py_ssize_t *len)
44
void PyString_InternInPlace(PyObject **)
45
int PyTuple_CheckExact(object t)
46
object PyTuple_New(Py_ssize_t n_entries)
47
void PyTuple_SET_ITEM(object, Py_ssize_t offset, object) # steals the ref
48
Py_ssize_t PyTuple_GET_SIZE(object t)
49
PyObject *PyTuple_GET_ITEM_ptr_object "PyTuple_GET_ITEM" (object tpl, int index)
50
void Py_INCREF(object)
51
void Py_DECREF_ptr "Py_DECREF" (PyObject *)
53
cdef extern from "string.h":
54
void *memcpy(void *dest, void *src, size_t n)
55
void *memchr(void *s, int c, size_t n)
57
# void *memrchr(void *s, int c, size_t n)
58
int strncmp(char *s1, char *s2, size_t n)
60
# It seems we need to import the definitions so that the pyrex compiler has
61
# local names to access them.
62
from _static_tuple_c cimport StaticTuple, \
63
import_static_tuple_c, StaticTuple_New, \
64
StaticTuple_Intern, StaticTuple_SET_ITEM, StaticTuple_CheckExact
67
# TODO: Find some way to import this from _dirstate_helpers
68
cdef void* _my_memrchr(void *s, int c, size_t n): # cannot_raise
69
# memrchr seems to be a GNU extension, so we have to implement it ourselves
70
# It is not present in any win32 standard library
83
# TODO: Import this from _dirstate_helpers when it is merged
84
cdef object safe_string_from_size(char *s, Py_ssize_t size):
87
'tried to create a string with an invalid size: %d @0x%x'
89
return PyString_FromStringAndSize(s, size)
92
cdef object safe_interned_string_from_size(char *s, Py_ssize_t size):
96
'tried to create a string with an invalid size: %d @0x%x'
98
py_str = PyString_FromStringAndSize_ptr(s, size)
99
PyString_InternInPlace(&py_str)
100
result = <object>py_str
101
# Casting a PyObject* to an <object> triggers an INCREF from Pyrex, so we
102
# DECREF it to avoid geting immortal strings
103
Py_DECREF_ptr(py_str)
106
from bzrlib import _static_tuple_c
87
107
# This sets up the StaticTuple C_API functionality
88
108
import_static_tuple_c()
293
def _parse_leaf_lines(data, key_length, ref_list_length):
294
parser = BTreeLeafParser(data, key_length, ref_list_length)
313
def _parse_leaf_lines(bytes, key_length, ref_list_length):
314
parser = BTreeLeafParser(bytes, key_length, ref_list_length)
295
315
return parser.parse()
298
# TODO: We can go from 8 byte offset + 4 byte length to a simple lookup,
299
# because the block_offset + length is likely to be repeated. However,
300
# the big win there is to cache across pages, and not just one page
301
# Though if we did cache in a page, we could certainly use a short int.
302
# And this goes from 40 bytes to 30 bytes.
303
# One slightly ugly option would be to cache block offsets in a global.
304
# However, that leads to thread-safety issues, etc.
305
ctypedef struct gc_chk_sha1_record:
306
unsigned long long block_offset
307
unsigned int block_length
308
unsigned int record_start
309
unsigned int record_end
313
cdef int _unhexbuf[256]
315
_hexbuf = b'0123456789abcdef'
317
cdef _populate_unhexbuf():
319
for i from 0 <= i < 256:
321
for i from 0 <= i < 10: # 0123456789 => map to the raw number
322
_unhexbuf[(i + c'0')] = i
323
for i from 10 <= i < 16: # abcdef => 10, 11, 12, 13, 14, 15, 16
324
_unhexbuf[(i - 10 + c'a')] = i
325
for i from 10 <= i < 16: # ABCDEF => 10, 11, 12, 13, 14, 15, 16
326
_unhexbuf[(i - 10 + c'A')] = i
330
cdef int _unhexlify_sha1(char *as_hex, char *as_bin): # cannot_raise
331
"""Take the hex sha1 in as_hex and make it binary in as_bin
333
Same as binascii.unhexlify, but working on C strings, not Python objects.
340
# binascii does this using isupper() and tolower() and ?: syntax. I'm
341
# guessing a simple lookup array should be faster.
343
for i from 0 <= i < 20:
344
top = _unhexbuf[<unsigned char>(as_hex[j])]
346
bot = _unhexbuf[<unsigned char>(as_hex[j])]
348
if top == -1 or bot == -1:
350
as_bin[i] = <unsigned char>((top << 4) + bot);
354
def _py_unhexlify(as_hex):
355
"""For the test infrastructure, just thunks to _unhexlify_sha1"""
356
if not PyBytes_CheckExact(as_hex) or PyBytes_GET_SIZE(as_hex) != 40:
357
raise ValueError('not a 40-byte hex digest')
358
as_bin = PyBytes_FromStringAndSize(NULL, 20)
359
if _unhexlify_sha1(PyBytes_AS_STRING(as_hex), PyBytes_AS_STRING(as_bin)):
364
cdef void _hexlify_sha1(char *as_bin, char *as_hex): # cannot_raise
369
for i from 0 <= i < 20:
371
as_hex[j] = _hexbuf[(c>>4)&0xf]
373
as_hex[j] = _hexbuf[(c)&0xf]
377
def _py_hexlify(as_bin):
378
"""For test infrastructure, thunk to _hexlify_sha1"""
379
if len(as_bin) != 20 or not PyBytes_CheckExact(as_bin):
380
raise ValueError('not a 20-byte binary digest')
381
as_hex = PyBytes_FromStringAndSize(NULL, 40)
382
_hexlify_sha1(PyBytes_AS_STRING(as_bin), PyBytes_AS_STRING(as_hex))
386
cdef int _key_to_sha1(key, char *sha1): # cannot_raise
387
"""Map a key into its sha1 content.
389
:param key: A tuple of style ('sha1:abcd...',)
390
:param sha1: A char buffer of 20 bytes
391
:return: 1 if this could be converted, 0 otherwise
396
if StaticTuple_CheckExact(key) and StaticTuple_GET_SIZE(key) == 1:
397
p_val = <PyObject *>StaticTuple_GET_ITEM(key, 0)
398
elif PyTuple_CheckExact(key) and PyTuple_GET_SIZE(key) == 1:
399
p_val = PyTuple_GET_ITEM(key, 0)
401
# Not a tuple or a StaticTuple
403
if (PyBytes_CheckExact(<object>p_val)
404
and PyBytes_GET_SIZE(<object>p_val) == 45):
405
c_val = PyBytes_AS_STRING(<object>p_val)
408
if strncmp(c_val, b'sha1:', 5) != 0:
410
if not _unhexlify_sha1(c_val + 5, sha1):
415
def _py_key_to_sha1(key):
416
"""Map a key to a simple sha1 string.
418
This is a testing thunk to the C function.
420
as_bin_sha = PyBytes_FromStringAndSize(NULL, 20)
421
if _key_to_sha1(key, PyBytes_AS_STRING(as_bin_sha)):
426
cdef StaticTuple _sha1_to_key(char *sha1):
427
"""Compute a ('sha1:abcd',) key for a given sha1."""
431
hexxed = PyBytes_FromStringAndSize(NULL, 45)
432
c_buf = PyBytes_AS_STRING(hexxed)
433
memcpy(c_buf, b'sha1:', 5)
434
_hexlify_sha1(sha1, c_buf+5)
435
key = StaticTuple_New(1)
437
StaticTuple_SET_ITEM(key, 0, hexxed)
438
# This is a bit expensive. To parse 120 keys takes 48us, to return them all
439
# can be done in 66.6us (so 18.6us to build them all).
440
# Adding simple hash() here brings it to 76.6us (so computing the hash
441
# value of 120keys is 10us), Intern is 86.9us (another 10us to look and add
442
# them to the intern structure.)
443
# However, since we only intern keys that are in active use, it is probably
444
# a win. Since they would have been read from elsewhere anyway.
445
# We *could* hang the PyObject form off of the gc_chk_sha1_record for ones
446
# that we have deserialized. Something to think about, at least.
447
key = StaticTuple_Intern(key)
451
def _py_sha1_to_key(sha1_bin):
452
"""Test thunk to check the sha1 mapping."""
453
if not PyBytes_CheckExact(sha1_bin) or PyBytes_GET_SIZE(sha1_bin) != 20:
454
raise ValueError('sha1_bin must be a str of exactly 20 bytes')
455
return _sha1_to_key(PyBytes_AS_STRING(sha1_bin))
458
cdef unsigned int _sha1_to_uint(char *sha1): # cannot_raise
459
cdef unsigned int val
460
# Must be in MSB, because that is how the content is sorted
461
val = (((<unsigned int>(sha1[0]) & 0xff) << 24)
462
| ((<unsigned int>(sha1[1]) & 0xff) << 16)
463
| ((<unsigned int>(sha1[2]) & 0xff) << 8)
464
| ((<unsigned int>(sha1[3]) & 0xff) << 0))
468
cdef _format_record(gc_chk_sha1_record *record):
469
# This is inefficient to go from a logical state back to a bytes object,
470
# but it makes things work a bit better internally for now.
471
if record.block_offset >= 0xFFFFFFFF:
472
# Could use %llu which was added to Python 2.7 but it oddly is missing
473
# from the Python 3 equivalent functions, so hack still needed. :(
474
block_offset_str = b'%d' % record.block_offset
475
value = PyBytes_FromFormat(
476
'%s %u %u %u', PyBytes_AS_STRING(block_offset_str),
477
record.block_length, record.record_start, record.record_end)
479
value = PyBytes_FromFormat(
480
'%lu %u %u %u', <unsigned long>record.block_offset,
481
record.block_length, record.record_start, record.record_end)
485
cdef class GCCHKSHA1LeafNode:
486
"""Track all the entries for a given leaf node."""
488
cdef gc_chk_sha1_record *records
489
cdef public object last_key
490
cdef gc_chk_sha1_record *last_record
491
cdef public int num_records
492
# This is the number of bits to shift to get to the interesting byte. A
493
# value of 24 means that the very first byte changes across all keys.
494
# Anything else means that there is a common prefix of bits that we can
495
# ignore. 0 means that at least the first 3 bytes are identical, though
496
# that is going to be very rare
497
cdef public unsigned char common_shift
498
# This maps an interesting byte to the first record that matches.
499
# Equivalent to bisect.bisect_left(self.records, sha1), though only taking
500
# into account that one byte.
501
cdef unsigned char offsets[257]
503
def __sizeof__(self):
505
sizeof(GCCHKSHA1LeafNode) +
506
sizeof(gc_chk_sha1_record) * self.num_records)
508
def __dealloc__(self):
509
if self.records != NULL:
510
PyMem_Free(self.records)
513
def __init__(self, bytes):
514
self._parse_bytes(bytes)
516
self.last_record = NULL
520
if self.num_records > 0:
521
return _sha1_to_key(self.records[0].sha1)
526
if self.num_records > 0:
527
return _sha1_to_key(self.records[self.num_records-1].sha1)
530
cdef StaticTuple _record_to_value_and_refs(self,
531
gc_chk_sha1_record *record):
532
"""Extract the refs and value part of this record."""
533
cdef StaticTuple value_and_refs
534
cdef StaticTuple empty
535
value_and_refs = StaticTuple_New(2)
536
value = _format_record(record)
538
StaticTuple_SET_ITEM(value_and_refs, 0, value)
540
empty = StaticTuple_New(0)
542
StaticTuple_SET_ITEM(value_and_refs, 1, empty)
543
return value_and_refs
545
cdef StaticTuple _record_to_item(self, gc_chk_sha1_record *record):
546
"""Turn a given record back into a fully fledged item.
548
cdef StaticTuple item
550
cdef StaticTuple value_and_refs
552
key = _sha1_to_key(record.sha1)
553
item = StaticTuple_New(2)
555
StaticTuple_SET_ITEM(item, 0, key)
556
value_and_refs = self._record_to_value_and_refs(record)
557
Py_INCREF(value_and_refs)
558
StaticTuple_SET_ITEM(item, 1, value_and_refs)
561
cdef gc_chk_sha1_record* _lookup_record(self, char *sha1) except? NULL:
562
"""Find a gc_chk_sha1_record that matches the sha1 supplied."""
563
cdef int lo, hi, mid, the_cmp
566
# TODO: We can speed up misses by comparing this sha1 to the common
567
# bits, and seeing if the common prefix matches, if not, we don't
568
# need to search for anything because it cannot match
569
# Use the offset array to find the closest fit for this entry
570
# follow that up with bisecting, since multiple keys can be in one
572
# Bisecting dropped us from 7000 comparisons to 582 (4.8/key), using
573
# the offset array dropped us from 23us to 20us and 156 comparisions
575
offset = self._offset_for_sha1(sha1)
576
lo = self.offsets[offset]
577
hi = self.offsets[offset+1]
579
# if hi == 255 that means we potentially ran off the end of the
580
# list, so push it up to num_records
581
# note that if 'lo' == 255, that is ok, because we can start
582
# searching from that part of the list.
583
hi = self.num_records
587
the_cmp = memcmp(self.records[mid].sha1, sha1, 20)
589
return &self.records[mid]
596
def __contains__(self, key):
598
cdef gc_chk_sha1_record *record
599
if _key_to_sha1(key, sha1):
600
# If it isn't a sha1 key, then it won't be in this leaf node
601
record = self._lookup_record(sha1)
604
self.last_record = record
608
def __getitem__(self, key):
610
cdef gc_chk_sha1_record *record
612
if self.last_record != NULL and key is self.last_key:
613
record = self.last_record
614
elif _key_to_sha1(key, sha1):
615
record = self._lookup_record(sha1)
617
raise KeyError('key %r is not present' % (key,))
618
return self._record_to_value_and_refs(record)
621
return self.num_records
626
for i from 0 <= i < self.num_records:
627
PyList_Append(result, _sha1_to_key(self.records[i].sha1))
633
for i from 0 <= i < self.num_records:
634
item = self._record_to_item(&self.records[i])
635
PyList_Append(result, item)
638
cdef int _count_records(self, char *c_content, char *c_end): # cannot_raise
639
"""Count how many records are in this section."""
645
while c_cur != NULL and c_cur < c_end:
646
c_cur = <char *>memchr(c_cur, c'\n', c_end - c_cur);
650
num_records = num_records + 1
653
cdef _parse_bytes(self, data):
654
"""Parse the bytes 'data' into content."""
658
cdef Py_ssize_t n_bytes
661
cdef gc_chk_sha1_record *cur_record
663
if not PyBytes_CheckExact(data):
664
raise TypeError('We only support parsing byte strings.')
665
# Pass 1, count how many records there will be
666
n_bytes = PyBytes_GET_SIZE(data)
667
c_bytes = PyBytes_AS_STRING(data)
668
c_end = c_bytes + n_bytes
669
if strncmp(c_bytes, b'type=leaf\n', 10):
670
raise ValueError("bytes did not start with 'type=leaf\\n': %r"
673
num_records = self._count_records(c_cur, c_end)
674
# Now allocate the memory for these items, and go to town
675
self.records = <gc_chk_sha1_record*>PyMem_Malloc(num_records *
676
(sizeof(unsigned short) + sizeof(gc_chk_sha1_record)))
677
self.num_records = num_records
678
cur_record = self.records
680
while c_cur != NULL and c_cur < c_end and entry < num_records:
681
c_cur = self._parse_one_entry(c_cur, c_end, cur_record)
682
cur_record = cur_record + 1
684
if (entry != self.num_records
686
or cur_record != self.records + self.num_records):
687
raise ValueError('Something went wrong while parsing.')
688
# Pass 3: build the offset map
689
self._compute_common()
691
cdef char *_parse_one_entry(self, char *c_cur, char *c_end,
692
gc_chk_sha1_record *cur_record) except NULL:
693
"""Read a single sha record from the bytes.
695
:param c_cur: The pointer to the start of bytes
696
:param cur_record: Record to populate
699
if strncmp(c_cur, 'sha1:', 5):
700
raise ValueError('line did not start with sha1: %r'
701
% (safe_string_from_size(c_cur, 10),))
703
c_next = <char *>memchr(c_cur, c'\0', c_end - c_cur)
704
if c_next == NULL or (c_next - c_cur != 40):
705
raise ValueError('Line did not contain 40 hex bytes')
706
if not _unhexlify_sha1(c_cur, cur_record.sha1):
707
raise ValueError('We failed to unhexlify')
709
if c_cur[0] != c'\0':
710
raise ValueError('only 1 null, not 2 as expected')
712
cur_record.block_offset = strtoull(c_cur, &c_next, 10)
713
if c_cur == c_next or c_next[0] != c' ':
714
raise ValueError('Failed to parse block offset')
716
cur_record.block_length = strtoul(c_cur, &c_next, 10)
717
if c_cur == c_next or c_next[0] != c' ':
718
raise ValueError('Failed to parse block length')
720
cur_record.record_start = strtoul(c_cur, &c_next, 10)
721
if c_cur == c_next or c_next[0] != c' ':
722
raise ValueError('Failed to parse block length')
724
cur_record.record_end = strtoul(c_cur, &c_next, 10)
725
if c_cur == c_next or c_next[0] != c'\n':
726
raise ValueError('Failed to parse record end')
730
cdef int _offset_for_sha1(self, char *sha1) except -1:
731
"""Find the first interesting 8-bits of this sha1."""
733
cdef unsigned int as_uint
734
as_uint = _sha1_to_uint(sha1)
735
this_offset = (as_uint >> self.common_shift) & 0xFF
738
def _get_offset_for_sha1(self, sha1):
739
return self._offset_for_sha1(PyBytes_AS_STRING(sha1))
741
cdef _compute_common(self):
742
cdef unsigned int first
743
cdef unsigned int this
744
cdef unsigned int common_mask
745
cdef unsigned char common_shift
747
cdef int offset, this_offset
749
# The idea with the offset map is that we should be able to quickly
750
# jump to the key that matches a gives sha1. We know that the keys are
751
# in sorted order, and we know that a lot of the prefix is going to be
752
# the same across them.
753
# By XORing the records together, we can determine what bits are set in
755
if self.num_records < 2:
756
# Everything is in common if you have 0 or 1 leaves
757
# So we'll always just shift to the first byte
758
self.common_shift = 24
760
common_mask = 0xFFFFFFFF
761
first = _sha1_to_uint(self.records[0].sha1)
762
for i from 0 < i < self.num_records:
763
this = _sha1_to_uint(self.records[i].sha1)
764
common_mask = (~(first ^ this)) & common_mask
766
while common_mask & 0x80000000 and common_shift > 0:
767
common_mask = common_mask << 1
768
common_shift = common_shift - 1
769
self.common_shift = common_shift
771
max_offset = self.num_records
772
# We cap this loop at 254 records. All the other offsets just get
773
# filled with 0xff as the singleton saying 'too many'.
774
# It means that if we have >255 records we have to bisect the second
775
# half of the list, but this is going to be very rare in practice.
778
for i from 0 <= i < max_offset:
779
this_offset = self._offset_for_sha1(self.records[i].sha1)
780
while offset <= this_offset:
781
self.offsets[offset] = i
784
self.offsets[offset] = max_offset
787
def _get_offsets(self):
790
for i from 0 <= i < 257:
791
PyList_Append(result, self.offsets[i])
795
def _parse_into_chk(bytes, key_length, ref_list_length):
796
"""Parse into a format optimized for chk records."""
797
assert key_length == 1
798
assert ref_list_length == 0
799
return GCCHKSHA1LeafNode(bytes)
802
318
def _flatten_node(node, reference_lists):
803
319
"""Convert a node into the serialized form.