315
329
return parser.parse()
332
# TODO: We can go from 8 byte offset + 4 byte length to a simple lookup,
333
# because the block_offset + length is likely to be repeated. However,
334
# the big win there is to cache across pages, and not just one page
335
# Though if we did cache in a page, we could certainly use a short int.
336
# And this goes from 40 bytes to 30 bytes.
337
# One slightly ugly option would be to cache block offsets in a global.
338
# However, that leads to thread-safety issues, etc.
339
ctypedef struct gc_chk_sha1_record:
340
long long block_offset
341
unsigned int block_length
342
unsigned int record_start
343
unsigned int record_end
347
cdef int _unhexbuf[256]
349
_hexbuf = '0123456789abcdef'
351
cdef _populate_unhexbuf():
353
for i from 0 <= i < 256:
355
for i from 0 <= i < 10: # 0123456789 => map to the raw number
356
_unhexbuf[(i + c'0')] = i
357
for i from 10 <= i < 16: # abcdef => 10, 11, 12, 13, 14, 15, 16
358
_unhexbuf[(i - 10 + c'a')] = i
359
for i from 10 <= i < 16: # ABCDEF => 10, 11, 12, 13, 14, 15, 16
360
_unhexbuf[(i - 10 + c'A')] = i
364
cdef int _unhexlify_sha1(char *as_hex, char *as_bin): # cannot_raise
365
"""Take the hex sha1 in as_hex and make it binary in as_bin
367
Same as binascii.unhexlify, but working on C strings, not Python objects.
374
# binascii does this using isupper() and tolower() and ?: syntax. I'm
375
# guessing a simple lookup array should be faster.
377
for i from 0 <= i < 20:
378
top = _unhexbuf[<unsigned char>(as_hex[j])]
380
bot = _unhexbuf[<unsigned char>(as_hex[j])]
382
if top == -1 or bot == -1:
384
as_bin[i] = <unsigned char>((top << 4) + bot);
388
def _py_unhexlify(as_hex):
389
"""For the test infrastructure, just thunks to _unhexlify_sha1"""
390
if len(as_hex) != 40 or not PyString_CheckExact(as_hex):
391
raise ValueError('not a 40-byte hex digest')
392
as_bin = PyString_FromStringAndSize(NULL, 20)
393
if _unhexlify_sha1(PyString_AS_STRING(as_hex), PyString_AS_STRING(as_bin)):
398
cdef void _hexlify_sha1(char *as_bin, char *as_hex): # cannot_raise
403
for i from 0 <= i < 20:
405
as_hex[j] = _hexbuf[(c>>4)&0xf]
407
as_hex[j] = _hexbuf[(c)&0xf]
411
def _py_hexlify(as_bin):
412
"""For test infrastructure, thunk to _hexlify_sha1"""
413
if len(as_bin) != 20 or not PyString_CheckExact(as_bin):
414
raise ValueError('not a 20-byte binary digest')
415
as_hex = PyString_FromStringAndSize(NULL, 40)
416
_hexlify_sha1(PyString_AS_STRING(as_bin), PyString_AS_STRING(as_hex))
420
cdef int _key_to_sha1(key, char *sha1): # cannot_raise
421
"""Map a key into its sha1 content.
423
:param key: A tuple of style ('sha1:abcd...',)
424
:param sha1: A char buffer of 20 bytes
425
:return: 1 if this could be converted, 0 otherwise
430
if StaticTuple_CheckExact(key) and StaticTuple_GET_SIZE(key) == 1:
431
p_val = <PyObject *>StaticTuple_GET_ITEM(key, 0)
432
elif (PyTuple_CheckExact(key) and PyTuple_GET_SIZE(key) == 1):
433
p_val = PyTuple_GET_ITEM_ptr_object(key, 0)
435
# Not a tuple or a StaticTuple
437
if (PyString_CheckExact_ptr(p_val) and PyString_GET_SIZE_ptr(p_val) == 45):
438
c_val = PyString_AS_STRING_ptr(p_val)
441
if strncmp(c_val, 'sha1:', 5) != 0:
443
if not _unhexlify_sha1(c_val + 5, sha1):
448
def _py_key_to_sha1(key):
449
"""Map a key to a simple sha1 string.
451
This is a testing thunk to the C function.
453
as_bin_sha = PyString_FromStringAndSize(NULL, 20)
454
if _key_to_sha1(key, PyString_AS_STRING(as_bin_sha)):
459
cdef StaticTuple _sha1_to_key(char *sha1):
460
"""Compute a ('sha1:abcd',) key for a given sha1."""
464
hexxed = PyString_FromStringAndSize(NULL, 45)
465
c_buf = PyString_AS_STRING(hexxed)
466
memcpy(c_buf, 'sha1:', 5)
467
_hexlify_sha1(sha1, c_buf+5)
468
key = StaticTuple_New(1)
470
StaticTuple_SET_ITEM(key, 0, hexxed)
471
# This is a bit expensive. To parse 120 keys takes 48us, to return them all
472
# can be done in 66.6us (so 18.6us to build them all).
473
# Adding simple hash() here brings it to 76.6us (so computing the hash
474
# value of 120keys is 10us), Intern is 86.9us (another 10us to look and add
475
# them to the intern structure.)
476
# However, since we only intern keys that are in active use, it is probably
477
# a win. Since they would have been read from elsewhere anyway.
478
# We *could* hang the PyObject form off of the gc_chk_sha1_record for ones
479
# that we have deserialized. Something to think about, at least.
480
key = StaticTuple_Intern(key)
484
def _py_sha1_to_key(sha1_bin):
485
"""Test thunk to check the sha1 mapping."""
486
if not PyString_CheckExact(sha1_bin) or PyString_GET_SIZE(sha1_bin) != 20:
487
raise ValueError('sha1_bin must be a str of exactly 20 bytes')
488
return _sha1_to_key(PyString_AS_STRING(sha1_bin))
491
cdef unsigned int _sha1_to_uint(char *sha1): # cannot_raise
492
cdef unsigned int val
493
# Must be in MSB, because that is how the content is sorted
494
val = (((<unsigned int>(sha1[0]) & 0xff) << 24)
495
| ((<unsigned int>(sha1[1]) & 0xff) << 16)
496
| ((<unsigned int>(sha1[2]) & 0xff) << 8)
497
| ((<unsigned int>(sha1[3]) & 0xff) << 0))
501
cdef _format_record(gc_chk_sha1_record *record):
502
# This is inefficient to go from a logical state back to a
503
# string, but it makes things work a bit better internally for now.
504
if record.block_offset >= 0xFFFFFFFF:
505
# %llu is what we really want, but unfortunately it was only added
506
# in python 2.7... :(
507
block_offset_str = str(record.block_offset)
508
value = PyString_FromFormat('%s %u %u %u',
509
PyString_AS_STRING(block_offset_str),
511
record.record_start, record.record_end)
513
value = PyString_FromFormat('%lu %u %u %u',
514
<unsigned long>record.block_offset,
516
record.record_start, record.record_end)
520
cdef class GCCHKSHA1LeafNode:
521
"""Track all the entries for a given leaf node."""
523
cdef gc_chk_sha1_record *records
524
cdef public object last_key
525
cdef gc_chk_sha1_record *last_record
526
cdef public int num_records
527
# This is the number of bits to shift to get to the interesting byte. A
528
# value of 24 means that the very first byte changes across all keys.
529
# Anything else means that there is a common prefix of bits that we can
530
# ignore. 0 means that at least the first 3 bytes are identical, though
531
# that is going to be very rare
532
cdef public unsigned char common_shift
533
# This maps an interesting byte to the first record that matches.
534
# Equivalent to bisect.bisect_left(self.records, sha1), though only taking
535
# into account that one byte.
536
cdef unsigned char offsets[257]
538
def __sizeof__(self):
539
# :( Why doesn't Pyrex let me do a simple sizeof(GCCHKSHA1LeafNode)
540
# like Cython? Explicitly enumerating everything here seems to leave my
541
# size off by 2 (286 bytes vs 288 bytes actual). I'm guessing it is an
542
# alignment/padding issue. Oh well- at least we scale properly with
543
# num_records and are very close to correct, which is what I care
545
# If we ever decide to require cython:
546
# return (sizeof(GCCHKSHA1LeafNode)
547
# + sizeof(gc_chk_sha1_record)*self.num_records)
548
return (sizeof(PyObject) + sizeof(void*) + sizeof(int)
549
+ sizeof(gc_chk_sha1_record*) + sizeof(PyObject *)
550
+ sizeof(gc_chk_sha1_record*) + sizeof(char)
551
+ sizeof(unsigned char)*257
552
+ sizeof(gc_chk_sha1_record)*self.num_records)
554
def __dealloc__(self):
555
if self.records != NULL:
556
PyMem_Free(self.records)
559
def __init__(self, bytes):
560
self._parse_bytes(bytes)
562
self.last_record = NULL
566
if self.num_records > 0:
567
return _sha1_to_key(self.records[0].sha1)
572
if self.num_records > 0:
573
return _sha1_to_key(self.records[self.num_records-1].sha1)
576
cdef StaticTuple _record_to_value_and_refs(self,
577
gc_chk_sha1_record *record):
578
"""Extract the refs and value part of this record."""
579
cdef StaticTuple value_and_refs
580
cdef StaticTuple empty
581
value_and_refs = StaticTuple_New(2)
582
value = _format_record(record)
584
StaticTuple_SET_ITEM(value_and_refs, 0, value)
586
empty = StaticTuple_New(0)
588
StaticTuple_SET_ITEM(value_and_refs, 1, empty)
589
return value_and_refs
591
cdef StaticTuple _record_to_item(self, gc_chk_sha1_record *record):
592
"""Turn a given record back into a fully fledged item.
594
cdef StaticTuple item
596
cdef StaticTuple value_and_refs
598
key = _sha1_to_key(record.sha1)
599
item = StaticTuple_New(2)
601
StaticTuple_SET_ITEM(item, 0, key)
602
value_and_refs = self._record_to_value_and_refs(record)
603
Py_INCREF(value_and_refs)
604
StaticTuple_SET_ITEM(item, 1, value_and_refs)
607
cdef gc_chk_sha1_record* _lookup_record(self, char *sha1) except? NULL:
608
"""Find a gc_chk_sha1_record that matches the sha1 supplied."""
609
cdef int lo, hi, mid, the_cmp
612
# TODO: We can speed up misses by comparing this sha1 to the common
613
# bits, and seeing if the common prefix matches, if not, we don't
614
# need to search for anything because it cannot match
615
# Use the offset array to find the closest fit for this entry
616
# follow that up with bisecting, since multiple keys can be in one
618
# Bisecting dropped us from 7000 comparisons to 582 (4.8/key), using
619
# the offset array dropped us from 23us to 20us and 156 comparisions
621
offset = self._offset_for_sha1(sha1)
622
lo = self.offsets[offset]
623
hi = self.offsets[offset+1]
625
# if hi == 255 that means we potentially ran off the end of the
626
# list, so push it up to num_records
627
# note that if 'lo' == 255, that is ok, because we can start
628
# searching from that part of the list.
629
hi = self.num_records
633
the_cmp = memcmp(self.records[mid].sha1, sha1, 20)
635
return &self.records[mid]
642
def __contains__(self, key):
644
cdef gc_chk_sha1_record *record
645
if _key_to_sha1(key, sha1):
646
# If it isn't a sha1 key, then it won't be in this leaf node
647
record = self._lookup_record(sha1)
650
self.last_record = record
654
def __getitem__(self, key):
656
cdef gc_chk_sha1_record *record
658
if self.last_record != NULL and key is self.last_key:
659
record = self.last_record
660
elif _key_to_sha1(key, sha1):
661
record = self._lookup_record(sha1)
663
raise KeyError('key %r is not present' % (key,))
664
return self._record_to_value_and_refs(record)
667
return self.num_records
672
for i from 0 <= i < self.num_records:
673
PyList_Append(result, _sha1_to_key(self.records[i].sha1))
679
for i from 0 <= i < self.num_records:
680
item = self._record_to_item(&self.records[i])
681
PyList_Append(result, item)
684
cdef int _count_records(self, char *c_content, char *c_end): # cannot_raise
685
"""Count how many records are in this section."""
691
while c_cur != NULL and c_cur < c_end:
692
c_cur = <char *>memchr(c_cur, c'\n', c_end - c_cur);
696
num_records = num_records + 1
699
cdef _parse_bytes(self, bytes):
700
"""Parse the string 'bytes' into content."""
704
cdef Py_ssize_t n_bytes
707
cdef gc_chk_sha1_record *cur_record
709
if not PyString_CheckExact(bytes):
710
raise TypeError('We only support parsing plain 8-bit strings.')
711
# Pass 1, count how many records there will be
712
n_bytes = PyString_GET_SIZE(bytes)
713
c_bytes = PyString_AS_STRING(bytes)
714
c_end = c_bytes + n_bytes
715
if strncmp(c_bytes, 'type=leaf\n', 10):
716
raise ValueError("bytes did not start with 'type=leaf\\n': %r"
719
num_records = self._count_records(c_cur, c_end)
720
# Now allocate the memory for these items, and go to town
721
self.records = <gc_chk_sha1_record*>PyMem_Malloc(num_records *
722
(sizeof(unsigned short) + sizeof(gc_chk_sha1_record)))
723
self.num_records = num_records
724
cur_record = self.records
726
while c_cur != NULL and c_cur < c_end and entry < num_records:
727
c_cur = self._parse_one_entry(c_cur, c_end, cur_record)
728
cur_record = cur_record + 1
730
if (entry != self.num_records
732
or cur_record != self.records + self.num_records):
733
raise ValueError('Something went wrong while parsing.')
734
# Pass 3: build the offset map
735
self._compute_common()
737
cdef char *_parse_one_entry(self, char *c_cur, char *c_end,
738
gc_chk_sha1_record *cur_record) except NULL:
739
"""Read a single sha record from the bytes.
740
:param c_cur: The pointer to the start of bytes
744
if strncmp(c_cur, 'sha1:', 5):
745
raise ValueError('line did not start with sha1: %r'
746
% (safe_string_from_size(c_cur, 10),))
748
c_next = <char *>memchr(c_cur, c'\0', c_end - c_cur)
749
if c_next == NULL or (c_next - c_cur != 40):
750
raise ValueError('Line did not contain 40 hex bytes')
751
if not _unhexlify_sha1(c_cur, cur_record.sha1):
752
raise ValueError('We failed to unhexlify')
754
if c_cur[0] != c'\0':
755
raise ValueError('only 1 null, not 2 as expected')
757
cur_record.block_offset = strtoll(c_cur, &c_next, 10)
758
if c_cur == c_next or c_next[0] != c' ':
759
raise ValueError('Failed to parse block offset')
761
cur_record.block_length = strtoul(c_cur, &c_next, 10)
762
if c_cur == c_next or c_next[0] != c' ':
763
raise ValueError('Failed to parse block length')
765
cur_record.record_start = strtoul(c_cur, &c_next, 10)
766
if c_cur == c_next or c_next[0] != c' ':
767
raise ValueError('Failed to parse block length')
769
cur_record.record_end = strtoul(c_cur, &c_next, 10)
770
if c_cur == c_next or c_next[0] != c'\n':
771
raise ValueError('Failed to parse record end')
775
cdef int _offset_for_sha1(self, char *sha1) except -1:
776
"""Find the first interesting 8-bits of this sha1."""
778
cdef unsigned int as_uint
779
as_uint = _sha1_to_uint(sha1)
780
this_offset = (as_uint >> self.common_shift) & 0xFF
783
def _get_offset_for_sha1(self, sha1):
784
return self._offset_for_sha1(PyString_AS_STRING(sha1))
786
cdef _compute_common(self):
787
cdef unsigned int first
788
cdef unsigned int this
789
cdef unsigned int common_mask
790
cdef unsigned char common_shift
792
cdef int offset, this_offset
794
# The idea with the offset map is that we should be able to quickly
795
# jump to the key that matches a gives sha1. We know that the keys are
796
# in sorted order, and we know that a lot of the prefix is going to be
797
# the same across them.
798
# By XORing the records together, we can determine what bits are set in
800
if self.num_records < 2:
801
# Everything is in common if you have 0 or 1 leaves
802
# So we'll always just shift to the first byte
803
self.common_shift = 24
805
common_mask = 0xFFFFFFFF
806
first = _sha1_to_uint(self.records[0].sha1)
807
for i from 0 < i < self.num_records:
808
this = _sha1_to_uint(self.records[i].sha1)
809
common_mask = (~(first ^ this)) & common_mask
811
while common_mask & 0x80000000 and common_shift > 0:
812
common_mask = common_mask << 1
813
common_shift = common_shift - 1
814
self.common_shift = common_shift
816
max_offset = self.num_records
817
# We cap this loop at 254 records. All the other offsets just get
818
# filled with 0xff as the singleton saying 'too many'.
819
# It means that if we have >255 records we have to bisect the second
820
# half of the list, but this is going to be very rare in practice.
823
for i from 0 <= i < max_offset:
824
this_offset = self._offset_for_sha1(self.records[i].sha1)
825
while offset <= this_offset:
826
self.offsets[offset] = i
829
self.offsets[offset] = max_offset
832
def _get_offsets(self):
835
for i from 0 <= i < 257:
836
PyList_Append(result, self.offsets[i])
840
def _parse_into_chk(bytes, key_length, ref_list_length):
841
"""Parse into a format optimized for chk records."""
842
assert key_length == 1
843
assert ref_list_length == 0
844
return GCCHKSHA1LeafNode(bytes)
318
847
def _flatten_node(node, reference_lists):
319
848
"""Convert a node into the serialized form.