483
483
return _sha1_to_key(PyString_AS_STRING(sha1_bin))
486
cdef unsigned int _sha1_to_uint(char *sha1):
487
cdef unsigned int val
488
# Must be in MSB, because that is how the content is sorted
489
val = (((<unsigned int>(sha1[0]) & 0xff) << 24)
490
| ((<unsigned int>(sha1[1]) & 0xff) << 16)
491
| ((<unsigned int>(sha1[2]) & 0xff) << 8)
492
| ((<unsigned int>(sha1[3]) & 0xff) << 0))
486
496
cdef class GCCHKSHA1LeafNode:
487
497
"""Track all the entries for a given leaf node."""
489
499
cdef public int num_entries
490
500
cdef gc_chk_sha1_record *entries
491
# This is for the mini-index. We look at all the keys and use whatever byte
492
# is first unique across all stored keys (this is often the first byte)
493
# we then store the entries offset for the first record that matches that
494
# byte. This does assume that we'll never have more than 32k entries, but
495
# that doesn't seem to be a terrible assumption (we should have ~100)
496
cdef public short interesting_byte
501
# This is for the mini-index.
502
# We look at all the keys and determine how many start bits are identical.
503
# We then strip those bits, and use the next 8 bits as 'interesting bits'.
504
# offsets is then the bisect() information for a key with those bits (where
505
# would you insert a key starting with these bits)
506
cdef public long bisect_steps
507
cdef public unsigned int common_mask
508
cdef public unsigned int common_bits
509
# The first N bits of all entries in this node have the same content
510
cdef public short common_shift
497
511
cdef short offsets[257]
499
513
def __sizeof__(self):
568
582
cdef gc_chk_sha1_record* _lookup_record(self, char *sha1):
569
583
"""Find a gc_chk_sha1_record that matches the sha1 supplied."""
570
584
cdef int lo, hi, mid, the_cmp
573
588
hi = self.num_entries
585
600
# TODO: Consider improving this, but for now it seems good enough. (We
586
601
# could create a small index so we would have less to bisect,
603
# bisecting does reduce the total number of memcmp calls from 7260 for
605
offset = self._offset_for_sha1(sha1)
606
lo = self.offsets[offset]
607
hi = self.offsets[offset+1]
589
609
mid = (lo + hi) / 2
590
the_cmp = memcmp(sha1, self.entries[mid].sha1, 20)
610
the_cmp = memcmp(self.entries[mid].sha1, sha1, 20)
611
self.bisect_steps += 1
613
# print mid, offset, self._offsets[offset], self._offsets[offset+1]
592
614
return &self.entries[mid]
593
615
elif the_cmp < 0:
599
621
def __contains__(self, key):
707
729
or cur_record != self.entries + self.num_entries):
708
730
raise ValueError('Something went wrong while parsing.')
709
731
# Pass 3: build the offset map
732
self._compute_common()
734
cdef int _offset_for_sha1(self, char *sha1) except -1:
735
"""Find the first interesting 8-bits of this sha1."""
737
this_offset = ((_sha1_to_uint(sha1) & ~self.common_mask)
738
>> (24 - self.common_shift))
739
# assert 0 < this_offset < 256
742
cdef _compute_common(self):
743
cdef unsigned int first
744
cdef unsigned int this
745
cdef unsigned int common_mask
746
cdef short common_shift
748
cdef int offset, this_offset
710
749
# The idea with the offset map is that we should be able to quickly
711
750
# jump to the key that matches a gives sha1. We know that the keys are
712
751
# in sorted order, and we know that a lot of the prefix is going to be
713
752
# the same across them.
753
# By XORing the entries together, we can determine what bits are set in
755
if self.num_entries < 2:
756
self.common_mask = 0x0
757
self.common_shift = 0
759
common_mask = 0xFFFFFFFF
760
first = _sha1_to_uint(self.entries[0].sha1)
761
for i from 0 < i < self.num_entries:
762
this = _sha1_to_uint(self.entries[i].sha1)
763
common_mask = (~(first ^ this)) & common_mask
765
self.common_mask = common_mask
766
while common_mask & 0x80000000:
767
common_mask = common_mask << 1
769
self.common_shift = common_shift
770
self.common_bits = first & self.common_mask
772
for i from 0 <= i < self.num_entries:
773
this_offset = self._offset_for_sha1(self.entries[i].sha1)
774
while offset <= this_offset:
775
self.offsets[offset] = i
778
self.offsets[offset] = self.num_entries
781
property _test_offsets:
786
for i from 0 <= i < 257:
787
result.append(self.offsets[i])
716
791
def _parse_into_chk(bytes, key_length, ref_list_length):