1
# Copyright (C) 2009, 2010 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Definition of a class that is similar to Set with some small changes."""
19
from __future__ import absolute_import
21
cdef extern from "python-compat.h":
24
cdef extern from "Python.h":
25
ctypedef unsigned long size_t
26
ctypedef long (*hashfunc)(PyObject*) except -1
27
ctypedef object (*richcmpfunc)(PyObject *, PyObject *, int)
28
ctypedef int (*visitproc)(PyObject *, void *)
29
ctypedef int (*traverseproc)(PyObject *, visitproc, void *)
31
void Py_INCREF(PyObject *)
32
void Py_DECREF(PyObject *)
33
ctypedef struct PyTypeObject:
35
richcmpfunc tp_richcompare
36
traverseproc tp_traverse
38
PyTypeObject *Py_TYPE(PyObject *)
39
# Note: *Don't* use hash(), Pyrex 0.9.8.5 thinks it returns an 'int', and
40
# thus silently truncates to 32-bits on 64-bit machines.
41
long PyObject_Hash(PyObject *) except -1
43
void *PyMem_Malloc(size_t nbytes)
44
void PyMem_Free(void *)
45
void memset(void *, int, size_t)
48
# Dummy is an object used to mark nodes that have been deleted. Since
49
# collisions require us to move a node to an alternative location, if we just
50
# set an entry to NULL on delete, we won't find any relocated nodes.
51
# We have to use _dummy_obj because we need to keep a refcount to it, but we
52
# also use _dummy as a pointer, because it avoids having to put <PyObject*> all
54
cdef object _dummy_obj
57
_dummy = <PyObject *>_dummy_obj
60
cdef object _NotImplemented
61
_NotImplemented = NotImplemented
64
cdef int _is_equal(PyObject *this, long this_hash, PyObject *other) except -1:
69
other_hash = PyObject_Hash(other)
70
if other_hash != this_hash:
73
# This implements a subset of the PyObject_RichCompareBool functionality.
75
# 1) Doesn't try to do anything with old-style classes
76
# 2) Assumes that both objects have a tp_richcompare implementation, and
77
# that if that is not enough to compare equal, then they are not
78
# equal. (It doesn't try to cast them both to some intermediate form
79
# that would compare equal.)
80
res = Py_TYPE(this).tp_richcompare(this, other, Py_EQ)
81
if res is _NotImplemented:
82
res = Py_TYPE(other).tp_richcompare(other, this, Py_EQ)
83
if res is _NotImplemented:
90
cdef public api class SimpleSet [object SimpleSetObject, type SimpleSet_Type]:
91
"""This class can be used to track canonical forms for objects.
93
It is similar in function to the interned dictionary that is used by
96
1) It assumes that hash(obj) is cheap, so does not need to inline a copy
98
2) It only stores one reference to the object, rather than 2 (key vs
101
As such, it uses 1/3rd the amount of memory to store a pointer to the
104
# Attributes are defined in the .pxd file
105
DEF DEFAULT_SIZE=1024
108
cdef Py_ssize_t size, n_bytes
111
self._mask = size - 1
114
n_bytes = sizeof(PyObject*) * size;
115
self._table = <PyObject **>PyMem_Malloc(n_bytes)
116
if self._table == NULL:
118
memset(self._table, 0, n_bytes)
120
def __sizeof__(self):
121
# Note: Pyrex doesn't allow sizeof(class) so we re-implement it here.
127
# Note that we might get alignment, etc, wrong, but at least this is
128
# better than no estimate at all
129
# return sizeof(SimpleSet) + (self._mask + 1) * (sizeof(PyObject*))
130
return (sizeof(PyObject) + sizeof(void*)
131
+ 3*sizeof(Py_ssize_t) + sizeof(PyObject**)
132
+ (self._mask + 1) * sizeof(PyObject*))
134
def __dealloc__(self):
135
if self._table != NULL:
136
PyMem_Free(self._table)
151
def _memory_size(self):
152
"""Return the number of bytes of memory consumed by this class."""
153
return sizeof(self) + (sizeof(PyObject*)*(self._mask + 1))
158
def _test_lookup(self, key):
161
slot = _lookup(self, key)
164
elif slot[0] == _dummy:
167
res = <object>slot[0]
168
return <int>(slot - self._table), res
170
def __contains__(self, key):
171
"""Is key present in this SimpleSet."""
174
slot = _lookup(self, key)
175
if slot[0] == NULL or slot[0] == _dummy:
179
cdef PyObject *_get(self, object key) except? NULL:
180
"""Return the object (or nothing) define at the given location."""
183
slot = _lookup(self, key)
184
if slot[0] == NULL or slot[0] == _dummy:
188
def __getitem__(self, key):
189
"""Return a stored item that is equivalent to key."""
190
cdef PyObject *py_val
192
py_val = self._get(key)
194
raise KeyError("Key %s is not present" % key)
195
val = <object>(py_val)
198
cdef int _insert_clean(self, PyObject *key) except -1:
199
"""Insert a key into self.table.
201
This is only meant to be used during times like '_resize',
202
as it makes a lot of assuptions about keys not already being present,
203
and there being no dummy entries.
205
cdef size_t i, n_lookup
207
cdef PyObject **table, **slot
213
the_hash = PyObject_Hash(key)
215
for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever
216
slot = &table[i & mask]
219
self._fill = self._fill + 1
220
self._used = self._used + 1
223
raise RuntimeError('ran out of slots.')
225
def _py_resize(self, min_used):
226
"""Do not use this directly, it is only exposed for testing."""
227
return self._resize(min_used)
229
cdef Py_ssize_t _resize(self, Py_ssize_t min_used) except -1:
230
"""Resize the internal table.
232
The final table will be big enough to hold at least min_used entries.
233
We will copy the data from the existing table over, leaving out dummy
236
:return: The new size of the internal table
238
cdef Py_ssize_t new_size, n_bytes, remaining
239
cdef PyObject **new_table, **old_table, **slot
241
new_size = DEFAULT_SIZE
242
while new_size <= min_used and new_size > 0:
243
new_size = new_size << 1
244
# We rolled over our signed size field
247
# Even if min_used == self._mask + 1, and we aren't changing the actual
248
# size, we will still run the algorithm so that dummy entries are
251
# if new_size < self._used:
252
# raise RuntimeError('cannot shrink SimpleSet to something'
253
# ' smaller than the number of used slots.')
254
n_bytes = sizeof(PyObject*) * new_size;
255
new_table = <PyObject **>PyMem_Malloc(n_bytes)
256
if new_table == NULL:
259
old_table = self._table
260
self._table = new_table
261
memset(self._table, 0, n_bytes)
262
self._mask = new_size - 1
264
remaining = self._fill
267
# Moving everything to the other table is refcount neutral, so we don't
271
if slot[0] == NULL: # unused slot
273
elif slot[0] == _dummy: # dummy slot
274
remaining = remaining - 1
276
remaining = remaining - 1
277
self._insert_clean(slot[0])
279
PyMem_Free(old_table)
283
"""Similar to set.add(), start tracking this key.
285
There is one small difference, which is that we return the object that
286
is stored at the given location. (which is closer to the
287
dict.setdefault() functionality.)
289
return self._add(key)
291
cdef object _add(self, key):
292
cdef PyObject **slot, *py_key
295
py_key = <PyObject *>key
296
if (Py_TYPE(py_key).tp_richcompare == NULL
297
or Py_TYPE(py_key).tp_hash == NULL):
298
raise TypeError('Types added to SimpleSet must implement'
299
' both tp_richcompare and tp_hash')
301
# We need at least one empty slot
302
assert self._used < self._mask
303
slot = _lookup(self, key)
304
if (slot[0] == NULL):
306
self._fill = self._fill + 1
307
self._used = self._used + 1
310
elif (slot[0] == _dummy):
312
self._used = self._used + 1
315
# No else: clause. If _lookup returns a pointer to
316
# a live object, then we already have a value at this location.
317
retval = <object>(slot[0])
318
# PySet and PyDict use a 2-3rds full algorithm, we'll follow suit
319
if added and (self._fill * 3) >= ((self._mask + 1) * 2):
320
# However, we always work for a load factor of 2:1
321
self._resize(self._used * 2)
322
# Even if we resized and ended up moving retval into a different slot,
323
# it is still the value that is held at the slot equivalent to 'key',
324
# so we can still return it
327
def discard(self, key):
328
"""Remove key from the set, whether it exists or not.
330
:return: False if the item did not exist, True if it did
332
if self._discard(key):
336
cdef int _discard(self, key) except -1:
337
cdef PyObject **slot, *py_key
339
slot = _lookup(self, key)
340
if slot[0] == NULL or slot[0] == _dummy:
342
self._used = self._used - 1
345
# PySet uses the heuristic: If more than 1/5 are dummies, then resize
347
# if ((so->_fill - so->_used) * 5 < so->mask)
348
# However, we are planning on using this as an interning structure, in
349
# which we will be putting a lot of objects. And we expect that large
350
# groups of them are going to have the same lifetime.
351
# Dummy entries hurt a little bit because they cause the lookup to keep
352
# searching, but resizing is also rather expensive
353
# For now, we'll just use their algorithm, but we may want to revisit
355
if ((self._fill - self._used) * 5 > self._mask):
356
self._resize(self._used * 2)
360
return _SimpleSet_iterator(self)
363
cdef class _SimpleSet_iterator:
364
"""Iterator over the SimpleSet structure."""
368
cdef Py_ssize_t _used # track if things have been mutated while iterating
369
cdef Py_ssize_t len # number of entries left
371
def __init__(self, obj):
374
self._used = self.set._used
375
self.len = self.set._used
381
cdef Py_ssize_t mask, i
386
if self.set._used != self._used:
387
# Force this exception to continue to be raised
389
raise RuntimeError("Set size changed during iteration")
390
if not SimpleSet_Next(self.set, &self.pos, &key):
394
the_key = <object>key # INCREF
395
self.len = self.len - 1
398
def __length_hint__(self):
399
if self.set is not None and self._used == self.set._used:
405
cdef api SimpleSet SimpleSet_New():
406
"""Create a new SimpleSet object."""
410
cdef SimpleSet _check_self(object self):
411
"""Check that the parameter is not None.
413
Pyrex/Cython will do type checking, but only to ensure that an object is
414
either the right type or None. You can say "object foo not None" for pure
415
python functions, but not for C functions.
416
So this is just a helper for all the apis that need to do the check.
418
cdef SimpleSet true_self
420
raise TypeError('self must not be None')
425
cdef PyObject **_lookup(SimpleSet self, object key) except NULL:
426
"""Find the slot where 'key' would fit.
428
This is the same as a dicts 'lookup' function.
430
:param key: An object we are looking up
431
:param hash: The hash for key
432
:return: The location in self.table where key should be put.
433
location == NULL is an exception, but (*location) == NULL just
434
indicates the slot is empty and can be used.
436
# This uses Quadratic Probing:
437
# http://en.wikipedia.org/wiki/Quadratic_probing
439
# This leads to probe locations at:
442
# h2 = h0 + 3 = h1 + 1 + 1
443
# h3 = h0 + 6 = h2 + 1 + 2
444
# h4 = h0 + 10 = h2 + 1 + 3
445
# Note that all of these are '& mask', but that is computed *after* the
447
# This differs from the algorithm used by Set and Dict. Which, effectively,
448
# use double-hashing, and a step size that starts large, but dwindles to
449
# stepping one-by-one.
450
# This gives more 'locality' in that if you have a collision at offset X,
451
# the first fallback is X+1, which is fast to check. However, that means
452
# that an object w/ hash X+1 will also check there, and then X+2 next.
453
# However, for objects with differing hashes, their chains are different.
454
# The former checks X, X+1, X+3, ... the latter checks X+1, X+2, X+4, ...
455
# So different hashes diverge quickly.
456
# A bigger problem is that we *only* ever use the lowest bits of the hash
457
# So all integers (x + SIZE*N) will resolve into the same bucket, and all
458
# use the same collision resolution. We may want to try to find a way to
459
# incorporate the upper bits of the hash with quadratic probing. (For
460
# example, X, X+1, X+3+some_upper_bits, X+6+more_upper_bits, etc.)
461
cdef size_t i, n_lookup
464
cdef PyObject **table, **slot, *cur, **free_slot, *py_key
466
py_key = <PyObject *>key
467
# Note: avoid using hash(obj) because of a bug w/ pyrex 0.9.8.5 and 64-bit
468
# (it treats hash() as returning an 'int' rather than a 'long')
469
key_hash = PyObject_Hash(py_key)
474
for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever
475
slot = &table[i & mask]
479
if free_slot != NULL:
480
# Did we find an earlier _dummy entry?
485
# Found an exact pointer to the key
488
if free_slot == NULL:
490
elif _is_equal(py_key, key_hash, cur):
491
# Both py_key and cur belong in this slot, return it
494
raise AssertionError('should never get here')
497
cdef api PyObject **_SimpleSet_Lookup(object self, object key) except NULL:
498
"""Find the slot where 'key' would fit.
500
This is the same as a dicts 'lookup' function. This is a private
501
api because mutating what you get without maintaing the other invariants
504
:param key: An object we are looking up
505
:param hash: The hash for key
506
:return: The location in self._table where key should be put
507
should never be NULL, but may reference a NULL (PyObject*)
509
return _lookup(_check_self(self), key)
512
cdef api object SimpleSet_Add(object self, object key):
513
"""Add a key to the SimpleSet (set).
515
:param self: The SimpleSet to add the key to.
516
:param key: The key to be added. If the key is already present,
517
self will not be modified
518
:return: The current key stored at the location defined by 'key'.
519
This may be the same object, or it may be an equivalent object.
520
(consider dict.setdefault(key, key))
522
return _check_self(self)._add(key)
525
cdef api int SimpleSet_Contains(object self, object key) except -1:
526
"""Is key present in self?"""
527
return (key in _check_self(self))
530
cdef api int SimpleSet_Discard(object self, object key) except -1:
531
"""Remove the object referenced at location 'key'.
533
:param self: The SimpleSet being modified
534
:param key: The key we are checking on
535
:return: 1 if there was an object present, 0 if there was not, and -1 on
538
return _check_self(self)._discard(key)
541
cdef api PyObject *SimpleSet_Get(SimpleSet self, object key) except? NULL:
542
"""Get a pointer to the object present at location 'key'.
544
This returns an object which is equal to key which was previously added to
545
self. This returns a borrowed reference, as it may also return NULL if no
546
value is present at that location.
548
:param key: The value we are looking for
549
:return: The object present at that location
551
return _check_self(self)._get(key)
554
cdef api Py_ssize_t SimpleSet_Size(object self) except -1:
555
"""Get the number of active entries in 'self'"""
556
return _check_self(self)._used
559
cdef api int SimpleSet_Next(object self, Py_ssize_t *pos,
560
PyObject **key) except -1:
561
"""Walk over items in a SimpleSet.
563
:param pos: should be initialized to 0 by the caller, and will be updated
565
:param key: Will return a borrowed reference to key
566
:return: 0 if nothing left, 1 if we are returning a new value
568
cdef Py_ssize_t i, mask
569
cdef SimpleSet true_self
570
cdef PyObject **table
571
true_self = _check_self(self)
575
mask = true_self._mask
576
table= true_self._table
577
while (i <= mask and (table[i] == NULL or table[i] == _dummy)):
587
cdef int SimpleSet_traverse(SimpleSet self, visitproc visit,
588
void *arg) except -1:
589
"""This is an implementation of 'tp_traverse' that hits the whole table.
591
Cython/Pyrex don't seem to let you define a tp_traverse, and they only
592
define one for you if you have an 'object' attribute. Since they don't
593
support C arrays of objects, we access the PyObject * directly.
596
cdef PyObject *next_key
600
while SimpleSet_Next(self, &pos, &next_key):
601
ret = visit(next_key, arg)
606
# It is a little bit ugly to do this, but it works, and means that Meliae can
607
# dump the total memory consumed by all child objects.
608
(<PyTypeObject *>SimpleSet).tp_traverse = <traverseproc>SimpleSet_traverse