1
# Copyright (C) 2009, 2010 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
# cython: language_level=3
19
"""Definition of a class that is similar to Set with some small changes."""
21
from cpython.object cimport (
31
from cpython.mem cimport (
35
from cpython.ref cimport (
39
from libc.string cimport memset
42
# Dummy is an object used to mark nodes that have been deleted. Since
43
# collisions require us to move a node to an alternative location, if we just
44
# set an entry to NULL on delete, we won't find any relocated nodes.
45
# We have to use _dummy_obj because we need to keep a refcount to it, but we
46
# also use _dummy as a pointer, because it avoids having to put <PyObject*> all
48
cdef object _dummy_obj
51
_dummy = <PyObject *>_dummy_obj
54
cdef object _NotImplemented
55
_NotImplemented = NotImplemented
58
cdef int _is_equal(object this, long this_hash, object other) except -1:
61
other_hash = PyObject_Hash(other)
62
if other_hash != this_hash:
65
# This implements a subset of the PyObject_RichCompareBool functionality.
67
# 1) Doesn't try to do anything with old-style classes
68
# 2) Assumes that both objects have a tp_richcompare implementation, and
69
# that if that is not enough to compare equal, then they are not
70
# equal. (It doesn't try to cast them both to some intermediate form
71
# that would compare equal.)
72
res = Py_TYPE(this).tp_richcompare(this, other, Py_EQ)
73
if res is _NotImplemented:
74
res = Py_TYPE(other).tp_richcompare(other, this, Py_EQ)
75
if res is _NotImplemented:
82
cdef public api class SimpleSet [object SimpleSetObject, type SimpleSet_Type]:
83
"""This class can be used to track canonical forms for objects.
85
It is similar in function to the interned dictionary that is used by
88
1) It assumes that hash(obj) is cheap, so does not need to inline a copy
90
2) It only stores one reference to the object, rather than 2 (key vs
93
As such, it uses 1/3rd the amount of memory to store a pointer to the
96
# Attributes are defined in the .pxd file
100
cdef Py_ssize_t size, n_bytes
103
self._mask = size - 1
106
n_bytes = sizeof(PyObject*) * size;
107
self._table = <PyObject **>PyMem_Malloc(n_bytes)
108
if self._table == NULL:
110
memset(self._table, 0, n_bytes)
112
def __sizeof__(self):
113
# Note: Pyrex doesn't allow sizeof(class) so we re-implement it here.
119
# Note that we might get alignment, etc, wrong, but at least this is
120
# better than no estimate at all
121
# return sizeof(SimpleSet) + (self._mask + 1) * (sizeof(PyObject*))
122
return (sizeof(PyObject) + sizeof(void*)
123
+ 3*sizeof(Py_ssize_t) + sizeof(PyObject**)
124
+ (self._mask + 1) * sizeof(PyObject*))
126
def __dealloc__(self):
127
if self._table != NULL:
128
PyMem_Free(self._table)
143
def _memory_size(self):
144
"""Return the number of bytes of memory consumed by this class."""
145
return sizeof(self) + (sizeof(PyObject*)*(self._mask + 1))
150
def _test_lookup(self, key):
153
slot = _lookup(self, key)
156
elif slot[0] == _dummy:
159
res = <object>slot[0]
160
return <int>(slot - self._table), res
162
def __contains__(self, key):
163
"""Is key present in this SimpleSet."""
166
slot = _lookup(self, key)
167
if slot[0] == NULL or slot[0] == _dummy:
171
cdef PyObject *_get(self, object key) except? NULL:
172
"""Return the object (or nothing) define at the given location."""
175
slot = _lookup(self, key)
176
if slot[0] == NULL or slot[0] == _dummy:
180
def __getitem__(self, key):
181
"""Return a stored item that is equivalent to key."""
182
cdef PyObject *py_val
184
py_val = self._get(key)
186
raise KeyError("Key %s is not present" % key)
187
val = <object>(py_val)
190
cdef int _insert_clean(self, PyObject *key) except -1:
191
"""Insert a key into self.table.
193
This is only meant to be used during times like '_resize',
194
as it makes a lot of assuptions about keys not already being present,
195
and there being no dummy entries.
197
cdef size_t i, n_lookup
199
cdef PyObject **table
206
the_hash = PyObject_Hash(<object>key)
208
for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever
209
slot = &table[i & mask]
212
self._fill = self._fill + 1
213
self._used = self._used + 1
216
raise RuntimeError('ran out of slots.')
218
def _py_resize(self, min_used):
219
"""Do not use this directly, it is only exposed for testing."""
220
return self._resize(min_used)
222
cdef Py_ssize_t _resize(self, Py_ssize_t min_used) except -1:
223
"""Resize the internal table.
225
The final table will be big enough to hold at least min_used entries.
226
We will copy the data from the existing table over, leaving out dummy
229
:return: The new size of the internal table
231
cdef Py_ssize_t new_size, n_bytes, remaining
232
cdef PyObject **new_table
233
cdef PyObject **old_table
236
new_size = DEFAULT_SIZE
237
while new_size <= min_used and new_size > 0:
238
new_size = new_size << 1
239
# We rolled over our signed size field
242
# Even if min_used == self._mask + 1, and we aren't changing the actual
243
# size, we will still run the algorithm so that dummy entries are
246
# if new_size < self._used:
247
# raise RuntimeError('cannot shrink SimpleSet to something'
248
# ' smaller than the number of used slots.')
249
n_bytes = sizeof(PyObject*) * new_size;
250
new_table = <PyObject **>PyMem_Malloc(n_bytes)
251
if new_table == NULL:
254
old_table = self._table
255
self._table = new_table
256
memset(self._table, 0, n_bytes)
257
self._mask = new_size - 1
259
remaining = self._fill
262
# Moving everything to the other table is refcount neutral, so we don't
266
if slot[0] == NULL: # unused slot
268
elif slot[0] == _dummy: # dummy slot
269
remaining = remaining - 1
271
remaining = remaining - 1
272
self._insert_clean(slot[0])
274
PyMem_Free(old_table)
277
cpdef object add(self, key):
278
"""Similar to set.add(), start tracking this key.
280
There is one small difference, which is that we return the object that
281
is stored at the given location. (which is closer to the
282
dict.setdefault() functionality.)
287
if (Py_TYPE(key).tp_richcompare == NULL
288
or Py_TYPE(key).tp_hash == NULL):
289
raise TypeError('Types added to SimpleSet must implement'
290
' both tp_richcompare and tp_hash')
292
# We need at least one empty slot
293
assert self._used < self._mask
294
slot = _lookup(self, key)
295
if (slot[0] == NULL):
297
self._fill = self._fill + 1
298
self._used = self._used + 1
299
slot[0] = <PyObject *>key
301
elif (slot[0] == _dummy):
303
self._used = self._used + 1
304
slot[0] = <PyObject *>key
306
# No else: clause. If _lookup returns a pointer to
307
# a live object, then we already have a value at this location.
308
retval = <object>(slot[0])
309
# PySet and PyDict use a 2-3rds full algorithm, we'll follow suit
310
if added and (self._fill * 3) >= ((self._mask + 1) * 2):
311
# However, we always work for a load factor of 2:1
312
self._resize(self._used * 2)
313
# Even if we resized and ended up moving retval into a different slot,
314
# it is still the value that is held at the slot equivalent to 'key',
315
# so we can still return it
318
cpdef bint discard(self, key) except -1:
319
"""Remove key from the set, whether it exists or not.
321
:return: False if the item did not exist, True if it did
325
slot = _lookup(self, key)
326
if slot[0] == NULL or slot[0] == _dummy:
328
self._used = self._used - 1
329
Py_DECREF(<object>slot[0])
331
# PySet uses the heuristic: If more than 1/5 are dummies, then resize
333
# if ((so->_fill - so->_used) * 5 < so->mask)
334
# However, we are planning on using this as an interning structure, in
335
# which we will be putting a lot of objects. And we expect that large
336
# groups of them are going to have the same lifetime.
337
# Dummy entries hurt a little bit because they cause the lookup to keep
338
# searching, but resizing is also rather expensive
339
# For now, we'll just use their algorithm, but we may want to revisit
341
if ((self._fill - self._used) * 5 > self._mask):
342
self._resize(self._used * 2)
346
return _SimpleSet_iterator(self)
349
cdef class _SimpleSet_iterator:
350
"""Iterator over the SimpleSet structure."""
354
cdef Py_ssize_t _used # track if things have been mutated while iterating
355
cdef Py_ssize_t len # number of entries left
357
def __init__(self, obj):
360
self._used = self.set._used
361
self.len = self.set._used
367
cdef Py_ssize_t mask, i
372
if self.set._used != self._used:
373
# Force this exception to continue to be raised
375
raise RuntimeError("Set size changed during iteration")
376
if not SimpleSet_Next(self.set, &self.pos, &key):
380
the_key = <object>key # INCREF
381
self.len = self.len - 1
384
def __length_hint__(self):
385
if self.set is not None and self._used == self.set._used:
390
cdef api SimpleSet SimpleSet_New():
391
"""Create a new SimpleSet object."""
395
cdef SimpleSet _check_self(object self):
396
"""Check that the parameter is not None.
398
Pyrex/Cython will do type checking, but only to ensure that an object is
399
either the right type or None. You can say "object foo not None" for pure
400
python functions, but not for C functions.
401
So this is just a helper for all the apis that need to do the check.
403
cdef SimpleSet true_self
405
raise TypeError('self must not be None')
410
cdef PyObject **_lookup(SimpleSet self, object key) except NULL:
411
"""Find the slot where 'key' would fit.
413
This is the same as a dicts 'lookup' function.
415
:param key: An object we are looking up
416
:param hash: The hash for key
417
:return: The location in self.table where key should be put.
418
location == NULL is an exception, but (*location) == NULL just
419
indicates the slot is empty and can be used.
421
# This uses Quadratic Probing:
422
# http://en.wikipedia.org/wiki/Quadratic_probing
424
# This leads to probe locations at:
427
# h2 = h0 + 3 = h1 + 1 + 1
428
# h3 = h0 + 6 = h2 + 1 + 2
429
# h4 = h0 + 10 = h2 + 1 + 3
430
# Note that all of these are '& mask', but that is computed *after* the
432
# This differs from the algorithm used by Set and Dict. Which, effectively,
433
# use double-hashing, and a step size that starts large, but dwindles to
434
# stepping one-by-one.
435
# This gives more 'locality' in that if you have a collision at offset X,
436
# the first fallback is X+1, which is fast to check. However, that means
437
# that an object w/ hash X+1 will also check there, and then X+2 next.
438
# However, for objects with differing hashes, their chains are different.
439
# The former checks X, X+1, X+3, ... the latter checks X+1, X+2, X+4, ...
440
# So different hashes diverge quickly.
441
# A bigger problem is that we *only* ever use the lowest bits of the hash
442
# So all integers (x + SIZE*N) will resolve into the same bucket, and all
443
# use the same collision resolution. We may want to try to find a way to
444
# incorporate the upper bits of the hash with quadratic probing. (For
445
# example, X, X+1, X+3+some_upper_bits, X+6+more_upper_bits, etc.)
446
cdef size_t i, n_lookup
449
cdef PyObject **table
452
cdef PyObject **free_slot
454
key_hash = PyObject_Hash(key)
459
for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever
460
slot = &table[i & mask]
464
if free_slot != NULL:
465
# Did we find an earlier _dummy entry?
469
if cur == <PyObject *>key:
470
# Found an exact pointer to the key
473
if free_slot == NULL:
475
elif _is_equal(key, key_hash, <object>cur):
476
# Both py_key and cur belong in this slot, return it
479
raise AssertionError('should never get here')
482
cdef api PyObject **_SimpleSet_Lookup(object self, object key) except NULL:
483
"""Find the slot where 'key' would fit.
485
This is the same as a dicts 'lookup' function. This is a private
486
api because mutating what you get without maintaing the other invariants
489
:param key: An object we are looking up
490
:param hash: The hash for key
491
:return: The location in self._table where key should be put
492
should never be NULL, but may reference a NULL (PyObject*)
494
return _lookup(_check_self(self), key)
497
cdef api object SimpleSet_Add(object self, object key):
498
"""Add a key to the SimpleSet (set).
500
:param self: The SimpleSet to add the key to.
501
:param key: The key to be added. If the key is already present,
502
self will not be modified
503
:return: The current key stored at the location defined by 'key'.
504
This may be the same object, or it may be an equivalent object.
505
(consider dict.setdefault(key, key))
507
return _check_self(self).add(key)
510
cdef api int SimpleSet_Contains(object self, object key) except -1:
511
"""Is key present in self?"""
512
return (key in _check_self(self))
515
cdef api int SimpleSet_Discard(object self, object key) except -1:
516
"""Remove the object referenced at location 'key'.
518
:param self: The SimpleSet being modified
519
:param key: The key we are checking on
520
:return: 1 if there was an object present, 0 if there was not, and -1 on
523
return _check_self(self).discard(key)
526
cdef api PyObject *SimpleSet_Get(SimpleSet self, object key) except? NULL:
527
"""Get a pointer to the object present at location 'key'.
529
This returns an object which is equal to key which was previously added to
530
self. This returns a borrowed reference, as it may also return NULL if no
531
value is present at that location.
533
:param key: The value we are looking for
534
:return: The object present at that location
536
return _check_self(self)._get(key)
539
cdef api Py_ssize_t SimpleSet_Size(object self) except -1:
540
"""Get the number of active entries in 'self'"""
541
return _check_self(self)._used
544
cdef api int SimpleSet_Next(object self, Py_ssize_t *pos,
545
PyObject **key) except -1:
546
"""Walk over items in a SimpleSet.
548
:param pos: should be initialized to 0 by the caller, and will be updated
550
:param key: Will return a borrowed reference to key
551
:return: 0 if nothing left, 1 if we are returning a new value
553
cdef Py_ssize_t i, mask
554
cdef SimpleSet true_self
555
cdef PyObject **table
556
true_self = _check_self(self)
560
mask = true_self._mask
561
table= true_self._table
562
while (i <= mask and (table[i] == NULL or table[i] == _dummy)):
572
cdef int SimpleSet_traverse(SimpleSet self, visitproc visit,
573
void *arg) except -1:
574
"""This is an implementation of 'tp_traverse' that hits the whole table.
576
Cython/Pyrex don't seem to let you define a tp_traverse, and they only
577
define one for you if you have an 'object' attribute. Since they don't
578
support C arrays of objects, we access the PyObject * directly.
581
cdef PyObject *next_key
585
while SimpleSet_Next(self, &pos, &next_key):
586
ret = visit(next_key, arg)
591
# It is a little bit ugly to do this, but it works, and means that Meliae can
592
# dump the total memory consumed by all child objects.
593
(<PyTypeObject *>SimpleSet).tp_traverse = <traverseproc>SimpleSet_traverse