optimizing large dictionaries

Thu Jan 15 17:22:48 EST 2009

> class MyClass
> 
>   def __str__(self):
>     return "%s-%s-%s" %(self.field1, self.field2, self.field3)
> 
>   def __repr__(self):
>     return str(self)
> 
>   def __hash__(self):
>     return hash(str(self))
> 
> 
> is there anything that can be done to speed up this simply code? right
> now it is taking well over 15 minutes to process, on a 3 Ghz machine
> with lots of RAM (though this is all taking CPU power, not RAM at this
> point.)

class MyClass(object):
    # a new style class with slots saves some memory
    __slots__ = ("field1", "field2", "field2")

    def __hash__(self):
        # In general this is faster than your approach because
        # it requires less function calls. However all fields must
        # be hashable
        return hash((self.field1, self.field2, self.field2))

    def __eq__(self, other):
        # you MUST provide a rich compare __eq__ function
        # when you provide your own hash function. It's called
        # when hash(a) == hash(b).
        if not isinstance(other, MyClass):
            return NotImplemented
        return (self.field1 == other.field1
            and self.field2 == other.field2
            and self.field3 == other.field3)

    def __str__(self):
        return "%s-%s-%s" %(self.field1, self.field2, self.field3)

    # faster than your approach because it doesn't require more function
    # lookups. You can omit this alias, too. It's not required.
    __repr__ = __str__

Instead of the try/except clause I recommend colletions.defaultdict.
Defaultdict takes one callable as factory function. Since int() returns
0 it has the same effect as your code.

from collections import defaultdict
elt = defaultdict(int)

Christian