don't need dictionary's keys - hash table?

Peter Otten __peter__ at web.de
Thu Jul 13 04:02:55 EDT 2006


Terry Hancock wrote:

> Nick Vatamaniuc wrote:
>>  The original post just said that he wanted to index some values by
>>  their urls and didn't really care about the urls themselves, so I
>>  suggested that he just use the hash of the key as the key. The
>>  dictionary will then take a hash of that [note that:
>>  hash(string)=hash(hash(string)) ] then the dictionary will not keep
>>  the reference to the urls and GC will collect them. So instead of
>>  dic[urlstring']=value he will do dic[hash(urlstring)]=value. But then
>>  to retrieve he will have to do old_value=dic[hash(urlstring]].
> 
> Note that it is trivial to catch collisions on entry and correct them:
> 
> key = hash(url_string)
> i      = 0
> if d.has_key(key):
>    while d.has_key(key):
>        i += 1
>    d[key + repr(i)] = val
> else:
>    d[key] = val
> 
> or something along those lines. No need to keep the original string.

How do you intend to get the value back out of the dictionary?

candidates = []
for key in xrange(hash(url)):
    try:
        candidates.append(d[key])
    except KeyError:
        break

Instead, I would put all values with the same hash into a container, along
the lines of

d.setdefault(hash(url), []).append(value))

Here is a simple implementation that can be used to put the idea to a test.
It keeps all but the first url for a cluster with identical hashes.

class Cluster(object):
    """A cluster of dictionary values with the same hash(key).

    Helper for Dict.
    """
    def __init__(self, default):
        self.pairs = {}
        self.default = default
    def __setitem__(self, key, value):
        assert key not in self.pairs
        self.pairs[key] = value
    def __getitem__(self, key):
        return self.pairs.get(key, self.default)
    def __repr__(self):
        return "Cluster(default=%r, pairs=%r)" % (self.default, self.pairs)
    def itervalues(self):
        yield self.default
        for value in self.pairs.itervalues():
            yield value

class Dict(object):
    """A dictionary that stores hashes instead of keys as long as there are
    no collisions.

    The value entered first becomes the default for a cluster of values with
    the same hash. It follows that you cannot implement a reliable
    containment test and must make sure by exterior means that only existing
    keys are looked up.
    """
    def __init__(self):
        self.dict = {}
    def __setitem__(self, key, value):
        short_key = hash(key)
        try:
            cluster = self.dict[short_key]
        except KeyError:
            self.dict[short_key] = value
        else:
            if isinstance(cluster, Cluster):
               cluster[key] = value
            else:
                cluster = Cluster(cluster)
                cluster[key] = value
                self.dict[short_key] = cluster
    def __getitem__(self, key):
        cluster = self.dict[hash(key)]
        if not isinstance(cluster, Cluster):
            return cluster
        return cluster[key]
    def itervalues(self):
        for value in self.dict.itervalues():
            if isinstance(value, Cluster):
                for v in value.itervalues():
                    yield v
            else:
                yield value
    def __repr__(self):
        return "Dict(%s)" % repr(self.dict)[1:-1]

if __name__ == "__main__":
    class BadHash(object):
        """Key with user-defined hash.

        Allows enforcing hash collisions.
        """
        def __init__(self, value, hash):
            self.value = value
            self.hash = hash
        def __hash__(self):
            return hash(self.hash)
        def __eq__(self, other):
            return self.value == other.value
        def __repr__(self):
            return "BadHash(value=%r, hash=%r)" % (self.value, self.hash)
    d = Dict()
    for i, v in enumerate("alpha beta gamma delta".split()):
        d[BadHash(v, i)] = v
    for v in "sigma tau omega".split():
        d[BadHash(v, 42)] = v
    print d
    assert d[BadHash("gamma", 2)] == "gamma"

    assert d[BadHash("sigma", 42)] == "sigma"
    assert d[BadHash("tau", 42)] == "tau"

    # but be warned that
    assert d[BadHash("whatever", 42)] == "sigma"

    expected = sorted("alpha beta gamma delta sigma tau omega".split())
    assert list(sorted(d.itervalues())) == expected

I fear that the memory footprint of a url is not large enough to warrant the
approach, though.

Peter







More information about the Python-list mailing list