Performance of list.index - how to speed up a silly algorithm?

Fri Apr 30 03:20:26 EDT 2010

MRAB wrote:

> The .index method does a linear search, checking on average 1/2 of the
> items in the list. That's why it's so slow.
> 
> In order to avoid that you could build a dict of each value in
> dimension_values[col_idx] and its index in a single pass so that it
> becomes a quick lookup.

For example:

from itertools import count, izip

def iterator_factory():
    # columns = ["color", "size", "weight", "value"]
    rows = [
         ["Yellow", "Big", 2, 4],
         ["Blue", "Big", 3, -4],
         ["Blue", "Small", 10, 55],
         #...
    ]

    return iter(rows)

class Indexer(object):
    def __init__(self):
        self.lookup = {}
        self.counter = count()
    def __call__(self, value):
        try:
            return self.lookup[value]
        except KeyError:
            result = self.lookup[value] = next(self.counter)
            return result
    def to_list(self):
        d = self.lookup
        reverse = dict(izip(d.itervalues(), d.iterkeys()))
        assert len(reverse) == len(d)
        return [reverse[i] for i in xrange(len(reverse))]

def pairs(rows, dimension_cols, measure_cols, indexers):
    for row in rows:
        dv = [indexer(row[col])
              for indexer, col in izip(indexers, dimension_cols)]
        mv = [row[col] for col in measure_cols]
        yield dv, mv

def main():
    # dimension_names = ["color", "size"]
    dimension_cols = [0, 1]

    # measure_names = ["weight", "value"]
    measure_cols = [2, 3]

    indexers = [Indexer() for _ in dimension_cols]

    facts = pairs(iterator_factory(),
                  dimension_cols, measure_cols, indexers)
    facts = list(facts)

    print facts
    for i, indexer in enumerate(indexers):
        print "%d: %s" % (i, indexer.to_list())

if __name__ == "__main__":
    main()