naive string question

Wed Mar 5 02:24:51 EST 2003

Quoth Létezõ:
> Your problem can be solved with an impressive list comprehension:
> 
> b= [(i,p) for i,c,p in zip(range(1,len(a)),a[1:],a[:-1]) if c!=p]
> 
> The above solution is more pythonical, but could be slower. I did not
> measured their speed.

Once more unto the benchmark, dear friends, and stop the gaps with
our poor-performing functions!

    2.43  1.000  original: Kurowski's original version.
    2.55  1.049  listcomp: Viktor's impressive list comprehension.
    2.03  0.835  listcomp2: A more literal translation of the original.
    1.84  0.757  optimized: An optimized version of the original.
    4.86  2.000  regex: For the regular expression addict.
    4.47  1.840  reducer: An abomination.

(Processor seconds on my machine to run each function 10,000 times
on Kurowski's sample string; ratios of those times; name and brief
description of the implementation.  On my machine, the times vary
by up to a few hundredths of a second on successive runs.)

The last two entries are for amusement only.

Code:

#!/usr/bin/env python
from __future__ import division
import re
import time

def original(a):
    """Kurowski's original version."""
    b = []
    for i in range(1, len(a)):
        if a[i] != a[i-1]:
            b.append((i, a[i-1]))
    return b

def listcomp(a):
    """Viktor's impressive list comprehension."""
    return [(i,p) for i,c,p in zip(range(1,len(a)),a[1:],a[:-1]) if c!=p]

def listcomp2(a):
    """A more literal translation of the original."""
    return [(i, a[i-1]) for i in range(1, len(a)) if a[i] != a[i-1]]

def optimized(a):
    """An optimized version of the original.

    Optimizations include moving a name lookup out of the loop,
    remembering the value currently being repeated instead of fetching
    a[i-1], and iterating over the string instead of indexing into it.
    (It turns out we can also get away with iterating over a instead
    of a[1:], and thereby avoid making a whole new string.  For the
    test string, this is slightly faster despite the extra run of the
    loop; this should hold except for tiny strings.)
    """
    b = []
    if a:
        append = b.append
        prev = a[0]
        i = 0
        for cur in a:
            if cur != prev:
                append((i, prev))
                prev = cur
            i += 1
    return b

_pat = re.compile(r'(.)\1*')
def regex(a):
    """For the regular expression addict."""
    return [(match.end(), a[match.start()])
        for match in _pat.finditer(a)][:-1]

def reducer(a):
    """An abomination."""
    b = []
    reduce(lambda x,y: x[1] != y[1] and b.append(x) or y, 
        zip(range(1, len(a)+1), a), (0, a and a[0]))
    return b

def test(f):
    for input, expected in [
            # based on the behaviour of the original
            ('', []),
            ('a', []),
            ('aa', []),
            ('ab', [(1, 'a')]),
            ('abb', [(1, 'a')]),
            ('abbc', [(1, 'a'), (3, 'b')]),
            ('aaa-ab-a-bbbaa', [(3, 'a'), (4, '-'), (5, 'a'),
                (6, 'b'), (7, '-'), (8, 'a'), (9, '-'), (12, 'b')]),
            ]:
        actual = f(input)
        if actual != expected:
            assert 0, '%s(%r) == %r, not %r' % (f.__name__,
                input, actual, expected)

basis = None
for f in (original, listcomp, listcomp2, optimized, regex, reducer):
    test(f)
    start = time.clock()
    for _ in xrange(10000):
        result = f('aaa-ab-a-bbbaa')
    end = time.clock()
    elapsed = end - start
    if basis is None:
        basis = elapsed
    print '%.2f  %.3f  %s' % (elapsed, elapsed/basis,
        f.__name__ + ': ' + f.__doc__.splitlines()[0])

-- 
Steven Taschuk                          staschuk at telusplanet.net
"Its force is immeasurable.  Even Computer cannot determine it."
                            -- _Space: 1999_ episode "Black Sun"