Refactor a buffered class...
George Sakkis
george.sakkis at gmail.com
Thu Sep 7 10:08:56 EDT 2006
Michael Spencer wrote:
> George Sakkis wrote:
> > Michael Spencer wrote:
> >
> >> Here's a small update to the generator that allows optional handling of the head
> >> and the tail:
> >>
> >> def chunker(s, chunk_size=3, sentry=".", keep_first = False, keep_last = False):
> >> buffer=[]
> ...
> >
> > And here's a (probably) more efficient version, using a deque as a
> > buffer:
> >
>
> Perhaps the deque-based solution is more efficient under some conditions, but
> it's significantly slower for all the cases I tested:
First off, if you're going to profile something, better use the
standard timeit module rather than a quick, dirty and most likely buggy
handmade function. From "Dive into python": "The most important thing
you need to know about optimizing Python code is that you shouldn't
write your own timing function.". As it turns out, none of chunk_size,
words_per_group and word_length are taken into account in your tests;
they all have their default values. By the way, word_length is
irrelevant since the input is already a sequence, not a big string to
be split.
Second, the output of the two functions is different, so you're not
comparing apples to apples:
>>> s = " this . is a . test to . check if it . works . well . it looks . like ."
>>> for c in chunkerMS(s.split(), keep_last=True, keep_first=True): print c
...
['this', '.']
['this', '.', 'is', 'a', '.']
['this', '.', 'is', 'a', '.', 'test', 'to', '.']
['is', 'a', '.', 'test', 'to', '.', 'check', 'if', 'it', '.']
['test', 'to', '.', 'check', 'if', 'it', '.', 'works', '.']
['check', 'if', 'it', '.', 'works', '.', 'well', '.']
['works', '.', 'well', '.', 'it', 'looks', '.']
['well', '.', 'it', 'looks', '.', 'like', '.']
['it', 'looks', '.', 'like', '.']
['like', '.']
>>> for c in chunkerGS(s.split(), keep_last=True, keep_first=True): print c
...
['this']
['this', 'is a']
['this', 'is a', 'test to']
['is a', 'test to', 'check if it']
['test to', 'check if it', 'works']
['check if it', 'works', 'well']
['works', 'well', 'it looks']
['well', 'it looks', 'like']
['it looks', 'like']
['like']
Third, and most important for the measured difference, is that the
performance hit in my function came from joining the words of each
group (['check', 'if', 'it'] -> 'check if it') every time it is
yielded. If the groups are left unjoined as in Michael's version, the
results are quite different:
* chunk_size=3
chunkerGS: 0.98 seconds
chunkerMS: 1.04 seconds
* chunk_size=30
chunkerGS: 0.98 seconds
chunkerMS: 1.17 seconds
* chunk_size=300
chunkerGS: 0.98 seconds
chunkerMS: 2.44 seconds
As expected, the deque solution is not affected by chunk_size. Here is
the full test:
# chunkers.py
from itertools import islice
from collections import deque
def chunkerMS(s, chunk_size=3, sentry=".", keep_first = False,
keep_last = False):
buffer=[]
sentry_count = 0
for item in s:
buffer.append(item)
if item == sentry:
sentry_count += 1
if sentry_count < chunk_size:
if keep_first:
yield buffer
else:
yield buffer
del buffer[:buffer.index(sentry)+1]
if keep_last:
while buffer:
yield buffer
del buffer[:buffer.index(sentry)+1]
def chunkerGS(seq, sentry='.', chunk_size=3, keep_first=False,
keep_last=False):
buf = deque()
iterchunks = itersplit(seq,sentry)
for chunk in islice(iterchunks, chunk_size-1):
buf.append(chunk)
if keep_first:
yield buf
for chunk in iterchunks:
buf.append(chunk)
yield buf
buf.popleft()
if keep_last:
while buf:
yield buf
buf.popleft()
def itersplit(seq, sentry='.'):
# split the iterable `seq` on each `sentry`
buf = []
for x in seq:
if x != sentry:
buf.append(x)
else:
yield buf
buf = []
if buf:
yield buf
def make_seq(groups=1000, words_per_group=3, sentry='.'):
group = ['w']*words_per_group
group.append(sentry)
return group*groups
if __name__ == '__main__':
import timeit
for chunk_size in 3,30,300:
print "* chunk_size=%d" % chunk_size
for func in chunkerGS,chunkerMS:
name = func.__name__
t = timeit.Timer(
"for p in %s(s, chunk_size=%d, keep_last=True,"
"keep_first=True):pass" %
(name,chunk_size),
"from chunkers import make_seq,chunkerGS,chunkerMS;"
"s=make_seq(groups=5000, words_per_group=500)")
print "%s: %.2f seconds" % (name, min(t.repeat(3,1)))
More information about the Python-list
mailing list