[pypy-commit] pypy default: Make itertools.tee shared data a linked list
cbjadwani
noreply at buildbot.pypy.org
Fri Oct 26 12:16:12 CEST 2012
Author: Chirag Jadwani <chirag.jadwani at gmail.com>
Branch:
Changeset: r58441:c1a38c26cf83
Date: 2012-08-02 01:55 +0530
http://bitbucket.org/pypy/pypy/changeset/c1a38c26cf83/
Log: Make itertools.tee shared data a linked list
diff --git a/lib_pypy/itertools.py b/lib_pypy/itertools.py
--- a/lib_pypy/itertools.py
+++ b/lib_pypy/itertools.py
@@ -773,40 +773,56 @@
raise StopIteration()
return value
-
+
class _TeeData(object):
- """Holds cached values for TeeObjects"""
+ """Holds cached values shared by _TeeObjects
+
+ _TeeData instances form linked list where in any instance (node) at most
+ CHUNK_SIZE items are cached.
+ """
+ CHUNK_SIZE = 64
def __init__(self, iterator):
- self.data = []
- self._iter = iterator
+ self.data = [None] * _TeeData.CHUNK_SIZE
+ self.iterator = iterator # must be an iterator not an iterable
+ self.num_read = 0
+ self.next_link = None
def __getitem__(self, i):
- # iterates until 'i' if not done yet
- while i >= len(self.data):
- value = next(self._iter)
- self.data.append(value)
+ if i == self.num_read:
+ item = next(self.iterator)
+ self.data[i] = item
+ self.num_read += 1
+ assert i < self.num_read
return self.data[i]
+ def get_next_link(self):
+ assert self.num_read == _TeeData.CHUNK_SIZE
+ if self.next_link is None:
+ self.next_link = _TeeData(self.iterator)
+ return self.next_link
+
class _TeeObject(object):
"""Iterables / Iterators as returned by the tee() function"""
- def __init__(self, iterable=None, tee_data=None):
- if tee_data:
- self.tee_data = tee_data
- self.pos = 0
- # <=> Copy constructor
- elif isinstance(iterable, _TeeObject):
+ def __init__(self, iterable):
+ if isinstance(iterable, _TeeObject):
self.tee_data = iterable.tee_data
self.pos = iterable.pos
else:
self.tee_data = _TeeData(iter(iterable))
self.pos = 0
-
+
def next(self):
+ assert self.pos <= _TeeData.CHUNK_SIZE
+
+ if self.pos == _TeeData.CHUNK_SIZE:
+ self.tee_data = self.tee_data.get_next_link()
+ self.pos = 0
+
data = self.tee_data[self.pos]
self.pos += 1
return data
-
+
def __iter__(self):
return self
@@ -814,34 +830,38 @@
@builtinify
def tee(iterable, n=2):
"""Return n independent iterators from a single iterable.
+
Note : once tee() has made a split, the original iterable
should not be used anywhere else; otherwise, the iterable could get
advanced without the tee objects being informed.
-
+
Note : this member of the toolkit may require significant auxiliary
storage (depending on how much temporary data needs to be stored).
In general, if one iterator is going to use most or all of the
data before the other iterator, it is faster to use list() instead
of tee()
-
+
Equivalent to :
-
+
def tee(iterable, n=2):
- def gen(next, data={}, cnt=[0]):
- for i in count():
- if i == cnt[0]:
- item = data[i] = next()
- cnt[0] += 1
- else:
- item = data.pop(i)
- yield item
it = iter(iterable)
- return tuple([gen(it.next) for i in range(n)])
+ deques = [collections.deque() for i in range(n)]
+ def gen(mydeque):
+ while True:
+ if not mydeque: # when the local deque is empty
+ newval = next(it) # fetch a new value and
+ for d in deques: # load it to all the deques
+ d.append(newval)
+ yield mydeque.popleft()
+ return tuple(gen(d) for d in deques)
"""
if n < 0:
raise ValueError('n must be >= 0')
+ if n == 0:
+ return ()
if isinstance(iterable, _TeeObject):
# a,b = tee(range(10)) ; c,d = tee(a) ; self.assert_(a is c)
- return tuple([iterable] + [_TeeObject(iterable) for i in xrange(n-1)])
- tee_data = _TeeData(iter(iterable))
- return tuple([_TeeObject(tee_data=tee_data) for i in xrange(n)])
+ tee_obj = iterable
+ else:
+ tee_obj = _TeeObject(iterable)
+ return tuple([tee_obj] + [_TeeObject(tee_obj) for i in xrange(n-1)])
More information about the pypy-commit
mailing list