Flattening lists

Thu Feb 5 11:49:25 EST 2009

Baolong zhen <netzhen at gmail.com> wrote:
> On Thu, Feb 5, 2009 at 10:17 PM, mk <mrkafk at gmail.com> wrote:
>
> > Brian Allen Vanderburg II wrote:
> > >> def flatten(x):
> > >>     res = []
> > >>     for el in x:
> > >>         if isinstance(el,list):
> > >>             res.extend(flatten(el))
> > >>         else:
> > >>             res.append(el)
> > >>     return res
> >
> > >
> > > I think it may be just a 'little' more efficient to do this:
> > >
> > > def flatten(x, res=None):
> > >    if res is None:
> > >       res = []
> > >
> > >    for el in x:
> > >       if isinstance(el, (tuple, list)):
> > >          flatten(el, res)
> > >       else:
> > >          res.append(el)
> > >
> > >    return res
> >
> >
> > Hmm why should it be more efficient? extend operation should not be very
> > costly?
>
> less list creation.

(Took me a while to find the content of your post because
you top posted it.  I've taken the liberty of correcting that.)

This is all premature optimization, except for the goopy code, which is
presumably used enough to make it worth optimizing.  And guess what?
The goopy code wins.  What the people theorizing about the speed of
extend vs list creation miss is that the things with high overhead in the
above functions are (1) isinstance and (2) the recursive function call.
The goopy code avoids this by using type and is, and by unrolling the
lowest level without a function call.  On the other hand, extend
_is_ faster than append if you aren't creating a new list, so the
goopy code can be optimized a little more.

I remembered the bit about high function call overhead, but the rest
of it measured:

temp.py
---------------------------------------------------------------------------------
from __future__ import print_function
from timeit import Timer

def flatten1(x):
    res = []
    for el in x:
        if isinstance(el, list):
            res.extend(flatten1(el))
        else:
            res.append(el)
    return res

def flatten1a(x):
    res = []
    for el in x:
        if isinstance(el, (tuple, list)):
            res.extend(flatten1a(el))
        else:
            res.append(el)
    return res

def flatten1b(x):
    res = []
    for el in x:
        if type(el) is list or type(el) is tuple:
            res.extend(flatten1b(el))
        else:
            res.append(el)
    return res

def flatten2(x, res=None):
    if res is None:
       res = []
    for el in x:
       if isinstance(el, list):
          flatten2(el, res)
       else:
          res.append(el)
    return res

def flatten2a(x, res=None):
    if res is None:
       res = []
    for el in x:
       if isinstance(el, (tuple, list)):
          flatten2a(el, res)
       else:
          res.append(el)
    return res

def flatten2b(x, res=None):
    if res is None:
       res = []
    for el in x:
       if type(el) is list or type(el) is tuple:
          flatten2b(el, res)
       else:
          res.append(el)
    return res

def flatten3z(seq):
  lst = []
  for x in seq:
    if type(x) is list or type(x) is tuple:
      for val in x:
        lst.append(val)
    else:
      lst.append(x)
  return lst

def flatten3(seq):
  lst = []
  for el in seq:
    if type(el) == list or type(el) is tuple:
      lst.extend(flatten3z(el))
    else:
      lst.append(el)
  return lst

def flatten3y(seq):
  lst = []
  for x in seq:
    if type(x) is list or type(x) is tuple:
      lst.extend(x)
    else:
      lst.append(x)
  return lst

def flatten3a(seq):
  lst = []
  for el in seq:
    if type(el) == list or type(el) is tuple:
      lst.extend(flatten3y(el))
    else:
      lst.append(el)
  return lst

l = [[1, 2, 3], 5, [7, 8], 3, [9, [10, 11, 12], 4, [9, [10, 5, 7]], 1, [[[[[5, 4], 3], 4, 3], 3, 1, 45], 9], 10]]

cases = dict(
    base = Timer(),
    c1 = Timer("flatten1(l)", "from temp import flatten1, l"),
    c1a = Timer("flatten1a(l)", "from temp import flatten1a, l"),
    c1b = Timer("flatten1b(l)", "from temp import flatten1b, l"),
    c2 = Timer("flatten2(l)", "from temp import flatten2, l"),
    c2a = Timer("flatten2a(l)", "from temp import flatten2a, l"),
    c2b = Timer("flatten2b(l)", "from temp import flatten2b, l"),
    c3 = Timer("flatten3(l)", "from temp import flatten3, l"),
    c3a = Timer("flatten3a(l)", "from temp import flatten3a, l"),
)

if __name__=="__main__":
    for (name, case) in sorted(cases.items()):
        print("{0:4s} {1}".format(name, case.timeit()))

-----------------------------------------------------------------------------

It is also interesting to note that python3.0 is faster in this
particular case (unless there are timing vagrancies on my machine, which
is possible, though the results were fairly consistent over several runs
of the script).  The second run below is using python2.6.1.

>src/python/Python-3.0/python temp.py
base 0.0278329849243
c1   30.4776289463
c1a  44.3886289597
c1b  32.5621030331
c2   25.6131818295
c2a  39.0944678783
c2b  27.1573381424
c3   15.346280098
c3a  14.3178970814

>python temp.py 
base 0.0288269519806
c1   35.8193409443
c1a  52.3054969311
c1b  36.3652667999
c2   32.3255820274
c2a  47.71251297
c2b  32.6813359261
c3   16.6060569286
c3a  15.1056449413

--RDM