shuffle the lines of a large file

Peter Otten __peter__ at web.de
Fri Mar 11 12:07:20 EST 2005


Simon Brunning wrote:

> I couldn't resist. ;-)

Me neither...
 
> import random
> 
> def randomLines(filename, lines=1):
>     selected_lines = list(None for line_no in xrange(lines))
> 
>     for line_index, line in enumerate(open(filename)):
>         for selected_line_index in xrange(lines):
>             if random.uniform(0, line_index) < 1:
>                 selected_lines[selected_line_index] = line
> 
>     return selected_lines
> 
> This has the advantage that every line had the same chance of being
> picked regardless of its length. There is the chance that it'll pick
> the same line more than once, though.


The idea is simple: have a chain of "pickers" that pass all items that are
not picked on to the next picker. The current implementation, however, may
hit the recursion limit for large samples.

import random

class PickOne(object):
    """Pick a random item from an iterable and store it in the 'picked'
attribute.
       As an iterator, yield all items that are not picked.

       Helper for the pick() function.
    """
    def __init__(self, iterable):
        self.iterable = iterable
    def __iter__(self):
        it = enumerate(self.iterable)
        index, self.picked = it.next()
        for index, item in it:
            if random.uniform(0, index) < 1:
                yield self.picked
                self.picked = item
            else:
                yield item

def pick(iterable, N, accept_less=False):
    """Pick N random items from an iterable.
    """
    # Set up the PickOne(PickOne( ... PickOne(iterable) ... )) chain
    keepers = []
    for i in xrange(N):
        iterable = PickOne(iterable)
        keepers.append(iterable)

    # We are interested in the side effects, i. e. the PickOne instances
    # picking random items from the initial iterable
    for item in iterable:
        pass 

    # The following could be
    # return [k.picked for k in keepers]
    # but we want to handle the case len(iterable) < N, too
    result = []
    for keeper in keepers:
        try:
            picked = keeper.picked
        except AttributeError:
            if accept_less:
                return result
            raise ValueError("sequence contains less items than shall be
picked")
        result.append(picked)
    return result

def demo():
    """Pick random lines from a set of files.
    """
    import optparse
    from itertools import chain, izip, repeat, count

    parser = optparse.OptionParser()
    parser.usage = ("%prog [nl] file1 file2 ... fileN\n\n"
        "Choose random lines from files given on the command line.")
    parser.add_option("-n", "--count", type="int", default=10,
        help="number of lines that shall be picked (default=10)")
    parser.add_option("-l", "--less", action="store_true",
        help="accept less picked lines than specified, if there are less
total lines in the file(s)")
    options, args = parser.parse_args()

    index_name_line = chain(*[izip(count(), repeat(fn), file(fn)) for fn in
args])
    try:
        picked = pick(index_name_line, options.count, options.less)
    except ValueError:
        raise SystemExit("Cannot pick more lines than found in the file(s)")

    print "".join([("%3d <%s> %s" % ifl) for ifl in picked])

if __name__ == "__main__":
    demo()




More information about the Python-list mailing list