[Tutor] Re: [Tutor]Reading and Writing (No Arithmetic?) [linecache?]

Michael P. Reilly arcege@speakeasy.net
Sat, 19 Jan 2002 11:47:09 -0500


--7JfCtLOvnd9MIVvH
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Sat, Jan 19, 2002 at 08:28:12AM -0500, Blake Winton wrote:
> * Danny Yoo <dyoo@hkn.eecs.berkeley.edu> [020118 22:32]:
> > On Fri, 18 Jan 2002, Paul Sidorsky wrote:
> > To get a random line from a file still requires that we count how many
> > lines are in a file, but that can be a quick linear scan.
> 
> Alternately, couldn't we just go to a random byte in the file, scan
> backwards for the first newline, then forwards for the next newline,
> and that would give us a random line?
> 
> I guess it would tend to weight longer lines heavier, but I don't
> think that's a terrible restriction, given that it should be
> stupid-fast, and tke up very little memory.

Better to take a full block, not just a byte.  It makes it easier to
find the newline without seeking too much.

This may be a little long winded bit of code, but it seems to work well.
All you need to do is get a function to give a random number.

Except for long lines, you would read in one or two block (default is 1k).
The function would scan from the location in the file backwards for the
first newline and forwards for the first newline, getting more blocks
if needed.

from random import random
from randline import getstring_precent
print getstring_precent(random() * 100)

The assumption made with my code is that it won't be used more than once
or twice, so building a cache would be wasteful.

  -Arcege

PS: Yikes... this took some effort with my ailment.. but I'm glad I can
still do it. :)

--7JfCtLOvnd9MIVvH
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline; filename="randline.py"

#!/usr/bin/python
"""Get a line from a given file at the specified position.  One function
is by a float percentage (0-100), the other function is by byte position
in the file.  The default block size is 1k.

getstring_percent(filename, percentage, [blocksize=1k])  returns
  line at that percentage, including the newline
getstring_locaton(filename, byte_location, [blocksize=1k]) returns
  line at that location (i.e. seek value), including the newline
"""

import os, string

def getstring_percent(fname, percent, blocksize = 1024):
  """Get the line at the position in the file given by a certain
percentage.  The percentage could be a floating point number.
For example, 100% would be the last line, 99% might get the third to
last line, so 99.5% would retrieve the second to last line."""
  size = os.path.getsize(fname)
  # get the byte location at the percentage given
  byte_location = int(size * (percent / 100.0)) - 1
  return getstring_location(fname, byte_location, blocksize)

def getstring_location(fname, byte_location, blocksize = 1024):
  """Get the line at the position in the file given by the byte location."""
  blockno, charpos = divmod(byte_location, blocksize)
  size = os.path.getsize(fname)
  if byte_location >= size:
    raise IndexError("out-of-range")

  f = open(fname, 'r')
  f.seek(blockno * blocksize)  # advance to the block we want
  block = f.read(blocksize)

  # the next two while loops are very similar, but unfortunately, there
  # are too many differences to make it into one nice function

  blocklist = [block]
  begin, curblock, curpos = -1, blockno, charpos

  # we will find the start of the line (previous newline); if we cannot
  # we'll get more blocks until we can find it, or the beginning of the
  # file
  while begin == -1:
    begin = string.rfind(block, '\n', 0, curpos)
    if begin == -1:
      curblock = curblock - 1
      f.seek(curblock * blocksize)  # rewind back one block
      block = f.read(blocksize)
      if not block:
        break
      blocklist.insert(0, block)
      curpos = len(block)      # this may not be blocksize
  # at this point, begin == -1 means that we start at the beginning of file

  block = blocklist[-1]
  end, curblock, curpos = -1, blockno, charpos

  # starting at the first block read, we'll go forward and find the
  # next newline; we'll get more blocks until we reach the end of the
  # file or find the next newline
  while end == -1:
    end = string.find(block, '\n', curpos)
    if end == -1:
      curblock = curblock + 1
      f.seek(curblock * blocksize)
      block = f.read(blocksize)
      if not block:
        break
      blocklist.append(0, block)
      curpos = len(block)

  # prune the available data and get only what we want
  if len(blocklist) == 1:   # special case
    if begin != -1 and end != -1:
      blocklist[0] = blocklist[0][begin+1:end+1]
    elif begin == -1:
      blocklist[0] = blocklist[0][:end+1]
    elif end == -1:
      blocklist[0] = blocklist[0][begin+1:]
    else:
      pass # this means the the whole file is the string
  else:
    if begin != -1:
      # replace the first block with only the portion it needs
      blocklist[0] = blocklist[0][begin+1:]
    # we'll do the same with the last block
    if end != -1:
      blocklist[-1] = blocklist[-1][:end+1]

  block = string.join( blocklist, '' )  # join all the blocks

  return block

def _test(value, fname):
  try:
    print value, `getstring_location(fname, value)`
  except IndexError, err:
    print value, 'is', err
  try:
    print value, `getstring_percent(fname, value)`
  except IndexError, err:
    print '%g%% is' % value, err

if __name__ == '__main__':
  import sys
  try:
    fname = sys.argv[1]
  except:
    fname = sys.argv[0]  # use the script itself
  try:
    value = string.atof(sys.argv[2])
  except:
    value = 100
  _test(value, fname)


--7JfCtLOvnd9MIVvH--