regex over files

Fri Apr 29 05:33:28 EDT 2005

Peter Otten wrote:
> Robin Becker wrote:
> 
> 
>>#sscan1.py thanks to Skip
>>import sys, time, mmap, os, re
>>fn = sys.argv[1]
>>fh=os.open(fn,os.O_BINARY|os.O_RDONLY)
>>s=mmap.mmap(fh,0,access=mmap.ACCESS_READ)
>>l=n=0
>>t0 = time.time()
>>for mat in re.split("XXXXX", s):
> 
> 
> re.split() returns a list, not a generator, and this list may consume a lot
> of memory.
> 
> 
>>n += 1
>>l += len(mat)
>>t1 = time.time()
>>
>>print "fn=%s n=%d l=%d time=%.2f" % (fn, n, l, (t1-t0))
> 
> 
> I wrote a generator replacement for re.split(), but as you might expect, the
> performance is nowhere near re.split(). For your large data it might help
> somewhat because of its smaller memory footprint.
> 
> def splititer(regex, data):
>     # like re.split(), but never yields the separators.
>     if not hasattr(regex, "finditer"):
>         regex = re.compile(regex)
>     start = 0
>     for match in regex.finditer(data):
>         end, new_start = match.span()
>         yield data[start:end]
>         start = new_start
>     yield data[start:]
> 
> Peter

OK now the split scan times are much more comparable for 200Mb (which is what I 
have freely available according to taskmanager), but things start getting bad 
for 300Mb.

C:\code\reportlab\demos\gadflypaper>\tmp\sscan0.py xxx_200mb.dat
fn=xxx_200mb.dat n=3797470 l=181012689 time=23.05

C:\code\reportlab\demos\gadflypaper>\tmp\sscan2.py xxx_200mb.dat
fn=xxx_200mb.dat n=3797470 l=181012689 time=27.63

C:\code\reportlab\demos\gadflypaper>\tmp\sscan2.py xxx_200mb.dat
fn=xxx_200mb.dat n=3797470 l=181012689 time=28.13

C:\code\reportlab\demos\gadflypaper>\tmp\sscan0.py xxx_200mb.dat
fn=xxx_200mb.dat n=3797470 l=181012689 time=22.66

C:\code\reportlab\demos\gadflypaper>\tmp\sscan2.py xxx_300mb.dat
fn=xxx_300mb.dat n=5696206 l=271519105 time=45.45

C:\code\reportlab\demos\gadflypaper>\tmp\sscan0.py xxx_300mb.dat
fn=xxx_300mb.dat n=5696206 l=271519105 time=32.14

C:\code\reportlab\demos\gadflypaper>\tmp\sscan0.py xxx_300mb.dat
fn=xxx_300mb.dat n=5696206 l=271519105 time=33.17

C:\code\reportlab\demos\gadflypaper>\tmp\sscan2.py xxx_300mb.dat
fn=xxx_300mb.dat n=5696206 l=271519105 time=45.27

here sscan0.py is Bengt's adaptive buffer splitter and sscan2.py is Peter's 
generator splitter.
C:\code\reportlab\demos\gadflypaper>cat \tmp\sscan0.py
import sys, time, re
fn = sys.argv[1]
rxo = re.compile('XXXXX')

def frxsplit(path, rxo, chunksize=4096):
     buffer = ''
     for chunk in iter((lambda f=open(path,'rb'): f.read(chunksize)),''):
         buffer += chunk
         pieces = rxo.split(buffer)
         for piece in pieces[:-1]: yield piece
         buffer = pieces[-1]
     yield buffer
l=n=0
t0 = time.time()
for mat in frxsplit(fn,rxo):
     n += 1
     l += len(mat)
t1 = time.time()

print "fn=%s n=%d l=%d time=%.2f" % (fn, n, l, (t1-t0))

C:\code\reportlab\demos\gadflypaper>cat \tmp\sscan2.py
import sys, time, mmap, os, re
def splititer(regex, data):
     # like re.split(), but never yields the separators.
     if not hasattr(regex, "finditer"):
         regex = re.compile(regex)
     start = 0
     for match in regex.finditer(data):
         end, new_start = match.span()
         yield data[start:end]
         start = new_start
     yield data[start:]
fn = sys.argv[1]
fh=os.open(fn,os.O_BINARY|os.O_RDONLY)
s=mmap.mmap(fh,0,access=mmap.ACCESS_READ)
l=n=0
t0 = time.time()
for mat in splititer("XXXXX", s):
     n += 1
     l += len(mat)
t1 = time.time()

print "fn=%s n=%d l=%d time=%.2f" % (fn, n, l, (t1-t0))

-- 
Robin Becker