regex over files
Robin Becker
robin at reportlab.com
Fri Apr 29 05:33:28 EDT 2005
Peter Otten wrote:
> Robin Becker wrote:
>
>
>>#sscan1.py thanks to Skip
>>import sys, time, mmap, os, re
>>fn = sys.argv[1]
>>fh=os.open(fn,os.O_BINARY|os.O_RDONLY)
>>s=mmap.mmap(fh,0,access=mmap.ACCESS_READ)
>>l=n=0
>>t0 = time.time()
>>for mat in re.split("XXXXX", s):
>
>
> re.split() returns a list, not a generator, and this list may consume a lot
> of memory.
>
>
>>n += 1
>>l += len(mat)
>>t1 = time.time()
>>
>>print "fn=%s n=%d l=%d time=%.2f" % (fn, n, l, (t1-t0))
>
>
> I wrote a generator replacement for re.split(), but as you might expect, the
> performance is nowhere near re.split(). For your large data it might help
> somewhat because of its smaller memory footprint.
>
> def splititer(regex, data):
> # like re.split(), but never yields the separators.
> if not hasattr(regex, "finditer"):
> regex = re.compile(regex)
> start = 0
> for match in regex.finditer(data):
> end, new_start = match.span()
> yield data[start:end]
> start = new_start
> yield data[start:]
>
> Peter
OK now the split scan times are much more comparable for 200Mb (which is what I
have freely available according to taskmanager), but things start getting bad
for 300Mb.
C:\code\reportlab\demos\gadflypaper>\tmp\sscan0.py xxx_200mb.dat
fn=xxx_200mb.dat n=3797470 l=181012689 time=23.05
C:\code\reportlab\demos\gadflypaper>\tmp\sscan2.py xxx_200mb.dat
fn=xxx_200mb.dat n=3797470 l=181012689 time=27.63
C:\code\reportlab\demos\gadflypaper>\tmp\sscan2.py xxx_200mb.dat
fn=xxx_200mb.dat n=3797470 l=181012689 time=28.13
C:\code\reportlab\demos\gadflypaper>\tmp\sscan0.py xxx_200mb.dat
fn=xxx_200mb.dat n=3797470 l=181012689 time=22.66
C:\code\reportlab\demos\gadflypaper>\tmp\sscan2.py xxx_300mb.dat
fn=xxx_300mb.dat n=5696206 l=271519105 time=45.45
C:\code\reportlab\demos\gadflypaper>\tmp\sscan0.py xxx_300mb.dat
fn=xxx_300mb.dat n=5696206 l=271519105 time=32.14
C:\code\reportlab\demos\gadflypaper>\tmp\sscan0.py xxx_300mb.dat
fn=xxx_300mb.dat n=5696206 l=271519105 time=33.17
C:\code\reportlab\demos\gadflypaper>\tmp\sscan2.py xxx_300mb.dat
fn=xxx_300mb.dat n=5696206 l=271519105 time=45.27
here sscan0.py is Bengt's adaptive buffer splitter and sscan2.py is Peter's
generator splitter.
C:\code\reportlab\demos\gadflypaper>cat \tmp\sscan0.py
import sys, time, re
fn = sys.argv[1]
rxo = re.compile('XXXXX')
def frxsplit(path, rxo, chunksize=4096):
buffer = ''
for chunk in iter((lambda f=open(path,'rb'): f.read(chunksize)),''):
buffer += chunk
pieces = rxo.split(buffer)
for piece in pieces[:-1]: yield piece
buffer = pieces[-1]
yield buffer
l=n=0
t0 = time.time()
for mat in frxsplit(fn,rxo):
n += 1
l += len(mat)
t1 = time.time()
print "fn=%s n=%d l=%d time=%.2f" % (fn, n, l, (t1-t0))
C:\code\reportlab\demos\gadflypaper>cat \tmp\sscan2.py
import sys, time, mmap, os, re
def splititer(regex, data):
# like re.split(), but never yields the separators.
if not hasattr(regex, "finditer"):
regex = re.compile(regex)
start = 0
for match in regex.finditer(data):
end, new_start = match.span()
yield data[start:end]
start = new_start
yield data[start:]
fn = sys.argv[1]
fh=os.open(fn,os.O_BINARY|os.O_RDONLY)
s=mmap.mmap(fh,0,access=mmap.ACCESS_READ)
l=n=0
t0 = time.time()
for mat in splititer("XXXXX", s):
n += 1
l += len(mat)
t1 = time.time()
print "fn=%s n=%d l=%d time=%.2f" % (fn, n, l, (t1-t0))
--
Robin Becker
More information about the Python-list
mailing list