Regex speed
Reinhold Birkenfeld
reinhold-birkenfeld-nospam at wolke7.net
Sat Oct 30 09:33:10 EDT 2004
Peter Hansen wrote:
> Reinhold Birkenfeld wrote:
>> OK, thank you. I now got rid of all the regexes, and - surprise,
>> surprise - the speeds are almost equal. The bitter thing about it is
>> that there are now twelve LOC more in Python that don't make
>> understanding the code easier.
>>
>> So the Perl regex engine seems to be highly optimized, at least for
>> simple expressions.
>
> Perhaps it's not just your regexes... as Andrew said, "You should post
> the actual code ..." (Preferably both the Perl and the Python, if you
> are trying to make this into a them vs. us kind of thing, as you
> seem to be doing.)
Well, commenting out the regex substitutions in both versions leads to
equal execution times, as I mentioned earlier, so it has to be my regexes.
Anyway, for your pleasure <wink>, the code (it takes a list of newsgroup
postings and calculates a ratio followups/postings):
------------------------------------------------------------------
At first the perl version (not coded by me):
------------------------------------------------------------------
#!/usr/bin/perl -w
use strict;
die unless @ARGV;
my (%post,%fup);
while (<STDIN>)
{
chomp;
my (undef,$from,$mid,$ref) = split /\|/;
$from =~ s|\s*<.*>\s*||;
$from = $1 if $from =~ m|\((.*)\)|;
$from =~ s|^"(.*)"$|$1|;
my $arr = $post{$from} ||= [];
push @$arr,$mid;
$fup{$ref}++;
}
my @arr;
for (keys %post)
{
my $arr = $post{$_};
my $m = 0;
$m += $fup{$_} || 0 for @$arr;
my $n = @$arr;
push @arr,[$m/$n,$n,$m,$_];
}
@arr = sort { $b->[1] <=> $a->[1] } @arr;
$#arr = $ARGV[0]-1;
print "Pos Relevanz Posts F'ups Poster\n";
print "--- -------- ----- ----- ------------------\n";
my $i = 1;
for (sort { $b->[0] <=> $a->[0] || $b->[1] <=> $a->[1] } @arr)
{
printf "%2d) %2.4f %4d %4d %s\n",$i++,@$_;
}
------------------------------------------------------------------
Original Python translation (with regexes):
------------------------------------------------------------------
#!/usr/bin/python
import sys, os, re
if len(sys.argv) == 1:
sys.exit(0)
post = {}
fup = {}
re1 = re.compile(r"\s*<.*>\s*")
re2 = re.compile(r".*\((.*)\).*")
re3 = re.compile(r'^"(.*)"$')
for line in sys.stdin:
line = line.rstrip("\n")
try:
nix, fro, mid, ref = line.split("|")
except: continue
fro = re1.sub("", fro)
fro = re2.sub(r"\1", fro)
fro = re3.sub(r"\1", fro)
post.setdefault(fro, []).append(mid)
if ref in fup: fup[ref] += 1
else: fup[ref] = 1
res = []
for name, arr in post.iteritems():
m = 0
for item in arr:
if item in fup:
m += fup[item]
n = len(arr)
res.append((float(m)/n, n, m, name))
res.sort(lambda x, y: cmp(y[1], x[1]))
del res[int(sys.argv[1]):]
print "Pos Relevanz Posts F'ups Poster"
print "--- -------- ----- ----- ------------------"
res.sort(lambda x, y: cmp(y[0], x[0]) or cmp(y[1], x[1]))
i = 1
for item in res:
print "%2d)" % i, " %2.4f %4d %4d %s" % item
i += 1
------------------------------------------------------------------
Optimized Python (without regexes):
------------------------------------------------------------------
#!/usr/bin/python
import sys, os
if len(sys.argv) == 1:
sys.exit(0)
post = {}
fup = {}
def inparens(text):
i = text.find("(")
if i == -1: return text
j = text.rfind(")")
if j < i: return text
return text[i+1:j]
def removeangles(text):
i = text.find("<")
if i == -1: return text
j = text.rfind(">")
if j < i: return text
return text[0:i].rstrip() + text[j+1:].lstrip()
for line in sys.stdin:
line = line.rstrip("\n")
try:
nix, fro, mid, ref = line.split("|")
except: continue
fro = inparens(removeangles(fro))
if fro.startswith('"'):
if fro.endswith('"'):
fro = fro[1:-1]
post.setdefault(fro, []).append(mid)
if ref in fup: fup[ref] += 1
else: fup[ref] = 1
res = []
for name, arr in post.iteritems():
m = 0
for item in arr:
if item in fup:
m += fup[item]
n = len(arr)
res.append((float(m)/n, n, m, name))
res.sort(lambda x, y: cmp(y[1], x[1]))
del res[int(sys.argv[1]):]
print "Pos Relevanz Posts F'ups Poster"
print "--- -------- ----- ----- ------------------"
res.sort(lambda x, y: cmp(y[0], x[0]) or cmp(y[1], x[1]))
i = 1
for item in res:
print "%2d)" % i, " %2.4f %4d %4d %s" % item
i += 1
------------------------------------------------------------------
Reinhold
--
[Windows ist wie] die Bahn: Man muss sich um nichts kuemmern, zahlt fuer
jede Kleinigkeit einen Aufpreis, der Service ist mies, Fremde koennen
jederzeit einsteigen, es ist unflexibel und zu allen anderen Verkehrs-
mitteln inkompatibel. -- Florian Diesch in dcoulm
More information about the Python-list
mailing list