Regex speed

Sat Oct 30 09:33:10 EDT 2004

Peter Hansen wrote:
> Reinhold Birkenfeld wrote:
>> OK, thank you. I now got rid of all the regexes, and - surprise,
>> surprise - the speeds are almost equal. The bitter thing about it is
>> that there are now twelve LOC more in Python that don't make
>> understanding the code easier.
>> 
>> So the Perl regex engine seems to be highly optimized, at least for
>> simple expressions.
> 
> Perhaps it's not just your regexes... as Andrew said, "You should post
> the actual code ..."  (Preferably both the Perl and the Python, if you
> are trying to make this into a them vs. us kind of thing, as you
> seem to be doing.)

Well, commenting out the regex substitutions in both versions leads to
equal execution times, as I mentioned earlier, so it has to be my regexes.

Anyway, for your pleasure <wink>, the code (it takes a list of newsgroup
postings and calculates a ratio followups/postings):

------------------------------------------------------------------
At first the perl version (not coded by me):
------------------------------------------------------------------

#!/usr/bin/perl -w

use strict;

die unless @ARGV;

my (%post,%fup);
while (<STDIN>)
{
        chomp;
        my (undef,$from,$mid,$ref) = split /\|/;

        $from =~ s|\s*<.*>\s*||;
        $from = $1 if $from =~ m|\((.*)\)|;
        $from =~ s|^"(.*)"$|$1|;

        my $arr = $post{$from} ||= [];
        push @$arr,$mid;

        $fup{$ref}++;
}

my @arr;
for (keys %post)
{
        my $arr = $post{$_};

        my $m = 0;
        $m += $fup{$_} || 0 for @$arr;

        my $n = @$arr;
        push @arr,[$m/$n,$n,$m,$_];
}

@arr = sort { $b->[1] <=> $a->[1] } @arr;
$#arr = $ARGV[0]-1;

print "Pos Relevanz Posts F'ups Poster\n";
print "--- -------- ----- ----- ------------------\n";

my $i = 1;
for (sort { $b->[0] <=> $a->[0] || $b->[1] <=> $a->[1] } @arr)
{
        printf "%2d)   %2.4f  %4d  %4d %s\n",$i++,@$_;
}

------------------------------------------------------------------
Original Python translation (with regexes):
------------------------------------------------------------------

#!/usr/bin/python

import sys, os, re

if len(sys.argv) == 1:
    sys.exit(0)

post = {}
fup  = {}

re1 = re.compile(r"\s*<.*>\s*")
re2 = re.compile(r".*\((.*)\).*")
re3 = re.compile(r'^"(.*)"$')

for line in sys.stdin:
    line = line.rstrip("\n")
    try:
        nix, fro, mid, ref = line.split("|")
    except: continue

    fro = re1.sub("", fro)
    fro = re2.sub(r"\1", fro)
    fro = re3.sub(r"\1", fro)

    post.setdefault(fro, []).append(mid)

    if ref in fup: fup[ref] += 1
    else: fup[ref] = 1

res = []

for name, arr in post.iteritems():
    m = 0
    for item in arr:
        if item in fup:
            m += fup[item]
    n = len(arr)
    res.append((float(m)/n, n, m, name))

res.sort(lambda x, y: cmp(y[1], x[1]))
del res[int(sys.argv[1]):]

print "Pos Relevanz Posts F'ups Poster"
print "--- -------- ----- ----- ------------------"

res.sort(lambda x, y: cmp(y[0], x[0]) or cmp(y[1], x[1]))
i = 1
for item in res:
    print "%2d)" % i, "  %2.4f  %4d  %4d %s" % item
    i += 1

------------------------------------------------------------------
Optimized Python (without regexes):
------------------------------------------------------------------

#!/usr/bin/python

import sys, os

if len(sys.argv) == 1:
    sys.exit(0)

post = {}
fup  = {}

def inparens(text):
    i = text.find("(")
    if i == -1: return text
    j = text.rfind(")")
    if j < i: return text
    return text[i+1:j]

def removeangles(text):
    i = text.find("<")
    if i == -1: return text
    j = text.rfind(">")
    if j < i: return text
    return text[0:i].rstrip() + text[j+1:].lstrip()

for line in sys.stdin:
    line = line.rstrip("\n")
    try:
        nix, fro, mid, ref = line.split("|")
    except: continue

    fro = inparens(removeangles(fro))
    if fro.startswith('"'):
        if fro.endswith('"'):
            fro = fro[1:-1]

    post.setdefault(fro, []).append(mid)

    if ref in fup: fup[ref] += 1
    else: fup[ref] = 1

res = []

for name, arr in post.iteritems():
    m = 0
    for item in arr:
        if item in fup:
            m += fup[item]
    n = len(arr)
    res.append((float(m)/n, n, m, name))

res.sort(lambda x, y: cmp(y[1], x[1]))
del res[int(sys.argv[1]):]

print "Pos Relevanz Posts F'ups Poster"
print "--- -------- ----- ----- ------------------"

res.sort(lambda x, y: cmp(y[0], x[0]) or cmp(y[1], x[1]))
i = 1
for item in res:
    print "%2d)" % i, "  %2.4f  %4d  %4d %s" % item
    i += 1

------------------------------------------------------------------

Reinhold

-- 
[Windows ist wie] die Bahn: Man muss sich um nichts kuemmern, zahlt fuer
jede Kleinigkeit einen Aufpreis, der Service ist mies, Fremde koennen
jederzeit einsteigen, es ist unflexibel und zu allen anderen Verkehrs-
mitteln inkompatibel.               -- Florian Diesch in dcoulm