Perl vs. Python for text manipulation
Paul Prescod
paul at prescod.net
Sat Jan 24 16:19:08 EST 2004
Cameron Laird wrote:
> ...
> The comparison with Perl in particular interests me. I
> often encounter characterizations of Perl (or Ruby--I
> regard them as equivalent in this dimension) as the par-
> agon of text manipulation. It's not, of course, as Icon
> conclusively demonstrates, at least for me; but I don't
> even see Perl as distinctly superior to Python in text
> mangling. I recognize that Python REs can feel a bit
> cumbersome, in comparison to Perl, because they essenti-
> ally demand the extra step of explicit compilation. Is
> that all that people mean, though, when they talk about
> Perl's superiority for text mangling? Is there more to
> it?
Here's an anecodotal sample where I think that the Perl and Python are
roughly the same. I translated the Perl to Python. Note that the Python
is a little more complicated primarily because of Python string
interpolation and not regular expressions.
http://www.cs.sfu.ca/~cameron/REX.html
# REX/Perl 1.0
# Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
# Technical Report TR 1998-17, School of Computing Science, Simon Fraser
# University, November, 1998.
# Copyright (c) 1998, Robert D. Cameron.
# The following code may be freely used and distributed provided that
# this copyright and citation notice remains intact and that modifications
# or additions are clearly identified.
$TextSE = "[^<]+";
$UntilHyphen = "[^-]*-";
$Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
$CommentCE = "$Until2Hyphens>?";
$UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+";
$CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
$S = "[ \\n\\t\\r]+";
$NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
$NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
$Name = "(?:$NameStrt)(?:$NameChar)*";
$QuoteSE = "\"[^\"]*\"|'[^']*'";
$DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
$MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
$S1 = "[\\n\\r\\t ]";
$UntilQMs = "[^?]*\\?+";
$PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
$DT_ItemSE =
"<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
$DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?";
$DeclCE =
"--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
$PI_CE = "$Name(?:$PI_Tail)?";
$EndTagCE = "$Name(?:$S)?>?";
$AttValSE = "\"[^<\"]*\"|'[^<']*'";
$ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?";
$MarkupSPE =
"<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
$XML_SPE = "$TextSE|$MarkupSPE";
sub ShallowParse {
my($XML_document) = @_;
return $XML_document =~ /$XML_SPE/g;
}
===============
import re
class recollector :
def __init__(self):
self.res={}
def add(self, name, reg ):
self.res[name] = reg % self.res
collector = recollector()
a = collector.add
a( "TextSE" ,"[^<]+" )
a( "UntilHyphen" ,"[^-]*-" )
a( "Until2Hyphens" ,"%(UntilHyphen)s(?:[^-]%(UntilHyphen)s)*-" )
a( "CommentCE" ,"%(Until2Hyphens)s>?" )
a( "UntilRSBs" ,"[^\\]]*](?:[^\\]]+])*]+" )
a( "CDATA_CE" ,"%(UntilRSBs)s(?:[^\\]>]%(UntilRSBs)s)*>" )
a( "S" ,"[ \\n\\t\\r]+" )
a( "NameStrt" ,"[A-Za-z_:]|[^\\x00-\\x7F]" )
a( "NameChar" ,"[A-Za-z0-9_:.-]|[^\\x00-\\x7F]" )
a( "Name" ,"(?:%(NameStrt)s)(?:%(NameChar)s)*" )
a( "QuoteSE" ,"\"[^\"]*\"|'[^']*'" )
a( "DT_IdentSE" ,"%(S)s%(Name)s(?:%(S)s(?:%(Name)s|%(QuoteSE)s))*" )
a( "MarkupDeclCE" ,"(?:[^\\]\"'><]+|%(QuoteSE)s)*>" )
a( "S1" ,"[\\n\\r\\t ]" )
a( "UntilQMs" ,"[^?]*\\?+" )
a( "PI_Tail" ,"\\?>|%(S1)s%(UntilQMs)s(?:[^>?]%(UntilQMs)s)*>" )
a( "DT_ItemSE"
,"<(?:!(?:--%(Until2Hyphens)s>|[^-]%(MarkupDeclCE)s)|\\?%(Name)s(?:%(PI_Tail)s))|%%%(Name)s;|%(S)s"
)
a( "DocTypeCE"
,"%(DT_IdentSE)s(?:%(S)s)?(?:\\[(?:%(DT_ItemSE)s)*](?:%(S)s)?)?>?" )
a( "DeclCE"
,"--(?:%(CommentCE)s)?|\\[CDATA\\[(?:%(CDATA_CE)s)?|DOCTYPE(?:%(DocTypeCE)s)?"
)
a( "PI_CE" ,"%(Name)s(?:%(PI_Tail)s)?" )
a( "EndTagCE" ,"%(Name)s(?:%(S)s)?>?" )
a( "AttValSE" ,"\"[^<\"]*\"|'[^<']*'" )
a( "ElemTagCE"
,"%(Name)s(?:%(S)s%(Name)s(?:%(S)s)?=(?:%(S)s)?(?:%(AttValSE)s))*(?:%(S)s)?/?>?"
)
a( "MarkupSPE"
,"<(?:!(?:%(DeclCE)s)?|\\?(?:%(PI_CE)s)?|/(?:%(EndTagCE)s)?|(?:%(ElemTagCE)s)?)"
)
a( "XML_SPE" ,"%(TextSE)s|%(MarkupSPE)s" )
a( "XML_MARKUP_ONLY_SPE" ,"%(MarkupSPE)s" )
def lexxml(data, markuponly=0):
regex = re.compile(collector.res["XML_SPE"])
return regex.findall(data)
(I stripped a few features out of the Python code to make it equivalent.
If it doesn't run as written above you can see the original here:
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/65125)
Paul Prescod
More information about the Python-list
mailing list