Perl vs. Python for text manipulation

Sat Jan 24 16:19:08 EST 2004

Cameron Laird wrote:
> ...
> The comparison with Perl in particular interests me.  I
> often encounter characterizations of Perl (or Ruby--I
> regard them as equivalent in this dimension) as the par-
> agon of text manipulation.  It's not, of course, as Icon
> conclusively demonstrates, at least for me; but I don't
> even see Perl as distinctly superior to Python in text
> mangling.  I recognize that Python REs can feel a bit
> cumbersome, in comparison to Perl, because they essenti-
> ally demand the extra step of explicit compilation.  Is
> that all that people mean, though, when they talk about
> Perl's superiority for text mangling?  Is there more to
> it?

Here's an anecodotal sample where I think that the Perl and Python are 
roughly the same. I translated the Perl to Python. Note that the Python 
is a little more complicated primarily because of Python string 
interpolation and not regular expressions.

http://www.cs.sfu.ca/~cameron/REX.html

# REX/Perl 1.0
# Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
# Technical Report TR 1998-17, School of Computing Science, Simon Fraser
# University, November, 1998.
# Copyright (c) 1998, Robert D. Cameron.
# The following code may be freely used and distributed provided that
# this copyright and citation notice remains intact and that modifications
# or additions are clearly identified.

$TextSE = "[^<]+";
$UntilHyphen = "[^-]*-";
$Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
$CommentCE = "$Until2Hyphens>?";
$UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+";
$CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
$S = "[ \\n\\t\\r]+";
$NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
$NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
$Name = "(?:$NameStrt)(?:$NameChar)*";
$QuoteSE = "\"[^\"]*\"|'[^']*'";
$DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
$MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
$S1 = "[\\n\\r\\t ]";
$UntilQMs = "[^?]*\\?+";
$PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
$DT_ItemSE = 
"<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
$DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?";
$DeclCE = 
"--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
$PI_CE = "$Name(?:$PI_Tail)?";
$EndTagCE = "$Name(?:$S)?>?";
$AttValSE = "\"[^<\"]*\"|'[^<']*'";
$ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?";
$MarkupSPE = 
"<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
$XML_SPE = "$TextSE|$MarkupSPE";

sub ShallowParse {
   my($XML_document) = @_;
   return $XML_document =~ /$XML_SPE/g;
}

===============

import re
class recollector :
	def __init__(self):
         	self.res={}
	def add(self, name, reg ):
		self.res[name] = reg % self.res

collector = recollector()
a = collector.add

a( "TextSE" ,"[^<]+" )
a( "UntilHyphen" ,"[^-]*-" )
a( "Until2Hyphens" ,"%(UntilHyphen)s(?:[^-]%(UntilHyphen)s)*-" )
a( "CommentCE" ,"%(Until2Hyphens)s>?" )
a( "UntilRSBs" ,"[^\\]]*](?:[^\\]]+])*]+" )
a( "CDATA_CE" ,"%(UntilRSBs)s(?:[^\\]>]%(UntilRSBs)s)*>" )
a( "S" ,"[ \\n\\t\\r]+" )
a( "NameStrt" ,"[A-Za-z_:]|[^\\x00-\\x7F]" )
a( "NameChar" ,"[A-Za-z0-9_:.-]|[^\\x00-\\x7F]" )
a( "Name" ,"(?:%(NameStrt)s)(?:%(NameChar)s)*" )
a( "QuoteSE" ,"\"[^\"]*\"|'[^']*'" )
a( "DT_IdentSE" ,"%(S)s%(Name)s(?:%(S)s(?:%(Name)s|%(QuoteSE)s))*" )
a( "MarkupDeclCE" ,"(?:[^\\]\"'><]+|%(QuoteSE)s)*>" )
a( "S1" ,"[\\n\\r\\t ]" )
a( "UntilQMs" ,"[^?]*\\?+" )
a( "PI_Tail" ,"\\?>|%(S1)s%(UntilQMs)s(?:[^>?]%(UntilQMs)s)*>" )
a( "DT_ItemSE" 
,"<(?:!(?:--%(Until2Hyphens)s>|[^-]%(MarkupDeclCE)s)|\\?%(Name)s(?:%(PI_Tail)s))|%%%(Name)s;|%(S)s" 
)
a( "DocTypeCE" 
,"%(DT_IdentSE)s(?:%(S)s)?(?:\\[(?:%(DT_ItemSE)s)*](?:%(S)s)?)?>?" )
a( "DeclCE" 
,"--(?:%(CommentCE)s)?|\\[CDATA\\[(?:%(CDATA_CE)s)?|DOCTYPE(?:%(DocTypeCE)s)?" 
)
a( "PI_CE" ,"%(Name)s(?:%(PI_Tail)s)?" )
a( "EndTagCE" ,"%(Name)s(?:%(S)s)?>?" )
a( "AttValSE" ,"\"[^<\"]*\"|'[^<']*'" )
a( "ElemTagCE" 
,"%(Name)s(?:%(S)s%(Name)s(?:%(S)s)?=(?:%(S)s)?(?:%(AttValSE)s))*(?:%(S)s)?/?>?" 
)

a( "MarkupSPE" 
,"<(?:!(?:%(DeclCE)s)?|\\?(?:%(PI_CE)s)?|/(?:%(EndTagCE)s)?|(?:%(ElemTagCE)s)?)" 
)
a( "XML_SPE" ,"%(TextSE)s|%(MarkupSPE)s" )
a( "XML_MARKUP_ONLY_SPE" ,"%(MarkupSPE)s" )

def lexxml(data, markuponly=0):
	regex = re.compile(collector.res["XML_SPE"])
	return regex.findall(data)

(I stripped a few features out of the Python code to make it equivalent. 
If it doesn't run as written above you can see the original here:

http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/65125)

  Paul Prescod