A nice way to use regex for complicate parsing

aspineux aspineux at gmail.com
Thu Mar 29 10:34:47 EDT 2007


My goal is to write a parser for these imaginary string from the SMTP
protocol, regarding RFC 821 and 1869.
I'm a little flexible with the BNF from these RFC :-)
Any comment ?

tests=[ 'MAIL FROM:<john.smith at address.com>',
        'MAIL FROM:john.smith at address.com',
        'MAIL FROM:<john.smith at address.com> SIZE=1234
OTHER=foo at bar.com',
        'MAIL FROM:john.smith at address.com SIZE=1234
OTHER=foo at bar.com',
        'MAIL FROM:<"this at is.a> legal=email"@address.com>',
        'MAIL FROM:"this at is.a> legal=email"@address.com',
        'MAIL FROM:<"this at is.a> legal=email"@address.com> SIZE=1234
OTHER=foo at bar.com',
        'MAIL FROM:"this at is.a> legal=email"@address.com SIZE=1234
OTHER=foo at bar.com',
]

def RN(name, regex):
	"""protect using () and give an optional name to a regex"""
	if name:
		return r'(?P<%s>%s)' % (name, regex)
	else:
		return r'(?:%s)' % regex


regex={}

# <dotnum> ::= <snum> "." <snum> "." <snum> "." <snum>
regex['dotnum']=RN(None, r'[012]?\d?\d\.[012]?\d?\d\.[012]?\d?\d\.
[012]?\d?\d' % regex)
# <dot-string> ::= <string> | <string> "." <dot-string>
regex['dot_string']=RN(None, r'[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)*' %
regex)
# <domain> ::=  <element> | <element> "." <domain>
regex['domain']=RN('domain', r'%(dotnum)s|%(dot_string)s' % regex)
# <q> ::= any one of the 128 ASCII characters except <CR>, <LF>, quote
("), or backslash (\)
regex['q']=RN(None, r'[^\n\r"\\]' % regex)
# <x> ::= any one of the 128 ASCII characters (no exceptions)
regex['x']=RN(None, r'.' % regex)
# <qtext> ::=  "\" <x> | "\" <x> <qtext> | <q> | <q> <qtext>
regex['qtext']=RN(None, r'(?:\\%(x)s|%(q)s)+' % regex)
# <quoted-string> ::=  """ <qtext> """
regex['quoted_string']=RN('quoted_string', r'"%(qtext)s"' % regex)
# <local-part> ::= <dot-string> | <quoted-string>
regex['local_part']=RN('local_part', r'%(quoted_string)s|%
(dot_string)s' % regex)
# <mailbox> ::= <local-part> "@" <domain>
regex['mailbox']=RN('mailbox', r'%(local_part)s@%(domain)s' % regex)
# <path> ::= "<" [ <a-d-l> ":" ] <mailbox> ">"
# also accept address without <>
regex['path']=RN('path', r'(?P<path_lt><)?%(mailbox)s(?(path_lt)>)' %
regex)
# esmtp-keyword    ::= (ALPHA / DIGIT) *(ALPHA / DIGIT / "-")
regex['esmtp_keyword']=RN(None, r'[a-zA-Z0-9][-a-zA-Z0-9]*' % regex)
# esmtp-value      ::= 1*<any CHAR excluding "=", SP, and all        ;
syntax and values depend on esmtp-keyword
#                      control characters (US ASCII 0-31inclusive)>
regex['esmtp_value']=RN(None, r'[^= \t\r\n\f\v]*' % regex)
# esmtp-parameter  ::= esmtp-keyword ["=" esmtp-value]
regex['esmtp_parameter']=RN(None, r'%(esmtp_keyword)s(?:=%
(esmtp_value)s)?' % regex)
# esmtp-parameter  ::= esmtp-keyword ["=" esmtp-value]
regex['esmtp_parameters']=RN('esmtp_parameters', r'%
(esmtp_parameter)s(?:\s+%(esmtp_parameter)s)+' % regex)
# esmtp-cmd        ::= inner-esmtp-cmd [SP esmtp-parameters] CR LF
regex['esmtp_addr']=RN('esmtp_addr', r'%(path)s(?:\s+%
(esmtp_parameters)s)?' % regex)

for t in tests:
	for keyword in [ 'MAIL FROM:', 'RCPT TO:' ]:
		keylen=len(keyword)
		if t[:keylen].upper()==keyword:
			t=t[keylen:]
	    	break

	match=re.match(regex['esmtp_addr'], t)
	if match:
		print 'MATCH local_part=%(local_part)s domain=%(domain)s
esmtp_parameters=%(esmtp_parameters)s' % match.groupdict()
	else:
		print 'DONT match', t




More information about the Python-list mailing list