Regex for nested {}
Diez B.Roggisch
deets at web.de
Thu Jul 28 12:37:52 EDT 2005
Chris <c <at> cdot.de> writes:
>
> is something like that possible?
No. Not with "pure" regexes. The reason is that the theory behind them doesn't
allow to detect syntactic
constructs like a**nn**n, with a={ and b=} in your case.
What you need is a "real" parser - usually one
uses regexes to split the string into so-called tokens. The either a handwritten
parser or a parser-generator is used.
The following is a simple implementation based on the spark parser generator:
import spark
class ParsingError:
def __init__(self, msg = None):
self.msg = msg
def __repr__(self):
return "Parsing Error: %s" % self.msg
class Token:
def __init__(self, type, attr=None):
self.type = type
self.attr = attr
def __cmp__(self, o):
return cmp(self.type, o)
def __repr__(self):
return self.attr or self.type
class Scanner(spark.GenericScanner):
def __init__(_):
spark.GenericScanner.__init__(_)
def error(self, s, pos):
raise ParsingError("Lexical error at %i" % pos)
def tokenize(self, input):
self.rv = []
spark.GenericScanner.tokenize(self, input)
return self.rv
def t_whitespace(self, s):
r' \s+ '
pass
def t_word(self, s):
r'[a-z]\w*'
t = Token(type='word', attr=s)
self.rv.append(t)
def t_parentheses(self, s):
r' \{| \}'
t = Token(type=s)
self.rv.append(t)
def t_comment(self, s):
r'\#.*'
pass
def t_default(self, s):
pass
class Parser(spark.GenericParser):
def __init__(self, start='expr'):
spark.GenericParser.__init__(self, start)
def p_expr_1(self, args):
'''
expr ::= word block expr
'''
return [(args[0], args[1])] + args[2]
def p_expr_2(self, args):
'''
expr ::=
'''
return []
def p_block(self, args):
'''
block ::= { expr }
'''
return args[1]
def error(self, token):
raise ParsingError("Syntax error at or near `%s' token" % token)
def scan(input):
scanner = Scanner()
try:
return scanner.tokenize(input)
except ParsingError,e:
print e
def parse(tokens):
return Parser().parse(tokens)
text = """
outer {
inner1 { }
inner2 { }
}
simple { }
"""
def main():
tokens = scan(text)
print parse(tokens)
if __name__ == "__main__":
main()
More information about the Python-list
mailing list