Regex for nested {}

Diez B.Roggisch deets at web.de
Thu Jul 28 12:37:52 EDT 2005


Chris <c <at> cdot.de> writes:
> 
> is something like that possible?

No. Not with "pure" regexes. The reason is that the theory behind them doesn't
allow to detect syntactic 
constructs like a**nn**n, with a={ and b=} in your case.
 What you need is a "real" parser - usually one 
uses regexes to split the string into so-called tokens. The either a handwritten 

parser or a parser-generator is used.

The following is a simple implementation based on the spark parser generator:


import spark

class ParsingError:
    def __init__(self, msg = None):
        self.msg = msg
    
    def __repr__(self):
        return "Parsing Error: %s" % self.msg
        
class Token:
    def __init__(self, type, attr=None):
        self.type = type
        self.attr = attr

    def __cmp__(self, o):
        return cmp(self.type, o)

    def __repr__(self):
        return self.attr or self.type        

class Scanner(spark.GenericScanner):
    def __init__(_):
        spark.GenericScanner.__init__(_)

    def error(self, s, pos):
        raise ParsingError("Lexical error at %i" % pos)
        
    def tokenize(self, input):
        self.rv = []
        spark.GenericScanner.tokenize(self, input)
        return self.rv
    
    def t_whitespace(self, s):
        r' \s+ '
        pass
        

    def t_word(self, s):
        r'[a-z]\w*'
        t = Token(type='word', attr=s)
        self.rv.append(t)

    def t_parentheses(self, s):
        r' \{| \}'        
        t = Token(type=s)
        self.rv.append(t)

    def t_comment(self, s):
        r'\#.*'
        pass
        
    def t_default(self, s):
        pass

class Parser(spark.GenericParser):
    def __init__(self, start='expr'):
        spark.GenericParser.__init__(self, start)

    def p_expr_1(self, args):
        '''
        expr ::= word block expr
	'''
	return [(args[0], args[1])] + args[2]

    def p_expr_2(self, args):
        '''
	expr ::=
	'''
	return []

    def p_block(self, args):
	'''
        block ::= { expr }
	'''
        return args[1]

    def error(self, token):
        raise ParsingError("Syntax error at or near `%s' token" % token)

def scan(input):
    scanner = Scanner()
    try:
	return scanner.tokenize(input)
    except ParsingError,e:
	print e


def parse(tokens):
    return Parser().parse(tokens)

text = """
        outer {
                inner1 {  }
                inner2 {  }
        }
        simple {  }
"""

def main():
    tokens = scan(text)
    print parse(tokens)


if __name__ == "__main__":
    main()





More information about the Python-list mailing list