make sure entire string was parsed

Steven Bethard steven.bethard at gmail.com
Mon Sep 12 11:15:41 EDT 2005


Paul McGuire wrote:
>>>I have to differentiate between:
>>>  (NP -x-y)
>>>and:
>>>  (NP-x -y)
>>>I'm doing this now using Combine.  Does that seem right?
> 
> If your word char set is just alphanums+"-", then this will work
> without doing anything unnatural with leaveWhitespace:
> 
> from pyparsing import *
> 
> thing = Word(alphanums+"-")
> LPAREN = Literal("(").suppress()
> RPAREN = Literal(")").suppress()
> node = LPAREN + OneOrMore(thing) + RPAREN
> 
> print node.parseString("(NP -x-y)")
> print node.parseString("(NP-x -y)")
> 
> will print:
> 
> ['NP', '-x-y']
> ['NP-x', '-y']

I actually need to break these into:

['NP', '-x-y'] {'tag':'NP', 'word:'-x-y'}
['NP', 'x', 'y'] {tag:'NP', 'functions':['x'], 'word':'y'}

I know the dict syntax afterwards isn't quite what pyparsing would 
output, but hopefully my intent is clear.  I need to use the dict-style 
results from setResultsName() calls because in the full grammar, I have 
a lot of optional elements.  For example:

(NP-1 -a)
   --> {'tag':'NP', 'id':'1', 'word':'-a'}
(NP-x-2 -B)
   --> {'tag':'NP', 'functions':['x'], 'id':'2', 'word':'-B'}
(NP-x-y=2-3 -4)
   --> {'tag':'NP', 'functions':['x', 'y'], 'coord':'2', 'id':'3', 
'word':'-4'}
(-NONE- x)
   --> {'tag':None, 'word':'x'}



STeVe

P.S.  In case you're curious, here's my current draft of the code:

# some character classes
printables_trans = _pp.printables.translate
word_chars = printables_trans(_id_trans, '()')
word_elem = _pp.Word(word_chars)
syn_chars = printables_trans(_id_trans, '()-=')
syn_word = _pp.Word(syn_chars)
func_chars = printables_trans(_id_trans, '()-=0123456789')
func_word = _pp.Word(func_chars)
num_word = _pp.Word(_pp.nums)

# tag separators
dash = _pp.Literal('-')
tag_sep = dash.suppress()
coord_sep = _pp.Literal('=').suppress()

# tag types (use Combine to guarantee no spaces)
special_tag = _pp.Combine(dash + syn_word + dash)
syn_tag = syn_word
func_tags = _pp.ZeroOrMore(_pp.Combine(tag_sep + func_word))
coord_tag = _pp.Optional(_pp.Combine(coord_sep + num_word))
id_tag = _pp.Optional(_pp.Combine(tag_sep + num_word))

# give tag types result names
special_tag = special_tag.setResultsName('tag')
syn_tag = syn_tag.setResultsName('tag')
func_tags = func_tags.setResultsName('funcs')
coord_tag = coord_tag.setResultsName('coord')
id_tag = id_tag.setResultsName('id')

# combine tag types into a tags element
normal_tags = syn_tag + func_tags + coord_tag + id_tag
tags = special_tag | _pp.Combine(normal_tags)
def get_tag(orig_string, tokens_start, tokens):
     tokens = dict(tokens)
     tag = tokens.pop('tag')
     if tag == '-NONE-':
         tag = None
     functions = list(tokens.pop('funcs', []))
     coord = tokens.pop('coord', None)
     id = tokens.pop('id', None)
     return [dict(tag=tag, functions=functions,
                  coord=coord, id=id)]
tags.setParseAction(get_tag)

# node parentheses
start = _pp.Literal('(').suppress()
end = _pp.Literal(')').suppress()

# words
word = word_elem.setResultsName('word')

# leaf nodes
leaf_node = tags + _pp.Optional(word)
def get_leaf_node(orig_string, tokens_start, tokens):
     try:
         tag_dict, word = tokens
         word = cls._unescape(word)
     except ValueError:
         tag_dict, = tokens
         word = None
     return cls(word=word, **tag_dict)
leaf_node.setParseAction(get_leaf_node)

# node, recursive
node = _pp.Forward()

# branch nodes
branch_node = tags + _pp.OneOrMore(node)
def get_branch_node(orig_string, tokens_start, tokens):
     return cls(children=tokens[1:], **tokens[0])
branch_node.setParseAction(get_branch_node)

# node, recursive
node << start + (branch_node | leaf_node) + end

# root node may have additional parentheses
root_node = node | start + node + end
root_nodes = _pp.OneOrMore(root_node)

# make sure nodes start and end string
str_start = _pp.StringStart()
str_end = _pp.StringEnd()
cls._root_node = str_start + root_node + str_end
cls._root_nodes = str_start + root_nodes + str_end



More information about the Python-list mailing list