Regular Expression question
bearophileHUGS at lycos.com
bearophileHUGS at lycos.com
Mon Aug 21 06:42:24 EDT 2006
I am not expert of REs yet, this my first possible solution:
import re
txt = """
<tag1 name="john"/> <br/> <tag2 value="adj__tall__"/>
<tag1 name="joe"/>
<tag1 name="jack"/>
<tag2 value="adj__short__"/>"""
tfinder = r"""< # The opening < the tag to find
\s* # Possible space or newline
(tag[12]) # First subgroup, the identifier, tag1
or tag2
\s+ # There must be a space or newline or
more
(?:name|value) # Name or value, non-grouping
\s* # Possible space or newline
= # The =
\s* # Possible space or newline
" # Opening "
([^"]*) # Second subgroup, the tag string, it
can't contain "
" # Closing " of the string
\s* # Possible space or newline
/? # One optional ending /
\s* # Possible space or newline
> # The closing > of the tag
? # Greedy, match the first closing >
"""
patt = re.compile(tfinder, flags=re.I+re.X)
prec_type = ""
prec_string = ""
for mobj in patt.finditer(txt):
curr_type, curr_string = mobj.groups()
if curr_type == "tag2" and prec_type == "tag1":
print prec_string, curr_string.replace("adj__", "").strip("_")
prec_type = curr_type
prec_string = curr_string
Bye,
bearophile
More information about the Python-list
mailing list