Regular Expression question

Mon Aug 21 06:42:24 EDT 2006

I am not expert of REs yet, this my first possible solution:

import re

txt = """
<tag1 name="john"/>  <br/> <tag2 value="adj__tall__"/>
<tag1 name="joe"/>
<tag1 name="jack"/>
<tag2 value="adj__short__"/>"""

tfinder = r"""<                # The opening < the tag to find
               \s*             # Possible space or newline
               (tag[12])       # First subgroup, the identifier, tag1
or tag2
               \s+             # There must be a space or newline or
more
               (?:name|value)  # Name or value, non-grouping
               \s*             # Possible space or newline
               =               # The =
               \s*             # Possible space or newline
               "               # Opening "
                ([^"]*)        # Second subgroup, the tag string, it
can't contain "
               "               # Closing " of the string
               \s*             # Possible space or newline
               /?              # One optional ending /
               \s*             # Possible space or newline
              >                # The closing > of the tag
              ?                # Greedy, match the first closing >
              """
patt = re.compile(tfinder, flags=re.I+re.X)

prec_type = ""
prec_string = ""
for mobj in patt.finditer(txt):
    curr_type, curr_string = mobj.groups()
    if curr_type == "tag2" and prec_type == "tag1":
        print prec_string, curr_string.replace("adj__", "").strip("_")
    prec_type = curr_type
    prec_string = curr_string

Bye,
bearophile