pyparsing with nested table

astarocean astarocean at gawab.com
Thu Dec 8 04:31:54 EST 2005


using pyparsing to deal with nested tables , wanna keep table's
structure and propertys .
but program was chunked with the </td> tag of inner table.

have any ideas?

here's the program


from pyparsing import *

mytable = """
<table id="leftpage_table" width="156" border="0" cellspacing="0"
cellpadding="0">
  <tr id="trtd" height="24">
    <td width="153" background="images/bt_kind.gif" align="center"
class="left_menu">system</td>
  </tr>
  <tr id="trtd_down" height="20">
    <td id="trtd_down"><table id="inner_lefgpage_table" width="100%"
height="100%" border="0" cellspacing="0" cellpadding="0">
        <tr id="inner_trtd" height="20">
          <td background="images/bt_class.gif" align="center">art</td>
        </tr>
        <tr>
          <td background="images/bt_class.gif" align="center">art</td>
        </tr>
      </table></td>
  </tr>
</table>
"""

startTag = Literal("<")
endTag = Literal(">")
idPattern = CaselessLiteral("id").suppress() + Literal("=").suppress()
+ ( quotedString.copy().setParseAction( removeQuotes ) |
Word(srange("[a-zA-Z0-9_~]")))
attrPattern = Combine(Word(alphanums + "_") + Literal("=") + (
quotedString | Word(srange("[a-zA-Z0-9_~:&@#;?/\.]"))))

tablePattern = Forward()
def getItemCloseTag(x):
    itemCloseTag = Combine(startTag + Literal("/") + CaselessLiteral(x)
+ endTag).suppress()
    return itemCloseTag
def getItemStartTag(x):
    itemStartTag = startTag.suppress() +
Keyword(x,caseless=True).suppress() + Group(ZeroOrMore(idPattern)) +
Group(ZeroOrMore(attrPattern)) + endTag.suppress()
    return itemStartTag
def getItemPattern(x):
    tCloseTag = getItemCloseTag(x)
    itemPattern = getItemStartTag(x) + Group(ZeroOrMore(tablePattern))
+ Group(SkipTo(tCloseTag)) + tCloseTag
    return itemPattern
def getMultiLevelPattern(x,y):
    tCloseTag = getItemCloseTag(x)
    itemPattern = getItemStartTag(x) + Group(OneOrMore(y)) + tCloseTag
    return itemPattern

tdPattern = getItemPattern(x='td')
trPattern = getMultiLevelPattern('tr',tdPattern)
tablePattern = getMultiLevelPattern('table',trPattern)
t = tablePattern
for toks,strt,end in t.scanString(mytable):
    print toks.asList()


OutPut:
[['leftpage_table'], ['width="156"', 'border="0"', 'cellspacing="0"',
'cellpadding="0"'], [['trtd'], ['height="24"'], [[], ['width="153"',
'background="images/bt_kind.gif"', 'align="center"',
'class="left_menu"'], [], ['system']], ['trtd_down'], ['height="20"'],
[['trtd_down'], [], [], ['<table id="inner_lefgpage_table" width="100%"
height="100%" border="0" cellspacing="0" cellpadding="0">\n        <tr
id="inner_trtd" height="20">\n          <td
background="images/bt_class.gif" align="center">art']], [], [], [[],
['background="images/bt_class.gif"', 'align="center"'], [], ['art']]]]




More information about the Python-list mailing list