find all js/css/image pathnames in a HTML document

iMath redstone-cold at 163.com
Sat May 11 04:19:30 EDT 2019


To find all js/css/image pathnames in a HTML document, I used regular expression(in the last line of my code snippet) to do this as the following, are there any other shorter regular expressions or more efficient ways to do this ?


import re
translation='''<link rel="stylesheet" type="text/css" href="O8C.css">

<span eid="beetle_e" id="beetle_e" level="0" alpha_id="000003072" name="beetle" idm_id="000003072" backup-class="b" class="entry">

  <span level="1" class="h-g">

    <span level="2" class="top-g">

      <span level="3" class="h">bee·tle™</span>

      <span type="h_full_" level="3" class="ei-g">

        <span class="z_ei-g">/</span>

        <span file="{gb}/b/be/bee/beetle#_gb_1.spx" level="4" wd="beetle" recdate="070514" class="phon-gb">ˈbiːtl</span>

        <a type="sound" topic="b/bee/beetl/beetle__gb_1.spx" resource="uk_pron" backup-class="Media" class="fayin" href="sound://uk/beetle__gb_1.spx"><img src="uk_pron.png" class="fayin"/></a>

        <span class="z">;

          <span class="z_phon-us">NAmE</span>

        </span>

        <span file="{gb}/b/be/bee/beetle#_us_1.spx" sup="y" level="4" wd="beetle" recdate="070514" class="phon-us">ˈbiːtl</span>

        <a type="sound" topic="b/bee/beetl/beetle__us_1.spx" resource="us_pron" backup-class="Media" class="fayin" href="sound://us/beetle__us_1.spx"><img src="us_pron.png" class="fayin"/></a>

        <span class="z_ei-g">/</span>

      </span>

      <span level="3" display="inline" class="pos-g">

        <span topic="beetle_e" bookmark="beetle_pos_n" class="Ref">

          <a href="entry://#beetle_pos_n" level="4" pos="n" backup-class="pos">noun</a>

        </span>

        <span class="z">,</span>

        <span topic="beetle_e" bookmark="beetle_pos_v" class="Ref">

          <a href="entry://#beetle_pos_v" level="4" pos="v" backup-class="pos">verb</a>

        </span>

      </span>

    </span>

    <span level="2" class="infl">

      <span level="3" class="inflection">beetle</span>

      <span level="3" class="inflection">beetles</span>

      <span level="3" class="inflection">beetled</span>

      <span level="3" class="inflection">beetling</span>

    </span>

  </span>

  <a name="beetle_pos_n"></a>

  <span eid="beetle_pos_n" id="beetle_pos_n" level="1" class="p-g">

    <span level="2" class="block-g">

      <span level="3" class="pos-g">

        <span level="4" pos="n" class="pos">noun</span>

      </span>

      <img src="/pic/insects_comp.jpg" alt="/pic/insects_comp.jpg" height="620" width="720" style="display:none;" onclick="this.style.display='none';this.nextSibling.nextSibling.style.display='block';"/>

      <img type="image" topic="insects_comp.htm" thumb_resource="thumb" resource="pic" thumb="beetle.jpg" class="Media" backup-height="620" backup-width="720" src="/thumb/beetle.jpg" alt="/thumb/beetle.jpg" onclick="this.style.display='none';this.previousSibling.previousSibling.style.display='block';"/>

      <span class="clear"></span>

    </span>

    <span eid="beetle_ng_1" id="beetle_ng_1" level="2" n="1" class="n-g">

      <span class="z_n">1</span>

      <span level="3" class="def-g">

        <span status="6" level="4" tranidoupc="1" class="d">an insect, often large and black, with a hard case on its back, covering its wings. There are several types of

          <span level="5" class="dh">beetle.</span>

          <span localeuidoupc="201" status="6" level="5" class="chn">甲虫</span>

        </span>

      </span>

      <span xt="see" level="3" class="xr-g">

        <span class="symbols-xrsym">☞</span>see also

        <span eid="beetle_xr_1" id="beetle_xr_1" xt="see" href="deathwatchbeetle_e" level="4" pos="n" class="xr">

          <span topic="deathwatchbeetle_e" fk="XXX" class="Ref">

            <span level="5" class="xh">

              <a href="entry://death-watch beetle">death-watch beetle</a>

            </span>

          </span>

        </span>

      </span>

    </span>

    <span eid="beetle_ng_2" id="beetle_ng_2" new="seven" level="2" n="2" enc="y" class="n-g">

      <span class="z_n">2</span>

      <span level="3" class="alt">Beetle</span>

      <span level="3" class="vs-g">

        <span class="z">(</span>

        <span level="4" brackets="n" display="inline" class="label-g">

          <span level="5" g="amalso" class="g">NAmE also</span>

        </span>

        <span level="4" class="v">bug</span>

        <span class="z">)</span>

      </span>

      <span level="3" class="def-g">

        <span status="6" level="4" tranidoupc="7" class="d">the English names for the original Volkswagen small car with a round shape at the front and the back

          <span localeuidoupc="201" status="6" level="5" class="chn">“甲壳虫”(英国人用以指称一款圆头圆顶的原大众牌的小汽车)</span>

        </span>

      </span>

    </span>

  </span>

  <a name="beetle_pos_v"></a>

  <span eid="beetle_pos_v" id="beetle_pos_v" level="1" class="p-g">

    <span level="2" class="block-g">

      <span level="3" class="pos-g">

        <span level="4" pos="v" class="pos">verb</span>

      </span>

    </span>

    <span gr="i" level="2" class="gr">

      <span class="z_gr_br">[</span>intransitive

      <span class="z_gr_br">]</span>

    </span>

    <span eid="beetle_cf_1" id="beetle_cf_1" level="2" class="cf">+ adv./prep.</span>

    <span level="2" class="def-g">

      <span level="3" display="inline" class="label-g">(

        <span level="4" g="br" class="g">BrE</span>) (

        <span level="4" r="infml" class="r">informal</span>)

      </span>

      <span status="6" level="3" tranidoupc="3" class="d">to move somewhere quickly

        <span localeuidoupc="201" status="6" level="4" class="chn">快速移动</span>

      </span>

    </span>

    <span xt="syn" level="2" class="xr-g">

      <span class="symbols-synsym">SYN</span>

      <span eid="beetle_xr_2" id="beetle_xr_2" xt="syn" href="scurry_e" level="3" pos="v" class="xr">

        <span topic="scurry_e" fk="XXX" class="Ref">

          <span level="4" class="xh">

            <a href="entry://scurry">scurry</a>

          </span>

        </span>

      </span>

    </span>

    <span eid="beetle_xg_1" id="beetle_xg_1" level="2" class="x-g">

      <span class="symbols-xsym">◆</span>

      <span status="6" record="y" level="3" tranidoupc="4" class="x">I last saw him beetling off down the road.</span>

      <span localeuidoupc="201" status="6" level="3" class="tx">我上次见到他时,他正快步沿路而去。</span>

    </span>

  </span>

  <span class="pracpron">

    <span class="pron-g">

      <span type="h" class="wd">bee·tle™</span>

      <span type="h_full_" level="3" class="ei-g">

        <span class="z_ei-g">/</span>

        <span file="{gb}/b/be/bee/beetle#_gb_1.spx" level="4" wd="beetle" recdate="070514" class="phon-gb">ˈbiːtl</span>

        <a type="sound" topic="b/bee/beetl/beetle__gb_1.spx" resource="uk_pron" backup-class="Media" class="fayin" href="sound://uk/beetle__gb_1.spx"><img src="uk_pron.png" class="fayin"/></a>

        <span class="z">;

          <span class="z_phon-us">NAmE</span>

        </span>

        <span file="{gb}/b/be/bee/beetle#_us_1.spx" sup="y" level="4" wd="beetle" recdate="070514" class="phon-us">ˈbiːtl</span>

        <a type="sound" topic="b/bee/beetl/beetle__us_1.spx" resource="us_pron" backup-class="Media" class="fayin" href="sound://us/beetle__us_1.spx"><img src="us_pron.png" class="fayin"/></a>

        <span class="z_ei-g">/</span>

      </span>

    </span>

  </span>

</span>
'''
print(re.findall(r'(?:href|src)="([^"]+?\.(?:css|js|png|jpg))"', translation))



More information about the Python-list mailing list