python - firefox dom/xpath question/issue

bruce bedouglas at earthlink.net
Mon Aug 25 16:03:06 EDT 2008


Hi.

Got a test web page, that basically has two "<html" tags in it. Examining
the page via Firefox/Dom Inspector, I can create a test xpath query
"/html/body/form" which gets the target form for the test.

The issue comes when I examine the page's source html. It looks like:
<html>
<body>
</body>
</html>

<html>
<body>
.
.
.
</body>
</html>

I've simplified things a bit... but basically, the 1st "html/body" is empty,
with the 2nd containing the data/nodes I need.

In using xpath("/html/body/form"), the app returns nothing/crashes.. I've
tried to do something like xpath("/html[position()=0]") as well with no
luck... It's as if xpath only looks at the 1st html that it sees in a given
page. I can't seem to find any docs for xpath to work around this. I'm using
the libxml2dom for python 2.5.1.

Any thoughts/comments...

If I comment out the 1st html section, things work as they should. The test
code is below...

thanks

------------------------------------------
#!/usr/bin/python
#
# test.py
#
# scrapes/extracts the basic data for the college
#
#
# the app gets/stores
#  name
#  url
#  address (street/city/state
#  phone
#
######################################################################3
#test python script
import re
import libxml2dom
import urllib
import urllib2
import sys, string
from  mechanize import Browser
import mechanize
#import tidy
import os.path
import cookielib
from libxml2dom import Node
from libxml2dom import NodeList
import subprocess
import time

########################
#
# Parse pricegrabber.com
########################
##cj = "p"
##COOKIEFILE = 'cookies.lwp'
#cookielib = 1


urlopen = urllib2.urlopen
Request = urllib2.Request
br = Browser()
br2 = Browser()

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values1 = {'name' : 'Michael Foord',
          'location' : 'Northampton',
          'language' : 'Python' }
headers = { 'User-Agent' : user_agent }

url="http://schedule.psu.edu/"
#=======================================


if __name__ == "__main__":
# main app

	txdata = None

#----------------------------

	##br.set_cookiejar(cj)
	br.set_handle_redirect(True)
	br.set_handle_referer(True)
	br.set_handle_robots(False)
	br.addheaders = [('User-Agent', 'Firefox')]

	print "url =",url
	#br.open(url)
	##cj.save(COOKIEFILE)    # resave cookies

	#res = br.response()  # this is a copy of response
	#s = res.read()
	#print "slen=",len(s)
	tfile = open("/college/psu1.dat")
	s = tfile.read()
	print s


	# s contains HTML not XML text
	d=[]
	d = libxml2dom.parseString(s, html=1)
	print "d",d

	name_=[]
	len_=0

	br.open(url)
	##cj.save(COOKIEFILE)    # resave cookies

	#res = br.response()  # this is a copy of response
	#s = res.read()
	print "slen=",len(s)

	# s contains HTML not XML text
	#d=[]
	#d = libxml2dom.parseString(s, html=1)
	#print "d",d

	#name_ = d.xpath("//form")
	name_ = d.xpath("/html/body/form")
	len_ = len(name_)
	print "len=",len_

	print "name1",name_
	print "len",len(name_)
	#print "sdlfs"
	sys.exit()
#	else:
#		print "err in form_ID"


	print "here..."





More information about the Python-list mailing list