xpath questions...

bruce bedouglas at earthlink.net
Sat Aug 23 09:31:31 EDT 2008


valid point...!!

here's the test python.. ugly as it is!!

#!/usr/bin/python
#
# test.py
#
# scrapes/extracts the basic data for the college
#
#
# the app gets/stores
#  name
#  url
#  address (street/city/state
#  phone
#
######################################################################3
#test python script
import re
import libxml2dom
import urllib
import urllib2
import sys, string
from  mechanize import Browser
import mechanize
#import tidy
import os.path
import cookielib
from libxml2dom import Node
from libxml2dom import NodeList
import subprocess
import MySQLdb
#import mysql_config
import time


########################
#
# Parse pricegrabber.com
########################


urlopen = urllib2.urlopen
##cj = urllib2.cookielib.LWPCookieJar()
Request = urllib2.Request
br = Browser()
br2 = Browser()


user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values1 = {'name' : 'Michael Foord',
          'location' : 'Northampton',
          'language' : 'Python' }
headers = { 'User-Agent' : user_agent }

url="http://schedule.berkeley.edu/"
url="http://schedule.psu.edu/"
#=======================================


if __name__ == "__main__":
# main app

	txdata = None

#----------------------------
# get the kentucky test pages

	#br.set_cookiejar(cj)
	br.set_handle_redirect(True)
	br.set_handle_referer(True)
	br.set_handle_robots(False)
	br.addheaders = [('User-Agent', 'Firefox')]


	#cnt is the page count for the master url
	murl=url

	print "url =",murl
	br.open(murl)
	#cj.save(COOKIEFILE)    # resave cookies

	res = br.response()  # this is a copy of response
	s = res.read()

	# s contains HTML not XML text
	d = libxml2dom.parseString(s, html=1)


 	#get the input/text dialogs
 	#tn1 = "//div[@id='main_content']/form[1]/input[position()=1]/@name"
	q="//img/parent::*/attribute::href"
	q="//form[@name='cos_search1']/@action"

	t1=d.xpath(q)
	print "href = ",t1
	print "hnode =",t1[0].nodeValue
	print "htest =",t1[0].textContent
	print "htesttt =",t1[0].toString()

	sys.exit()

thanks!!


-----Original Message-----
From: python-list-bounces+bedouglas=earthlink.net at python.org
[mailto:python-list-bounces+bedouglas=earthlink.net at python.org]On Behalf
Of Fredrik Lundh
Sent: Saturday, August 23, 2008 5:58 AM
To: python-list at python.org
Subject: Re: xpath questions...


bruce wrote:

> Regarding the xpath question I've posed, some have said that it shouldn't
be
> here on the mailing list. Give that I'm writing the test scripts/apps in
> python, using the python libs, where else should it be posted?
>
> I mean, I could post the entire sample script so you can see that it's
using
> python, but I simplified the issue.

there was zero Python content left after the simplification.  maybe you
should at least mention what library you're using to "play around with
xpath and the html dom" ?

</F>

--
http://mail.python.org/mailman/listinfo/python-list




More information about the Python-list mailing list