A fun python CLI program for all to enjoy!

Fri May 6 15:10:37 EDT 2016

getAddresses.py

Scrapes addresses from www.usdirectory.com and stores them in a SQLite 
database, or writes them to text files for mailing labels, etc

Now, just by typing 'fast food Taco Bell <city> 10 db all' you can find 
out how many Taco Bells are within 10 miles of you, and store all the 
addresses in your own address database.

No more convoluted Googling, or hitting the 'Next Page' button, or 
fumbling with the Yellow Pages...

Note: the db structure is flat on purpose, and the .csv files aren't 
quote delimited.

Put the program in its own directory.  It creates the SQLite database 
there, and writes files there, too.

Reviews of code, bug reports, criticisms, suggestions for improvement, 
etc are all welcome.

Enjoy!

========================================================================
#getAddresses.py

import os, sys, requests, time, datetime
from lxml import html
import pyodbc, sqlite3, re

#show values of variables, HTML content, etc
#set it to False for short/concise program output
verbose = False
if verbose == True:
	print "The verbose setting is turned On."
	print ""

#check if address is unique
addrCheck = []
def addrUnique(addr):
	if addr not in addrCheck:
		x = True
		addrCheck.append(addr)
	else: x = False	
	return x

#validate and parse command line
def showHelp():
	print ""
	print " Enter search word(s), city or zip, state, miles to search, txt 
or csv or db, # addresses to save (no commas)"
	print ""
	print " eg: restaurant Knoxville TN 10 txt 50"
	print "     search for restaurants within 10 miles of Knoxville TN, and 
write"
	print "     the first 50 address to a txt file"
	print ""
	print " eg: furniture 30303 GA 20 csv all"
	print "     search for furniture within 20 miles of zip 30303 GA,"
	print "     and write all results to a csv file"
	print ""
	print " eg: boxing gyms Detroit MI 10 db 5"
	print "     search for boxing gyms within 10 miles of Detroit MI, and 
store"
	print "     the first 5 results in a database"
	print ""
	print " All entries are case-insensitive (ie TX or tx are acceptable)"
	exit(0)

argCnt = len(sys.argv)
if argCnt < 7: showHelp()
if verbose == True:
	print ""
	print str(argCnt) + " arguments"

keyw = ""							#eg restaurant, boxing gym
if argCnt == 7:	keyw = sys.argv[1]	#one search word
if argCnt >  7: 					#multiple search words
	for i in range(1,argCnt-5):
		keyw = keyw + sys.argv[i] + "+"
	keyw = keyw[:-1]			#drop trailing + sign
cityzip  = sys.argv[argCnt-5]   #eg Atlanta or 30339
state    = sys.argv[argCnt-4]   #eg GA
miles    = sys.argv[argCnt-3]   #eg 5,10,20,30,50 (website allows max 30)
store    = sys.argv[argCnt-2]   #write address to file or database
addrWant = sys.argv[argCnt-1]   #eg save All or number >0

if addrWant.lower() != "all":	#how many addresses to save
	if addrWant.isdigit() == False: showHelp()
	if addrWant == "0"            : showHelp()
	addrWant = int(addrWant)
elif addrWant.lower() == "all": addrWant = addrWant.lower()
else: addrWant = int(addrWant)

if store != "csv" and store != "txt" and store != "db": showHelp()

#begin timing the code
startTime = time.clock()	

#website, SQLite db, search string, current date/time for use with db
datasrc = "www.usdirectory.com"
dbName  = "addresses.sqlite"
search  = keyw + " " + str(cityzip) + " " + state + " " + str(miles) + " 
" + str(addrWant)
loaddt = datetime.datetime.now()

#write addresses to file
#each time the same search is done, the file is deleted and recreated
if store == "csv" or store == "txt":
	#csv will write in .csv format - header and 1 line per address
	#txt will write out 3 lines per address, then blank before next address
	webfile  = "usdirectory.com_"+keyw+"_"+cityzip+"_"+state+"."+store
	f = open(webfile,"w")
	if store == "csv": f.write("Name,Address,CityStateZip\n")
	f.close

#store addresses in database	
cSQL = ""
if store == "db":	
	#creates a SQLite database that Access 2003 can't read
	#conn = sqlite3.connect(dbName)

	#also creates a SQLite database that Access 2003 can't read
	conn = pyodbc.connect('Driver={SQLite3 ODBC Driver};Database=' + dbName)
	db   = conn.cursor()

	cSQL =  "CREATE TABLE If Not Exists ADDRESSES "
	cSQL += "(datasrc, search, category, name, street, city, state, zip, 
loaddt, "
	cSQL += "PRIMARY KEY (datasrc, search, name, street));"
	db.execute(cSQL)

	# cSQL =  "CREATE TABLE If Not Exists CATEGORIES "
	# cSQL += "(catID INTEGER PRIMARY KEY, catDesc);"
	# db.execute(cSQL)
	# db.execute("CREATE UNIQUE INDEX If Not Exists UIDX_CATDESC ON 
CATEGORIES (catDesc);")

	conn.commit()
	if verbose == True:
		print("connected to database: " + dbName)
		print cSQL
		print("created table: addresses")
		print("")

if verbose == True:
	print "Search summary"
	print "------------------------------"
	print "Keywords: " + keyw
	print "City/Zip: " + cityzip
	print "State   : " + state
	print "Radius  : " + str(miles)    + " miles"
	print "Save    : " + str(addrWant) + " addresses to " + store
	print "------------------------------"
	print ""

#build url
wBase    = "http://www.usdirectory.com"
wForm    = "/ypr.aspx?fromform=qsearch"
wKeyw   = "&qhqn=" + keyw
wCityZip = "&qc="   + cityzip
wState   = "&qs="   + state
wDist    = "&rg="   + str(miles)
wSort    = "&sb=a2z"  #sort alpha
wPage    = "&ap="   #used with the results page number
webpage = wBase + wForm + wKeyw + wCityZip + wState + wDist
if verbose == True:
	print "Search url: \n" + webpage
	print ""

#delete previous results of identical search
if store == "db":
	cSQL = "DELETE FROM addresses "
	cSQL += "WHERE datasrc = '" + datasrc + "' "
	cSQL += "AND   search  = '" + search + "';"
	if verbose == True: print cSQL
	db.execute(cSQL)
	conn.commit()

#query web server, save results
print "searching..."
i = 0
dupes = 0
addrReturned = 0
addrSaved = 0
while 1:
	wPageNbr = wPage + str(i+1)
	webpage  = wBase + wForm + wKeyw + wCityZip + wState + wDist + wSort + 
wPageNbr
	page = requests.get(webpage)
	tree = html.fromstring(page.content)

	#no matches
	matches = tree.xpath('//strong/text()') 	
	if i == 0 and "No results were found" in str(matches):
		print "No results found for that search"
		exit(0)
		os.remove(webfile)

	#parse number of addresses returned
	#some searches return 2 items: ['Found N results', 'junk']
	#some searches return 3 items: ['Filter this search','Found N 
results','junk']
	if i == 0:
		match  = tree.xpath('//div[@class="header_text"]/text()')
		if len(match) > 2: match.pop(0) #remove first element if 2
		match = [int(s) for s in match[0].split() if s.isdigit()]
		addrFound = match[0]
		if addrWant != "all": addrWant = min(addrWant,addrFound)
		print str(addrFound) + " matches found (" + str(addrWant) + " will be 
saved)"
		print ""

	#split names, addresses into lists
	nms   = tree.xpath('//span[@class="header_text3"]/text()')
	if len(nms) == 0: break
	addr   = tree.xpath('//span[@class="text3"]/text()')
	addr   = [t.replace("\r\n", "")  for t in addr]
	addr   = filter(None, (t.strip() for t in addr))
	street = [s.split(',')[0] for s in addr]
	city   = [c.split(',')[1].strip() for c in addr]
	state  = [s[-8:][:2] for s in addr]
	zip    = [z[-5:] for z in addr]

	#get usdirectory.com categories
	category = tree.xpath('//a/text()')
	category = [c.strip() for c in category]
	category = filter(None, category)
	pattern  = re.compile(r"^[A-Z\s&,-]+$")
	category = [x for x in category if pattern.match(x)]

	#screen feedback
	print "retrieving page " + str(i+1)	+ ": " + str(len(nms)) + " addresses"
	if verbose == True:
		print ""
		print "Names: \n" + str(nms)
		print ""
		print "Addresses: \n" + str(addr)
		print ""
		print "Categories: \n" + str(category)
		print 
"----------------------------------------------------------------------------------------"
		print ""

	#data integrity check - make sure all lists have same # of items
	lenData = 
[len(category),len(nms),len(addr),len(street),len(city),len(state),len(zip)]
	if len(set(lenData)) != 1:
		print "Data parsing issue.  One or more lists has an incorrect number 
of items.  Program will exit."
		exit(0)
	if verbose == True:
		if len(set(lenData)) == 1: print "Verified: each list has " + 
str(len(nms)) + " items in it."

	#write addresses to file
	if store == "txt" or store == "csv":
		addrList = []
		for j in range(len(nms)):
			if addrUnique(nms[j]+' '+street[j]) == True:
				if store == "txt": 
addrList.append(nms[j]+'\n'+street[j]+'\n'+city[j]+', '+state[j]+' 
'+zip[j]+'\n\n')
				if store == "csv": addrList.append(nms[j]+',' +street[j]+',' 
+city[j]+' ' +state[j]+'  '+zip[j]+'\n')
				addrSaved += 1
			else:
				dupes += 1
				print " * duplicate address found: " + nms[j] + ", " + street[j]
			addrReturned += 1			
			if addrWant != "all":
				if addrSaved >= addrWant: break

		f = open(webfile,"a")
		for address in addrList: f.write(address)
		f.close

	#write addresses to database
	if store == "db":
		for j in range(len(nms)):
			dupeRow = False
			cSQL = "INSERT INTO ADDRESSES VALUES (?,?,?,?,?,?,?,?,?)" 
#(datasrc,search,category,name,street,city,state,zip,loaddt) "
			Vals = 
datasrc,search,category[j],nms[j],street[j],city[j],state[j],zip[j],str(loaddt)
			if verbose == True: print cSQL + ',' + str(Vals)
			try: db.execute(cSQL, Vals)
			except (pyodbc.Error) as programError:
				if str(programError).find("UNIQUE constraint failed") > 0:
					dupeRow = True
					dupes +=1
					print " * duplicate address found: " + nms[j] + ", " + street[j]
					pass
			addrReturned += 1		
			if dupeRow == False:
				addrSaved += 1
			if addrWant != "all":
				if addrSaved >= addrWant: break
		conn.commit()

	if addrSaved >= addrFound or addrSaved >= addrWant: break
	i += 1
	time.sleep(2)

#finish
if (store == "csv" or store == "txt"):
	print "\nFinished\nWrote " + str(addrSaved) + " addresses to file " + 
webfile

elif store == "db":
	db.close()
	conn.close()
	print "\nFinished\nStored " + str(addrSaved) + " addresses in database: 
" + dbName

if dupes > 0: print "(" + str(dupes) + " duplicate addresses ignored)"

#timer
endTime = time.clock()
print "processing time: %.2g seconds" %(endTime-startTime)

#bug in www.usdirectory.com code: usually overreports matches by 1
if (addrWant == "all") and (addrReturned != addrFound):
	print "Note: " + datasrc + " reported " + str(addrFound) + " matches, 
but returned " + str(addrReturned)
========================================================================