[LARGO] Re: Problema con la codificación
Oscar Curero
oscar-listas en naiandei.net
Lun Oct 3 18:52:54 CEST 2005
El Dilluns, 3 de Octubre de 2005 18:35, Oscar Curero va escriure:
> Hola,
>
> El siguiente script me funciona bien si lo ejecuto desde la consola pero
> mal si lo ejecuto desde un programa de QT (mythtv, para más señas).
Vale, no envía adjuntos. Aquí va el script:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import getopt
try:
from urlgrabber import urlopen
except ImportError:
print "Error importing urlgrabber. Can't find urlgrabber.\n Get it from
http://linux.duke.edu/projects/urlgrabber/"
sys.exit(1)
import sgmllib # A simple SGML parser
import re # Regular Expressions module
class HtmlParser(sgmllib.SGMLParser):
"""A html parser."""
def __init__(self, action,verbose=0):
"""Initialise an object, passing 'verbose' to the superclass."""
sgmllib.SGMLParser.__init__(self, verbose)
self.action=action
self.entitydefs["ntilde"]="ñ"
self.entitydefs["oacute"]="ó"
self.__inside_b_element=0
self.__inside_a_element=0
self.__inside_font_element=0
self.__inside_td_element=0
if action=="fetch":
self.metadata={"title":"","year":"","plot":"","director":"","movierating":"","runtime":""}
self.__buffer={0:0,1:0,2:0}
self.__title=0
self.__plot=0
self.__year=0
self.__director=0
self.__rating=0
self.__runtime=0
elif action=="query":
self.__addResult=0
self.movieID=0
self.results={}
else:
self.poster=""
self.pattern="http://www.basecine.net/caratulas/"
self.__addPoster=0
def __title_handler(self,data):
if self.__title==1 and self.__inside_b_element and
self.__inside_font_element:
#Set title
self.metadata["title"]+=data
def __year_handler (self,data):
if self.__year==1:
#Set the year
self.metadata["year"]+=data
self.__buffer[0]=0
if self.__inside_b_element and (data=="A" or data=="ñ" or data=="o:"):
if data=="A":
self.__buffer[0]=1
else:
if data=="ñ" and self.__buffer[0]==1:
self.__buffer[0]+=1
if data=="o:" and self.__buffer[0]==2:
#Look for the year. Next data will be interesting...
self.__year=1
def __director_handler (self,data):
if self.__director==1:
#Set director
self.metadata["director"]+=data
self.__buffer[1]=0
if self.__inside_b_element and (data=="Direcci" or data=="ó" or
data=="n:"):
if data=="Direcci":
self.__buffer[1]=1
else:
if data=="ó" and self.__buffer[1]==1:
self.__buffer[1]+=1
if data=="n:" and self.__buffer[1]==2:
#Look for the director. Next data will be interesting...
self.__director=1
def __movierating_handler(self,data):
if self.__rating==1:
#Set rating
self.metadata["movierating"]+=data
if self.__inside_b_element and data=="Clasificacion:":
#Look for the director. Next data will be interesting...
self.__rating=1
def __runtime_handler(self,data):
if self.__runtime==1:
#Set runtime
self.metadata["runtime"]+=data
self.__buffer[2]=0
if self.__inside_b_element and (data=="Duraci" or data=="ó" or data=="n:"):
if data=="Duraci":
self.__buffer[2]=1
else:
if data=="ó" and self.__buffer[2]==1:
self.__buffer[2]+=1
if data=="n:" and self.__buffer[2]==2:
#Look for the runtime. Next data will be interesting...
self.__runtime=1
def __plot_handler(self,data):
if self.__plot==1:
#Set synopsis
self.metadata["plot"]+=data
if self.__inside_b_element and data=="Sinopsis:":
#Look for the plot text. Next data will be interesting...
self.__plot=1
def __query_handler(self,data):
"""unicode(data,'latin1').encode('utf8')"""
if self.__addResult==1 and self.movieID:
#Add result
self.results[self.movieID]=data
if self.__inside_td_element and self.__inside_font_element:
self.__addResult=1
def parse(self, input):
"""Parse the given data 'input'."""
self.feed(input)
self.close()
def start_b(self, attributes):
"""Process <b> html tags."""
self.__inside_b_element=1
def end_b(self):
"""Process <b> html tags."""
self.__inside_b_element = 0
def start_td(self, attributes):
"""Process <b> html tags."""
self.__inside_td_element=1
#In This part we ensure that the <td> tag is the one before the title <td
align="center">
for name,value in attributes:
if name=="align":
if value=="center":
self.__title=1
def end_td(self):
"""Process <b> html tags."""
self.__inside_td_element = 0
if self.action=="fetch":
if self.__title==1:
#We found the end of the title, close it
self.__title=0
def do_br(self, attributes):
"""Process <br> html tags."""
if self.action=="fetch":
if self.__year==1:
#We found the end of the year, close it
self.__year=0
if self.__rating==1:
#We found the end of the rating, close it
self.__rating=0
if self.__runtime==1:
#We found the end of the runtime, close it
self.__runtime=0
elif self.action=="poster":
self.__addPoster=1
def start_a(self, attributes):
"""Process <b> html tags."""
self.__inside_a_element=1
if self.action=="query":
if self.__addResult==1:
for name,value in attributes:
if name=="href":
self.movieID=value.replace("peli.php?id=","")
else:
pass
def end_a(self):
"""Process <b> html tags."""
self.__inside_a_element = 0
if self.action=="fetch":
if self.__director==1:
#We found the end of the director, close it
self.__director=0
elif self.action=="query":
if self.__addResult==1:
self.__addResult=0
self.movieID=0
def start_font(self,attributes):
"""Record the start of a <font> tag."""
self.__inside_font_element = 1
def end_font(self):
"""Record the end of a <font> tag."""
if self.action=="fetch":
if self.__plot==1:
#We found the end of the plot, close it
self.__plot=0
def do_img(self,attributes):
if self.action=="poster" and self.__addPoster==1 and self.poster=="":
for name,value in attributes:
if name=="src":
pattern=re.compile("^"+self.pattern)
if re.search(pattern,value):
#We got the poster
self.poster=value
def handle_data(self, stream):
"""Handler for the textual data.
Parameters:
- data: actual data passed to this method."""
if self.action=="fetch":
self.__title_handler(stream)
self.__year_handler(stream)
self.__director_handler(stream)
self.__movierating_handler(stream)
self.__runtime_handler(stream)
self.__plot_handler(stream)
elif self.action=="query":
self.__query_handler(stream)
class BaseCine:
""" Basecine query. Class to retreive information about movies in spanish
from the spanish website www.basecine.net"""
def __init__(self,action="fetch",verbose=0):
self.__rawdata=""
self.action=action
def __GetStream(self,args):
""" This method gets the data from the site and returns raw data"""
if self.action=="query":
baseUrl="http://www.basecine.net/bilatu_peli1.php?titulo="
args
args=args.replace(" ","+")
else:
baseUrl="http://www.basecine.net/peli.php?id="
baseUrl = "%s%s" % (baseUrl,args) #Base URL with parameters
#print "Fetching " + baseUrl + " ..." #Enable for debug
query = urlopen(baseUrl) #Get http
rawData = query.read()
query.close()
return rawData
def __RemoveBackslashes(self,input):
"""Private method to remove LF and CR"""
return input.replace("\n","").replace("\t","")
def __ParseData(self):
"""Private method to parse the data"""
self.__rawdata=self.__RemoveBackslashes(self.__rawdata)
parser = HtmlParser(self.action)
parser.parse(self.__rawdata)
if self.action=="fetch":
return parser.metadata
elif self.action=="query":
return parser.results
else:
return parser.poster
####################### END PRIVATE METHODS ######################
######################## PUBLIC METHODS ########################
def GetRawData(self,args):
"""Show raw data of the operation. Useful for debugging"""
return self.__GetStream(args)
def GetData(self,args):
"""GetData(args): This method is used to retreive the data from the web and
parse it.\nUse <args> as a parameter"""
self.__rawdata=self.__GetStream(args)
return self.__ParseData()
def GetTitle(self,data):
"""GetTitle(data): Get the title of the movie."""
return unicode(data["title"],"iso-8859-1").strip().capitalize()
def GetDirector(self,data):
"""GetDirector(data): Get the director of the movie."""
return data["director"].strip()
def GetRuntime(self,data):
"""GetRuntime(data): Get the runtime of the movie."""
return unicode(data["runtime"],"iso-8859-1").strip()
def GetMovieRating(self,data):
"""GetMovieRating(data): Get the rating of the movie."""
return data["movierating"].strip()
def GetYear(self,data):
"""GetYear(data): Get the year of the movie."""
return data["year"].strip()
def GetPlot(self,data):
"""GetPlot(data): Get the plot of the movie."""
return data["plot"].strip()
def GetMovieList(self,movie):
"""GetMovieList(pattern): Get a list of matches."""
result=self.GetData(movie)
for movieID in result.keys():
result[movieID]=result[movieID].capitalize()
return result
def GetMoviePoster(self,movie):
"""GetMoviePoster(movie): Get the url image for the poster."""
return self.GetData(movie)
ver="v0.1"
version="Basecine query " + ver + " by Oscar Curero <oscar en naiandei.net>"
info="Performs queries using the www.basecine.net website."
def usage():
print version + "\n" + info
print "usage: ./basecine.py [OPTIONS]\n \
Available options are:\n \
\t-M --movie=QUERY\tGet movie list\n \
\t-P --poster=ID\t\tGet movie poster\n \
\t-D --data=ID\t\tGet movie data\n \
\t-h --help\t\tShow help\n \
\t-d \t\t\tShow debug info\n \
\t-r \t\t\tDump raw query data only\n \
\t-v \t\t\tShow version and exit\n \
\t-i \t\t\tShow info and exit"
def main(argv):
movie=""
rawquery=0
import sys
try:
parms,args = getopt.getopt(argv, "hdrviM:P:D:", ["help",
"movie=","poster=","data="])
except getopt.GetoptError:
usage()
sys.exit(2)
for opt,value in parms:
if opt in ("-h","--help"):
usage()
sys.exit(0)
if opt=="-i":
print info
sys.exit(0)
if opt=="-v":
print version
sys.exit(0)
if opt=="-d":
global _debug
_debug=1
if opt=="-r":
rawquery=1
if opt in ("-M","--movie"):
movie=value
action="query"
if opt in ("-P","--poster"):
movie=value
action="poster"
if opt in ("-D","--data"):
movie=value
action="fetch"
if movie=="":
usage()
sys.exit(1)
query= BaseCine(action)
if rawquery:
print query.GetRawData(movie)
sys.exit(0)
if action=="fetch":
data=query.GetData(movie)
print "Title:" + query.GetTitle(data)
print "Year:" + query.GetYear(data)
print "Runtime:" + query.GetRuntime(data)
print "Director:" + unicode(query.GetDirector(data),"iso-8859-15")
print "MovieRating: " + query.GetDirector(data)
print "Plot:" + query.GetPlot(data)
elif action=="query":
data=query.GetMovieList(movie)
if data.values():
print "\n".join(["%s:%s" % (movieID, movieName) for movieID, movieName in
data.items()])
elif action=="poster":
print query.GetMoviePoster(movie)
if __name__ =="__main__":
main(sys.argv[1:])
--
Oscar Curero - Linux user: 306877
-- GPG keyID: 0xE0EA0B24 --
--
Oscar Curero - Linux user: 306877
To send PERSONAL email, remove "-listas" from the address
-- GPG keyID: 0xE0EA0B24 --
Más información sobre la lista de distribución Python-es