[LARGO] Re: Problema con la codificación

Oscar Curero oscar-listas en naiandei.net
Lun Oct 3 18:52:54 CEST 2005


El Dilluns, 3 de Octubre de 2005 18:35, Oscar Curero va escriure:
> Hola,
>
> El siguiente script me funciona bien si lo ejecuto desde la consola pero
> mal si lo ejecuto desde un programa de QT (mythtv, para más señas).

Vale, no envía adjuntos. Aquí va el script:

#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import getopt
try:
	from urlgrabber import urlopen
except ImportError:
	print "Error importing urlgrabber. Can't find urlgrabber.\n Get it from 
http://linux.duke.edu/projects/urlgrabber/"
	sys.exit(1)
import sgmllib # A simple SGML parser 
import re # Regular Expressions module
class HtmlParser(sgmllib.SGMLParser):
	"""A html parser."""
	
	def __init__(self, action,verbose=0):
		"""Initialise an object, passing 'verbose' to the superclass."""
		sgmllib.SGMLParser.__init__(self, verbose)
		self.action=action
		self.entitydefs["ntilde"]="ñ"
		self.entitydefs["oacute"]="ó"
		self.__inside_b_element=0
		self.__inside_a_element=0
		self.__inside_font_element=0
		self.__inside_td_element=0
		if action=="fetch":
			
self.metadata={"title":"","year":"","plot":"","director":"","movierating":"","runtime":""}
			self.__buffer={0:0,1:0,2:0}
			self.__title=0
			self.__plot=0
			self.__year=0
			self.__director=0
			self.__rating=0
			self.__runtime=0
		elif action=="query":
			self.__addResult=0
			self.movieID=0
			self.results={}
		else:
			self.poster=""
			self.pattern="http://www.basecine.net/caratulas/"
			self.__addPoster=0
	def __title_handler(self,data):
		if self.__title==1 and self.__inside_b_element and 
self.__inside_font_element:
			#Set title
			self.metadata["title"]+=data
	def __year_handler (self,data):
		if self.__year==1:
			#Set the year
			self.metadata["year"]+=data
			self.__buffer[0]=0
		if self.__inside_b_element and (data=="A" or data=="ñ" or data=="o:"):
			if data=="A":
				self.__buffer[0]=1
			else:
				if data=="ñ" and self.__buffer[0]==1:
					self.__buffer[0]+=1
				if data=="o:" and self.__buffer[0]==2:
					#Look for the year. Next data will be interesting...
					self.__year=1
	def __director_handler (self,data):
		if self.__director==1:
			#Set director
			self.metadata["director"]+=data
			self.__buffer[1]=0
		if self.__inside_b_element and (data=="Direcci" or data=="ó" or  
data=="n:"):
			if data=="Direcci":
				self.__buffer[1]=1
			else:
				if data=="ó" and self.__buffer[1]==1:
					self.__buffer[1]+=1
				if data=="n:" and self.__buffer[1]==2:
					#Look for the director. Next data will be interesting...
					self.__director=1
	def __movierating_handler(self,data):
		if self.__rating==1:
			#Set rating
			self.metadata["movierating"]+=data
		if self.__inside_b_element and data=="Clasificacion:":
			#Look for the director. Next data will be interesting...
			self.__rating=1
	def __runtime_handler(self,data):
		if self.__runtime==1:
			#Set runtime
			self.metadata["runtime"]+=data
			self.__buffer[2]=0
		if self.__inside_b_element and (data=="Duraci" or data=="ó" or  data=="n:"):
			if data=="Duraci":
				self.__buffer[2]=1
			else:
				if data=="ó" and self.__buffer[2]==1:
					self.__buffer[2]+=1
				if data=="n:" and self.__buffer[2]==2:
					#Look for the runtime. Next data will be interesting...
					self.__runtime=1
	def __plot_handler(self,data):
		if self.__plot==1:
			#Set synopsis
			self.metadata["plot"]+=data
		if self.__inside_b_element and data=="Sinopsis:":
			#Look for the plot text. Next data will be interesting...
			self.__plot=1
	def __query_handler(self,data):
		"""unicode(data,'latin1').encode('utf8')"""
		if self.__addResult==1 and self.movieID:
			#Add result
			self.results[self.movieID]=data
		if self.__inside_td_element and self.__inside_font_element:
			self.__addResult=1
	def parse(self, input):
		"""Parse the given data 'input'."""
		self.feed(input)
		self.close()
	def start_b(self, attributes):
		"""Process <b> html tags."""
		self.__inside_b_element=1
	def end_b(self):
		"""Process <b> html tags."""
		self.__inside_b_element = 0
	def start_td(self, attributes):
		"""Process <b> html tags."""
		self.__inside_td_element=1
		#In This part we ensure that the <td> tag is the one before the title <td 
align="center"> 
		for name,value in attributes:
			if name=="align":
				if value=="center":
					self.__title=1
	def end_td(self):
		"""Process <b> html tags."""
		self.__inside_td_element = 0
                if self.action=="fetch":
                        if self.__title==1:
                                #We found the end of the title, close it
                                self.__title=0

	def do_br(self, attributes):
		"""Process <br> html tags."""
		if self.action=="fetch":
			if self.__year==1:
				#We found the end of the year, close it
				self.__year=0
			if self.__rating==1:
				#We found the end of the rating, close it
				self.__rating=0
			if self.__runtime==1:
				#We found the end of the runtime, close it
				self.__runtime=0
		elif self.action=="poster":
			self.__addPoster=1
	def start_a(self, attributes):
		"""Process <b> html tags."""
		self.__inside_a_element=1
		if self.action=="query":
			if self.__addResult==1:
				for name,value in attributes:
					if name=="href":
						self.movieID=value.replace("peli.php?id=","")
		else:
			pass
		
	def end_a(self):
		"""Process <b> html tags."""
		self.__inside_a_element = 0
		if self.action=="fetch":
			if self.__director==1:
				#We found the end of the director, close it
				self.__director=0
		elif self.action=="query":
			if self.__addResult==1:
				self.__addResult=0
				self.movieID=0		
	def start_font(self,attributes):
		"""Record the start of a <font> tag."""
		self.__inside_font_element = 1
	def end_font(self):
		"""Record the end of a <font> tag."""
		if self.action=="fetch":
			if self.__plot==1:
				#We found the end of the plot, close it
				self.__plot=0
	def do_img(self,attributes):
		if self.action=="poster" and self.__addPoster==1 and self.poster=="":
			for name,value in attributes:
				if name=="src":
					pattern=re.compile("^"+self.pattern)
					if re.search(pattern,value):
						#We got the poster
						self.poster=value
	def handle_data(self, stream):
		"""Handler for the textual data.
		Parameters:
		- data: actual data passed to this method."""
		if self.action=="fetch":
			self.__title_handler(stream)
			self.__year_handler(stream)
			self.__director_handler(stream)
			self.__movierating_handler(stream)
			self.__runtime_handler(stream)
			self.__plot_handler(stream)
		elif self.action=="query":
			self.__query_handler(stream)
class BaseCine:
	""" Basecine query. Class to retreive information about movies in spanish 
from the spanish website www.basecine.net"""
	def __init__(self,action="fetch",verbose=0):
		self.__rawdata=""
		self.action=action
	def __GetStream(self,args):
		""" This method gets the data from the site and returns raw data"""
		if self.action=="query":
			baseUrl="http://www.basecine.net/bilatu_peli1.php?titulo="
			args
			args=args.replace(" ","+")
		else:
			baseUrl="http://www.basecine.net/peli.php?id="
		baseUrl = "%s%s" % (baseUrl,args) #Base URL with parameters
		#print "Fetching " + baseUrl + "  ..." #Enable for debug
		query = urlopen(baseUrl) #Get http
		rawData = query.read()
		query.close()
		return rawData
	def __RemoveBackslashes(self,input):
		"""Private method to remove LF and CR"""
		return input.replace("\n","").replace("\t","")
	def __ParseData(self):
		"""Private method to parse the data"""
		self.__rawdata=self.__RemoveBackslashes(self.__rawdata)
		parser = HtmlParser(self.action)
		parser.parse(self.__rawdata)
		if self.action=="fetch":
			return parser.metadata
		elif self.action=="query":
			return parser.results
		else:
			return parser.poster
	####################### END PRIVATE METHODS ######################
	########################  PUBLIC METHODS  ########################
	def GetRawData(self,args):
		"""Show raw data of the operation. Useful for debugging""" 
		return self.__GetStream(args)
	def GetData(self,args):
		"""GetData(args): This method is used to retreive the data from the web and 
parse it.\nUse <args> as a parameter"""
		self.__rawdata=self.__GetStream(args)
		return self.__ParseData()
	def GetTitle(self,data):
		"""GetTitle(data): Get the title of the movie."""
		return unicode(data["title"],"iso-8859-1").strip().capitalize()
	def GetDirector(self,data):
		"""GetDirector(data): Get the director of the movie."""
		return data["director"].strip()
	def GetRuntime(self,data):
		"""GetRuntime(data): Get the runtime of the movie."""
		return unicode(data["runtime"],"iso-8859-1").strip()
	def GetMovieRating(self,data):
		"""GetMovieRating(data): Get the rating of the movie."""
		return data["movierating"].strip()
	def GetYear(self,data):
		"""GetYear(data): Get the year of the movie."""
		return data["year"].strip()
	def GetPlot(self,data):
		"""GetPlot(data): Get the plot of the movie."""
		return data["plot"].strip()
	def GetMovieList(self,movie):
		"""GetMovieList(pattern): Get a list of matches."""
		result=self.GetData(movie)
		for movieID in result.keys():
			result[movieID]=result[movieID].capitalize()
		return result
	def GetMoviePoster(self,movie):
		"""GetMoviePoster(movie): Get the url image for the poster."""
		return self.GetData(movie)
ver="v0.1"
version="Basecine query " + ver + " by Oscar Curero <oscar en naiandei.net>"
info="Performs queries using the www.basecine.net website."

def usage():
	print version + "\n" + info
	print "usage: ./basecine.py [OPTIONS]\n \
Available options are:\n \
\t-M  --movie=QUERY\tGet movie list\n \
\t-P  --poster=ID\t\tGet movie poster\n \
\t-D  --data=ID\t\tGet movie data\n \
\t-h  --help\t\tShow help\n \
\t-d \t\t\tShow debug info\n \
\t-r  \t\t\tDump raw query data only\n \
\t-v  \t\t\tShow version and exit\n \
\t-i  \t\t\tShow info and exit" 
def main(argv):
	movie=""
	rawquery=0
	import sys 
	try:
		parms,args = getopt.getopt(argv, "hdrviM:P:D:", ["help", 
"movie=","poster=","data="])
	except getopt.GetoptError:
		usage()
		sys.exit(2)
	for opt,value in parms:
		if opt in ("-h","--help"):
			usage()
			sys.exit(0)
		if opt=="-i":
			print info
			sys.exit(0)
		if opt=="-v":
			print version
			sys.exit(0)
		if opt=="-d":
			global _debug
			_debug=1
		if opt=="-r":
			rawquery=1
		if opt in ("-M","--movie"):
			movie=value
			action="query"
		if opt in ("-P","--poster"):
			movie=value
			action="poster"
		if opt in ("-D","--data"):
			movie=value
			action="fetch"
	if movie=="":
		usage()
		sys.exit(1)
	query= BaseCine(action)
	if rawquery:
		print query.GetRawData(movie)
		sys.exit(0)
	if action=="fetch":
		data=query.GetData(movie)
		print "Title:" + query.GetTitle(data)
		print "Year:" + query.GetYear(data)
		print "Runtime:" + query.GetRuntime(data)
		print "Director:" + unicode(query.GetDirector(data),"iso-8859-15")
		print "MovieRating: " + query.GetDirector(data)
		print "Plot:" + query.GetPlot(data)
	elif action=="query":
		data=query.GetMovieList(movie)
		if data.values():
			print "\n".join(["%s:%s" % (movieID, movieName) for movieID, movieName in 
data.items()])
	elif action=="poster":
		print query.GetMoviePoster(movie)
if __name__ =="__main__":
	main(sys.argv[1:])



-- 
Oscar Curero - Linux user: 306877
--    GPG keyID: 0xE0EA0B24    -- 

-- 
Oscar Curero - Linux user: 306877
To send  PERSONAL email, remove "-listas" from the address
--    GPG keyID: 0xE0EA0B24    -- 




Más información sobre la lista de distribución Python-es