how to detect the character encoding in a web page ?

iMath redstone-cold at 163.com
Sun Jun 9 07:47:02 EDT 2013


在 2012年12月24日星期一UTC+8上午8时34分47秒,iMath写道:
> how to detect the character encoding  in a web page ?
> 
> such as this page 
> 
> 
> 
> http://python.org/

Finally ,I found by using PyQt’s QtextStream , QTextCodec and chardet ,we can get a web page code more securely  
even for this bad page
http://www.qnwz.cn/html/yinlegushihui/magazine/2013/0524/425731.html 

this script 
http://www.flvxz.com/getFlv.php?url=aHR0cDojI3d3dy41Ni5jb20vdTk1L3ZfT1RFM05UYzBNakEuaHRtbA==

and this page without chardet in its source code 
http://msdn.microsoft.com/en-us/library/bb802962(v=office.12).aspx


from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtNetwork  import *
import sys
import chardet

def slotSourceDownloaded(reply):
    redirctLocation=reply.header(QNetworkRequest.LocationHeader)
    redirctLocationUrl=reply.url() if not redirctLocation else redirctLocation
    #print(redirctLocationUrl,reply.header(QNetworkRequest.ContentTypeHeader))

    if (reply.error()!= QNetworkReply.NoError):
        print('11111111', reply.errorString())
        return

    pageCode=reply.readAll()
    charCodecInfo=chardet.detect(pageCode.data())

    textStream=QTextStream(pageCode)
    codec=QTextCodec.codecForHtml(pageCode,QTextCodec.codecForName(charCodecInfo['encoding'] ))
    textStream.setCodec(codec)
    content=textStream.readAll()
    print(content)

    if content=='':
        print('---------', 'cannot find any resource !')
        return

    reply.deleteLater()
    qApp.quit()


if __name__ == '__main__':
    app =QCoreApplication(sys.argv)
    manager=QNetworkAccessManager ()
    url =input('input url :')
    request=QNetworkRequest (QUrl.fromEncoded(QUrl.fromUserInput(url).toEncoded()))
    request.setRawHeader("User-Agent" ,'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17 SE 2.X MetaSr 1.0')
    manager.get(request)
    manager.finished.connect(slotSourceDownloaded)
sys.exit(app.exec_())



More information about the Python-list mailing list