[XML-SIG] HTML parse error

Tue Feb 23 04:45:36 CET 2010

On Mon, Feb 22, 2010 at 10:46 PM, Stefan Behnel <stefan_ml at behnel.de> wrote:

> sharifah ummu kulthum, 22.02.2010 14:24:
> >   File "grabmy.py", line 63, in get_html
> >     return BeautifulSoup(content)
> >   File "build/bdist.linux-i686/egg/BeautifulSoup.py", line 1499, in
> __init__
> >   File "build/bdist.linux-i686/egg/BeautifulSoup.py", line 1230, in
> __init__
> >   File "build/bdist.linux-i686/egg/BeautifulSoup.py", line 1263, in _feed
> >   File "/usr/lib/python2.6/HTMLParser.py", line 108, in feed
> >     self.goahead(0)
> >   File "/usr/lib/python2.6/HTMLParser.py", line 148, in goahead
> >     k = self.parse_starttag(i)
> >   File "/usr/lib/python2.6/HTMLParser.py", line 226, in parse_starttag
> >     endpos = self.check_for_whole_start_tag(i)
> >   File "/usr/lib/python2.6/HTMLParser.py", line 301, in
> > check_for_whole_start_tag
> >     self.error("malformed start tag")
> >   File "/usr/lib/python2.6/HTMLParser.py", line 115, in error
> >     raise HTMLParseError(message, self.getpos())
> > HTMLParser.HTMLParseError: malformed start tag, at line 830, column 36
>
> Just noticed this now - you seem to be using BeautifulSoup, likely version
> 3.1. This version does not support parsing broken HTML any well, so use
> version 3.0.8 instead, or switch to the tools I indicated.
>
> Note that switching tools means that you need to change your code to use
> them. Just installing them is not enough.
>
> Stefan
>
>
I am so sorry but I really don't know how to change the code as I have just
learn python. How am I going to switch the version or to change the code?
Because I don't really understand the code.

Here is the code:

'''
Copyright (c) 2008  Yap Sok Ann <sayap at sayap.com>

This module contains xmltv grabbers for Malaysia channels.
'''

__author__ = 'Yap Sok Ann <sayap at sayap.com>'
__license__ = 'PSF License'

import logging

from datetime import date as dt
from datetime import datetime, time, timedelta
from dateutil.tz import tzlocal
from httplib2 import Http
from lxml import etree
from urllib import urlencode
from BeautifulSoup import BeautifulSoup

channels = ['rtm1', 'rtm2', 'tv3', 'ntv7', '8tv', 'tv9']

datetime_format = '%Y%m%d%H%M%S %z'

h = Http()
h.force_exception_to_status_code = True
#h.timeout = 15

logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s %(levelname)-8s %(process)d %(message)s',
)
log = logging.getLogger(__name__)

def strclean(s):
    s = s.strip().replace('&lsquo;', '\'').replace('&rsquo;', '\'')
    if s != '&nbsp;':
        return s

class Grabber(object):

    base_url = None

    def __init__(self, channel):
        self.channel = channel
        self.url = self.base_url

    def qs_params(self, date, **kwargs):
        '''Returns a dict of params to form the url's query string
        '''
        raise NotImplementedError

    def _parse_html(self, date, html):
        '''Returns a list of dicts with the following keys:
        - mandatory: title, start
        - optional: stop, sub_title, desc, episode_number, episode_system
        '''
        raise NotImplementedError

    def get_html(self, date, **kwargs):
        params = self.qs_params(date, **kwargs)
        response, content = h.request(self.url + '?' + urlencode(params))
        if response.status == 200:
            return BeautifulSoup(content)
        else:
            log.error('Status: %s\nContent: %s' % (response.status,
content))

    def parse_html(self, date, html):
        prev_schedule = None
        try:
            for schedule in self._parse_html(date, html):
                if 'stop' in schedule:
                    yield schedule
                elif prev_schedule:
                    prev_schedule['stop'] = schedule['start']
                    yield prev_schedule
                prev_schedule = schedule
        except:
            log.exception('Cannot parse html for date %s' % date)

    def to_xml(self, schedules):
        for schedule in schedules:
            program = etree.Element('programme', channel=self.channel,
                start=schedule['start'].strftime(datetime_format),
                stop=schedule['stop'].strftime(datetime_format))

            title = etree.SubElement(program, 'title')
            title.text = schedule['title']

            if schedule.get('episode_num'):
                episode_num = etree.SubElement(program, 'episode-num')
                episode_num.set('system', schedule.get('episode_system'))
                episode_num.text = schedule['episode_num']

            for field in ['sub_title', 'desc']:
                if schedule.get(field):
                    elem = etree.SubElement(program, field.replace('_',
'-'))
                    elem.text = schedule[field]

            yield program

    def grab(self, date, **kwargs):
        html = self.get_html(date, **kwargs)
        if html:
            return self.to_xml(self.parse_html(date, html))

class Astro(Grabber):

    base_url = 'http://www.astro.com.my/channels/%(channel)s/Default.asp'
    params_dicts = [dict(batch=1),
                    dict(batch=2)]
    ignores = ['No Transmission', 'Transmission Ends']

    def __init__(self, channel):
        self.channel = channel
        self.url = self.base_url % dict(channel=channel)

    def qs_params(self, date, **kwargs):
        kwargs['sDate'] = date.strftime('%d-%b-%Y')
        return kwargs

    def _parse_html(self, date, html):
        header_row = html.find('tr', bgcolor='#29487F')
        for tr in header_row.fetchNextSiblings('tr'):
            tds = tr.findChildren('td')

            title = strclean(tds[1].find('a').string)
            if title in self.ignores:
                continue

            # start time, '21:00' -> 9 PM
            hour, minute = [int(x) for x in tds[0].string.split(':')]
            start = datetime.combine(date,
                                     time(hour, minute, tzinfo=tzlocal()))

            # duration, '00:30' -> 30 minutes
            hours, minutes = [int(x) for x in tds[2].string.split(':')]
            stop = start + timedelta(hours=hours, minutes=minutes)

            yield dict(title=title, start=start, stop=stop)

class TheStar(Grabber):

    base_url = 'http://star-ecentral.com/tvnradio/tvguide/guide.asp'
    params_dicts = [dict(db='live')]

    def qs_params(self, date, **kwargs):
        kwargs['pdate'] = date.strftime('%m/%d/%Y')
        kwargs['chn'] = self.channel.replace('rtm', 'tv')
        return kwargs

    def _parse_html(self, date, html):
        last_ampm = None
        header_row = html.find('tr', bgcolor='#5e789c')
        for tr in header_row.fetchNextSiblings('tr'):
            tds = tr.findChildren('td')

            schedule = {}
            schedule['title'] =
strclean(tds[1].find('b').find('font').string)
            schedule['desc'] = strclean(tds[2].find('font').string)

            episode_num = strclean(tds[3].find('font').string)
            if episode_num:
                try:
                    episode_num = int(episode_num) - 1
                    episode_num = '.' + str(episode_num) + '.'
                    episode_system = 'xmltv_ns'
                except ValueError:
                    episode_system = 'onscreen'
                schedule['episode_num'] = episode_num
                schedule['episode_system'] = episode_system

            # start time, '9.00pm' -> 9 PM
            time_str = tds[0].find('font').string
            ampm = time_str[-2:]
            hour, minute = [int(x) for x in time_str[:-2].split('.')]
            if ampm == 'pm' and hour < 12:
                hour += 12
            elif ampm =='am' and hour == 12:
                hour = 0

            if last_ampm == 'pm' and ampm == 'am':
                date = date + timedelta(1)
            schedule['start'] = datetime.combine(
                date, time(hour, minute, tzinfo=tzlocal()))
            last_ampm = ampm

            yield schedule

def main():
    from optparse import OptionParser

    parser = OptionParser()
    parser.add_option('-s', '--source', dest='source',
        help='SOURCE to grab from: Astro, TheStar. Default: TheStar')
    parser.add_option('-d', '--date', dest='date',
        help='Start DATE to grab schedules for (YYYY-MM-DD). Default:
today')
    parser.add_option('-n', '--days', dest='days',
        help='Number of DAYS to grab schedules for. Default: 1')
    parser.add_option('-f', '--file', dest='filename', metavar='FILE',
        help='Output FILE to write to. Default: stdout')

    options, args = parser.parse_args()

    if options.source is None:
        cls = TheStar
    else:
        cls = globals()[options.source]

    if options.date is None:
        date = dt.today()
    else:
        date = dt(*[int(x) for x in options.date.split('-')])

    if options.days is None:
        days = 1
    else:
        days = int(options.days)

    root = etree.Element('tv')

    for channel in channels:
        grabber = cls(channel)
        for i in range(days):
            for params_dict in cls.params_dicts:
                for elem in grabber.grab(date + timedelta(i),
**params_dict):
                    root.append(elem)

    xml = etree.tostring(root, encoding='UTF-8', xml_declaration=True,
                         pretty_print=True)
    if options.filename is None:
        print xml
    else:
        open(options.filename, 'w').write(xml)

if __name__ == '__main__':
    main()
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/xml-sig/attachments/20100223/d3aa2fd4/attachment-0001.html>