a quick program to download tv listings

Tue Sep 16 00:52:43 EDT 2003

> Is there anyone out there that has written anything in python to
> download tv listings (no XML)? ... I wrote the test
> program below and it works.  I am just curious if anyone has a more
> robust python implementation before I take the time to add all the
> bells and whistles.

Here's a version that uses an HTMLParser and does its own filtering
by channel, rather than setting a cookie.

Some sample output:

% tvsearch "college football" news

College Football                          ABC  7     12:30 PM Sat Sep 13
College Football                          CBS  2     12:30 PM Sat Sep 13
Eyewitness News                           ABC  7      4:00 PM Sat Sep 13
CBS 2 News at 5:00                        CBS  2      5:00 PM Sat Sep 13
College Football                          ABC  7      5:00 PM Sat Sep 13
Channel 4 News                            NBC  4      5:00 PM Sat Sep 13
CBS Evening News                          CBS  2      5:30 PM Sat Sep 13

#!/usr/bin/env python

import formatter, re, time, urllib
from htmllib import HTMLParser

# Channel lineup (leave empty to search all channels)
CHANNELS = [2, 3, 4, 5, 7, 9, 11, 13, 18, 22, 30, 32, 34, 35, 36, 39,
    40, 41, 42, 43, 44, 46, 50, 57, 62]

# Yahoo location code
LOCATION = 'us_CA57315'

# Yahoo TV listing URL
YAHOO_TV_URL = ('http://tv.yahoo.com/grid?lineup=' + LOCATION
    + '&starttime=%(epoch)d&.intl=us')

class Show(object):

    '''Just a structure to hold program information'''

    __slots__ = ('name', 'channel', 'station', 'start', 'end')

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            self.__setattr__(k, v)

    def __str__(self):
        showTime=time.strftime('%I:%M %p %a %b %d', time.localtime(self.start))
        if showTime[0] == '0':
            showTime = ' ' + showTime[1:]
        return '%-35s  %8s  %-4d  %-s' % (self.name, self.station,
            self.channel, showTime)

class YahooTVParser(HTMLParser):

    '''Minimal HTML parser for Yahoo TV listings'''

    showRE = re.compile('\/tvpdb\?d=tvp&id=(.*)')
    showInfoRE = re.compile('(\d*)&cf.*channels=us_([^&]*).*'

    def __init__(self):
        HTMLParser.__init__(self, formatter.NullFormatter())
        self.shows = []
        self.inShow = 0

    def start_a(self, attrs):  # <A> handler
        '''If the tag's HREF matches showRE, record the show info.'''
        self.newShow = None
        self.showName = ''

        # Check if the HREF matches a show.
        for k, v in attrs:
            if k == 'href':
                url = ''.join(v.split('\n'))
                if self.showRE.search(url):
                    m = self.showInfoRE.search(url)
                    if m:
                        # Create a new Show--its name isn't known yet.
                        self.newShow = Show(start=float(m.group(5)),
                            channel=int(m.group(4)), station=m.group(3))
                        self.inShow = 1

    def end_a(self):  # </A> handler
        '''If done with a show, record its name and add it to the list.'''
        if self.inShow and self.showName:
            self.newShow.name = self.showName
        self.inShow = 0

    def handle_data(self, text):
        '''Handle the data between, e.g., <A> and </A> tags.'''
        if self.inShow:
            self.showName += text

def getGrid(epoch):
    url = YAHOO_TV_URL % vars()
    parser = YahooTVParser()
    return parser.shows

def findShows(patterns):
    isMatchingShow = None
    if patterns:
        nameRE = re.compile('|'.join(['(%s)' % n for n in patterns]), re.I)
        if CHANNELS:
            def isMatchingShow(show):
                return (show.channel in CHANNELS) and nameRE.search(show.name)
            def isMatchingShow(show):
                return (nameRE.search(show.name) is not None)
    elif CHANNELS:
        def isMatchingShow(show):
            return (show.channel in CHANNELS)

    THREE_HOURS = 3 * 60 * 60
    ONE_WEEK = THREE_HOURS * 8 * 7
    startTime = int(time.time())
    endTime = startTime + ONE_WEEK

    for h in range(startTime, endTime, THREE_HOURS):
        allShows = getGrid(h)
        # Print matching shows sorted by starting time.
        if isMatchingShow is not None:
            shows = [(s.start, s) for s in allShows if isMatchingShow(s)]
            shows = [(s.start, s) for s in allShows]
        for t, s in shows:
            print s

def main():
    import os.path, sys

    args = sys.argv[1:]
    if '-h' in args:
        sys.stderr.write("Usage: %s [PATTERN]...\n"
            % os.path.basename(sys.argv[0]))

    try: findShows(args)
    except KeyboardInterrupt: pass

if __name__ == '__main__':

