x=something, y=somethinelse and z=crud all likely to fail - how do i wrap them up

Sun Jan 31 04:21:29 EST 2016

On Sun, 31 Jan 2016 05:29 pm, Veek. M wrote:

> I'm using lxml.html

Hmmm. Well, I've never used lxml, but the first obvious problem I see is
that your lines:

description = li_item.find_class('vip')[0].text_content()

link = li_item.find_class('vip')[0].get('href')

price_dollar = li_item.find_class('lvprice prc')[0].xpath('span')[0].text

bids = li_item.find_class('lvformat')[0].xpath('span')[0].text

look suspiciously like a violation of the Liskov Substitution Principle.
("Talk to your dog, not to the dog's legs!") A long series of chained dot
accesses (or equivalent getitem, call, getitem, dot, etc) is a code-smell
suggesting that you are trying to control your dog's individual legs,
instead of just calling the dog.

But, I'll assume that this is part of the design of lxml, and so allowed. So
let's refactor by adding some helper methods and tidying the parse_page
method. This will also make it easier to test, refactor and maintain the
code, especially if the format of the XML file changes.

    def extract(self, item, clsname, extractor, default="unknown"):
        """Return the class of item, or default if unknown."""
        try:
            cls = item.find_class(clsname)
        except lxml.ClassNotFoundError:  # what should this be?
            return default
        return extractor(cls)

    def get_time(self, clsname, default='No time found'):
        extractor = lambda obj: obj[0].xpath('span')[0].get('timems')
        t = self.extract(li_item, clsname, extractor, None)
        if t is None:
            return default
        return int(t)/1000 - time.time()

    def parse_page(self, root):
        for li_item in root.xpath(
                '//li[re:test(@id, "^item[a-z0-9]+$")]', 
                namespaces={'re': "http://exslt.org/regular-expressions"}
                ):
            description = self.extract(li_item, 'vip',
                    lambda obj: obj[0].text_content(), "no description")
            link = self.extract(li_item, 'vip', 
                    lambda obj: obj[0].get('href'))
            price_dollar = self.extract(li_item, 'lvprice prc', 
                    lambda obj: obj[0].xpath('span')[0].text)
            bids = self.extract(li_item, 'lvformat', 
                    lambda obj: obj[0].xpath('span')[0].text)
            time_hrs = self.get_time('tme')
            shipping = self.extract(li_item, 'lvshipping',
                    lambda obj: obj[0].xpath(
                    'span/span/span')[0].text_content()
                    )
            print('{} {} {} {} {}'.format(
                    link, price_dollar, time_hrs, shipping, bids))
            print('-'*70)

#######################

If you prefer a more Java-style object-oriented solution:

    def get_class(self, item, clsname):
        """Return the class of item, or None if unknown."""
        try:
            return item.find_class(clsname)
        except lxml.ClassNotFoundError:  # what should this be?
            return None

    def get_description(self, maybe_cls, default="unknown"):
        if maybe_cls is None:
            return default
        return maybe_cls[0].text_content()

    def get_link(self, maybe_cls, tag='href', default='none'):
        if maybe_cls is None:
            return default
        return maybe_cls[0].get(tag)

    def get_text(self, maybe_cls, default='unknown'):
        if maybe_cls is None:
            return default
        return maybe_cls[0].xpath('span')[0].text

    def get_time(self, maybe_cls, default='No time found'):
        if maybe_cls is None:
            return default
        t = maybe_cls[0].xpath('span')[0].get('timems')
        return int(t)/1000 - time.time()

    def get_shipping(self, maybe_cls, default='unknown shipping'):
        if maybe_cls is None:
            return default
        return maybe_cls[0].xpath('span/span/span')[0].text_content()

    def parse_page(self, root):
        for li_item in root.xpath(
                '//li[re:test(@id, "^item[a-z0-9]+$")]', 
                namespaces={'re': "http://exslt.org/regular-expressions"}
                ):
            description = self.get_description(
                    self.get_class(li_item, 'vip'), "no description")
            link = self.get_link(self.get_class(li_item, 'vip'))
            price_dollar = self.get_text(
                    self.get_class(li_item, 'lvprice prc'))
            bids = self.get_text(
                    self.get_class(li_item, 'lvformat')
            time_hrs = self.get_time(self.get_class(li_item, 'tme'))
            shipping = self.get_shipping(
                    self.get_class(li_item, 'lvshipping')
            print('{} {} {} {} {}'.format(
                    link, price_dollar, time_hrs, shipping, bids))
            print('-'*70)

Obviously I haven't tested this code.

-- 
Steven