Using XML w/ Python...

Steve Holden steve at holdenweb.com
Sun Dec 11 06:17:23 EST 2005


Jay wrote:
> Yes i know, i did check out a couple but i could never understand it.
> They were confusing for me and i wasnt hoping for a full typed
> tutorial, just like some help with excactly wat im trying to do, not
> the whole module... but watever, Thx alot for the feedbak.
> 
Well I don't want to hold this up as an example of best practice (it was 
a quick hack to get some book graphics for my web site), but this 
example shows you how you can extract stuff from XML, in this case 
returned from Amazon's web services module.

Sorry about any wrapping that mangles the code.

regards
  Steve

#!/usr/bin/python
#
# getbooks.py: download book details from Amazon.com
#
# hwBuild: database-driven web content management system
# Copyright (C) 2005 Steve Holden - steve at holdenweb.com
#
# This program is free software; you can redistribute it
# and/or modify it under the terms of the GNU General
# Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
# PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
#
import urllib
import urlparse
import os
import re
from xml.parsers import expat
from config import Config
picindir = os.path.join(Config['datadir'], "pybooks")
for f in os.listdir(picindir):
     os.unlink(os.path.join(picindir, f))

filpat = re.compile(r"\d+")

class myParser:
     def __init__(self):
         self.parser = expat.ParserCreate()
         self.parser.StartElementHandler = self.start_element
         self.parser.EndElementHandler = self.end_element
         self.parser.CharacterDataHandler = self.character_data
         self.processing = 0
         self.count = 0

     def parse(self, f):
         self.parser.ParseFile(f)
         return self.count

     def start_element(self, name, attrs):
         if name == "MediumImage":
             self.processing = 1
             self.imgname = ""
         if self.processing == 1 and name == "URL":
             self.processing = 2

     def end_element(self, name):
         if self.processing == 2 and name == "URL":
             self.processing = 1
             print "Getting:", self.imgname
             scheme, loc, path, params, query, fragment = 
urlparse.urlparse(self.imgname)
             itemno = filpat.match(os.path.basename(path))
             fnam = itemno.group()
             u  = urllib.urlopen(self.imgname)
             img = u.read()
             outfile = file(os.path.join(picindir, "%s.jpg" % fnam), "wb")
             outfile.write(img)
             outfile.close()
             self.count += 1
         if self.processing ==1 and name == "MediumImage":
             self.processing = 0

     def character_data(self, data):
         if self.processing == 2:
             self.imgname += data

def main(search=None):
     print "Search:", search
     count = 0
     for pageNum in range(1,5):
         f = 
urllib.urlopen("http://webservices.amazon.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=XXXXXXXXXXXXXXXXXXXX&t=steveholden-20&SearchIndex=Books&Operation=ItemSearch&Keywords=%s&ItemPage=%d&ResponseGroup=Images&type=lite&Version=2004-11-10&f=xml" 
% (urllib.quote(search or Config['book-search']), pageNum))
         fnam = os.path.join(picindir, "bookdata.txt")
         file(fnam, "w").write(f.read())
         f = file(fnam, "r")
         p = myParser()
         n = p.parse(f)
         if n == 0:
             break
         count += n
     return count


if __name__ == "__main__":
     import sys
     search = None
     if len(sys.argv) > 1:
         search = sys.argv[1]
     n = main(search)
     print "Pictures found:", n
-- 
Steve Holden       +44 150 684 7255  +1 800 494 3119
Holden Web LLC                     www.holdenweb.com
PyCon TX 2006                  www.python.org/pycon/




More information about the Python-list mailing list