[XML-SIG] Parse MULTIPLE XML files in a directory
Stefan Behnel
stefan_ml at behnel.de
Fri Aug 10 11:35:38 CEST 2007
Hi,
why don't you use the code snippet I posted? It already solves most of your
problems. The rest should be easy for you to add.
Stefan
amitesh kumar wrote:
> I've updated my code to this extent:
>
> import sys
> import xml.parsers.expat
> import dircache
>
> rec = {}
> rec2 = {}
> oli = {}
> ordtagname = '*'
> recList = {}
> cnt = 0
> cnt2 = 0
>
> ordtags =
> set(['orrfnbr','orrfnbr','afidlog','orprtot','ortxtot','orshtot','orcpcur','orpstmp','orustmp','orappstat','orappdt'])
>
> shptags =
> set(['strfnbr','stprnbr','stvdnbr','stprice','stquant','stpstmp','stustmp','starwbll','stdspchstat','stlogistics','stentrydt','stcpprice','stlstprice'])
>
> omptags = set(['ompaymthd','ommaxaamt'])
>
> def start_element(name, attrs):
> global ordtagname, rec,recList,cnt,ordtags,rec2,cnt2
> if name in ordtags or name in shptags or name in omptags:
> ordtagname = name
> if name == 'shipto':
> rec[cnt2] = rec2
> if name == 'order':
> recList[cnt] = rec
> sys.stdout.flush()
>
> def end_element(name):
> global ordtagname, rec,recList,cnt,ordtags,rec2,cnt2
> if name in ordtags or name in shptags or name in omptags:
> ordtagname = ''
> if name == 'shipto':
> cnt2 = cnt2+1
> rec2 = {}
> #rec[cnt2] = rec2
> if name == 'order':
> cnt2 = 0
> #recList[cnt] = rec
> rec = {}
> cnt = cnt+1
> sys.stdout.flush()
>
> def char_data(data):
> global ordtagname, rec,recList,cnt,ordtags,rec2,cnt2
> if None != data:
> if ordtagname in ordtags or ordtagname in shptags or ordtagname
> in omptags:
> if ordtagname in shptags :
> rec2[repr(ordtagname).strip('u\'')] =
> repr(data).strip('u\'')
> else:
> rec[repr(ordtagname).strip('u\'')] = repr(data).strip('u\'')
> sys.stdout.flush()
>
> for f in iter(dircache.listdir('./xmls/')):
> #print f
> g=open('./xmls/'+f, 'r')
> p = xml.parsers.expat.ParserCreate()
> p.StartElementHandler = start_element
> p.CharacterDataHandler = char_data
> p.EndElementHandler = end_element
> p.ParseFile(g)
> g.close()
> print recList
>
> -----------
>
>
> Now, I've to access recList elements in a iterative manner and do
> further processing. Will you please help me in this effort.
>
> Current output is:
>
> {0: {0: {'stentrydt': 'null', 'stustmp': '2007-07-18 14:49:43.0',
> 'stlogistics': '7', 'stprnbr': '10197436', 'stlstprice': '284',
> 'stdspchstat': '0', 'starwbll': 'null', 'strfnbr': '4491691', 'stquant':
> '1', 'stprice': ' 284.0', 'stpstmp': '2007-07-18 14:49:43.0', 'stvdnbr':
> '4143', 'stcpprice': '221.52'}, 'orustmp': '2007-07-19 18:29:23.0', 2:
> {'stentrydt': 'null', 'stustmp': '2007-07-18 14:49: 44.0',
> 'stlogistics': '7', 'stprnbr': '10158532', 'stlstprice': '325',
> 'stdspchstat': '0', 'starwbll': 'null', 'strfnbr': '4491693', 'stquant':
> '1', 'stprice': ' 325.0', 'stpstmp': '2007-07-18 14:49:44.0', 'stvdnbr':
> '4285', 'stcpprice': '276.25'}, 'orappstat': '1', 4: {'stentrydt':
> 'null', 'stustmp': '2007-07-18 14:49: 44.0', 'stlogistics': '0',
> 'stprnbr': '10193438', 'stlstprice': '199', 'stdspchstat': '0',
> 'starwbll': 'null', 'strfnbr': '4491695', 'stquant': '1', 'stprice': '
> 129.0', 'stpstmp': '2007-07-18 14:49:44.0', 'stvdnbr': '956',
> 'stcpprice': '90.3'}, 3: {'stentrydt': 'null', 'stustmp': '2007-07-18
> 14:49:44.0 ', 'stlogistics': '7', 'stprnbr': '10092402', 'stlstprice':
> '199', 'stdspchstat': '0', 'starwbll': 'null', 'strfnbr': '4491694',
> 'stquant': '1', 'stprice': ' 189.0', 'stpstmp': '2007-07-18 14:49:44.0',
> 'stvdnbr': '4094', 'stcpprice': '151.2'}, 1: {'stentrydt': 'null',
> 'stustmp': '2007-07-18 14:49:43.0 ', 'stlogistics': '7', 'stprnbr':
> '10188562', 'stlstprice': '1299', 'stdspchstat': '0', 'starwbll': 'll',
> 'strfnbr': '4491692', 'stquant': '1', 'stprice': ' 909.0', 'stpstmp':
> '2007-07-18 14:49:43.0', 'stvdnbr': '3557', 'stcpprice': '727.2'},
> 'orpstmp': '2007-07-18 14:49:44.0', 'ompaymthd': 'ICI ', 'orappdt':
> '2007-07-19 18:29: 23.0', 'orshtot': '241.0', 'orcpcur': 'INR',
> 'ommaxaamt': '2077.0', 'orrfnbr': '3992187', 'orprtot': '1836.0',
> 'ortxtot': '0.0 '}, 1: {0: {'stentrydt': 'null', 'stustmp': '2007-07-19
> 22:52:14.0', 'stlogistics': '0', 'stprnbr': '1030470', 'stlstprice':
> '2475', 'stdspchstat': '0', 'starwbll': 'null', 'strfnbr': '4494126',
> 'stquant': '1', 'stprice': ' 2475.0', 'stpstmp': '2007-07-19
> 22:52:14.0', 'stvdnbr': '2179', 'stcpprice': '1750.0'}, 'orustmp':
> '2007-07-19 22:52:16.0', 'orappstat': '-1', 1: {'stentrydt': 'null',
> 'stustmp': '2007-07-19 22:52: 14.0', 'stlogistics': '0', 'stprnbr':
> '1048790', 'stlstprice': '2475', 'stdspchstat': '0', 'starwbll': 'null',
> 'strfnbr': '4494127', 'stquant': '1', 'stprice': ' 2475.0', 'stpstmp':
> '2007-07-19 22:52:14.0', 'stvdnbr': '2179', 'stcpprice': '0.0'},
> 'orpstmp': '2007-07-19 22:52:14.0', 'ompaymthd': 'MAST ', 'orappdt':
> 'null', 'orshtot': ' 0.0', 'orcpcur': 'INR', 'ommaxaamt': '4950.0',
> 'orrfnbr': '3994456', 'orprtot': '4950.0', 'ortxtot': '0.0'}, 2: {0:
> {'stentrydt': 'null', 'stustmp': '2007-07-19 23:05: 05.0',
> 'stlogistics': '0', 'stprnbr': '3539177', 'stlstprice': '1',
> 'stdspchstat': '0', 'starwbll': 'null', 'strfnbr': '4494139', 'stquant':
> '1', 'stprice': ' 500.0', 'stpstmp': '2007-07-19 23:05:05.0', 'stvdnbr':
> '4370', 'stcpprice': '465.0'}, 'orustmp': '2007-07-20 00:20:06.0',
> 'orappstat': '5', 'orpstmp': '2007-07-19 23:05: 05.0', 'ompaymthd':
> 'ICI ', 'orappdt': 'null', 'afidlog': 'Auction', 'orshtot': '0.0',
> 'orcpcur': 'INR', 'ommaxaamt': '500.0 ', 'orrfnbr': '3994466',
> 'orprtot': '500.0', 'ortxtot': '0.0'}, 3: {0: {'stentrydt': 'null',
> 'stustmp': '2007-07-19 23:38:56.0', 'stlogistics': '0', 'stprnbr':
> '2771831', 'stlstprice': '843', 'stdspchstat': '0', 'starwbll': 'null',
> 'strfnbr': '4494158', 'stquant': '1', 'stprice': ' 900.0', 'stpstmp':
> '2007-07-19 23:38:56.0', 'stvdnbr': '3991', 'stcpprice': '543.0'},
> 'orustmp': '2007-07-19 23:38:57.0', 'orappstat': '-1', 'orpstmp':
> '2007-07-19 23:38: 56.0', 'ompaymthd': 'AMEX ', 'orappdt': 'null',
> 'orshtot': '0.0', 'orcpcur': 'INR', 'ommaxaamt': '900.0', 'orrfnbr':
> '3994481', 'orprtot': ' 900.0', 'ortxtot': '0.0'}}
>
>
> Thanks,
>
> Amitesh
>
>
>
>
> On 8/10/07, *Stefan Behnel* < stefan_ml at behnel.de
> <mailto:stefan_ml at behnel.de>> wrote:
>
> Hi,
>
> first thing: don't use expat directly. Use (c)ElementTree's
> iterparse. It's in
> Python 2.5, but is also available as an external package for older
> Python
> versions. There's also lxml (which is mostly compatible to
> ElementTree), in
> case you ever need features like XPath, XSLT or whatever.
>
>
> amitesh kumar wrote:
> > Please review the following code and help me.
> >
> > Here I'm trying to :
> > 1. Read each XML file in a folder.
> > 2. Parse file.
> > 3. Store some of the tags values as key-value pair in a map
> > 4. Similarly maintain another collection that'll store one list
> per file.
> >
> ------------------------------------------------------------------------
> >
> > ordtags = set()
> > shptags = set()
> > omptags = set()
> >
> > ordtags.add('orrfnbr')
> > ordtags.add('afidlog')
> [...]
>
> Better:
>
> ordtags = set(['offfnbr', 'afidlog', ...])
>
> from xml.etree.cElementTree import iterparse
>
> for onefile in allfiles:
> for event, element in iterparse(onefile):
> if element.tag in ordtags:
> # do something like
> values[ element.tag] = element.text
> elif element.tag in shptags:
> # do something else
> else:
> # don't do anything?
> element.clear()
>
> Stefan
>
>
>
>
> --
> With Regards
> Amitesh K.
> 9850638640
More information about the XML-SIG
mailing list