[XML-SIG] Parse MULTIPLE XML files in a directory

Stefan Behnel stefan_ml at behnel.de
Fri Aug 10 11:35:38 CEST 2007


Hi,

why don't you use the code snippet I posted? It already solves most of your
problems. The rest should be easy for you to add.

Stefan


amitesh kumar wrote:
> I've updated my code to this extent:
> 
> import sys
> import xml.parsers.expat
> import dircache
> 
> rec = {}
> rec2 = {}
> oli = {}
> ordtagname = '*'
> recList = {}
> cnt = 0
> cnt2 = 0
> 
> ordtags =
> set(['orrfnbr','orrfnbr','afidlog','orprtot','ortxtot','orshtot','orcpcur','orpstmp','orustmp','orappstat','orappdt'])
> 
> shptags =
> set(['strfnbr','stprnbr','stvdnbr','stprice','stquant','stpstmp','stustmp','starwbll','stdspchstat','stlogistics','stentrydt','stcpprice','stlstprice'])
> 
> omptags = set(['ompaymthd','ommaxaamt'])
> 
> def start_element(name, attrs):
>     global ordtagname, rec,recList,cnt,ordtags,rec2,cnt2
>     if name in ordtags or name in shptags or name in omptags:
>         ordtagname = name
>     if name == 'shipto':
>         rec[cnt2] = rec2
>     if name == 'order':
>         recList[cnt] = rec
>     sys.stdout.flush()
> 
> def end_element(name):
>     global ordtagname, rec,recList,cnt,ordtags,rec2,cnt2
>     if name in ordtags or name in shptags or name in omptags:
>         ordtagname = ''
>     if name == 'shipto':
>         cnt2 = cnt2+1
>         rec2 = {}
>         #rec[cnt2] = rec2
>     if name == 'order':
>         cnt2 = 0
>         #recList[cnt] = rec
>         rec = {}
>         cnt = cnt+1
>     sys.stdout.flush()
>    
> def char_data(data):
>     global ordtagname, rec,recList,cnt,ordtags,rec2,cnt2
>     if None != data:
>         if ordtagname in ordtags or ordtagname in shptags or ordtagname
> in omptags:
>             if ordtagname in shptags :
>                 rec2[repr(ordtagname).strip('u\'')] =
> repr(data).strip('u\'')
>             else:
>                 rec[repr(ordtagname).strip('u\'')] = repr(data).strip('u\'')
>     sys.stdout.flush()
> 
> for f in iter(dircache.listdir('./xmls/')):
>     #print f
>     g=open('./xmls/'+f, 'r')
>     p = xml.parsers.expat.ParserCreate()
>     p.StartElementHandler  = start_element
>     p.CharacterDataHandler = char_data
>     p.EndElementHandler    = end_element
>     p.ParseFile(g)
>     g.close()
> print recList           
>        
> -----------
> 
> 
> Now, I've to access recList elements in a iterative manner and do
> further processing. Will you please help me in this effort.
> 
> Current output is:
> 
> {0: {0: {'stentrydt': 'null', 'stustmp': '2007-07-18 14:49:43.0',
> 'stlogistics': '7', 'stprnbr': '10197436', 'stlstprice': '284',
> 'stdspchstat': '0', 'starwbll': 'null', 'strfnbr': '4491691', 'stquant':
> '1', 'stprice': ' 284.0', 'stpstmp': '2007-07-18 14:49:43.0', 'stvdnbr':
> '4143', 'stcpprice': '221.52'}, 'orustmp': '2007-07-19 18:29:23.0', 2:
> {'stentrydt': 'null', 'stustmp': '2007-07-18 14:49: 44.0',
> 'stlogistics': '7', 'stprnbr': '10158532', 'stlstprice': '325',
> 'stdspchstat': '0', 'starwbll': 'null', 'strfnbr': '4491693', 'stquant':
> '1', 'stprice': ' 325.0', 'stpstmp': '2007-07-18 14:49:44.0', 'stvdnbr':
> '4285', 'stcpprice': '276.25'}, 'orappstat': '1', 4: {'stentrydt':
> 'null', 'stustmp': '2007-07-18 14:49: 44.0', 'stlogistics': '0',
> 'stprnbr': '10193438', 'stlstprice': '199', 'stdspchstat': '0',
> 'starwbll': 'null', 'strfnbr': '4491695', 'stquant': '1', 'stprice': '
> 129.0', 'stpstmp': '2007-07-18 14:49:44.0', 'stvdnbr': '956',
> 'stcpprice': '90.3'}, 3: {'stentrydt': 'null', 'stustmp': '2007-07-18
> 14:49:44.0 ', 'stlogistics': '7', 'stprnbr': '10092402', 'stlstprice':
> '199', 'stdspchstat': '0', 'starwbll': 'null', 'strfnbr': '4491694',
> 'stquant': '1', 'stprice': ' 189.0', 'stpstmp': '2007-07-18 14:49:44.0',
> 'stvdnbr': '4094', 'stcpprice': '151.2'}, 1: {'stentrydt': 'null',
> 'stustmp': '2007-07-18 14:49:43.0 ', 'stlogistics': '7', 'stprnbr':
> '10188562', 'stlstprice': '1299', 'stdspchstat': '0', 'starwbll': 'll',
> 'strfnbr': '4491692', 'stquant': '1', 'stprice': ' 909.0', 'stpstmp':
> '2007-07-18 14:49:43.0', 'stvdnbr': '3557', 'stcpprice': '727.2'},
> 'orpstmp': '2007-07-18 14:49:44.0', 'ompaymthd': 'ICI  ', 'orappdt':
> '2007-07-19 18:29: 23.0', 'orshtot': '241.0', 'orcpcur': 'INR',
> 'ommaxaamt': '2077.0', 'orrfnbr': '3992187', 'orprtot': '1836.0',
> 'ortxtot': '0.0 '}, 1: {0: {'stentrydt': 'null', 'stustmp': '2007-07-19
> 22:52:14.0', 'stlogistics': '0', 'stprnbr': '1030470', 'stlstprice':
> '2475', 'stdspchstat': '0', 'starwbll': 'null', 'strfnbr': '4494126',
> 'stquant': '1', 'stprice': ' 2475.0', 'stpstmp': '2007-07-19
> 22:52:14.0', 'stvdnbr': '2179', 'stcpprice': '1750.0'}, 'orustmp':
> '2007-07-19 22:52:16.0', 'orappstat': '-1', 1: {'stentrydt': 'null',
> 'stustmp': '2007-07-19 22:52: 14.0', 'stlogistics': '0', 'stprnbr':
> '1048790', 'stlstprice': '2475', 'stdspchstat': '0', 'starwbll': 'null',
> 'strfnbr': '4494127', 'stquant': '1', 'stprice': ' 2475.0', 'stpstmp':
> '2007-07-19 22:52:14.0', 'stvdnbr': '2179', 'stcpprice': '0.0'},
> 'orpstmp': '2007-07-19 22:52:14.0', 'ompaymthd': 'MAST ', 'orappdt':
> 'null', 'orshtot': ' 0.0', 'orcpcur': 'INR', 'ommaxaamt': '4950.0',
> 'orrfnbr': '3994456', 'orprtot': '4950.0', 'ortxtot': '0.0'}, 2: {0:
> {'stentrydt': 'null', 'stustmp': '2007-07-19 23:05: 05.0',
> 'stlogistics': '0', 'stprnbr': '3539177', 'stlstprice': '1',
> 'stdspchstat': '0', 'starwbll': 'null', 'strfnbr': '4494139', 'stquant':
> '1', 'stprice': ' 500.0', 'stpstmp': '2007-07-19 23:05:05.0', 'stvdnbr':
> '4370', 'stcpprice': '465.0'}, 'orustmp': '2007-07-20 00:20:06.0',
> 'orappstat': '5', 'orpstmp': '2007-07-19 23:05: 05.0', 'ompaymthd':
> 'ICI  ', 'orappdt': 'null', 'afidlog': 'Auction', 'orshtot': '0.0',
> 'orcpcur': 'INR', 'ommaxaamt': '500.0 ', 'orrfnbr': '3994466',
> 'orprtot': '500.0', 'ortxtot': '0.0'}, 3: {0: {'stentrydt': 'null',
> 'stustmp': '2007-07-19 23:38:56.0', 'stlogistics': '0', 'stprnbr':
> '2771831', 'stlstprice': '843', 'stdspchstat': '0', 'starwbll': 'null',
> 'strfnbr': '4494158', 'stquant': '1', 'stprice': ' 900.0', 'stpstmp':
> '2007-07-19 23:38:56.0', 'stvdnbr': '3991', 'stcpprice': '543.0'},
> 'orustmp': '2007-07-19 23:38:57.0', 'orappstat': '-1', 'orpstmp':
> '2007-07-19 23:38: 56.0', 'ompaymthd': 'AMEX ', 'orappdt': 'null',
> 'orshtot': '0.0', 'orcpcur': 'INR', 'ommaxaamt': '900.0', 'orrfnbr':
> '3994481', 'orprtot': ' 900.0', 'ortxtot': '0.0'}}
> 
> 
> Thanks,
> 
> Amitesh
>        
>    
> 
> 
> On 8/10/07, *Stefan Behnel* < stefan_ml at behnel.de
> <mailto:stefan_ml at behnel.de>> wrote:
> 
>     Hi,
> 
>     first thing: don't use expat directly. Use (c)ElementTree's
>     iterparse. It's in
>     Python 2.5, but is also available as an external package for older
>     Python
>     versions. There's also lxml (which is mostly compatible to
>     ElementTree), in
>     case you ever need features like XPath, XSLT or whatever.
> 
> 
>     amitesh kumar wrote:
>     > Please review the following code and help me.
>     >
>     > Here I'm trying to :
>     > 1. Read each XML file in a folder.
>     > 2. Parse file.
>     > 3. Store some of the tags values as key-value pair in a map
>     > 4. Similarly maintain another collection that'll store one list
>     per file.
>     >
>     ------------------------------------------------------------------------
>     >
>     > ordtags = set()
>     > shptags = set()
>     > omptags = set()
>     >
>     > ordtags.add('orrfnbr')
>     > ordtags.add('afidlog')
>     [...]
> 
>     Better:
> 
>         ordtags = set(['offfnbr', 'afidlog', ...])
> 
>         from xml.etree.cElementTree import iterparse
> 
>         for onefile in allfiles:
>             for event, element in iterparse(onefile):
>                 if element.tag in ordtags:
>                      # do something like
>                      values[ element.tag] = element.text
>                 elif element.tag in shptags:
>                      # do something else
>                 else:
>                      # don't do anything?
>                 element.clear()
> 
>     Stefan
> 
> 
> 
> 
> -- 
> With Regards
> Amitesh K.
> 9850638640


More information about the XML-SIG mailing list