searching and storing large quantities of xml!

dads wayne.dads.bell at gmail.com
Mon Jan 18 16:39:14 EST 2010


Thanks all, took your advice and have been playing all weekend which
has been great fun. ElementTree is awesome. I created a script that
organises the xml as they're in year blocks and I didn't realise the
required xml is mixed up with other xml. Plus the volumes are much
greater than I realised, I checked as back at work and it was
something like 600,000 files in a year, just over a gig for each
year.

I'm going to add zipping up of the files and getting the required info
and putting it in a db this week hopefully. It's been completely
overhauled, originally I used modified date now it gets the date from
the parsed xml, safer that way. The code is below but word of caution,
it's hobbyist code so it'll probably make your eyes bleed =), thanks
again:

There was one thing that I forgot about - when ElementTree fails to
parse due to an element not being closed why doesn't it close the file
like object. As later on I would raise 'WindowsError: [Error
32] ...file being used by other process' when using shutil.move(). I
got round this by using a 'try except' block.

from __future__ import print_function
import xml.etree.cElementTree as ET
import calendar
import zipfile
import os.path
import shutil
import zlib
import os


class Xmlorg(object):

    def __init__(self):

        self.cwd = os.getcwd()
        self.year = os.path.basename(self.cwd)

    def _mkMonthAndDaysDirs(self):

        ''' creates dirs for every month and day of a of specidifed
year.
            Works for leap years as well.

            (specified)year/(year)month/day


            ...2010/201001/01
            ...2010/201001/02
            ...2010/201001/03                 '''


        def addZero(n):

            if len(str(n)) < 2:
                return '0' + str(n)
            else:
                return str(n)

        dim = [ calendar.monthrange(year,month)[1] for year in \
        [int(self.year)] for month in range(1,13) ]

        count = 1
        for n in dim:
            month = addZero(count)
            count += 1
            ym = os.path.join(self.cwd, self.year + month)
            os.mkdir(ym)
            for x in range(1,n+1):
                x = addZero(x)
                os.mkdir(os.path.join(ym, x))


    def ParseAndOrg(self):

        '''requires dir and zip struct:

        .../(year)/(year).zip - example .../2008/2008.zip '''


        def movef(fp1,fp2):

            '''moves files with exception handling'''

            try:
                shutil.move(fp1,fp2)
            except IOError, e:
                print(e)
            except WindowsError, e:
                print(e)

        self._mkMonthAndDaysDirs()
        os.mkdir(os.path.join(self.cwd, 'otherFileType'))

        # dir struct .../(year)/(year).zip - ex. .../2008/2008.zip
        zf = zipfile.ZipFile(os.path.join(self.cwd, self.year +
'.zip'))
        zf.extractall()
        ld = os.listdir(self.cwd)
        for i in ld:
            if os.path.isfile(i) and i.endswith('.xml'):
                try:
                    tree = ET.parse(i)
                except:
                    print('%s np' % i) #not parsed
                root = tree.getroot()
                if root.findtext('Summary/FileType') == 'Order':
                    date = root.findtext('OrderHeader/OrderDate')[:10]
#dd/mm/yyyy
                    dc = date.split('/')
                    fp1 = os.path.join(self.cwd, i)
                    fp2 = os.path.join(self.cwd, dc[2] + dc[1], dc[0])
                    movef(fp1,fp2)
                else:
                    fp1 = os.path.join(self.cwd, i)
                    fp2 = os.path.join(self.cwd, 'otherFileType')
                    movef(fp1,fp2)


if __name__ == '__main__':
    os.chdir('c:/sv_zip_test/2010/') #remove
    xo = Xmlorg()
    xo.ParseAndOrg()



More information about the Python-list mailing list