optomizations

Rodrick Brown rodrick.brown at gmail.com
Mon Apr 22 21:19:23 EDT 2013


I would like some feedback on possible solutions to make this script run
faster.
The system is pegged at 100% CPU and it takes a long time to complete.


#!/usr/bin/env python

import gzip
import re
import os
import sys
from datetime import datetime
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', dest='inputfile', type=str, help='data file
to parse')
    parser.add_argument('-o', dest='outputdir', type=str,
default=os.getcwd(), help='Output directory')
    args = parser.parse_args()

    if len(sys.argv[1:]) < 1:
        parser.print_usage()
        sys.exit(-1)

    print(args)
    if args.inputfile and os.path.exists(args.inputfile):
        try:
            with gzip.open(args.inputfile) as datafile:
                for line in datafile:
                    line = line.replace('mediacdn.xxx.com', 'media.xxx.com')
                    line = line.replace('staticcdn.xxx.co.uk', '
static.xxx.co.uk')
                    line = line.replace('cdn.xxx', 'www.xxx')
                    line = line.replace('cdn.xxx', 'www.xxx')
                    line = line.replace('cdn.xx', 'www.xx')
                    siteurl = line.split()[6].split('/')[2]
                    line = re.sub(r'\bhttps?://%s\b' % siteurl, "", line, 1)

                    (day, month, year, hour, minute, second) =
(line.split()[3]).replace('[','').replace(':','/').split('/')
                    datelog = '{} {} {}'.format(month, day, year)
                    dateobj = datetime.strptime(datelog, '%b %d %Y')

                    outfile = '{}{}{}_combined.log'.format(dateobj.year,
dateobj.month, dateobj.day)
                    outdir = (args.outputdir + os.sep + siteurl)

                    if not os.path.exists(outdir):
                        os.makedirs(outdir)

                    with open(outdir + os.sep + outfile, 'w+') as outf:
                        outf.write(line)

        except IOError, err:
            sys.stderr.write("Error unable to read or extract inputfile: {}
{}\n".format(args.inputfile, err))
            sys.exit(-1)
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/python-list/attachments/20130422/0726db1d/attachment.html>


More information about the Python-list mailing list