[Tutor] Troubles with lists and control flow

Thu Oct 8 22:42:08 CEST 2009

Hello I'm developing a script to compare two files, finding duplicate
entries and matching which id of one csv file corresponds to the id of
another csv file.
The first version was working nice, but I wanted to postpone the
writing to a file till the end and also make a correct csv file. The
code is not so great and I expect to work with no more than 3000 lines
of data in either file:
So here is the inicial code. I hope it's not too long or complicated:
import csv
import re
import addrnormalize
import difflib
import time

started = time.time()

nobv = open('regnobv.csv', 'wb')
yesbv = open('reginbv.csv', 'wb')

bv = open(r'\\albertapdc\ESP Data\ESP Main
Files\BV_Customersa.csv').read().upper()

site = open(r'C:\myscripts\latestregistrants.csv').read().upper()

site = re.sub(r'([0-9]{3})-([0-9]{3})-([0-9]{4})', r'\1\2\3', site)

bvreader = csv.DictReader(bv.splitlines())

sitelist = csv.DictReader(site.splitlines())

def inbv(yesbv):
    yesbv.write(item['USER_ID'] + ',')
    yesbv.write(row['CUS_NO'] + ',')
    yesbv.write(item['COMPANY'] + ',')
    yesbv.write(row['BVADDR1'] + ',')
    yesbv.write(item['ADDRESSLINEONE']+ ',')
    yesbv.write(row['BVADDRTELNO1'] + ',')
    yesbv.write(item['PHONE'] + '\n')

bvreader = list(bvreader)

#  or (row['NAME'] in item['COMPANY']) or (row['BVADDREMAIL'] in item['EMAIL'])
for item in sitelist:

    for row in bvreader:
        if ((row['BVADDRTELNO1'] == item['PHONE'] and
row['BVADDRTELNO1']) or (row['BVADDREMAIL'] == item['EMAIL'] and
row['BVADDREMAIL'])):
            inbv(yesbv)
            break

## this module just makes a few string transformations to standardize
both strings. Like STREET -> ST
        elif addrnormalize.format_address(row['BVADDR1']) ==
addrnormalize.format_address(item['ADDRESSLINEONE']) and
row['BVADDR1'] and row['BVPROVSTATE'] == item['STATE'] and
row['BVPROVSTATE']:
            inbv(yesbv)
            break
## trying some fuzzy matching here
        elif (difflib.SequenceMatcher(lambda x: x in " ,.-#" ,
row['BVADDR1'], item['ADDRESSLINEONE']).quick_ratio() > 0.87) \
        and (difflib.SequenceMatcher(lambda x: x in " .-" ,
row['BVCITY'], item['CITY']).quick_ratio() > 0.87):
            inbv(yesbv)
            break

    else:
        nobv.write(item['USER_ID']+ ',')
        nobv.write(item['FIRSTNAME']+ ',')
        nobv.write(item['LASTNAME']+ ',')
        nobv.write(item['COMPANY']+ ',')
        nobv.write(item['EMAIL'].lower()+ ',')
        nobv.write(item['PHONE']+ ',')
        nobv.write(item['FAX']+ ',')
        nobv.write(item['ADDRESSLINEONE']+ ',')
        nobv.write(item['ADDRESSLINETWO']+ ',')
        nobv.write(item['CITY']+ ',')
        nobv.write(item['STATE']+ ',')
        nobv.write(item['POSTALCODE']+ ',')
        nobv.write(item['COUNTRY']+ ',')
        nobv.write('\n')

nobv.close()
yesbv.close()

finished = time.time()

print finished - started

---- End of code ---

#### When I try with list it does not even print the "print linha" test
#### If I uncomment all the conditionals except the first if than I
get that written to the final file: reginbv.
### How is the new function with list affecting the results?

import csv
import re
import addrnormalize
import difflib
import time

started = time.time()

nobv = open('regnobv.csv', 'wb')

bv = open(r'\\albertapdc\ESP Data\ESP Main
Files\BV_Customersa.csv').read().upper()

site = open(r'C:\myscripts\latestregistrants.csv').read().upper()

site = re.sub(r'([0-9]{3})-([0-9]{3})-([0-9]{4})', r'\1\2\3', site)

bvreader = csv.DictReader(bv.splitlines())

sitelist = csv.DictReader(site.splitlines())

list2csv = []

list_not_in_bv = []

yesbv = csv.writer(open('reginbv.csv', 'wb'), dialect="excel")
nobv = csv.writer(open('regnobv.csv', 'wb'), dialect="excel")

def inbv(currentline = None):
    """writes a line of data when a date is found in BV"""
    if currentline is None:
        currentline = []
    else:
        currentline.append(item['USER_ID'])
        currentline.append(row['CUS_NO'])
        currentline.append(item['COMPANY'])
        currentline.append(row['BVADDR1'])
        currentline.append(item['ADDRESSLINEONE'])
        currentline.append(row['BVADDRTELNO1'])
        currentline.append(item['PHONE'])
        currentline.append(row['BVCITY'])
        currentline.append(item['CITY'])

    return currentline

def notinbv(currentline):
    if currentline is None:
        currentline = []
    else:
        currentline.append(item['USER_ID'])
        currentline.append(item['FIRSTNAME'])
        currentline.append(item['LASTNAME'])
        currentline.append(item['COMPANY'])
        currentline.append(item['EMAIL'])
        currentline.append(item['PHONE'])
        currentline.append(item['FAX'])
        currentline.append(item['ADDRESSLINEONE'])
        currentline.append(item['ADDRESSLINETWO'])
        currentline.append(item['CITY'])
        currentline.append(item['STATE'])
        currentline.append(item['POSTALCODE'])
        currentline.append(item['COUNTRY'])

    return currentline

bvreader = list(bvreader)

#  or (row['NAME'] in item['COMPANY']) or (row['BVADDREMAIL'] in item['EMAIL'])
for item in sitelist:

    for row in bvreader:
        if ((row['BVADDRTELNO1'] == item['PHONE'] and
row['BVADDRTELNO1']) or (row['BVADDREMAIL'] == item['EMAIL'] and
row['BVADDREMAIL'])):
            lin = []
            linha = inbv(lin)
            list2csv.append(linha)
            print linha
            break

        elif addrnormalize.format_address(row['BVADDR1']) ==
addrnormalize.format_address(item['ADDRESSLINEONE']) and
row['BVADDR1'] and row['BVPROVSTATE'] == item['STATE'] and
row['BVPROVSTATE']:
            lin = []
            linha = inbv(lin)
            list2csv.append(linha)
            break
##
        elif (difflib.SequenceMatcher(lambda x: x in " ,.-#" ,
row['BVADDR1'], item['ADDRESSLINEONE']).quick_ratio() > 0.87) \
        and (difflib.SequenceMatcher(lambda x: x in " .-" ,
row['BVCITY'], item['CITY']).quick_ratio() > 0.87):
            lin = []
            linha = inbv(lin)
            list2csv.append(linha)
            break
##
##
    else:
       le = []
       linha = notinbv(le)
       list_not_in_bv.append(linha)
       break

print "now printing list2csv"
print list2csv
print list_not_in_bv
for customer in list2csv:
    yesbv.writerow(customer)

for customer in list_not_in_bv:
    nobv.writerow(customer)

finished = time.time()

print finished - started