fixing an horrific formatted csv file.
flebber
flebber.crue at gmail.com
Fri Jul 4 06:28:04 EDT 2014
On Friday, 4 July 2014 14:12:15 UTC+10, flebber wrote:
> I have taken the code and gone a little further, but I need to be able to protect myself against commas and single quotes in names.
>
>
>
> How is it the best to do this?
>
>
>
> so in my file I had on line 44 this trainer name.
>
>
>
> "Michael, Wayne & John Hawkes"
>
>
>
> and in line 95 this horse name.
>
> Inz'n'out
>
>
>
> this throws of my capturing correct item 9. How do I protect against this?
>
>
>
> Here is current code.
>
>
>
> import re
>
> from sys import argv
>
> SCRIPT, FILENAME = argv
>
>
>
>
>
> def out_file_name(file_name):
>
> """take an input file and keep the name with appended _clean"""
>
> file_parts = file_name.split(".",)
>
> output_file = file_parts[0] + '_clean.' + file_parts[1]
>
> return output_file
>
>
>
>
>
> def race_table(text_file):
>
> """utility to reorganise poorly made csv entry"""
>
> input_table = [[item.strip(' "') for item in record.split(',')]
>
> for record in text_file.splitlines()]
>
> # At this point look at input_table to find the record indices
>
> output_table = []
>
> for record in input_table:
>
> if record[0] == 'Meeting':
>
> meeting = record[3]
>
> elif record[0] == 'Race':
>
> date = record[13]
>
> race = record[1]
>
> elif record[0] == 'Horse':
>
> number = record[1]
>
> name = record[2]
>
> results = record[9]
>
> res_split = re.split('[- ]', results)
>
> starts = res_split[0]
>
> wins = res_split[1]
>
> seconds = res_split[2]
>
> thirds = res_split[3]
>
> prizemoney = res_split[4]
>
> trainer = record[4]
>
> location = record[5]
>
> print(name, wins, seconds)
>
> output_table.append((meeting, date, race, number, name,
>
> starts, wins, seconds, thirds, prizemoney,
>
> trainer, location))
>
> return output_table
>
>
>
> MY_FILE = out_file_name(FILENAME)
>
>
>
> # with open(FILENAME, 'r') as f_in, open(MY_FILE, 'w') as f_out:
>
> # for line in race_table(f_in.readline()):
>
> # new_row = line
>
> with open(FILENAME, 'r') as f_in, open(MY_FILE, 'w') as f_out:
>
> CONTENT = f_in.read()
>
> # print(content)
>
> FILE_CONTENTS = race_table(CONTENT)
>
> # print new_name
>
> f_out.write(str(FILE_CONTENTS))
>
>
>
>
>
> if __name__ == '__main__':
>
> pass
So I found this on stack overflow
In [2]: import string
In [3]: identity = string.maketrans("", "")
In [4]: x = ['+5556', '-1539', '-99', '+1500']
In [5]: x = [s.translate(identity, "+-") for s in x]
In [6]: x
Out[6]: ['5556', '1539', '99', '1500']
but it fails in my file, due to I believe mine being a list of list. Is there an easy way to iterate the sublists without flattening?
Current code.
input_table = [[item.strip(' "') for item in record.split(',')]
for record in text_file.splitlines()]
# At this point look at input_table to find the record indices
identity = string.maketrans("", "")
print(input_table)
input_table = [s.translate(identity, ",'") for s
in input_table]
Sayth
More information about the Python-list
mailing list