Reading csv file

Thu Dec 19 05:39:06 EST 2013

Igor Korot wrote:

> Hi, Peter,
> Thank you for the great suggestion.
> 
> I tried to implement you code but failed.
> 
> Here's what I have:
> 
> class FileReader:
>         def __init__(self, filename, isSkip):
>                 self.path = filename
>                 self.isSkip = isSkip
> 
>         @contextmanager
>         def open(*args):

Selfless OO? Not in Python.

>                 from StringIO import StringIO
>                 lines = range(10)
>                 if self.isSkip:
>                         lines[0] = "skipped"
>                         lines[6] = "field1-from-line6,field2-from-line6"
>                 else:
>                         lines[0] = "field1-from-line1,field2-from-line1"
>                 yield StringIO("\r\n".join(map(str, lines)))
> 
>         def is_arbitrary_text(self,fieldnames):
>                 return "skipped" in fieldnames
> 
>         def readData(self):
>                 with self.open(self.path, "r") as f:
>                         reader = csv.DictReader(f)
>                         if self.is_arbitrary_text(reader.fieldnames):
>                                 for _ in range(5):
>                                         next(reader, None)
>                                 reader._fieldnames = None

Here you introduced another bug, ignoring my helpful comments.

>>         reader._fieldnames = None # underscore necessary,
>>                                   # fieldnames setter doesn't work
>>         reader.fieldnames # used for its side-effect

>                         for row in reader:
>                                 print row
> 
> Unfortunately this does not work as "def open()" does not belong to my
> class and if I comment the "@contextmanager" line
> I will get an exception: "AttributeError: __exit__"
> 
> Any idea what to do?

Keeping comments is not an option? But please read and try to understand the 
comments before you excise them ;)

As I mentioned in the comment to the open() function you are not supposed to 
use it as you have real data -- use Python's built-in open() function. 
Anyway, if you insist on doing everything the OO-way, at least add a self in 
all the right places and don't introduce bugs that could be avoided with 
copy-and-paste. 

A working script with mock data and following the OO fashion would be:

$ cat csv_skip_header_oo.py
import csv
from contextlib import contextmanager

class FileReader:
    def __init__(self, filename, isSkip):
        self.path = filename
        self.isSkip = isSkip

    @contextmanager
    def open(self, *args):
        from StringIO import StringIO
        lines = range(10)
        if self.isSkip:
            lines[0] = "skipped"
            lines[6] = "field1-from-line6,field2-from-line6"
        else:
            lines[0] = "field1-from-line1,field2-from-line1"
        yield StringIO("\r\n".join(map(str, lines)))

    def is_arbitrary_text(self,fieldnames):
        return "skipped" in fieldnames

    def readData(self):
        with self.open(self.path, "r") as f:
            reader = csv.DictReader(f)
            if self.is_arbitrary_text(reader.fieldnames):
                for _ in range(5):
                    next(reader, None)

                reader._fieldnames = None # underscore necessary,
                                          # fieldnames setter doesn't work
                reader.fieldnames # used for its side-effect

            for row in reader:
                print row

if __name__ == "__main__":
    import sys
    print "Demo with made-up data"
    skip = len(sys.argv) > 1 and sys.argv[1] == "--skip"
    if skip:
        print "Variant 2, header is skipped"
    else:
        print "Variant 1, no header"
    FileReader("whatever.csv", skip).readData()

$ python csv_skip_header_oo.py
Demo with made-up data
Variant 1, no header
{'field2-from-line1': None, 'field1-from-line1': '1'}
{'field2-from-line1': None, 'field1-from-line1': '2'}
{'field2-from-line1': None, 'field1-from-line1': '3'}
{'field2-from-line1': None, 'field1-from-line1': '4'}
{'field2-from-line1': None, 'field1-from-line1': '5'}
{'field2-from-line1': None, 'field1-from-line1': '6'}
{'field2-from-line1': None, 'field1-from-line1': '7'}
{'field2-from-line1': None, 'field1-from-line1': '8'}
{'field2-from-line1': None, 'field1-from-line1': '9'}
$ python csv_skip_header_oo.py --skip
Demo with made-up data
Variant 2, header is skipped
{'field1-from-line6': '7', 'field2-from-line6': None}
{'field1-from-line6': '8', 'field2-from-line6': None}
{'field1-from-line6': '9', 'field2-from-line6': None}

A script using real data would be:

$ cat csv_skip_header_oo.py
import csv

class FileReader:
    def __init__(self, filename):
        self.path = filename

    def is_arbitrary_text(self, fieldnames):
        # XXX replace with a test suitable for your actual data
        return "skipped" in fieldnames

    def read_data(self):
        with open(self.path, "rb") as f:
            reader = csv.DictReader(f)
            if self.is_arbitrary_text(reader.fieldnames):
                for _ in range(5):
                    next(reader, None)
                reader = csv.DictReader(f)
            for row in reader:
                print row

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("file")
    args = parser.parse_args()

    FileReader(args.file).read_data()

$ cat skipped_header.csv 
skipped
1
2
3
4
5
field1-from-line6,field2-from-line6
7
8
9$python csv_skip_header_oo.py skipped_header.csv 
{'field1-from-line6': '7', 'field2-from-line6': None}
{'field1-from-line6': '8', 'field2-from-line6': None}
{'field1-from-line6': '9', 'field2-from-line6': None}
$ cat no_header.csv 
field1-from-line1,field2-from-line1
1
2
3
4
5
6
7
8
9$python csv_skip_header_oo.py no_header.csv 
{'field2-from-line1': None, 'field1-from-line1': '1'}
{'field2-from-line1': None, 'field1-from-line1': '2'}
{'field2-from-line1': None, 'field1-from-line1': '3'}
{'field2-from-line1': None, 'field1-from-line1': '4'}
{'field2-from-line1': None, 'field1-from-line1': '5'}
{'field2-from-line1': None, 'field1-from-line1': '6'}
{'field2-from-line1': None, 'field1-from-line1': '7'}
{'field2-from-line1': None, 'field1-from-line1': '8'}
{'field2-from-line1': None, 'field1-from-line1': '9'}

Please have a look at the cleaned-up implementation of the read_data() 
method of this last example. As a result of the discussion on the bug 
tracker <http://bugs.python.org/issue20004> I am now convinced that you 
should use two `DictReader`s rather than hack internal attributes or broken 
properties.

See also <http://www.python.org/dev/peps/pep-0008/> for naming conventions.