Finding Blank Columns in CSV

Peter Otten __peter__ at web.de
Wed Oct 7 03:49:39 EDT 2015


Peter Otten wrote:

> I really meant it when I asked you to post the code you actually ran, and
> the traceback it produces.

Anyway, here's a complete script that should work. It uses indices instead 
names, but the underlying logic is the same.

#!/usr/bin/env python
import csv
import sys
from contextlib import contextmanager


if sys.version_info.major == 3:
    READ_MODE = "r"
    WRITE_MODE = "w"
else:
    READ_MODE = "rb"
    WRITE_MODE = "wb"


@contextmanager
def ext_open(file, mode):
    """Open a file if passed a string, pass on everything else unchanged.

    "-", b"-", and None indicate stdin or stdout depending on `mode`.

    >>> with ext_open(None, "w") as f:
    ...     print("foo", file=f)
    foo
    >>> with ext_open(["one\\n", "two\\n"], "r") as f:
    ...     for line in f: print(line, end="")
    one
    two
    """
    if file is None or file == "-" or file == b"-":
        yield sys.stdout if "w" in mode else sys.stdin
    elif isinstance(file, (str, bytes)):
        with open(file, mode) as stream:
            yield stream
    else:
        yield file


def non_empty_columns(infile):
    """Find indices of columns that contain data.

    Doesn't check the first row assumed to contain the fieldnames.
    """
    with ext_open(infile, READ_MODE) as instream:
        reader = csv.reader(instream)
        colcount = len(next(reader))
        empty_columns = set(range(colcount))
        for row in reader:
            assert len(row) == colcount
            empty_columns = {i for i in empty_columns if not row[i]}
            if not empty_columns:
                break
        return [i for i in range(colcount) if i not in empty_columns]


def copy_csv(infile, outfile, columns):
    """Copy the specified columns from infile to outfile.
    """
    with ext_open(infile, READ_MODE) as instream:
        with ext_open(outfile, WRITE_MODE) as outstream:
            reader = csv.reader(instream)
            writer = csv.writer(outstream)
            for row in reader:
                writer.writerow([row[i] for i in columns])


def main():
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("infile", nargs="?", default=sys.stdin)
    parser.add_argument("outfile", nargs="?", default=sys.stdout)
    args = parser.parse_args()

    if args.infile is sys.stdin or args.infile == "-":
        args.infile = list(sys.stdin)

    copy_csv(
        args.infile, args.outfile,
        columns=non_empty_columns(args.infile))


if __name__ == "__main__":
    main()





More information about the Python-list mailing list