[pypy-svn] r61405 - pypy/trunk/pypy/lib
afa at codespeak.net
afa at codespeak.net
Tue Jan 27 20:28:47 CET 2009
Author: afa
Date: Tue Jan 27 20:28:45 2009
New Revision: 61405
Added:
pypy/trunk/pypy/lib/_csv.py
Log:
A pure python implementation of the _csv module.
All tests pass ("bin/py.py -m test.test_csv")
except one which should be marked as impl_detail IMO
Added: pypy/trunk/pypy/lib/_csv.py
==============================================================================
--- (empty file)
+++ pypy/trunk/pypy/lib/_csv.py Tue Jan 27 20:28:45 2009
@@ -0,0 +1,531 @@
+"""CSV parsing and writing.
+
+This module provides classes that assist in the reading and writing
+of Comma Separated Value (CSV) files, and implements the interface
+described by PEP 305. Although many CSV files are simple to parse,
+the format is not formally defined by a stable specification and
+is subtle enough that parsing lines of a CSV file with something
+like line.split(\",\") is bound to fail. The module supports three
+basic APIs: reading, writing, and registration of dialects.
+
+
+DIALECT REGISTRATION:
+
+Readers and writers support a dialect argument, which is a convenient
+handle on a group of settings. When the dialect argument is a string,
+it identifies one of the dialects previously registered with the module.
+If it is a class or instance, the attributes of the argument are used as
+the settings for the reader or writer:
+
+ class excel:
+ delimiter = ','
+ quotechar = '\"'
+ escapechar = None
+ doublequote = True
+ skipinitialspace = False
+ lineterminator = '\\r\\n'
+ quoting = QUOTE_MINIMAL
+
+SETTINGS:
+
+ * quotechar - specifies a one-character string to use as the
+ quoting character. It defaults to '\"'.
+ * delimiter - specifies a one-character string to use as the
+ field separator. It defaults to ','.
+ * skipinitialspace - specifies how to interpret whitespace which
+ immediately follows a delimiter. It defaults to False, which
+ means that whitespace immediately following a delimiter is part
+ of the following field.
+ * lineterminator - specifies the character sequence which should
+ terminate rows.
+ * quoting - controls when quotes should be generated by the writer.
+ It can take on any of the following module constants:
+
+ csv.QUOTE_MINIMAL means only when required, for example, when a
+ field contains either the quotechar or the delimiter
+ csv.QUOTE_ALL means that quotes are always placed around fields.
+ csv.QUOTE_NONNUMERIC means that quotes are always placed around
+ fields which do not parse as integers or floating point
+ numbers.
+ csv.QUOTE_NONE means that quotes are never placed around fields.
+ * escapechar - specifies a one-character string used to escape
+ the delimiter when quoting is set to QUOTE_NONE.
+ * doublequote - controls the handling of quotes inside fields. When
+ True, two consecutive quotes are interpreted as one during read,
+ and when writing, each quote character embedded in the data is
+ written as two quotes.
+"""
+
+__version__ = "1.0"
+
+QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE = range(4)
+_dialects = {}
+_field_limit = 128 * 1024 # max parsed field size
+
+class Error(Exception):
+ pass
+
+class Dialect(object):
+ """CSV dialect
+
+ The Dialect type records CSV parsing and generation options."""
+
+ __slots__ = ["delimiter", "doublequote", "escapechar", "lineterminator",
+ "quotechar", "quoting", "skipinitialspace", "strict"]
+
+ def __new__(cls, dialect, **kwargs):
+
+ for name in kwargs:
+ if name not in Dialect.__slots__:
+ raise TypeError("unexpected keyword argument '%s'" %
+ (name,))
+
+ if dialect is not None:
+ if isinstance(dialect, basestring):
+ dialect = get_dialect(dialect)
+
+ # Can we reuse this instance?
+ if (isinstance(dialect, Dialect)
+ and all(value is None for value in kwargs.itervalues())):
+ return dialect
+
+ self = object.__new__(cls)
+
+
+ def set_char(x):
+ if x is None:
+ return None
+ if isinstance(x, str) and len(x) <= 1:
+ return x
+ raise TypeError("%r must be a 1-character string" % (name,))
+ def set_str(x):
+ if isinstance(x, str):
+ return x
+ raise TypeError("%r must be a string" % (name,))
+ def set_quoting(x):
+ if x in range(4):
+ return x
+ raise TypeError("bad 'quoting' value")
+
+ attributes = {"delimiter": (',', set_char),
+ "doublequote": (True, bool),
+ "escapechar": (None, set_char),
+ "lineterminator": ("\r\n", set_str),
+ "quotechar": ('"', set_char),
+ "quoting": (QUOTE_MINIMAL, set_quoting),
+ "skipinitialspace": (False, bool),
+ "strict": (False, bool),
+ }
+
+ # Copy attributes
+ notset = object()
+ for name in Dialect.__slots__:
+ value = notset
+ if name in kwargs:
+ value = kwargs[name]
+ elif dialect is not None:
+ value = getattr(dialect, name, notset)
+
+ # mapping by name: (default, converter)
+ if value is notset:
+ value = attributes[name][0]
+ if name == 'quoting' and not self.quotechar:
+ value = QUOTE_NONE
+ else:
+ converter = attributes[name][1]
+ if converter:
+ value = converter(value)
+
+ setattr(self, name, value)
+
+ if not self.delimiter:
+ raise TypeError("delimiter must be set")
+
+ if self.quoting != QUOTE_NONE and not self.quotechar:
+ raise TypeError("quotechar must be set if quoting enabled")
+
+ if not self.lineterminator:
+ raise TypeError("lineterminator must be set")
+
+ return self
+
+
+def _call_dialect(dialect_inst, kwargs):
+ return Dialect(dialect_inst, **kwargs)
+
+def register_dialect(name, dialect=None, **kwargs):
+ """Create a mapping from a string name to a dialect class.
+ dialect = csv.register_dialect(name, dialect)"""
+ if not isinstance(name, basestring):
+ raise TypeError("dialect name must be a string or unicode")
+
+ dialect = _call_dialect(dialect, kwargs)
+ _dialects[name] = dialect
+
+def unregister_dialect(name):
+ """Delete the name/dialect mapping associated with a string name.\n
+ csv.unregister_dialect(name)"""
+ try:
+ del _dialects[name]
+ except KeyError:
+ raise Error("unknown dialect")
+
+def get_dialect(name):
+ """Return the dialect instance associated with name.
+ dialect = csv.get_dialect(name)"""
+ try:
+ return _dialects[name]
+ except KeyError:
+ raise Error("unknown dialect")
+
+def list_dialects():
+ """Return a list of all know dialect names
+ names = csv.list_dialects()"""
+ return list(_dialects)
+
+class Reader:
+
+ """CSV reader
+
+ Reader objects are responsible for reading and parsing tabular data
+ in CSV format."""
+
+
+ (START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD,
+ IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD,
+ EAT_CRNL) = range(8)
+
+ def __init__(self, iterator, dialect=None, **kwargs):
+ self.dialect = _call_dialect(dialect, kwargs)
+ self.input_iter = iter(iterator)
+ self.line_num = 0
+
+ self._parse_reset()
+
+ def _parse_reset(self):
+ self.field = []
+ self.fields = []
+ self.state = self.START_RECORD
+ self.numeric_field = False
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ self._parse_reset()
+ while True:
+ try:
+ line = self.input_iter.next()
+ except StopIteration:
+ # End of input OR exception
+ if len(self.field) > 0:
+ raise Error("newline inside string")
+ raise
+
+ self.line_num += 1
+
+ for c in line:
+ if c == '\0':
+ raise Error("line contains NULL byte")
+ self._parse_process_char(c)
+ self._parse_process_char('\0')
+
+ if self.state == self.START_RECORD:
+ break
+
+ fields = self.fields
+ self.fields = []
+ return fields
+
+ def _parse_process_char(self, c):
+ if self.state == self.IN_FIELD:
+ # in unquoted field
+ if c in ('\n', '\r', '\0'):
+ # end of line - return [fields]
+ self._parse_save_field()
+ if c == '\0':
+ self.state = self.START_RECORD
+ else:
+ self.state = self.EAT_CRNL
+ elif c == self.dialect.escapechar:
+ # possible escaped character
+ self.state = self.ESCAPED_CHAR
+ elif c == self.dialect.delimiter:
+ # save field - wait for new field
+ self._parse_save_field()
+ self.state = self.START_FIELD
+ else:
+ # normal character - save in field
+ self._parse_add_char(c)
+
+ elif self.state == self.START_RECORD:
+ if c == '\0':
+ # empty line - return []
+ pass
+ elif c in ('\n', '\r'):
+ self.state = self.EAT_CRNL
+ else:
+ self.state = self.START_FIELD
+ # restart process
+ self._parse_process_char(c)
+
+ elif self.state == self.START_FIELD:
+ if c in ('\n', '\r', '\0'):
+ # save empty field - return [fields]
+ self._parse_save_field()
+ if c == '\0':
+ self.state = self.START_RECORD
+ else:
+ self.state = self.EAT_CRNL
+ elif (c == self.dialect.quotechar
+ and self.dialect.quoting != QUOTE_NONE):
+ # start quoted field
+ self.state = self.IN_QUOTED_FIELD
+ elif c == self.dialect.escapechar:
+ # possible escaped character
+ self.state = self.ESCAPED_CHAR
+ elif c == ' ' and self.dialect.skipinitialspace:
+ # ignore space at start of field
+ pass
+ elif c == self.dialect.delimiter:
+ # save empty field
+ self._parse_save_field()
+ else:
+ # begin new unquoted field
+ if self.dialect.quoting == QUOTE_NONNUMERIC:
+ self.numeric_field = True
+ self._parse_add_char(c)
+ self.state = self.IN_FIELD
+
+ elif self.state == self.ESCAPED_CHAR:
+ if c == '\0':
+ c = '\n'
+ self._parse_add_char(c)
+ self.state = self.IN_FIELD
+
+ elif self.state == self.IN_QUOTED_FIELD:
+ if c == '\0':
+ pass
+ elif c == self.dialect.escapechar:
+ # possible escape character
+ self.state = self.ESCAPE_IN_QUOTED_FIELD
+ elif (c == self.dialect.quotechar
+ and self.dialect.quoting != QUOTE_NONE):
+ if self.dialect.doublequote:
+ # doublequote; " represented by ""
+ self.state = self.QUOTE_IN_QUOTED_FIELD
+ else:
+ #end of quote part of field
+ self.state = self.IN_FIELD
+ else:
+ # normal character - save in field
+ self._parse_add_char(c)
+
+ elif self.state == self.ESCAPE_IN_QUOTED_FIELD:
+ if c == '\0':
+ c = '\n'
+ self._parse_add_char(c)
+ self.state = self.IN_QUOTED_FIELD
+
+ elif self.state == self.QUOTE_IN_QUOTED_FIELD:
+ # doublequote - seen a quote in a quoted field
+ if (c == self.dialect.quotechar
+ and self.dialect.quoting != QUOTE_NONE):
+ # save "" as "
+ self._parse_add_char(c)
+ self.state = self.IN_QUOTED_FIELD
+ elif c == self.dialect.delimiter:
+ # save field - wait for new field
+ self._parse_save_field()
+ self.state = self.START_FIELD
+ elif c in ('\r', '\n', '\0'):
+ # end of line - return [fields]
+ self._parse_save_field()
+ if c == '\0':
+ self.state = self.START_RECORD
+ else:
+ self.state = self.EAT_CRNL
+ elif not self.dialect.strict:
+ self._parse_add_char(c)
+ self.state = self.IN_FIELD
+ else:
+ raise Error("'%c' expected after '%c'" %
+ (self.dialect.delimiter, self.dialect.quotechar))
+
+ elif self.state == self.EAT_CRNL:
+ if c in ('\r', '\n'):
+ pass
+ elif c == '\0':
+ self.state = self.START_RECORD
+ else:
+ raise Error("new-line character seen in unquoted field - "
+ "do you need to open the file "
+ "in universal-newline mode?")
+
+ else:
+ raise RuntimeError("unknown state: %r" % (self.state,))
+
+ def _parse_save_field(self):
+ field, self.field = self.field, []
+ field = ''.join(field)
+ if self.numeric_field:
+ self.numeric_field = False
+ field = float(field)
+ self.fields.append(field)
+
+ def _parse_add_char(self, c):
+ if len(self.field) >= _field_limit:
+ raise Error("field larget than field limit (%d)" % (_field_limit))
+ self.field.append(c)
+
+
+class Writer:
+ """CSV writer
+
+ Writer objects are responsible for generating tabular data
+ in CSV format from sequence input."""
+
+ def __init__(self, file, dialect=None, **kwargs):
+ if not (hasattr(file, 'write') and callable(file.write)):
+ raise TypeError("argument 1 must have a 'write' method")
+ self.writeline = file.write
+ self.dialect = _call_dialect(dialect, kwargs)
+
+ def _join_reset(self):
+ self.rec = []
+ self.num_fields = 0
+
+ def _join_append(self, field, quoted, quote_empty):
+ dialect = self.dialect
+ # If this is not the first field we need a field separator
+ if self.num_fields > 0:
+ self.rec.append(dialect.delimiter)
+
+ if dialect.quoting == QUOTE_NONE:
+ need_escape = tuple(dialect.lineterminator) + (
+ dialect.escapechar, # escapechar always first
+ dialect.delimiter, dialect.quotechar)
+
+ else:
+ for c in tuple(dialect.lineterminator) + (
+ dialect.delimiter, dialect.escapechar):
+ if c and c in field:
+ quoted = True
+
+ need_escape = ()
+ if dialect.quotechar in field:
+ if dialect.doublequote:
+ field = field.replace(dialect.quotechar,
+ dialect.quotechar * 2)
+ quoted = True
+ else:
+ need_escape = (dialect.quotechar,)
+
+
+ for c in need_escape:
+ if c and c in field:
+ if not dialect.escapechar:
+ raise Error("need to escape, but no escapechar set")
+ field = field.replace(c, dialect.escapechar + c)
+
+ # If field is empty check if it needs to be quoted
+ if field == '' and quote_empty:
+ if dialect.quoting == QUOTE_NONE:
+ raise Error("single empty field record must be quoted")
+ quoted = 1
+
+ if quoted:
+ field = dialect.quotechar + field + dialect.quotechar
+
+ self.rec.append(field)
+ self.num_fields += 1
+
+
+
+ def writerow(self, row):
+ dialect = self.dialect
+ try:
+ rowlen = len(row)
+ except TypeError:
+ raise Error("sequence expected")
+
+ # join all fields in internal buffer
+ self._join_reset()
+
+ for field in row:
+ quoted = False
+ if dialect.quoting == QUOTE_NONNUMERIC:
+ try:
+ float(field)
+ except:
+ quoted = True
+ # This changed since 2.5:
+ # quoted = not isinstance(field, (int, long, float))
+ elif dialect.quoting == QUOTE_ALL:
+ quoted = True
+
+ if field is None:
+ self._join_append("", quoted, rowlen == 1)
+ else:
+ self._join_append(str(field), quoted, rowlen == 1)
+
+ # add line terminator
+ self.rec.append(dialect.lineterminator)
+
+ self.writeline(''.join(self.rec))
+
+ def writerows(self, rows):
+ for row in rows:
+ self.writerow(row)
+
+def reader(*args, **kwargs):
+ """
+ csv_reader = reader(iterable [, dialect='excel']
+ [optional keyword args])
+ for row in csv_reader:
+ process(row)
+
+ The "iterable" argument can be any object that returns a line
+ of input for each iteration, such as a file object or a list. The
+ optional \"dialect\" parameter is discussed below. The function
+ also accepts optional keyword arguments which override settings
+ provided by the dialect.
+
+ The returned object is an iterator. Each iteration returns a row
+ of the CSV file (which can span multiple input lines)"""
+
+ return Reader(*args, **kwargs)
+
+def writer(*args, **kwargs):
+ """
+ csv_writer = csv.writer(fileobj [, dialect='excel']
+ [optional keyword args])
+ for row in sequence:
+ csv_writer.writerow(row)
+
+ [or]
+
+ csv_writer = csv.writer(fileobj [, dialect='excel']
+ [optional keyword args])
+ csv_writer.writerows(rows)
+
+ The \"fileobj\" argument can be any object that supports the file API."""
+ return Writer(*args, **kwargs)
+
+def field_size_limit(limit=None):
+ """Sets an upper limit on parsed fields.
+ csv.field_size_limit([limit])
+
+ Returns old limit. If limit is not given, no new limit is set and
+ the old limit is returned"""
+
+ global _field_limit
+ old_limit = _field_limit
+
+ if limit is not None:
+ if not isinstance(limit, (int, long)):
+ raise TypeError("int expected, got %s" %
+ (limit.__class__.__name__,))
+ _field_limit = limit
+
+ return old_limit
More information about the Pypy-commit
mailing list