[Python-checkins] python/nondist/sandbox/csv/util sniffer.py,1.3,1.4
cliffwells18@users.sourceforge.net
cliffwells18@users.sourceforge.net
Fri, 14 Mar 2003 16:42:49 -0800
Update of /cvsroot/python/python/nondist/sandbox/csv/util
In directory sc8-pr-cvs1:/tmp/cvs-serv16961
Modified Files:
sniffer.py
Log Message:
Fixed return value of delim when there's only a single column (return '' rather than None).
Index: sniffer.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/csv/util/sniffer.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** sniffer.py 14 Mar 2003 21:10:27 -0000 1.3
--- sniffer.py 15 Mar 2003 00:42:46 -0000 1.4
***************
*** 9,12 ****
--- 9,13 ----
import re
+ # ------------------------------------------------------------------------------
class Sniffer:
"""
***************
*** 57,60 ****
--- 58,62 ----
"""
+ matches = []
for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
***************
*** 99,103 ****
skipinitialspace = delims[delim] == spaces
if delim == '\n': # most likely a file with a single column
! delim = None
else:
# there is *no* delimiter, it's a single column of quoted data
--- 101,105 ----
skipinitialspace = delims[delim] == spaces
if delim == '\n': # most likely a file with a single column
! delim = ''
else:
# there is *no* delimiter, it's a single column of quoted data
***************
*** 198,201 ****
--- 200,274 ----
+ # ------------------------------------------------------------------------------
+ def hasHeaders(data, columns = 0):
+ """
+ PROTOTYPE:
+ hasHeaders(data, columns = 0)
+ DESCRIPTION:
+ Decides whether row 0 is a header row
+ ARGUMENTS:
+ - data is a list of lists of data (as returned by importDSV)
+ - columns is either the expected number of columns in each row or 0
+ RETURNS:
+ - true if data has header row
+ """
+
+ # Algorithm: creates a dictionary of types of data in each column. If any column
+ # is of a single type (say, integers), *except* for the first row, then the first
+ # row is presumed to be labels. If the type can't be determined, it is assumed to
+ # be a string in which case the length of the string is the determining factor: if
+ # all of the rows except for the first are the same length, it's a header.
+ # Finally, a 'vote' is taken at the end for each column, adding or subtracting from
+ # the likelihood of the first row being a header.
+
+ if type(data) != type([]):
+ raise InvalidData, "list expected."
+ if len(data) < 2: return 0
+
+ if not columns:
+ columns = modeOfLengths(data)
+
+ columnTypes = {}
+ for i in range(columns): columnTypes[i] = None
+
+ for row in data[1:]:
+ if len(row) != columns:
+ continue # skip rows that have irregular number of columns
+ for col in columnTypes.keys():
+ try:
+ try:
+ # is it a built-in type (besides string)?
+ thisType = type(eval(row[col]))
+ except OverflowError:
+ # a long int?
+ thisType = type(eval(row[col] + 'L'))
+ thisType = type(0) # treat long ints as int
+ except:
+ # fallback to length of string
+ thisType = len(row[col])
+
+ if thisType != columnTypes[col]:
+ if columnTypes[col] is None: # add new column type
+ columnTypes[col] = thisType
+ else: # type is inconsistent, remove column from consideration
+ del columnTypes[col]
+
+ # finally, compare results against first row and vote on whether it's a header
+ hasHeader = 0
+ for col, colType in columnTypes.items():
+ if type(colType) == type(0): # it's a length
+ if len(data[0][col]) != colType:
+ hasHeader += 1
+ else:
+ hasHeader -= 1
+ else: # attempt typecast
+ try:
+ eval("%s(%s)" % (colType.__name__, data[0][col]))
+ except:
+ hasHeader += 1
+ else:
+ hasHeader -= 1
+
+ return hasHeader > 0