[Python-checkins] python/nondist/sandbox/csv/util sniffer.py,1.3,1.4

Fri, 14 Mar 2003 16:42:49 -0800

Update of /cvsroot/python/python/nondist/sandbox/csv/util
In directory sc8-pr-cvs1:/tmp/cvs-serv16961

Modified Files:
	sniffer.py 
Log Message:
Fixed return value of delim when there's only a single column (return '' rather than None).


Index: sniffer.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/csv/util/sniffer.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** sniffer.py	14 Mar 2003 21:10:27 -0000	1.3
--- sniffer.py	15 Mar 2003 00:42:46 -0000	1.4
***************
*** 9,12 ****
--- 9,13 ----
  import re
  
+ # ------------------------------------------------------------------------------
  class Sniffer:
      """
***************
*** 57,60 ****
--- 58,62 ----
          """
  
+         matches = []
          for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
                        '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
***************
*** 99,103 ****
              skipinitialspace = delims[delim] == spaces
              if delim == '\n': # most likely a file with a single column
!                 delim = None
          else:
              # there is *no* delimiter, it's a single column of quoted data
--- 101,105 ----
              skipinitialspace = delims[delim] == spaces
              if delim == '\n': # most likely a file with a single column
!                 delim = ''
          else:
              # there is *no* delimiter, it's a single column of quoted data
***************
*** 198,201 ****
--- 200,274 ----
  
  
+ # ------------------------------------------------------------------------------
+ def hasHeaders(data, columns = 0):
+     """
+     PROTOTYPE:
+       hasHeaders(data, columns = 0)
+     DESCRIPTION:
+       Decides whether row 0 is a header row
+     ARGUMENTS:
+       - data is a list of lists of data (as returned by importDSV)
+       - columns is either the expected number of columns in each row or 0
+     RETURNS:
+       - true if data has header row
+     """
+     
+     # Algorithm: creates a dictionary of types of data in each column. If any column
+     # is of a single type (say, integers), *except* for the first row, then the first
+     # row is presumed to be labels. If the type can't be determined, it is assumed to
+     # be a string in which case the length of the string is the determining factor: if
+     # all of the rows except for the first are the same length, it's a header.
+     # Finally, a 'vote' is taken at the end for each column, adding or subtracting from
+     # the likelihood of the first row being a header. 
+ 
+     if type(data) != type([]):
+         raise InvalidData, "list expected."
+     if len(data) < 2: return 0
+ 
+     if not columns:
+         columns = modeOfLengths(data)
+         
+     columnTypes = {}
+     for i in range(columns): columnTypes[i] = None
+     
+     for row in data[1:]:
+         if len(row) != columns:
+             continue # skip rows that have irregular number of columns
+         for col in columnTypes.keys():
+             try:
+                 try:
+                     # is it a built-in type (besides string)?
+                     thisType = type(eval(row[col]))
+                 except OverflowError:
+                     # a long int?
+                     thisType = type(eval(row[col] + 'L'))
+                     thisType = type(0) # treat long ints as int
+             except:
+                 # fallback to length of string
+                 thisType = len(row[col])
+ 
+             if thisType != columnTypes[col]:
+                 if columnTypes[col] is None: # add new column type
+                     columnTypes[col] = thisType
+                 else: # type is inconsistent, remove column from consideration
+                     del columnTypes[col]
+                     
+     # finally, compare results against first row and vote on whether it's a header
+     hasHeader = 0
+     for col, colType in columnTypes.items():
+         if type(colType) == type(0): # it's a length
+             if len(data[0][col]) != colType:
+                 hasHeader += 1
+             else:
+                 hasHeader -= 1
+         else: # attempt typecast
+             try:
+                 eval("%s(%s)" % (colType.__name__, data[0][col]))
+             except:
+                 hasHeader += 1
+             else:
+                 hasHeader -= 1
+ 
+     return hasHeader > 0