[Spambayes-checkins] spambayes/spambayes compatcsv.py,1.2,1.3
Skip Montanaro
montanaro at users.sourceforge.net
Sun May 30 12:54:03 EDT 2004
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv29361
Modified Files:
compatcsv.py
Log Message:
Abandon regexes as bad way to parse csv file - add simple unit test as well
Index: compatcsv.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/compatcsv.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** compatcsv.py 21 May 2004 13:34:42 -0000 1.2
--- compatcsv.py 30 May 2004 16:53:59 -0000 1.3
***************
*** 20,25 ****
return self
def next(self):
! return self.parse_line(self.fp.readline())
def parse_line(self, line):
--- 20,35 ----
return self
+ def _readline(self):
+ line = self.fp.readline()
+ # strip any EOL detritus
+ while line[-1:] in ("\r", "\n"):
+ line = line[:-1]
+ return line
+
def next(self):
! line = self._readline()
! if not line:
! raise StopIteration
! return self.parse_line(line)
def parse_line(self, line):
***************
*** 35,68 ****
result = []
while line:
if line[0] == '"':
! # search for ending quotation mark
! match = re.match('"(.*?)"[^"]', line)
! if match is None:
! # embedded newline
! line = line + self.fp.readline()
! continue
! else:
! field = match.group(1)
! field = field.replace('""', '"')
! try:
! dummy = unicode(field, "ascii")
! except UnicodeError:
! field = unicode(field, "utf-8")
! result.append(field)
! line = line[len(field)+3:]
!
else:
! # field is terminated by a comma or EOL
! match = re.match("(.*?)(,|%s)"%EOL, line)
! if match is None:
! print "parse error:", line
! raise
! field = match.group(1)
! try:
! dummy = unicode(field, "ascii")
! except UnicodeError:
! field = unicode(field, "utf-8")
! result.append(field)
! line = line[len(field)+len(match.group(2)):]
return result
--- 45,83 ----
result = []
while line:
+ # quoted field
if line[0] == '"':
! line = line[1:]
! field = []
! while True:
! if line[0:2] == '""':
! field.append('"')
! line = line[2:]
! elif line[0] == '"':
! # end of field - skip quote and possible comma
! line = line[1:]
! if line[0:1] == ',':
! line = line[1:]
! break
! else:
! field.append(line[0])
! line = line[1:]
! # ran out of line before field
! if not line:
! field.append("\n")
! line = self._readline()
! if not line:
! raise IOError, "end-of-file during parsing"
else:
! # unquoted field
! field = []
! while line:
! if line[0] == ',':
! # end of field
! line = line[1:]
! break
! else:
! field.append(line[0])
! line = line[1:]
! result.append("".join(field))
return result
***************
*** 84,85 ****
--- 99,120 ----
result = ",".join(result)
self.fp.write(result+EOL)
+
+ if __name__ == "__main__":
+ import unittest
+ import StringIO
+
+ class TestCase(unittest.TestCase):
+ def test_reader(self):
+ f = StringIO.StringIO('''\
+ """rare""",1,0
+ "beginning;
+ end=""itinhh.txt""",1,0
+ ''')
+ f.seek(0)
+ rdr = reader(f)
+ self.assertEqual(rdr.next(), ['"rare"', '1', '0'])
+ self.assertEqual(rdr.next(),
+ ['beginning;\n\tend="itinhh.txt"','1', '0'])
+ self.assertRaises(StopIteration, rdr.next)
+
+ unittest.main()
More information about the Spambayes-checkins
mailing list