[pypy-commit] pypy py3.6: speed up utf8-handling of csv module
cfbolz
pypy.commits at gmail.com
Wed Oct 9 09:59:40 EDT 2019
Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch: py3.6
Changeset: r97745:1b9016af40bd
Date: 2019-10-09 13:39 +0200
http://bitbucket.org/pypy/pypy/changeset/1b9016af40bd/
Log: speed up utf8-handling of csv module
diff --git a/pypy/module/_csv/interp_reader.py b/pypy/module/_csv/interp_reader.py
--- a/pypy/module/_csv/interp_reader.py
+++ b/pypy/module/_csv/interp_reader.py
@@ -1,5 +1,5 @@
from rpython.rlib.rstring import UnicodeBuilder
-from rpython.rlib.rutf8 import Utf8StringIterator
+from rpython.rlib.rutf8 import Utf8StringIterator, Utf8StringBuilder
from rpython.rlib import objectmodel
from pypy.interpreter.baseobjspace import W_Root
from pypy.interpreter.error import OperationError
@@ -38,16 +38,15 @@
assert field_builder is not None
if field_builder.getlength() >= field_limit.limit:
raise self.error(u"field larger than field limit")
- field_builder.append(c)
+ field_builder.append_code(c)
def save_field(self, field_builder):
space = self.space
field = field_builder.build()
+ w_obj = space.newutf8(field, field_builder.getlength())
if self.numeric_field:
self.numeric_field = False
- w_obj = space.call_function(space.w_float, space.newtext(field))
- else:
- w_obj = space.newtext(field)
+ w_obj = space.call_function(space.w_float, w_obj)
self.fields_w.append(w_obj)
def next_w(self):
@@ -79,13 +78,11 @@
u"(did you open the file in text mode?")
line = space.utf8_w(w_line)
for c in Utf8StringIterator(line):
- # XXX rewrite this to use c (as int) not unichr(c)
- c = unichr(c)
- if c == '\0':
+ if c == 0:
raise self.error(u"line contains NULL byte")
if state == START_RECORD:
- if c == b'\n' or c == b'\r':
+ if c == ord(u'\n') or c == ord(u'\r'):
state = EAT_CRNL
continue
# normal character - handle as START_FIELD
@@ -93,23 +90,23 @@
# fall-through to the next case
if state == START_FIELD:
- field_builder = UnicodeBuilder(64)
+ field_builder = Utf8StringBuilder(64)
# expecting field
- if c == u'\n' or c == u'\r':
+ if c == ord(u'\n') or c == ord(u'\r'):
# save empty field
self.save_field(field_builder)
state = EAT_CRNL
- elif (c == dialect.quotechar and
+ elif (c == ord(dialect.quotechar) and
dialect.quoting != QUOTE_NONE):
# start quoted field
state = IN_QUOTED_FIELD
- elif c == dialect.escapechar:
+ elif c == ord(dialect.escapechar):
# possible escaped character
state = ESCAPED_CHAR
- elif c == u' ' and dialect.skipinitialspace:
+ elif c == ord(u' ') and dialect.skipinitialspace:
# ignore space at start of field
pass
- elif c == dialect.delimiter:
+ elif c == ord(dialect.delimiter):
# save empty field
self.save_field(field_builder)
else:
@@ -120,7 +117,7 @@
state = IN_FIELD
elif state == ESCAPED_CHAR:
- if c in '\n\r':
+ if c == ord(u'\n') or c == ord(u'\r'):
self.add_char(field_builder, c)
state = AFTER_ESCAPED_CRNL
else:
@@ -129,14 +126,14 @@
elif state == IN_FIELD or state == AFTER_ESCAPED_CRNL:
# in unquoted field
- if c == u'\n' or c == u'\r':
+ if c == ord(u'\n') or c == ord(u'\r'):
# end of line
self.save_field(field_builder)
state = EAT_CRNL
- elif c == dialect.escapechar:
+ elif c == ord(dialect.escapechar):
# possible escaped character
state = ESCAPED_CHAR
- elif c == dialect.delimiter:
+ elif c == ord(dialect.delimiter):
# save field - wait for new field
self.save_field(field_builder)
state = START_FIELD
@@ -146,10 +143,10 @@
elif state == IN_QUOTED_FIELD:
# in quoted field
- if c == dialect.escapechar:
+ if c == ord(dialect.escapechar):
# Possible escape character
state = ESCAPE_IN_QUOTED_FIELD
- elif (c == dialect.quotechar and
+ elif (c == ord(dialect.quotechar) and
dialect.quoting != QUOTE_NONE):
if dialect.doublequote:
# doublequote; " represented by ""
@@ -168,15 +165,15 @@
elif state == QUOTE_IN_QUOTED_FIELD:
# doublequote - seen a quote in an quoted field
if (dialect.quoting != QUOTE_NONE and
- c == dialect.quotechar):
+ c == ord(dialect.quotechar)):
# save "" as "
self.add_char(field_builder, c)
state = IN_QUOTED_FIELD
- elif c == dialect.delimiter:
+ elif c == ord(dialect.delimiter):
# save field - wait for new field
self.save_field(field_builder)
state = START_FIELD
- elif c == u'\n' or c == u'\r':
+ elif c == ord(u'\n') or c == ord(u'\r'):
# end of line
self.save_field(field_builder)
state = EAT_CRNL
@@ -189,7 +186,7 @@
dialect.delimiter, dialect.quotechar))
elif state == EAT_CRNL:
- if not (c == u'\n' or c == u'\r'):
+ if not (c == ord(u'\n') or c == ord(u'\r')):
raise self.error(u"new-line character seen in unquoted "
u"field - do you need to open the file "
u"in universal-newline mode?")
@@ -198,16 +195,16 @@
self.save_field(field_builder)
break
elif state == ESCAPED_CHAR:
- self.add_char(field_builder, u'\n')
+ self.add_char(field_builder, ord(u'\n'))
state = IN_FIELD
elif state == IN_QUOTED_FIELD:
pass
elif state == ESCAPE_IN_QUOTED_FIELD:
- self.add_char(field_builder, u'\n')
+ self.add_char(field_builder, ord(u'\n'))
state = IN_QUOTED_FIELD
elif state == START_FIELD:
# save empty field
- field_builder = UnicodeBuilder(1)
+ field_builder = Utf8StringBuilder()
self.save_field(field_builder)
break
elif state == AFTER_ESCAPED_CRNL:
More information about the pypy-commit
mailing list