[Python-checkins] cpython: Issue #24870: Optimize the ASCII decoder for error handlers: surrogateescape,

Tue Sep 22 00:11:16 CEST 2015

https://hg.python.org/cpython/rev/3c430259873e
changeset:   98150:3c430259873e
user:        Victor Stinner <victor.stinner at gmail.com>
date:        Mon Sep 21 23:06:27 2015 +0200
summary:
  Issue #24870: Optimize the ASCII decoder for error handlers: surrogateescape,
ignore and replace. Initial patch written by Naoki Inada.

The decoder is now up to 60 times as fast for these error handlers.

Add also unit tests for the ASCII decoder.

files:
  Doc/whatsnew/3.6.rst    |   3 +-
  Lib/test/test_codecs.py |  32 +++++++++++++
  Objects/unicodeobject.c |  68 ++++++++++++++++++++++++++--
  3 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@@ -106,7 +106,8 @@
 Optimizations
 =============
 
-* None yet.
+* The ASCII decoder is now up to 60 times as fast for error handlers:
+  ``surrogateescape``, ``ignore`` and ``replace``.
 
 
 Build and C API Changes
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -27,6 +27,7 @@
         self.assertEqual(coder(input), (expect, len(input)))
     return check
 
+
 class Queue(object):
     """
     queue: write bytes at one end, read bytes from the other end
@@ -47,6 +48,7 @@
             self._buffer = self._buffer[size:]
             return s
 
+
 class MixInCheckStateHandling:
     def check_state_handling_decode(self, encoding, u, s):
         for i in range(len(s)+1):
@@ -80,6 +82,7 @@
             part2 = d.encode(u[i:], True)
             self.assertEqual(s, part1+part2)
 
+
 class ReadTest(MixInCheckStateHandling):
     def check_partial(self, input, partialresults):
         # get a StreamReader for the encoding and feed the bytestring version
@@ -383,6 +386,7 @@
             self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
                              before + backslashreplace + after)
 
+
 class UTF32Test(ReadTest, unittest.TestCase):
     encoding = "utf-32"
     if sys.byteorder == 'little':
@@ -478,6 +482,7 @@
         self.assertEqual('\U00010000' * 1024,
                          codecs.utf_32_decode(encoded_be)[0])
 
+
 class UTF32LETest(ReadTest, unittest.TestCase):
     encoding = "utf-32-le"
     ill_formed_sequence = b"\x80\xdc\x00\x00"
@@ -523,6 +528,7 @@
         self.assertEqual('\U00010000' * 1024,
                          codecs.utf_32_le_decode(encoded)[0])
 
+
 class UTF32BETest(ReadTest, unittest.TestCase):
     encoding = "utf-32-be"
     ill_formed_sequence = b"\x00\x00\xdc\x80"
@@ -797,6 +803,7 @@
         with self.assertRaises(UnicodeDecodeError):
             b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
 
+
 @unittest.skipUnless(sys.platform == 'win32',
                      'cp65001 is a Windows-only codec')
 class CP65001Test(ReadTest, unittest.TestCase):
@@ -1136,6 +1143,7 @@
         self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
         self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
 
+
 class RecodingTest(unittest.TestCase):
     def test_recoding(self):
         f = io.BytesIO()
@@ -1255,6 +1263,7 @@
     if len(i)!=2:
         print(repr(i))
 
+
 class PunycodeTest(unittest.TestCase):
     def test_encode(self):
         for uni, puny in punycode_testcases:
@@ -1274,6 +1283,7 @@
             puny = puny.decode("ascii").encode("ascii")
             self.assertEqual(uni, puny.decode("punycode"))
 
+
 class UnicodeInternalTest(unittest.TestCase):
     @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
     def test_bug1251300(self):
@@ -1528,6 +1538,7 @@
                 except Exception as e:
                     raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
 
+
 class IDNACodecTest(unittest.TestCase):
     def test_builtin_decode(self):
         self.assertEqual(str(b"python.org", "idna"), "python.org")
@@ -1614,6 +1625,7 @@
             self.assertRaises(Exception,
                 b"python.org".decode, "idna", errors)
 
+
 class CodecsModuleTest(unittest.TestCase):
 
     def test_decode(self):
@@ -1722,6 +1734,7 @@
             self.assertRaises(UnicodeError,
                 codecs.decode, b'abc', 'undefined', errors)
 
+
 class StreamReaderTest(unittest.TestCase):
 
     def setUp(self):
@@ -1732,6 +1745,7 @@
         f = self.reader(self.stream)
         self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
 
+
 class EncodedFileTest(unittest.TestCase):
 
     def test_basic(self):
@@ -1862,6 +1876,7 @@
     "unicode_internal"
 ]
 
+
 class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
     def test_basics(self):
         s = "abc123"  # all codecs should be able to encode these
@@ -2024,6 +2039,7 @@
                 self.check_state_handling_decode(encoding, u, u.encode(encoding))
                 self.check_state_handling_encode(encoding, u, u.encode(encoding))
 
+
 class CharmapTest(unittest.TestCase):
     def test_decode_with_string_map(self):
         self.assertEqual(
@@ -2274,6 +2290,7 @@
                                        info.streamwriter, 'strict') as srw:
             self.assertEqual(srw.read(), "\xfc")
 
+
 class TypesTest(unittest.TestCase):
     def test_decode_unicode(self):
         # Most decoders don't accept unicode input
@@ -2564,6 +2581,7 @@
     bytes_transform_encodings.append("bz2_codec")
     transform_aliases["bz2_codec"] = ["bz2"]
 
+
 class TransformCodecTest(unittest.TestCase):
 
     def test_basics(self):
@@ -3041,5 +3059,19 @@
         self.assertEqual(decoded, ('abc', 3))
 
 
+class ASCIITest(unittest.TestCase):
+    def test_decode(self):
+        for data, error_handler, expected in (
+            (b'[\x80\xff]', 'ignore', '[]'),
+            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
+            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
+            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
+        ):
+            with self.subTest(data=data, error_handler=error_handler,
+                              expected=expected):
+                self.assertEqual(data.decode('ascii', error_handler),
+                                 expected)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6644,6 +6644,28 @@
 
 /* --- 7-bit ASCII Codec -------------------------------------------------- */
 
+typedef enum {
+    _Py_ERROR_UNKNOWN=0,
+    _Py_ERROR_SURROGATEESCAPE,
+    _Py_ERROR_REPLACE,
+    _Py_ERROR_IGNORE,
+    _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+    if (errors == NULL)
+        return _Py_ERROR_OTHER;
+    if (strcmp(errors, "surrogateescape") == 0)
+        return _Py_ERROR_SURROGATEESCAPE;
+    if (strcmp(errors, "ignore") == 0)
+        return _Py_ERROR_IGNORE;
+    if (strcmp(errors, "replace") == 0)
+        return _Py_ERROR_REPLACE;
+    return _Py_ERROR_OTHER;
+}
+
 PyObject *
 PyUnicode_DecodeASCII(const char *s,
                       Py_ssize_t size,
@@ -6657,8 +6679,9 @@
     Py_ssize_t endinpos;
     Py_ssize_t outpos;
     const char *e;
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 
     if (size == 0)
         _Py_RETURN_UNICODE_EMPTY();
@@ -6687,12 +6710,45 @@
             PyUnicode_WRITE(kind, data, writer.pos, c);
             writer.pos++;
             ++s;
-        }
-        else {
+            continue;
+        }
+
+        /* byte outsize range 0x00..0x7f: call the error handler */
+
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler)
+        {
+        case _Py_ERROR_REPLACE:
+        case _Py_ERROR_SURROGATEESCAPE:
+            /* Fast-path: the error handler only writes one character,
+               but we must switch to UCS2 at the first write */
+            if (kind < PyUnicode_2BYTE_KIND) {
+                if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos,
+                                             0xffff) < 0)
+                    return NULL;
+                kind = writer.kind;
+                data = writer.data;
+            }
+
+            if (error_handler == _Py_ERROR_REPLACE)
+                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+            else
+                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+            writer.pos++;
+            ++s;
+            break;
+
+        case _Py_ERROR_IGNORE:
+            ++s;
+            break;
+
+        default:
             startinpos = s-starts;
             endinpos = startinpos + 1;
             if (unicode_decode_call_errorhandler_writer(
-                    errors, &errorHandler,
+                    errors, &error_handler_obj,
                     "ascii", "ordinal not in range(128)",
                     &starts, &e, &startinpos, &endinpos, &exc, &s,
                     &writer))
@@ -6701,13 +6757,13 @@
             data = writer.data;
         }
     }
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return _PyUnicodeWriter_Finish(&writer);
 
   onError:
     _PyUnicodeWriter_Dealloc(&writer);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return NULL;
 }

-- 
Repository URL: https://hg.python.org/cpython