[Python-checkins] cpython: Close #13093: PyUnicode_EncodeDecimal() doesn't support error handlers

victor.stinner python-checkins at python.org
Fri Nov 25 20:06:50 CET 2011


http://hg.python.org/cpython/rev/a20fae95618c
changeset:   73741:a20fae95618c
user:        Victor Stinner <victor.stinner at haypocalc.com>
date:        Fri Nov 25 20:09:01 2011 +0100
summary:
  Close #13093: PyUnicode_EncodeDecimal() doesn't support error handlers
different than "strict" anymore. The caller was unable to compute the
size of the output buffer: it depends on the error handler.

files:
  Lib/test/test_unicode.py |   18 +--
  Misc/NEWS                |    4 +
  Objects/unicodeobject.c  |  131 +++-----------------------
  3 files changed, 26 insertions(+), 127 deletions(-)


diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1816,20 +1816,10 @@
                          b' 3.14 ')
         self.assertRaises(UnicodeEncodeError,
                           unicode_encodedecimal, "123\u20ac", "strict")
-        self.assertEqual(unicode_encodedecimal("123\u20ac", "replace"),
-                         b'123?')
-        self.assertEqual(unicode_encodedecimal("123\u20ac", "ignore"),
-                         b'123')
-        self.assertEqual(unicode_encodedecimal("123\u20ac", "xmlcharrefreplace"),
-                         b'123&#8364;')
-        self.assertEqual(unicode_encodedecimal("123\u20ac", "backslashreplace"),
-                         b'123\\u20ac')
-        self.assertEqual(unicode_encodedecimal("123\u20ac\N{EM SPACE}", "replace"),
-                         b'123? ')
-        self.assertEqual(unicode_encodedecimal("123\u20ac\u20ac", "replace"),
-                         b'123??')
-        self.assertEqual(unicode_encodedecimal("123\u20ac\u0660", "replace"),
-                         b'123?0')
+        self.assertRaisesRegex(
+            ValueError,
+            "^'decimal' codec can't encode character",
+            unicode_encodedecimal, "123\u20ac", "replace")
 
     def test_transform_decimal(self):
         from _testcapi import unicode_transformdecimaltoascii as transform_decimal
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@
 Core and Builtins
 -----------------
 
+- Issue #13093: PyUnicode_EncodeDecimal() doesn't support error handlers
+  different than "strict" anymore. The caller was unable to compute the
+  size of the output buffer: it depends on the error handler.
+
 - PEP 3155 / issue #13448: Qualified name for classes and functions.
 
 - Issue #13436: Fix a bogus error message when an AST object was passed
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -8839,15 +8839,8 @@
                         char *output,
                         const char *errors)
 {
-    PyObject *errorHandler = NULL;
-    PyObject *exc = NULL;
     PyObject *unicode;
-    const char *encoding = "decimal";
-    const char *reason = "invalid decimal Unicode string";
-    /* the following variable is used for caching string comparisons
-     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
-    int known_errorHandler = -1;
-    Py_ssize_t i, j;
+    Py_ssize_t i;
     enum PyUnicode_Kind kind;
     void *data;
 
@@ -8860,15 +8853,20 @@
     if (unicode == NULL)
         return -1;
 
-    if (PyUnicode_READY(unicode) < 0)
-        goto onError;
+    if (PyUnicode_READY(unicode) < 0) {
+        Py_DECREF(unicode);
+        return -1;
+    }
     kind = PyUnicode_KIND(unicode);
     data = PyUnicode_DATA(unicode);
 
     for (i=0; i < length; ) {
-        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+        PyObject *exc;
+        Py_UCS4 ch;
         int decimal;
-        Py_ssize_t startpos, endpos;
+        Py_ssize_t startpos;
+
+        ch = PyUnicode_READ(kind, data, i);
 
         if (Py_UNICODE_ISSPACE(ch)) {
             *output++ = ' ';
@@ -8886,113 +8884,20 @@
             i++;
             continue;
         }
-        /* All other characters are considered unencodable */
+
         startpos = i;
-        endpos = i+1;
-        for (; endpos < length; endpos++) {
-            ch = PyUnicode_READ(kind, data, endpos);
-            if ((0 < ch && ch < 256) ||
-                Py_UNICODE_ISSPACE(ch) ||
-                0 <= Py_UNICODE_TODECIMAL(ch))
-                break;
-        }
-        /* cache callback name lookup
-         * (if not done yet, i.e. it's the first error) */
-        if (known_errorHandler==-1) {
-            if ((errors==NULL) || (!strcmp(errors, "strict")))
-                known_errorHandler = 1;
-            else if (!strcmp(errors, "replace"))
-                known_errorHandler = 2;
-            else if (!strcmp(errors, "ignore"))
-                known_errorHandler = 3;
-            else if (!strcmp(errors, "xmlcharrefreplace"))
-                known_errorHandler = 4;
-            else
-                known_errorHandler = 0;
-        }
-        switch (known_errorHandler) {
-        case 1: /* strict */
-            raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
-            goto onError;
-        case 2: /* replace */
-            for (j=startpos; j < endpos; j++)
-                *output++ = '?';
-            i = endpos;
-            break;
-        case 3: /* ignore */
-            i = endpos;
-            break;
-        case 4: /* xmlcharrefreplace */
-            /* generate replacement */
-            for (j=startpos; j < endpos; j++) {
-                ch = PyUnicode_READ(kind, data, i);
-                output += sprintf(output, "&#%d;", (int)ch);
-                i++;
-            }
-            break;
-        default:
-        {
-            PyObject *repunicode;
-            Py_ssize_t repsize, newpos, k;
-            enum PyUnicode_Kind repkind;
-            void *repdata;
-
-            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
-                                                          encoding, reason, unicode, &exc,
-                                                          startpos, endpos, &newpos);
-            if (repunicode == NULL)
-                goto onError;
-            if (!PyUnicode_Check(repunicode)) {
-                /* Byte results not supported, since they have no decimal property. */
-                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
-                Py_DECREF(repunicode);
-                goto onError;
-            }
-            if (PyUnicode_READY(repunicode) < 0) {
-                Py_DECREF(repunicode);
-                goto onError;
-            }
-            repkind = PyUnicode_KIND(repunicode);
-            repdata = PyUnicode_DATA(repunicode);
-
-            /* generate replacement  */
-            repsize = PyUnicode_GET_SIZE(repunicode);
-            for (k=0; k<repsize; k++) {
-                ch = PyUnicode_READ(repkind, repdata, k);
-                if (Py_UNICODE_ISSPACE(ch))
-                    *output++ = ' ';
-                else {
-                    decimal = Py_UNICODE_TODECIMAL(ch);
-                    if (decimal >= 0)
-                        *output++ = '0' + decimal;
-                    else if (0 < ch && ch < 256)
-                        *output++ = (char)ch;
-                    else {
-                        Py_DECREF(repunicode);
-                        raise_encode_exception(&exc, encoding,
-                                               unicode, startpos, endpos,
-                                               reason);
-                        goto onError;
-                    }
-                }
-            }
-            i = newpos;
-            Py_DECREF(repunicode);
-        }
-        }
+        exc = NULL;
+        raise_encode_exception(&exc, "decimal", unicode,
+                               startpos, startpos+1,
+                               "invalid decimal Unicode string");
+        Py_XDECREF(exc);
+        Py_DECREF(unicode);
+        return -1;
     }
     /* 0-terminate the output string */
     *output++ = '\0';
-    Py_XDECREF(exc);
-    Py_XDECREF(errorHandler);
     Py_DECREF(unicode);
     return 0;
-
-  onError:
-    Py_XDECREF(exc);
-    Py_XDECREF(errorHandler);
-    Py_DECREF(unicode);
-    return -1;
 }
 
 /* --- Helpers ------------------------------------------------------------ */

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list