[Python-checkins] bpo-35365: Use a wchar_t* buffer in the code page decoder. (GH-10837)

Serhiy Storchaka webhook-mailer at python.org
Tue Dec 4 03:25:54 EST 2018


https://github.com/python/cpython/commit/eeb719eac6347f5b6e85389aa13a386024766806
commit: eeb719eac6347f5b6e85389aa13a386024766806
branch: master
author: Serhiy Storchaka <storchaka at gmail.com>
committer: GitHub <noreply at github.com>
date: 2018-12-04T10:25:50+02:00
summary:

bpo-35365: Use a wchar_t* buffer in the code page decoder. (GH-10837)

files:
M Objects/unicodeobject.c

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 1351eece8e92..d0f0358cfc69 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4059,6 +4059,21 @@ make_decode_exception(PyObject **exceptionObject,
 }
 
 #ifdef MS_WINDOWS
+static int
+widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
+{
+    if (newsize > *size) {
+        wchar_t *newbuf = *buf;
+        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
+            PyErr_NoMemory();
+            return -1;
+        }
+        *buf = newbuf;
+    }
+    *size = newsize;
+    return 0;
+}
+
 /* error handling callback helper:
    build arguments, call the callback and check the arguments,
    if no exception occurred, copy the replacement to the output
@@ -4072,7 +4087,7 @@ unicode_decode_call_errorhandler_wchar(
     const char *encoding, const char *reason,
     const char **input, const char **inend, Py_ssize_t *startinpos,
     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
-    PyObject **output, Py_ssize_t *outpos)
+    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
 {
     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
 
@@ -4086,9 +4101,6 @@ unicode_decode_call_errorhandler_wchar(
     wchar_t *repwstr;
     Py_ssize_t repwlen;
 
-    assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
-    outsize = _PyUnicode_WSTR_LENGTH(*output);
-
     if (*errorHandler == NULL) {
         *errorHandler = PyCodec_LookupError(errors);
         if (*errorHandler == NULL)
@@ -4146,13 +4158,15 @@ unicode_decode_call_errorhandler_wchar(
     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
         goto overflow;
     requiredsize += insize - newpos;
+    outsize = *bufsize;
     if (requiredsize > outsize) {
         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
             requiredsize = 2*outsize;
-        if (unicode_resize(output, requiredsize) < 0)
+        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
             goto onError;
+        }
     }
-    wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
+    wcsncpy(*buf + *outpos, repwstr, repwlen);
     *outpos += repwlen;
     *endinpos = newpos;
     *inptr = *input + newpos;
@@ -7146,7 +7160,8 @@ decode_code_page_flags(UINT code_page)
  */
 static int
 decode_code_page_strict(UINT code_page,
-                        PyObject **v,
+                        wchar_t **buf,
+                        Py_ssize_t *bufsize,
                         const char *in,
                         int insize)
 {
@@ -7160,21 +7175,12 @@ decode_code_page_strict(UINT code_page,
     if (outsize <= 0)
         goto error;
 
-    if (*v == NULL) {
-        /* Create unicode object */
-        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
-        *v = (PyObject*)_PyUnicode_New(outsize);
-        if (*v == NULL)
-            return -1;
-        out = PyUnicode_AS_UNICODE(*v);
-    }
-    else {
-        /* Extend unicode object */
-        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
-        if (unicode_resize(v, n + outsize) < 0)
-            return -1;
-        out = PyUnicode_AS_UNICODE(*v) + n;
+    /* Extend a wchar_t* buffer */
+    Py_ssize_t n = *bufsize;   /* Get the current length */
+    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
+        return -1;
     }
+    out = *buf + n;
 
     /* Do the conversion */
     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
@@ -7198,7 +7204,8 @@ decode_code_page_strict(UINT code_page,
  */
 static int
 decode_code_page_errors(UINT code_page,
-                        PyObject **v,
+                        wchar_t **buf,
+                        Py_ssize_t *bufsize,
                         const char *in, const int size,
                         const char *errors, int final)
 {
@@ -7238,29 +7245,16 @@ decode_code_page_errors(UINT code_page,
         goto error;
     }
 
-    if (*v == NULL) {
-        /* Create unicode object */
-        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
-            PyErr_NoMemory();
-            goto error;
-        }
-        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
-        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
-        if (*v == NULL)
-            goto error;
-        out = PyUnicode_AS_UNICODE(*v);
+    /* Extend a wchar_t* buffer */
+    Py_ssize_t n = *bufsize;   /* Get the current length */
+    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+        PyErr_NoMemory();
+        goto error;
     }
-    else {
-        /* Extend unicode object */
-        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
-        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
-            PyErr_NoMemory();
-            goto error;
-        }
-        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
-            goto error;
-        out = PyUnicode_AS_UNICODE(*v) + n;
+    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
+        goto error;
     }
+    out = *buf + n;
 
     /* Decode the byte string character per character */
     while (in < endin)
@@ -7295,16 +7289,16 @@ decode_code_page_errors(UINT code_page,
 
             startinpos = in - startin;
             endinpos = startinpos + 1;
-            outpos = out - PyUnicode_AS_UNICODE(*v);
+            outpos = out - *buf;
             if (unicode_decode_call_errorhandler_wchar(
                     errors, &errorHandler,
                     encoding, reason,
                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
-                    v, &outpos))
+                    buf, bufsize, &outpos))
             {
                 goto error;
             }
-            out = PyUnicode_AS_UNICODE(*v) + outpos;
+            out = *buf + outpos;
         }
         else {
             in += insize;
@@ -7313,14 +7307,9 @@ decode_code_page_errors(UINT code_page,
         }
     }
 
-    /* write a NUL character at the end */
-    *out = 0;
-
-    /* Extend unicode object */
-    outsize = out - PyUnicode_AS_UNICODE(*v);
-    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
-    if (unicode_resize(v, outsize) < 0)
-        goto error;
+    /* Shrink the buffer */
+    assert(out - *buf <= *bufsize);
+    *bufsize = out - *buf;
     /* (in - startin) <= size and size is an int */
     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
 
@@ -7336,7 +7325,8 @@ decode_code_page_stateful(int code_page,
                           const char *s, Py_ssize_t size,
                           const char *errors, Py_ssize_t *consumed)
 {
-    PyObject *v = NULL;
+    wchar_t *buf = NULL;
+    Py_ssize_t bufsize = 0;
     int chunk_size, final, converted, done;
 
     if (code_page < 0) {
@@ -7368,21 +7358,21 @@ decode_code_page_stateful(int code_page,
         }
 
         if (chunk_size == 0 && done) {
-            if (v != NULL)
+            if (buf != NULL)
                 break;
             _Py_RETURN_UNICODE_EMPTY();
         }
 
-        converted = decode_code_page_strict(code_page, &v,
+        converted = decode_code_page_strict(code_page, &buf, &bufsize,
                                             s, chunk_size);
         if (converted == -2)
-            converted = decode_code_page_errors(code_page, &v,
+            converted = decode_code_page_errors(code_page, &buf, &bufsize,
                                                 s, chunk_size,
                                                 errors, final);
         assert(converted != 0 || done);
 
         if (converted < 0) {
-            Py_XDECREF(v);
+            PyMem_Free(buf);
             return NULL;
         }
 
@@ -7393,7 +7383,9 @@ decode_code_page_stateful(int code_page,
         size -= converted;
     } while (!done);
 
-    return unicode_result(v);
+    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
+    PyMem_Free(buf);
+    return v;
 }
 
 PyObject *



More information about the Python-checkins mailing list