[Python-checkins] bpo-34454: Clean up datetime.fromisoformat surrogate handling (GH-8959)

Miss Islington (bot) webhook-mailer at python.org
Mon Oct 22 18:35:20 EDT 2018


https://github.com/python/cpython/commit/18450be94d74b5c7e05a08f4aaa4792749ecda18
commit: 18450be94d74b5c7e05a08f4aaa4792749ecda18
branch: 3.7
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: GitHub <noreply at github.com>
date: 2018-10-22T15:35:15-07:00
summary:

bpo-34454: Clean up datetime.fromisoformat surrogate handling (GH-8959)


* Use _PyUnicode_Copy in sanitize_isoformat_str

* Use repr in fromisoformat error message

This reverses commit 67b74a98b2 per Serhiy Storchaka's suggestion:

     I suggested to use %R in the error message because including the raw
     string can be confusing in the case of empty string, or string
     containing trailing whitespaces, invisible or unprintable characters.

We agree that it is better to change both the C and pure Python versions
to use repr.

* Retain non-sanitized dtstr for error printing

This does not create an extra string, it just holds on to a reference to
the original input string for purposes of creating the error message.

* PEP 7 fixes to from_isoformat

* Separate handling of Unicode and other errors

In the initial implementation, errors other than encoding errors would
both raise an error indicating an invalid format, which would not be
true for errors like MemoryError.

* Drop needs_decref from _sanitize_isoformat_str

Instead _sanitize_isoformat_str returns a new reference, even to the
original string.
(cherry picked from commit 3df85404d4bf420db3362eeae1345f2cad948a71)

Co-authored-by: Paul Ganssle <pganssle at users.noreply.github.com>

files:
M Lib/datetime.py
M Lib/test/datetimetester.py
M Modules/_datetimemodule.c

diff --git a/Lib/datetime.py b/Lib/datetime.py
index 12a0f1489feb..8bffbef9ac32 100644
--- a/Lib/datetime.py
+++ b/Lib/datetime.py
@@ -857,7 +857,7 @@ def fromisoformat(cls, date_string):
             assert len(date_string) == 10
             return cls(*_parse_isoformat_date(date_string))
         except Exception:
-            raise ValueError('Invalid isoformat string: %s' % date_string)
+            raise ValueError(f'Invalid isoformat string: {date_string!r}')
 
 
     # Conversions to string
@@ -1369,7 +1369,7 @@ def fromisoformat(cls, time_string):
         try:
             return cls(*_parse_isoformat_time(time_string))
         except Exception:
-            raise ValueError('Invalid isoformat string: %s' % time_string)
+            raise ValueError(f'Invalid isoformat string: {time_string!r}')
 
 
     def strftime(self, fmt):
@@ -1646,13 +1646,13 @@ def fromisoformat(cls, date_string):
         try:
             date_components = _parse_isoformat_date(dstr)
         except ValueError:
-            raise ValueError('Invalid isoformat string: %s' % date_string)
+            raise ValueError(f'Invalid isoformat string: {date_string!r}')
 
         if tstr:
             try:
                 time_components = _parse_isoformat_time(tstr)
             except ValueError:
-                raise ValueError('Invalid isoformat string: %s' % date_string)
+                raise ValueError(f'Invalid isoformat string: {date_string!r}')
         else:
             time_components = [0, 0, 0, 0, None]
 
diff --git a/Lib/test/datetimetester.py b/Lib/test/datetimetester.py
index 9c6e71c54d79..122f6b55ba33 100644
--- a/Lib/test/datetimetester.py
+++ b/Lib/test/datetimetester.py
@@ -13,6 +13,7 @@
 import os
 import pickle
 import random
+import re
 import struct
 import unittest
 
@@ -2676,6 +2677,14 @@ def test_fromisoformat_fails_datetime(self):
                 with self.assertRaises(ValueError):
                     self.theclass.fromisoformat(bad_str)
 
+    def test_fromisoformat_fails_surrogate(self):
+        # Test that when fromisoformat() fails with a surrogate character as
+        # the separator, the error message contains the original string
+        dtstr = "2018-01-03\ud80001:0113"
+
+        with self.assertRaisesRegex(ValueError, re.escape(repr(dtstr))):
+            self.theclass.fromisoformat(dtstr)
+
     def test_fromisoformat_utc(self):
         dt_str = '2014-04-19T13:21:13+00:00'
         dt = self.theclass.fromisoformat(dt_str)
diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c
index 81a0b1f16811..f53880808d21 100644
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@@ -667,8 +667,8 @@ set_date_fields(PyDateTime_Date *self, int y, int m, int d)
  * String parsing utilities and helper functions
  */
 
-static const char*
-parse_digits(const char* ptr, int* var, size_t num_digits)
+static const char *
+parse_digits(const char *ptr, int *var, size_t num_digits)
 {
     for (size_t i = 0; i < num_digits; ++i) {
         unsigned int tmp = (unsigned int)(*(ptr++) - '0');
@@ -682,15 +682,16 @@ parse_digits(const char* ptr, int* var, size_t num_digits)
     return ptr;
 }
 
-static int parse_isoformat_date(const char *dtstr,
-                                int* year, int *month, int* day) {
+static int
+parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
+{
     /* Parse the date components of the result of date.isoformat()
-    *
-    *  Return codes:
-    *       0:  Success
-    *      -1:  Failed to parse date component
-    *      -2:  Failed to parse dateseparator
-    */
+     *
+     *  Return codes:
+     *       0:  Success
+     *      -1:  Failed to parse date component
+     *      -2:  Failed to parse dateseparator
+     */
     const char *p = dtstr;
     p = parse_digits(p, year, 4);
     if (NULL == p) {
@@ -719,8 +720,9 @@ static int parse_isoformat_date(const char *dtstr,
 }
 
 static int
-parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end,
-                  int* hour, int* minute, int *second, int *microsecond) {
+parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
+                  int *minute, int *second, int *microsecond)
+{
     const char *p = tstr;
     const char *p_end = tstr_end;
     int *vals[3] = {hour, minute, second};
@@ -735,12 +737,15 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end,
         char c = *(p++);
         if (p >= p_end) {
             return c != '\0';
-        } else if (c == ':') {
+        }
+        else if (c == ':') {
             continue;
-        } else if (c == '.') {
+        }
+        else if (c == '.') {
             break;
-        } else {
-            return -4;      // Malformed time separator
+        }
+        else {
+            return -4;  // Malformed time separator
         }
     }
 
@@ -764,9 +769,10 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end,
 }
 
 static int
-parse_isoformat_time(const char *dtstr, size_t dtlen,
-                     int* hour, int *minute, int *second, int *microsecond,
-                     int* tzoffset, int *tzmicrosecond) {
+parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
+                     int *second, int *microsecond, int *tzoffset,
+                     int *tzmicrosecond)
+{
     // Parse the time portion of a datetime.isoformat() string
     //
     // Return codes:
@@ -784,19 +790,21 @@ parse_isoformat_time(const char *dtstr, size_t dtlen,
         if (*tzinfo_pos == '+' || *tzinfo_pos == '-') {
             break;
         }
-    } while(++tzinfo_pos < p_end);
+    } while (++tzinfo_pos < p_end);
 
-    int rv = parse_hh_mm_ss_ff(dtstr, tzinfo_pos,
-                               hour, minute, second, microsecond);
+    int rv = parse_hh_mm_ss_ff(dtstr, tzinfo_pos, hour, minute, second,
+                               microsecond);
 
     if (rv < 0) {
         return rv;
-    } else if (tzinfo_pos == p_end) {
+    }
+    else if (tzinfo_pos == p_end) {
         // We know that there's no time zone, so if there's stuff at the
         // end of the string it's an error.
         if (rv == 1) {
             return -5;
-        } else {
+        }
+        else {
             return 0;
         }
     }
@@ -811,19 +819,18 @@ parse_isoformat_time(const char *dtstr, size_t dtlen,
         return -5;
     }
 
-    int tzsign = (*tzinfo_pos == '-')?-1:1;
+    int tzsign = (*tzinfo_pos == '-') ? -1 : 1;
     tzinfo_pos++;
     int tzhour = 0, tzminute = 0, tzsecond = 0;
-    rv = parse_hh_mm_ss_ff(tzinfo_pos, p_end,
-                           &tzhour, &tzminute, &tzsecond, tzmicrosecond);
+    rv = parse_hh_mm_ss_ff(tzinfo_pos, p_end, &tzhour, &tzminute, &tzsecond,
+                           tzmicrosecond);
 
     *tzoffset = tzsign * ((tzhour * 3600) + (tzminute * 60) + tzsecond);
     *tzmicrosecond *= tzsign;
 
-    return rv?-5:1;
+    return rv ? -5 : 1;
 }
 
-
 /* ---------------------------------------------------------------------------
  * Create various objects, mostly without range checking.
  */
@@ -838,30 +845,33 @@ new_date_ex(int year, int month, int day, PyTypeObject *type)
         return NULL;
     }
 
-    self = (PyDateTime_Date *) (type->tp_alloc(type, 0));
+    self = (PyDateTime_Date *)(type->tp_alloc(type, 0));
     if (self != NULL)
         set_date_fields(self, year, month, day);
-    return (PyObject *) self;
+    return (PyObject *)self;
 }
 
 #define new_date(year, month, day) \
     new_date_ex(year, month, day, &PyDateTime_DateType)
 
 // Forward declaration
-static PyObject * new_datetime_ex(int, int, int, int, int, int, int,
-                                  PyObject*, PyTypeObject*);
+static PyObject *
+new_datetime_ex(int, int, int, int, int, int, int, PyObject *, PyTypeObject *);
 
 /* Create date instance with no range checking, or call subclass constructor */
 static PyObject *
-new_date_subclass_ex(int year, int month, int day, PyObject *cls) {
+new_date_subclass_ex(int year, int month, int day, PyObject *cls)
+{
     PyObject *result;
     // We have "fast path" constructors for two subclasses: date and datetime
     if ((PyTypeObject *)cls == &PyDateTime_DateType) {
         result = new_date_ex(year, month, day, (PyTypeObject *)cls);
-    } else if ((PyTypeObject *)cls == &PyDateTime_DateTimeType) {
+    }
+    else if ((PyTypeObject *)cls == &PyDateTime_DateTimeType) {
         result = new_datetime_ex(year, month, day, 0, 0, 0, 0, Py_None,
                                  (PyTypeObject *)cls);
-    } else {
+    }
+    else {
         result = PyObject_CallFunction(cls, "iii", year, month, day);
     }
 
@@ -1280,7 +1290,8 @@ append_keyword_fold(PyObject *repr, int fold)
 }
 
 static inline PyObject *
-tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds) {
+tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds)
+{
     PyObject *tzinfo;
     if (rv == 1) {
         // Create a timezone from offset in seconds (0 returns UTC)
@@ -1295,7 +1306,8 @@ tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds) {
         }
         tzinfo = new_timezone(delta, NULL);
         Py_DECREF(delta);
-    } else {
+    }
+    else {
         tzinfo = Py_None;
         Py_INCREF(Py_None);
     }
@@ -2879,17 +2891,19 @@ date_fromordinal(PyObject *cls, PyObject *args)
 
 /* Return the new date from a string as generated by date.isoformat() */
 static PyObject *
-date_fromisoformat(PyObject *cls, PyObject *dtstr) {
+date_fromisoformat(PyObject *cls, PyObject *dtstr)
+{
     assert(dtstr != NULL);
 
     if (!PyUnicode_Check(dtstr)) {
-        PyErr_SetString(PyExc_TypeError, "fromisoformat: argument must be str");
+        PyErr_SetString(PyExc_TypeError,
+                        "fromisoformat: argument must be str");
         return NULL;
     }
 
     Py_ssize_t len;
 
-    const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
+    const char *dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
     if (dt_ptr == NULL) {
         goto invalid_string_error;
     }
@@ -2899,7 +2913,8 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
     int rv;
     if (len == 10) {
         rv = parse_isoformat_date(dt_ptr, &year, &month, &day);
-    } else {
+    }
+    else {
         rv = -1;
     }
 
@@ -2910,12 +2925,10 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
     return new_date_subclass_ex(year, month, day, cls);
 
 invalid_string_error:
-    PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R",
-                 dtstr);
+    PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
     return NULL;
 }
 
-
 /*
  * Date arithmetic.
  */
@@ -4860,52 +4873,65 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
 }
 
 static PyObject *
-_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) {
+_sanitize_isoformat_str(PyObject *dtstr)
+{
     // `fromisoformat` allows surrogate characters in exactly one position,
     // the separator; to allow datetime_fromisoformat to make the simplifying
     // assumption that all valid strings can be encoded in UTF-8, this function
     // replaces any surrogate character separators with `T`.
+    //
+    // The result of this, if not NULL, returns a new reference
     Py_ssize_t len = PyUnicode_GetLength(dtstr);
-    *needs_decref = 0;
-    if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
+    if (len < 0) {
+        return NULL;
+    }
+
+    if (len <= 10 ||
+        !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
+        Py_INCREF(dtstr);
         return dtstr;
     }
 
-    PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr));
+    PyObject *str_out = _PyUnicode_Copy(dtstr);
     if (str_out == NULL) {
         return NULL;
     }
 
-    if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 ||
-            PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
+    if (PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
         Py_DECREF(str_out);
         return NULL;
     }
 
-    *needs_decref = 1;
     return str_out;
 }
 
 static PyObject *
-datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
+datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
+{
     assert(dtstr != NULL);
 
     if (!PyUnicode_Check(dtstr)) {
-        PyErr_SetString(PyExc_TypeError, "fromisoformat: argument must be str");
+        PyErr_SetString(PyExc_TypeError,
+                        "fromisoformat: argument must be str");
         return NULL;
     }
 
-    int needs_decref = 0;
-    dtstr = _sanitize_isoformat_str(dtstr, &needs_decref);
-    if (dtstr == NULL) {
+    PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr);
+    if (dtstr_clean == NULL) {
         goto error;
     }
 
     Py_ssize_t len;
-    const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
+    const char *dt_ptr = PyUnicode_AsUTF8AndSize(dtstr_clean, &len);
 
     if (dt_ptr == NULL) {
-        goto invalid_string_error;
+        if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) {
+            // Encoding errors are invalid string errors at this point
+            goto invalid_string_error;
+        }
+        else {
+            goto error;
+        }
     }
 
     const char *p = dt_ptr;
@@ -4921,8 +4947,9 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
         // In UTF-8, the length of multi-byte characters is encoded in the MSB
         if ((p[10] & 0x80) == 0) {
             p += 11;
-        } else {
-            switch(p[10] & 0xf0) {
+        }
+        else {
+            switch (p[10] & 0xf0) {
                 case 0xe0:
                     p += 13;
                     break;
@@ -4936,15 +4963,14 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
         }
 
         len -= (p - dt_ptr);
-        rv = parse_isoformat_time(p, len,
-                                  &hour, &minute, &second, &microsecond,
-                                  &tzoffset, &tzusec);
+        rv = parse_isoformat_time(p, len, &hour, &minute, &second,
+                                  &microsecond, &tzoffset, &tzusec);
     }
     if (rv < 0) {
         goto invalid_string_error;
     }
 
-    PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
+    PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
     if (tzinfo == NULL) {
         goto error;
     }
@@ -4953,23 +4979,18 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
                                             second, microsecond, tzinfo, cls);
 
     Py_DECREF(tzinfo);
-    if (needs_decref) {
-        Py_DECREF(dtstr);
-    }
+    Py_DECREF(dtstr_clean);
     return dt;
 
 invalid_string_error:
     PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
 
 error:
-    if (needs_decref) {
-        Py_DECREF(dtstr);
-    }
+    Py_XDECREF(dtstr_clean);
 
     return NULL;
 }
 
-
 /*
  * Destructor.
  */



More information about the Python-checkins mailing list