[Python-checkins] r75928 - in python/branches/py3k: Lib/test/test_pep263.py Misc/NEWS Python/ast.c
benjamin.peterson
python-checkins at python.org
Wed Oct 28 22:59:39 CET 2009
Author: benjamin.peterson
Date: Wed Oct 28 22:59:39 2009
New Revision: 75928
Log:
in wide builds, avoid storing high unicode characters from source code with surrogates
This is accomplished by decoding with utf-32 instead of utf-16 on all builds.
The patch is by Adam Olsen.
Modified:
python/branches/py3k/Lib/test/test_pep263.py
python/branches/py3k/Misc/NEWS
python/branches/py3k/Python/ast.c
Modified: python/branches/py3k/Lib/test/test_pep263.py
==============================================================================
--- python/branches/py3k/Lib/test/test_pep263.py (original)
+++ python/branches/py3k/Lib/test/test_pep263.py Wed Oct 28 22:59:39 2009
@@ -36,6 +36,14 @@
exec(c, d)
self.assertEquals(d['\xc6'], '\xc6')
+ def test_issue3297(self):
+ c = compile("a, b = '\U0001010F', '\\U0001010F'", "dummy", "exec")
+ d = {}
+ exec(c, d)
+ self.assertEqual(d['a'], d['b'])
+ self.assertEqual(len(d['a']), len(d['b']))
+ self.assertEqual(ascii(d['a']), ascii(d['b']))
+
def test_main():
support.run_unittest(PEP263Test)
Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS (original)
+++ python/branches/py3k/Misc/NEWS Wed Oct 28 22:59:39 2009
@@ -12,6 +12,9 @@
Core and Builtins
-----------------
+- Issue #3297: On wide unicode builds, do not split unicode characters into
+ surrogates.
+
- Remove length limitation when constructing a complex number from a string.
- Issue #1087418: Boost performance of bitwise operations for longs.
Modified: python/branches/py3k/Python/ast.c
==============================================================================
--- python/branches/py3k/Python/ast.c (original)
+++ python/branches/py3k/Python/ast.c Wed Oct 28 22:59:39 2009
@@ -3246,10 +3246,11 @@
u = NULL;
} else {
/* check for integer overflow */
- if (len > PY_SIZE_MAX / 4)
+ if (len > PY_SIZE_MAX / 6)
return NULL;
- /* "\XX" may become "\u005c\uHHLL" (12 bytes) */
- u = PyBytes_FromStringAndSize((char *)NULL, len * 4);
+ /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
+ "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
+ u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
if (u == NULL)
return NULL;
p = buf = PyBytes_AsString(u);
@@ -3266,20 +3267,24 @@
PyObject *w;
char *r;
Py_ssize_t rn, i;
- w = decode_utf8(c, &s, end, "utf-16-be");
+ w = decode_utf8(c, &s, end, "utf-32-be");
if (w == NULL) {
Py_DECREF(u);
return NULL;
}
r = PyBytes_AS_STRING(w);
rn = Py_SIZE(w);
- assert(rn % 2 == 0);
- for (i = 0; i < rn; i += 2) {
- sprintf(p, "\\u%02x%02x",
+ assert(rn % 4 == 0);
+ for (i = 0; i < rn; i += 4) {
+ sprintf(p, "\\U%02x%02x%02x%02x",
r[i + 0] & 0xFF,
- r[i + 1] & 0xFF);
- p += 6;
+ r[i + 1] & 0xFF,
+ r[i + 2] & 0xFF,
+ r[i + 3] & 0xFF);
+ p += 10;
}
+ /* Should be impossible to overflow */
+ assert(p - buf <= Py_SIZE(u));
Py_DECREF(w);
} else {
*p++ = *s++;
More information about the Python-checkins
mailing list