[Python-checkins] r51450 - in python/trunk: Lib/test/test_unicode.py Misc/ACKS Objects/unicodeobject.c
neal.norwitz
python-checkins at python.org
Tue Aug 22 00:21:20 CEST 2006
Author: neal.norwitz
Date: Tue Aug 22 00:21:19 2006
New Revision: 51450
Modified:
python/trunk/Lib/test/test_unicode.py
python/trunk/Misc/ACKS
python/trunk/Objects/unicodeobject.c
Log:
Patch #1541585: fix buffer overrun when performing repr() on
a unicode string in a build with wide unicode (UCS-4) support.
This code could be improved, so add an XXX comment.
Modified: python/trunk/Lib/test/test_unicode.py
==============================================================================
--- python/trunk/Lib/test/test_unicode.py (original)
+++ python/trunk/Lib/test/test_unicode.py Tue Aug 22 00:21:19 2006
@@ -92,6 +92,10 @@
"\\xfe\\xff'")
testrepr = repr(u''.join(map(unichr, xrange(256))))
self.assertEqual(testrepr, latin1repr)
+ # Test repr works on wide unicode escapes without overflow.
+ self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
+ repr(u"\U00010000" * 39 + u"\uffff" * 4096))
+
def test_count(self):
string_tests.CommonTest.test_count(self)
Modified: python/trunk/Misc/ACKS
==============================================================================
--- python/trunk/Misc/ACKS (original)
+++ python/trunk/Misc/ACKS Tue Aug 22 00:21:19 2006
@@ -365,6 +365,7 @@
Soren Larsen
Piers Lauder
Ben Laurie
+Simon Law
Chris Lawrence
Christopher Lee
Inyeol Lee
Modified: python/trunk/Objects/unicodeobject.c
==============================================================================
--- python/trunk/Objects/unicodeobject.c (original)
+++ python/trunk/Objects/unicodeobject.c Tue Aug 22 00:21:19 2006
@@ -2040,7 +2040,32 @@
static const char *hexdigit = "0123456789abcdef";
- repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
+ /* XXX(nnorwitz): rather than over-allocating, it would be
+ better to choose a different scheme. Perhaps scan the
+ first N-chars of the string and allocate based on that size.
+ */
+ /* Initial allocation is based on the longest-possible unichr
+ escape.
+
+ In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
+ unichr, so in this case it's the longest unichr escape. In
+ narrow (UTF-16) builds this is five chars per source unichr
+ since there are two unichrs in the surrogate pair, so in narrow
+ (UTF-16) builds it's not the longest unichr escape.
+
+ In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
+ so in the narrow (UTF-16) build case it's the longest unichr
+ escape.
+ */
+
+ repr = PyString_FromStringAndSize(NULL,
+ 2
+#ifdef Py_UNICODE_WIDE
+ + 10*size
+#else
+ + 6*size
+#endif
+ + 1);
if (repr == NULL)
return NULL;
@@ -2065,15 +2090,6 @@
#ifdef Py_UNICODE_WIDE
/* Map 21-bit characters to '\U00xxxxxx' */
else if (ch >= 0x10000) {
- Py_ssize_t offset = p - PyString_AS_STRING(repr);
-
- /* Resize the string if necessary */
- if (offset + 12 > PyString_GET_SIZE(repr)) {
- if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
- return NULL;
- p = PyString_AS_STRING(repr) + offset;
- }
-
*p++ = '\\';
*p++ = 'U';
*p++ = hexdigit[(ch >> 28) & 0x0000000F];
@@ -2086,8 +2102,8 @@
*p++ = hexdigit[ch & 0x0000000F];
continue;
}
-#endif
- /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
+#else
+ /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
else if (ch >= 0xD800 && ch < 0xDC00) {
Py_UNICODE ch2;
Py_UCS4 ucs;
@@ -2112,6 +2128,7 @@
s--;
size++;
}
+#endif
/* Map 16-bit characters to '\uxxxx' */
if (ch >= 256) {
More information about the Python-checkins
mailing list