[Python-checkins] CVS: python/dist/src/Objects unicodeobject.c,2.102,2.103
M.-A. Lemburg
lemburg@users.sourceforge.net
Fri, 20 Jul 2001 10:39:13 -0700
Update of /cvsroot/python/python/dist/src/Objects
In directory usw-pr-cvs1:/tmp/cvs-serv2536/Objects
Modified Files:
unicodeobject.c
Log Message:
Make the unicode-escape and the UTF-16 codecs handle surrogates
correctly and thus roundtrip-safe.
Some minor cleanups of the code.
Added tests for the roundtrip-safety.
Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.102
retrieving revision 2.103
diff -C2 -r2.102 -r2.103
*** unicodeobject.c 2001/07/20 16:36:21 2.102
--- unicodeobject.c 2001/07/20 17:39:11 2.103
***************
*** 105,109 ****
Py_UNICODE
! PyUnicode_GetMax()
{
#ifdef Py_UNICODE_WIDE
--- 105,109 ----
Py_UNICODE
! PyUnicode_GetMax(void)
{
#ifdef Py_UNICODE_WIDE
***************
*** 1082,1096 ****
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
#ifndef Py_UNICODE_WIDE
! /* This is valid data (a UTF-16 surrogate pair), but
! we are not able to store this information since our
! Py_UNICODE type only has 16 bits... this might
! change someday, even though it's unlikely. */
! errmsg = "code pairs are not supported";
! goto utf16Error;
#else
*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
- continue;
#endif
!
}
else {
--- 1082,1091 ----
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
#ifndef Py_UNICODE_WIDE
! *p++ = ch;
! *p++ = ch2;
#else
*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
#endif
! continue;
}
else {
***************
*** 1326,1330 ****
*p++ = (Py_UNICODE) chr;
else if (chr <= 0x10ffff) {
! /* UCS-4 character. Either store directly, or as surrogate pair. */
#ifdef Py_UNICODE_WIDE
*p++ = chr;
--- 1321,1326 ----
*p++ = (Py_UNICODE) chr;
else if (chr <= 0x10ffff) {
! /* UCS-4 character. Either store directly, or as
! surrogate pair. */
#ifdef Py_UNICODE_WIDE
*p++ = chr;
***************
*** 1447,1468 ****
*p++ = '\\';
*p++ = 'U';
! *p++ = hexdigit[(ch >> 28) & 0xf];
! *p++ = hexdigit[(ch >> 24) & 0xf];
! *p++ = hexdigit[(ch >> 20) & 0xf];
! *p++ = hexdigit[(ch >> 16) & 0xf];
! *p++ = hexdigit[(ch >> 12) & 0xf];
! *p++ = hexdigit[(ch >> 8) & 0xf];
! *p++ = hexdigit[(ch >> 4) & 0xf];
*p++ = hexdigit[ch & 15];
}
#endif
/* Map 16-bit characters to '\uxxxx' */
! else if (ch >= 256) {
*p++ = '\\';
*p++ = 'u';
! *p++ = hexdigit[(ch >> 12) & 0xf];
! *p++ = hexdigit[(ch >> 8) & 0xf];
! *p++ = hexdigit[(ch >> 4) & 0xf];
! *p++ = hexdigit[ch & 15];
}
/* Map special whitespace to '\t', \n', '\r' */
--- 1443,1490 ----
*p++ = '\\';
*p++ = 'U';
! *p++ = hexdigit[(ch >> 28) & 0x0000000F];
! *p++ = hexdigit[(ch >> 24) & 0x0000000F];
! *p++ = hexdigit[(ch >> 20) & 0x0000000F];
! *p++ = hexdigit[(ch >> 16) & 0x0000000F];
! *p++ = hexdigit[(ch >> 12) & 0x0000000F];
! *p++ = hexdigit[(ch >> 8) & 0x0000000F];
! *p++ = hexdigit[(ch >> 4) & 0x0000000F];
*p++ = hexdigit[ch & 15];
}
#endif
+ /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
+ else if (ch >= 0xD800 && ch < 0xDC00) {
+ Py_UNICODE ch2;
+ Py_UCS4 ucs;
+
+ ch2 = *s++;
+ size--;
+ if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+ ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
+ *p++ = '\\';
+ *p++ = 'U';
+ *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
+ *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
+ *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
+ *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
+ *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
+ *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
+ *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
+ *p++ = hexdigit[ucs & 0x0000000F];
+ continue;
+ }
+ /* Fall through: isolated surrogates are copied as-is */
+ s--;
+ size++;
+ }
+
/* Map 16-bit characters to '\uxxxx' */
! if (ch >= 256) {
*p++ = '\\';
*p++ = 'u';
! *p++ = hexdigit[(ch >> 12) & 0x000F];
! *p++ = hexdigit[(ch >> 8) & 0x000F];
! *p++ = hexdigit[(ch >> 4) & 0x000F];
! *p++ = hexdigit[ch & 0x000F];
}
/* Map special whitespace to '\t', \n', '\r' */
***************
*** 1483,1488 ****
*p++ = '\\';
*p++ = 'x';
! *p++ = hexdigit[(ch >> 4) & 0xf];
! *p++ = hexdigit[ch & 15];
}
/* Copy everything else as-is */
--- 1505,1510 ----
*p++ = '\\';
*p++ = 'x';
! *p++ = hexdigit[(ch >> 4) & 0x000F];
! *p++ = hexdigit[ch & 0x000F];
}
/* Copy everything else as-is */