[Python-checkins] CVS: python/dist/src/Objects unicodeobject.c,2.102,2.103

M.-A. Lemburg lemburg@users.sourceforge.net
Fri, 20 Jul 2001 10:39:13 -0700


Update of /cvsroot/python/python/dist/src/Objects
In directory usw-pr-cvs1:/tmp/cvs-serv2536/Objects

Modified Files:
	unicodeobject.c 
Log Message:
Make the unicode-escape and the UTF-16 codecs handle surrogates
correctly and thus roundtrip-safe.

Some minor cleanups of the code.

Added tests for the roundtrip-safety.



Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.102
retrieving revision 2.103
diff -C2 -r2.102 -r2.103
*** unicodeobject.c	2001/07/20 16:36:21	2.102
--- unicodeobject.c	2001/07/20 17:39:11	2.103
***************
*** 105,109 ****
  
  Py_UNICODE
! PyUnicode_GetMax()
  {
  #ifdef Py_UNICODE_WIDE
--- 105,109 ----
  
  Py_UNICODE
! PyUnicode_GetMax(void)
  {
  #ifdef Py_UNICODE_WIDE
***************
*** 1082,1096 ****
  	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
  #ifndef Py_UNICODE_WIDE
! 		/* This is valid data (a UTF-16 surrogate pair), but
! 		   we are not able to store this information since our
! 		   Py_UNICODE type only has 16 bits... this might
! 		   change someday, even though it's unlikely. */
! 		errmsg = "code pairs are not supported";
! 		goto utf16Error;
  #else
  		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
- 		continue;
  #endif
! 		
  	    }
  	    else {
--- 1082,1091 ----
  	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
  #ifndef Py_UNICODE_WIDE
! 		*p++ = ch;
! 		*p++ = ch2;
  #else
  		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
  #endif
! 		continue;
  	    }
  	    else {
***************
*** 1326,1330 ****
                  *p++ = (Py_UNICODE) chr;
              else if (chr <= 0x10ffff) {
!                 /* UCS-4 character. Either store directly, or as surrogate pair. */
  #ifdef Py_UNICODE_WIDE
                  *p++ = chr;
--- 1321,1326 ----
                  *p++ = (Py_UNICODE) chr;
              else if (chr <= 0x10ffff) {
!                 /* UCS-4 character. Either store directly, or as
! 		   surrogate pair. */
  #ifdef Py_UNICODE_WIDE
                  *p++ = chr;
***************
*** 1447,1468 ****
              *p++ = '\\';
              *p++ = 'U';
!             *p++ = hexdigit[(ch >> 28) & 0xf];
!             *p++ = hexdigit[(ch >> 24) & 0xf];
!             *p++ = hexdigit[(ch >> 20) & 0xf];
!             *p++ = hexdigit[(ch >> 16) & 0xf];
!             *p++ = hexdigit[(ch >> 12) & 0xf];
!             *p++ = hexdigit[(ch >> 8) & 0xf];
!             *p++ = hexdigit[(ch >> 4) & 0xf];
              *p++ = hexdigit[ch & 15];
          }
  #endif
          /* Map 16-bit characters to '\uxxxx' */
!         else if (ch >= 256) {
              *p++ = '\\';
              *p++ = 'u';
!             *p++ = hexdigit[(ch >> 12) & 0xf];
!             *p++ = hexdigit[(ch >> 8) & 0xf];
!             *p++ = hexdigit[(ch >> 4) & 0xf];
!             *p++ = hexdigit[ch & 15];
          }
          /* Map special whitespace to '\t', \n', '\r' */
--- 1443,1490 ----
              *p++ = '\\';
              *p++ = 'U';
!             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
!             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
!             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
!             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
!             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
!             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
!             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
              *p++ = hexdigit[ch & 15];
          }
  #endif
+ 	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
+ 	else if (ch >= 0xD800 && ch < 0xDC00) {
+ 	    Py_UNICODE ch2;
+ 	    Py_UCS4 ucs;
+ 	    
+ 	    ch2 = *s++;
+ 	    size--;
+ 	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+ 		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
+ 		*p++ = '\\';
+ 		*p++ = 'U';
+ 		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
+ 		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
+ 		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
+ 		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
+ 		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
+ 		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
+ 		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
+ 		*p++ = hexdigit[ucs & 0x0000000F];
+ 		continue;
+ 	    }
+ 	    /* Fall through: isolated surrogates are copied as-is */
+ 	    s--;
+ 	    size++;
+ 	}
+ 
          /* Map 16-bit characters to '\uxxxx' */
!         if (ch >= 256) {
              *p++ = '\\';
              *p++ = 'u';
!             *p++ = hexdigit[(ch >> 12) & 0x000F];
!             *p++ = hexdigit[(ch >> 8) & 0x000F];
!             *p++ = hexdigit[(ch >> 4) & 0x000F];
!             *p++ = hexdigit[ch & 0x000F];
          }
          /* Map special whitespace to '\t', \n', '\r' */
***************
*** 1483,1488 ****
              *p++ = '\\';
              *p++ = 'x';
!             *p++ = hexdigit[(ch >> 4) & 0xf];
!             *p++ = hexdigit[ch & 15];
          } 
          /* Copy everything else as-is */
--- 1505,1510 ----
              *p++ = '\\';
              *p++ = 'x';
!             *p++ = hexdigit[(ch >> 4) & 0x000F];
!             *p++ = hexdigit[ch & 0x000F];
          } 
          /* Copy everything else as-is */