[Python-checkins] CVS: python/dist/src/Objects unicodeobject.c,2.106,2.107

Tim Peters tim_one@users.sourceforge.net
Thu, 09 Aug 2001 15:21:58 -0700


Update of /cvsroot/python/python/dist/src/Objects
In directory usw-pr-cvs1:/tmp/cvs-serv28410/python/dist/src/Objects

Modified Files:
	unicodeobject.c 
Log Message:
SF patch #438013 Remove 2-byte Py_UCS2 assumptions
Removed all instances of Py_UCS2 from the codebase, and so also (I hope)
the last remaining reliance on the platform having an integral type
with exactly 16 bits.
PyUnicode_DecodeUTF16() and PyUnicode_EncodeUTF16() now read and write
one byte at a time.


Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.106
retrieving revision 2.107
diff -C2 -d -r2.106 -r2.107
*** unicodeobject.c	2001/08/02 04:15:00	2.106
--- unicodeobject.c	2001/08/09 22:21:55	2.107
***************
*** 945,950 ****
  
  static
! int utf16_decoding_error(const Py_UCS2 **source,
! 			 Py_UNICODE **dest,
  			 const char *errors,
  			 const char *details) 
--- 945,949 ----
  
  static
! int utf16_decoding_error(Py_UNICODE **dest,
  			 const char *errors,
  			 const char *details) 
***************
*** 976,996 ****
  }
  
! PyObject *PyUnicode_DecodeUTF16(const char *s,
! 				int size,
! 				const char *errors,
! 				int *byteorder)
  {
      PyUnicodeObject *unicode;
      Py_UNICODE *p;
!     const Py_UCS2 *q, *e;
!     int bo = 0;
      const char *errmsg = "";
  
      /* size should be an even number */
!     if (size % sizeof(Py_UCS2) != 0) {
! 	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
! 	    return NULL;
! 	/* The remaining input chars are ignored if we fall through
!            here... */
      }
  
--- 975,1001 ----
  }
  
! PyObject *
! PyUnicode_DecodeUTF16(const char *s,
! 		      int size,
! 		      const char *errors,
! 		      int *byteorder)
  {
      PyUnicodeObject *unicode;
      Py_UNICODE *p;
!     const unsigned char *q, *e;
!     int bo = 0;       /* assume native ordering by default */
      const char *errmsg = "";
+     /* Offsets from q for retrieving byte pairs in the right order. */
+ #ifdef BYTEORDER_IS_LITTLE_ENDIAN
+     int ihi = 1, ilo = 0;
+ #else
+     int ihi = 0, ilo = 1;
+ #endif
  
      /* size should be an even number */
!     if (size & 1) {
!         if (utf16_decoding_error(NULL, errors, "truncated data"))
!             return NULL;
!         --size;  /* else ignore the oddball byte */
      }
  
***************
*** 1005,1013 ****
      /* Unpack UTF-16 encoded data */
      p = unicode->str;
!     q = (Py_UCS2 *)s;
!     e = q + (size / sizeof(Py_UCS2));
  
      if (byteorder)
! 	bo = *byteorder;
  
      /* Check for BOM marks (U+FEFF) in the input and adjust current
--- 1010,1018 ----
      /* Unpack UTF-16 encoded data */
      p = unicode->str;
!     q = (unsigned char *)s;
!     e = q + size;
  
      if (byteorder)
!         bo = *byteorder;
  
      /* Check for BOM marks (U+FEFF) in the input and adjust current
***************
*** 1016,1050 ****
         stream as-is (giving a ZWNBSP character). */
      if (bo == 0) {
  #ifdef BYTEORDER_IS_LITTLE_ENDIAN
! 	if (*q == 0xFEFF) {
! 	    q++;
  	    bo = -1;
! 	} else if (*q == 0xFFFE) {
! 	    q++;
  	    bo = 1;
  	}
  #else    
! 	if (*q == 0xFEFF) {
! 	    q++;
  	    bo = 1;
! 	} else if (*q == 0xFFFE) {
! 	    q++;
  	    bo = -1;
  	}
  #endif
      }
!     
      while (q < e) {
! 	register Py_UCS2 ch = *q++;
  
- 	/* Swap input bytes if needed. (This assumes
- 	   sizeof(Py_UNICODE) == 2 !) */
- #ifdef BYTEORDER_IS_LITTLE_ENDIAN
- 	if (bo == 1)
- 	    ch = (ch >> 8) | (ch << 8);
- #else    
- 	if (bo == -1)
- 	    ch = (ch >> 8) | (ch << 8);
- #endif
  	if (ch < 0xD800 || ch > 0xDFFF) {
  	    *p++ = ch;
--- 1021,1061 ----
         stream as-is (giving a ZWNBSP character). */
      if (bo == 0) {
+         const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
  #ifdef BYTEORDER_IS_LITTLE_ENDIAN
! 	if (bom == 0xFEFF) {
! 	    q += 2;
  	    bo = -1;
! 	}
!         else if (bom == 0xFFFE) {
! 	    q += 2;
  	    bo = 1;
  	}
  #else    
! 	if (bom == 0xFEFF) {
! 	    q += 2;
  	    bo = 1;
! 	}
!         else if (bom == 0xFFFE) {
! 	    q += 2;
  	    bo = -1;
  	}
  #endif
      }
! 
!     if (bo == -1) {
!         /* force LE */
!         ihi = 1;
!         ilo = 0;
!     }
!     else if (bo == 1) {
!         /* force BE */
!         ihi = 0;
!         ilo = 1;
!     }
! 
      while (q < e) {
! 	Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
! 	q += 2;
  
  	if (ch < 0xD800 || ch > 0xDFFF) {
  	    *p++ = ch;
***************
*** 1058,1069 ****
  	}
  	if (0xD800 <= ch && ch <= 0xDBFF) {
! 	    Py_UCS2 ch2 = *q++;
! #ifdef BYTEORDER_IS_LITTLE_ENDIAN
! 	    if (bo == 1)
! 		    ch2 = (ch2 >> 8) | (ch2 << 8);
! #else    
! 	    if (bo == -1)
! 		    ch2 = (ch2 >> 8) | (ch2 << 8);
! #endif
  	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
  #ifndef Py_UNICODE_WIDE
--- 1069,1074 ----
  	}
  	if (0xD800 <= ch && ch <= 0xDBFF) {
! 	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
! 	    q += 2;
  	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
  #ifndef Py_UNICODE_WIDE
***************
*** 1085,1089 ****
  
      utf16Error:
! 	if (utf16_decoding_error(&q, &p, errors, errmsg))
  	    goto onError;
      }
--- 1090,1094 ----
  
      utf16Error:
! 	if (utf16_decoding_error(&p, errors, errmsg))
  	    goto onError;
      }
***************
*** 1103,1158 ****
  }
  
! #undef UTF16_ERROR
! 
! PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
! 				int size,
! 				const char *errors,
! 				int byteorder)
  {
      PyObject *v;
!     Py_UCS2 *p;
!     char *q;
!     int i, pairs, doswap = 1;
  
      for (i = pairs = 0; i < size; i++)
  	if (s[i] >= 0x10000)
  	    pairs++;
      v = PyString_FromStringAndSize(NULL, 
! 		  sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
      if (v == NULL)
          return NULL;
  
!     q = PyString_AS_STRING(v);
!     p = (Py_UCS2 *)q;
      if (byteorder == 0)
! 	*p++ = 0xFEFF;
      if (size == 0)
          return v;
!     if (byteorder == 0 ||
! #ifdef BYTEORDER_IS_LITTLE_ENDIAN	
! 	byteorder == -1
! #else
! 	byteorder == 1
! #endif
! 	)
! 	doswap = 0;
      while (size-- > 0) {
  	Py_UNICODE ch = *s++;
  	Py_UNICODE ch2 = 0;
  	if (ch >= 0x10000) {
! 	    ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
! 	    ch  = 0xD800|((ch-0x10000)>>10);
! 	}
! 	if (doswap){
! 	    *p++ = (ch >> 8) | (ch << 8);
! 	    if (ch2)
! 		*p++ = (ch2 >> 8) | (ch2 << 8);
! 	}else{
! 	    *p++ = ch;
! 	    if(ch2)
! 		*p++ = ch2;
  	}
      }
      return v;
  }
  
--- 1108,1172 ----
  }
  
! PyObject *
! PyUnicode_EncodeUTF16(const Py_UNICODE *s,
! 		      int size,
! 		      const char *errors,
! 		      int byteorder)
  {
      PyObject *v;
!     unsigned char *p;
!     int i, pairs;
!     /* Offsets from p for storing byte pairs in the right order. */
! #ifdef BYTEORDER_IS_LITTLE_ENDIAN
!     int ihi = 1, ilo = 0;
! #else
!     int ihi = 0, ilo = 1;
! #endif
  
+ #define STORECHAR(CH)                   \
+     do {                                \
+         p[ihi] = ((CH) >> 8) & 0xff;    \
+         p[ilo] = (CH) & 0xff;           \
+         p += 2;                         \
+     } while(0)
+ 
      for (i = pairs = 0; i < size; i++)
  	if (s[i] >= 0x10000)
  	    pairs++;
      v = PyString_FromStringAndSize(NULL, 
! 		  2 * (size + pairs + (byteorder == 0)));
      if (v == NULL)
          return NULL;
  
!     p = (unsigned char *)PyString_AS_STRING(v);
      if (byteorder == 0)
! 	STORECHAR(0xFEFF);
      if (size == 0)
          return v;
! 
!     if (byteorder == -1) {
!         /* force LE */
!         ihi = 1;
!         ilo = 0;
!     }
!     else if (byteorder == 1) {
!         /* force BE */
!         ihi = 0;
!         ilo = 1;
!     }
! 
      while (size-- > 0) {
  	Py_UNICODE ch = *s++;
  	Py_UNICODE ch2 = 0;
  	if (ch >= 0x10000) {
! 	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
! 	    ch  = 0xD800 | ((ch-0x10000) >> 10);
  	}
+         STORECHAR(ch);
+         if (ch2)
+             STORECHAR(ch2);
      }
      return v;
+ #undef STORECHAR
  }