[Python-checkins] python/dist/src/Objects unicodeobject.c,2.141,2.142

Sun, 21 Apr 2002 02:59:48 -0700

Update of /cvsroot/python/python/dist/src/Objects
In directory usw-pr-cvs1:/tmp/cvs-serv22440

Modified Files:
	unicodeobject.c 
Log Message:
Back out 2.140.


Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.141
retrieving revision 2.142
diff -C2 -d -r2.141 -r2.142
*** unicodeobject.c	21 Apr 2002 03:26:37 -0000	2.141
--- unicodeobject.c	21 Apr 2002 09:59:45 -0000	2.142
***************
*** 1173,1176 ****
--- 1173,1182 ----
  #endif
  
+ /* Allocation strategy: we default to Latin-1, then do one resize
+    whenever we hit an order boundary. The assumption is that
+    characters from higher orders usually occur often enough to warrant
+    this.
+ */
+ 
  PyObject *
  PyUnicode_EncodeUTF8(const Py_UNICODE *s,
***************
*** 1180,1222 ****
      PyObject *v;
      char *p;
!     int allocated = 0;
!     int i;
! 
      /* Short-cut for emtpy strings */
      if (size == 0)
  	return PyString_FromStringAndSize(NULL, 0);
  
!     for (i = 0; i < size; ) {
!         Py_UCS4 ch = s[i++];
!         if (ch < 0x80)
! 	    allocated += 1;
!         else if (ch < 0x0800)
!             allocated += 2;
!         else if (ch < 0x10000) {
!             /* Check for high surrogate */
!             if (0xD800 <= ch && ch <= 0xDBFF &&
!                 i != size && 
! 		0xDC00 <= s[i] && s[i] <= 0xDFFF) {
! 		allocated += 1;
! 		i++;
! 	    }
! 	    allocated += 3;
!         } else
!             allocated += 4;
!     }
! 
!     v = PyString_FromStringAndSize(NULL, allocated);
      if (v == NULL)
          return NULL;
  
      p = PyString_AS_STRING(v);
!     for (i = 0; i < size; ) {
          Py_UCS4 ch = s[i++];
  
!         if (ch < 0x80) {
              *p++ = (char) ch;
-         }
  
          else if (ch < 0x0800) {
              *p++ = (char)(0xc0 | (ch >> 6));
              *p++ = (char)(0x80 | (ch & 0x3f));
--- 1186,1212 ----
      PyObject *v;
      char *p;
!     int i = 0;
!     int overalloc = 2;
!     int len;
!     
      /* Short-cut for emtpy strings */
      if (size == 0)
  	return PyString_FromStringAndSize(NULL, 0);
  
!     v = PyString_FromStringAndSize(NULL, overalloc * size);
      if (v == NULL)
          return NULL;
  
      p = PyString_AS_STRING(v);
! 
!     while (i < size) {
          Py_UCS4 ch = s[i++];
  
!         if (ch < 0x80)
! 	    /* Encode ASCII */
              *p++ = (char) ch;
  
          else if (ch < 0x0800) {
+ 	    /* Encode Latin-1 */
              *p++ = (char)(0xc0 | (ch >> 6));
              *p++ = (char)(0x80 | (ch & 0x3f));
***************
*** 1224,1258 ****
                          
          else {
! 	    
  	    if (ch < 0x10000) {
! 		/* Check for high surrogate */
  		if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
  		    Py_UCS4 ch2 = s[i];
! 		    /* Check for low surrogate */
  		    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
!                         ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
!                         *p++ = (char)((ch >> 18) | 0xf0);
!                         *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
! 			*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
! 			*p++ = (char)(0x80 | (ch & 0x3f));
!                         i++;
! 			continue;
                      }
  		    /* Fall through: handles isolated high surrogates */
                  }
                  *p++ = (char)(0xe0 | (ch >> 12));
  		*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
  		*p++ = (char)(0x80 | (ch & 0x3f));
!     
! 	    } else {
! 		*p++ = (char)(0xf0 | (ch>>18));
! 		*p++ = (char)(0x80 | ((ch>>12) & 0x3f));
! 		*p++ = (char)(0x80 | ((ch>>6) & 0x3f));
! 		*p++ = (char)(0x80 | (ch & 0x3f));
  	    }
  	}
      }
!     assert(p - PyString_AS_STRING(v) == allocated);
      return v;
  }
  
--- 1214,1270 ----
                          
          else {
! 	    /* Encode UCS2 Unicode ordinals */
  	    if (ch < 0x10000) {
! 
! 		/* Special case: check for high surrogate */
  		if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
  		    Py_UCS4 ch2 = s[i];
! 		    /* Check for low surrogate and combine the two to
! 		       form a UCS4 value */
  		    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
!                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
! 			i++;
! 			goto encodeUCS4;
                      }
  		    /* Fall through: handles isolated high surrogates */
                  }
+ 
+ 		if (overalloc < 3) {
+ 		    len = (int)(p - PyString_AS_STRING(v));
+ 		    overalloc = 3;
+ 		    if (_PyString_Resize(&v, overalloc * size))
+ 			goto onError;
+ 		    p = PyString_AS_STRING(v) + len;
+ 		}
                  *p++ = (char)(0xe0 | (ch >> 12));
  		*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
  		*p++ = (char)(0x80 | (ch & 0x3f));
! 		continue;
! 	    }
! 
! 	    /* Encode UCS4 Unicode ordinals */
! 	encodeUCS4:
! 	    if (overalloc < 4) {
! 		len = (int)(p - PyString_AS_STRING(v));
! 		overalloc = 4;
! 		if (_PyString_Resize(&v, overalloc * size))
! 		    goto onError;
! 		p = PyString_AS_STRING(v) + len;
  	    }
+ 	    *p++ = (char)(0xf0 | (ch >> 18));
+ 	    *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
+ 	    *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+ 	    *p++ = (char)(0x80 | (ch & 0x3f));
  	}
      }
!     *p = '\0';
!     assert((p - PyString_AS_STRING(v)) <= overalloc*size);
!     if (_PyString_Resize(&v, (int)(p - PyString_AS_STRING(v))))
! 	goto onError;
      return v;
+ 
+  onError:
+     Py_DECREF(v);
+     return NULL;
  }