[Python-checkins] python/dist/src/Objects unicodeobject.c,2.139,2.140

Sat, 20 Apr 2002 06:44:03 -0700

Update of /cvsroot/python/python/dist/src/Objects
In directory usw-pr-cvs1:/tmp/cvs-serv30961

Modified Files:
	unicodeobject.c 
Log Message:
Patch #495401: Count number of required bytes for encoding UTF-8 before 
allocating the target buffer.


Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.139
retrieving revision 2.140
diff -C2 -d -r2.139 -r2.140
*** unicodeobject.c	15 Apr 2002 18:42:15 -0000	2.139
--- unicodeobject.c	20 Apr 2002 13:44:01 -0000	2.140
***************
*** 1173,1182 ****
  #endif
  
- /* Allocation strategy: we default to Latin-1, then do one resize
-    whenever we hit an order boundary. The assumption is that
-    characters from higher orders usually occur often enough to warrant
-    this.
- */
- 
  PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
  			       int size,
--- 1173,1176 ----
***************
*** 1185,1211 ****
      PyObject *v;
      char *p;
!     int i = 0;
!     int overalloc = 2;
!     int len;
!     
      /* Short-cut for emtpy strings */
      if (size == 0)
  	return PyString_FromStringAndSize(NULL, 0);
  
!     v = PyString_FromStringAndSize(NULL, overalloc * size);
      if (v == NULL)
          return NULL;
  
      p = PyString_AS_STRING(v);
! 
!     while (i < size) {
          Py_UCS4 ch = s[i++];
  
!         if (ch < 0x80)
! 	    /* Encode ASCII */
              *p++ = (char) ch;
  
          else if (ch < 0x0800) {
- 	    /* Encode Latin-1 */
              *p++ = (char)(0xc0 | (ch >> 6));
              *p++ = (char)(0x80 | (ch & 0x3f));
--- 1179,1221 ----
      PyObject *v;
      char *p;
!     unsigned int allocated = 0;
!     int i;
! 
      /* Short-cut for emtpy strings */
      if (size == 0)
  	return PyString_FromStringAndSize(NULL, 0);
  
!     for (i = 0; i < size; ) {
!         Py_UCS4 ch = s[i++];
!         if (ch < 0x80)
! 	    allocated += 1;
!         else if (ch < 0x0800)
!             allocated += 2;
!         else if (ch < 0x10000) {
!             /* Check for high surrogate */
!             if (0xD800 <= ch && ch <= 0xDBFF &&
!                 i != size && 
! 		0xDC00 <= s[i] && s[i] <= 0xDFFF) {
! 		allocated += 1;
! 		i++;
! 	    }
! 	    allocated += 3;
!         } else
!             allocated += 4;
!     }
! 
!     v = PyString_FromStringAndSize(NULL, allocated);
      if (v == NULL)
          return NULL;
  
      p = PyString_AS_STRING(v);
!     for (i = 0; i < size; ) {
          Py_UCS4 ch = s[i++];
  
!         if (ch < 0x80) {
              *p++ = (char) ch;
+         }
  
          else if (ch < 0x0800) {
              *p++ = (char)(0xc0 | (ch >> 6));
              *p++ = (char)(0x80 | (ch & 0x3f));
***************
*** 1213,1268 ****
                          
          else {
! 	    /* Encode UCS2 Unicode ordinals */
  	    if (ch < 0x10000) {
! 
! 		/* Special case: check for high surrogate */
  		if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
  		    Py_UCS4 ch2 = s[i];
! 		    /* Check for low surrogate and combine the two to
! 		       form a UCS4 value */
  		    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
!                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
! 			i++;
! 			goto encodeUCS4;
                      }
  		    /* Fall through: handles isolated high surrogates */
                  }
- 
- 		if (overalloc < 3) {
- 		    len = (int)(p - PyString_AS_STRING(v));
- 		    overalloc = 3;
- 		    if (_PyString_Resize(&v, overalloc * size))
- 			goto onError;
- 		    p = PyString_AS_STRING(v) + len;
- 		}
                  *p++ = (char)(0xe0 | (ch >> 12));
  		*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
  		*p++ = (char)(0x80 | (ch & 0x3f));
! 		continue;
! 	    }
! 
! 	    /* Encode UCS4 Unicode ordinals */
! 	encodeUCS4:
! 	    if (overalloc < 4) {
! 		len = (int)(p - PyString_AS_STRING(v));
! 		overalloc = 4;
! 		if (_PyString_Resize(&v, overalloc * size))
! 		    goto onError;
! 		p = PyString_AS_STRING(v) + len;
  	    }
- 	    *p++ = (char)(0xf0 | (ch >> 18));
- 	    *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
- 	    *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
- 	    *p++ = (char)(0x80 | (ch & 0x3f));
  	}
      }
!     *p = '\0';
!     if (_PyString_Resize(&v, (int)(p - PyString_AS_STRING(v))))
! 	goto onError;
      return v;
- 
-  onError:
-     Py_DECREF(v);
-     return NULL;
  }
  
--- 1223,1257 ----
                          
          else {
! 	    
  	    if (ch < 0x10000) {
! 		/* Check for high surrogate */
  		if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
  		    Py_UCS4 ch2 = s[i];
! 		    /* Check for low surrogate */
  		    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
!                         ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
!                         *p++ = (char)((ch >> 18) | 0xf0);
!                         *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
! 			*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
! 			*p++ = (char)(0x80 | (ch & 0x3f));
!                         i++;
! 			continue;
                      }
  		    /* Fall through: handles isolated high surrogates */
                  }
                  *p++ = (char)(0xe0 | (ch >> 12));
  		*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
  		*p++ = (char)(0x80 | (ch & 0x3f));
!     
! 	    } else {
! 		*p++ = (char)(0xf0 | (ch>>18));
! 		*p++ = (char)(0x80 | ((ch>>12) & 0x3f));
! 		*p++ = (char)(0x80 | ((ch>>6) & 0x3f));
! 		*p++ = (char)(0x80 | (ch & 0x3f));
  	    }
  	}
      }
!     assert(p - PyString_AS_STRING(v) == allocated);
      return v;
  }