[Python-checkins] CVS: python/dist/src/Objects unicodeobject.c,2.40,2.41

Fri, 7 Jul 2000 10:51:10 -0700

Update of /cvsroot/python/python/dist/src/Objects
In directory slayer.i.sourceforge.net:/tmp/cvs-serv27530/Objects

Modified Files:
	unicodeobject.c 
Log Message:
New surrogate support in the UTF-8 codec. By Bill Tutt.

Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.40
retrieving revision 2.41
diff -C2 -r2.40 -r2.41
*** unicodeobject.c	2000/07/07 13:46:42	2.40
--- unicodeobject.c	2000/07/07 17:51:08	2.41
***************
*** 658,665 ****
  
      while (s < e) {
!         register Py_UNICODE ch = (unsigned char)*s;
  
          if (ch < 0x80) {
!             *p++ = ch;
              s++;
              continue;
--- 658,665 ----
  
      while (s < e) {
!         Py_UCS4 ch = (unsigned char)*s;
  
          if (ch < 0x80) {
!             *p++ = (Py_UNICODE)ch;
              s++;
              continue;
***************
*** 688,692 ****
                  UTF8_ERROR("illegal encoding");
  	    else
! 		*p++ = ch;
              break;
  
--- 688,692 ----
                  UTF8_ERROR("illegal encoding");
  	    else
! 				*p++ = (Py_UNICODE)ch;
              break;
  
***************
*** 699,703 ****
                  UTF8_ERROR("illegal encoding");
  	    else
! 		*p++ = ch;
              break;
  
--- 699,726 ----
                  UTF8_ERROR("illegal encoding");
  	    else
! 				*p++ = (Py_UNICODE)ch;
!             break;
! 
!         case 4:
!             if ((s[1] & 0xc0) != 0x80 ||
!                 (s[2] & 0xc0) != 0x80 ||
!                 (s[3] & 0xc0) != 0x80)
!                 UTF8_ERROR("invalid data");
!             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
!                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
!             /* validate and convert to UTF-16 */
!             if ((ch < 0x10000) ||                  /* minimum value allowed for 4 byte encoding */
!                 (ch > 0x10ffff))                   /* maximum value allowed for UTF-16 */
!                 UTF8_ERROR("illegal encoding");
!             /*  compute and append the two surrogates: */
!             
!             /*  translate from 10000..10FFFF to 0..FFFF */
!             ch -= 0x10000;
!                     
!             /*  high surrogate = top 10 bits added to D800 */
!             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
!                     
!             /*  low surrogate = bottom 10 bits added to DC00 */
!             *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
              break;
  
***************
*** 759,764 ****
      char *p;
      char *q;
  
!     v = PyString_FromStringAndSize(NULL, 3 * size);
      if (v == NULL)
          return NULL;
--- 782,791 ----
      char *p;
      char *q;
+     Py_UCS4 ch2;
+     unsigned int cbAllocated = 3 * size;
+     unsigned int cbWritten = 0;
+     int i = 0;
  
!     v = PyString_FromStringAndSize(NULL, cbAllocated);
      if (v == NULL)
          return NULL;
***************
*** 767,788 ****
  
      p = q = PyString_AS_STRING(v);
!     while (size-- > 0) {
!         Py_UNICODE ch = *s++;
!         if (ch < 0x80)
              *p++ = (char) ch;
          else if (ch < 0x0800) {
              *p++ = 0xc0 | (ch >> 6);
              *p++ = 0x80 | (ch & 0x3f);
! 	} else if (0xD800 <= ch && ch <= 0xDFFF) {
! 	    /* These byte ranges are reserved for UTF-16 surrogate
! 	       bytes which the Python implementation currently does
! 	       not support. */
! 	    if (utf8_encoding_error(&s, &p, errors, 
! 				    "unsupported code range"))
  		goto onError;
!         } else {
!             *p++ = 0xe0 | (ch >> 12);
!             *p++ = 0x80 | ((ch >> 6) & 0x3f);
!             *p++ = 0x80 | (ch & 0x3f);
          }
      }
--- 794,839 ----
  
      p = q = PyString_AS_STRING(v);
!     while (i < size) {
!         Py_UCS4 ch = s[i++];
!         if (ch < 0x80) {
              *p++ = (char) ch;
+             cbWritten++;
+         }
          else if (ch < 0x0800) {
              *p++ = 0xc0 | (ch >> 6);
              *p++ = 0x80 | (ch & 0x3f);
!             cbWritten += 2;
!         }
!         else {
!             /* Check for high surrogate */
!             if (0xD800 <= ch && ch <= 0xDBFF) {
!                 if (i != size) {
!                     ch2 = s[i];
!                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
!                         
!                         if (cbWritten >= (cbAllocated - 4)) {
! 			    /* Provide enough room for some more
! 			       surrogates */
! 			    cbAllocated += 4*10;
!                             if (_PyString_Resize(&v, cbAllocated))
  		goto onError;
!                         }
! 
!                         /* combine the two values */
!                         ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
!                     
!                         *p++ = (char)((ch >> 18) | 0xf0);
!                         *p++ = (char)(0x80 | (ch >> 12) & 0x3f);
!                         i++;
!                         cbWritten += 4;
!                     }
!                 }
!             }
!             else {
!                 *p++ = (char)(0xe0 | (ch >> 12));
!                 cbWritten += 3;
!             }
!             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
!             *p++ = (char)(0x80 | (ch & 0x3f));
          }
      }
***************
*** 1218,1222 ****
                  const char *start = s + 1;
                  const char *endBrace = start;
!                 unsigned int uiValue;
                  unsigned long j;
  
--- 1269,1273 ----
                  const char *start = s + 1;
                  const char *endBrace = start;
!                 Py_UCS4 value;
                  unsigned long j;
  
***************
*** 1249,1258 ****
                          goto ucnFallthrough;
                      }
!                     uiValue = ((_Py_UnicodeCharacterName *)
!                                (pucnHash->getValue(j)))->uiValue;
!                     if (uiValue < 1<<16)
                      {
                          /* In UCS-2 range, easy solution.. */
!                         *p++ = uiValue;
                      }
                      else
--- 1300,1309 ----
                          goto ucnFallthrough;
                      }
!                     value = ((_Py_UnicodeCharacterName *)
!                                (pucnHash->getValue(j)))->value;
!                     if (value < 1<<16)
                      {
                          /* In UCS-2 range, easy solution.. */
!                         *p++ = value;
                      }
                      else
***************
*** 1261,1271 ****
                          /*  compute and append the two surrogates: */
                          /*  translate from 10000..10FFFF to 0..FFFFF */
!                         uiValue -= 0x10000;
                      
                          /* high surrogate = top 10 bits added to D800 */
!                         *p++ = 0xD800 + (uiValue >> 10);
                          
                          /* low surrogate  = bottom 10 bits added to DC00 */
!                         *p++ = 0xDC00 + (uiValue & ~0xFC00);
                      }
                      s = endBrace + 1;
--- 1312,1322 ----
                          /*  compute and append the two surrogates: */
                          /*  translate from 10000..10FFFF to 0..FFFFF */
!                         value -= 0x10000;
                      
                          /* high surrogate = top 10 bits added to D800 */
!                         *p++ = 0xD800 + (value >> 10);
                          
                          /* low surrogate  = bottom 10 bits added to DC00 */
!                         *p++ = 0xDC00 + (value & ~0xFC00);
                      }
                      s = endBrace + 1;
***************
*** 3092,3101 ****
  /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
  
! static unsigned long utf16Fixup[32] =
  {
      0, 0, 0, 0, 0, 0, 0, 0, 
      0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 
!     0, 0, 0, 0x2000, 0xf800, 0xf800, 0xf800, 0xf800
  };
  
--- 3143,3152 ----
  /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
  
! static short utf16Fixup[32] =
  {
      0, 0, 0, 0, 0, 0, 0, 0, 
      0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 
!     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
  };
  
***************
*** 3112,3116 ****
      
      while (len1 > 0 && len2 > 0) {
! 	unsigned long c1, c2;
  	long diff;
  
--- 3163,3167 ----
      
      while (len1 > 0 && len2 > 0) {
!         Py_UNICODE c1, c2;     
  	long diff;