[Python-checkins] CVS: python/dist/src/Objects unicodeobject.c,2.40,2.41
M.-A. Lemburg
python-dev@python.org
Fri, 7 Jul 2000 10:51:10 -0700
Update of /cvsroot/python/python/dist/src/Objects
In directory slayer.i.sourceforge.net:/tmp/cvs-serv27530/Objects
Modified Files:
unicodeobject.c
Log Message:
New surrogate support in the UTF-8 codec. By Bill Tutt.
Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.40
retrieving revision 2.41
diff -C2 -r2.40 -r2.41
*** unicodeobject.c 2000/07/07 13:46:42 2.40
--- unicodeobject.c 2000/07/07 17:51:08 2.41
***************
*** 658,665 ****
while (s < e) {
! register Py_UNICODE ch = (unsigned char)*s;
if (ch < 0x80) {
! *p++ = ch;
s++;
continue;
--- 658,665 ----
while (s < e) {
! Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) {
! *p++ = (Py_UNICODE)ch;
s++;
continue;
***************
*** 688,692 ****
UTF8_ERROR("illegal encoding");
else
! *p++ = ch;
break;
--- 688,692 ----
UTF8_ERROR("illegal encoding");
else
! *p++ = (Py_UNICODE)ch;
break;
***************
*** 699,703 ****
UTF8_ERROR("illegal encoding");
else
! *p++ = ch;
break;
--- 699,726 ----
UTF8_ERROR("illegal encoding");
else
! *p++ = (Py_UNICODE)ch;
! break;
!
! case 4:
! if ((s[1] & 0xc0) != 0x80 ||
! (s[2] & 0xc0) != 0x80 ||
! (s[3] & 0xc0) != 0x80)
! UTF8_ERROR("invalid data");
! ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
! ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
! /* validate and convert to UTF-16 */
! if ((ch < 0x10000) || /* minimum value allowed for 4 byte encoding */
! (ch > 0x10ffff)) /* maximum value allowed for UTF-16 */
! UTF8_ERROR("illegal encoding");
! /* compute and append the two surrogates: */
!
! /* translate from 10000..10FFFF to 0..FFFF */
! ch -= 0x10000;
!
! /* high surrogate = top 10 bits added to D800 */
! *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
!
! /* low surrogate = bottom 10 bits added to DC00 */
! *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
break;
***************
*** 759,764 ****
char *p;
char *q;
! v = PyString_FromStringAndSize(NULL, 3 * size);
if (v == NULL)
return NULL;
--- 782,791 ----
char *p;
char *q;
+ Py_UCS4 ch2;
+ unsigned int cbAllocated = 3 * size;
+ unsigned int cbWritten = 0;
+ int i = 0;
! v = PyString_FromStringAndSize(NULL, cbAllocated);
if (v == NULL)
return NULL;
***************
*** 767,788 ****
p = q = PyString_AS_STRING(v);
! while (size-- > 0) {
! Py_UNICODE ch = *s++;
! if (ch < 0x80)
*p++ = (char) ch;
else if (ch < 0x0800) {
*p++ = 0xc0 | (ch >> 6);
*p++ = 0x80 | (ch & 0x3f);
! } else if (0xD800 <= ch && ch <= 0xDFFF) {
! /* These byte ranges are reserved for UTF-16 surrogate
! bytes which the Python implementation currently does
! not support. */
! if (utf8_encoding_error(&s, &p, errors,
! "unsupported code range"))
goto onError;
! } else {
! *p++ = 0xe0 | (ch >> 12);
! *p++ = 0x80 | ((ch >> 6) & 0x3f);
! *p++ = 0x80 | (ch & 0x3f);
}
}
--- 794,839 ----
p = q = PyString_AS_STRING(v);
! while (i < size) {
! Py_UCS4 ch = s[i++];
! if (ch < 0x80) {
*p++ = (char) ch;
+ cbWritten++;
+ }
else if (ch < 0x0800) {
*p++ = 0xc0 | (ch >> 6);
*p++ = 0x80 | (ch & 0x3f);
! cbWritten += 2;
! }
! else {
! /* Check for high surrogate */
! if (0xD800 <= ch && ch <= 0xDBFF) {
! if (i != size) {
! ch2 = s[i];
! if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
!
! if (cbWritten >= (cbAllocated - 4)) {
! /* Provide enough room for some more
! surrogates */
! cbAllocated += 4*10;
! if (_PyString_Resize(&v, cbAllocated))
goto onError;
! }
!
! /* combine the two values */
! ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
!
! *p++ = (char)((ch >> 18) | 0xf0);
! *p++ = (char)(0x80 | (ch >> 12) & 0x3f);
! i++;
! cbWritten += 4;
! }
! }
! }
! else {
! *p++ = (char)(0xe0 | (ch >> 12));
! cbWritten += 3;
! }
! *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
! *p++ = (char)(0x80 | (ch & 0x3f));
}
}
***************
*** 1218,1222 ****
const char *start = s + 1;
const char *endBrace = start;
! unsigned int uiValue;
unsigned long j;
--- 1269,1273 ----
const char *start = s + 1;
const char *endBrace = start;
! Py_UCS4 value;
unsigned long j;
***************
*** 1249,1258 ****
goto ucnFallthrough;
}
! uiValue = ((_Py_UnicodeCharacterName *)
! (pucnHash->getValue(j)))->uiValue;
! if (uiValue < 1<<16)
{
/* In UCS-2 range, easy solution.. */
! *p++ = uiValue;
}
else
--- 1300,1309 ----
goto ucnFallthrough;
}
! value = ((_Py_UnicodeCharacterName *)
! (pucnHash->getValue(j)))->value;
! if (value < 1<<16)
{
/* In UCS-2 range, easy solution.. */
! *p++ = value;
}
else
***************
*** 1261,1271 ****
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFFF */
! uiValue -= 0x10000;
/* high surrogate = top 10 bits added to D800 */
! *p++ = 0xD800 + (uiValue >> 10);
/* low surrogate = bottom 10 bits added to DC00 */
! *p++ = 0xDC00 + (uiValue & ~0xFC00);
}
s = endBrace + 1;
--- 1312,1322 ----
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFFF */
! value -= 0x10000;
/* high surrogate = top 10 bits added to D800 */
! *p++ = 0xD800 + (value >> 10);
/* low surrogate = bottom 10 bits added to DC00 */
! *p++ = 0xDC00 + (value & ~0xFC00);
}
s = endBrace + 1;
***************
*** 3092,3101 ****
/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
! static unsigned long utf16Fixup[32] =
{
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
! 0, 0, 0, 0x2000, 0xf800, 0xf800, 0xf800, 0xf800
};
--- 3143,3152 ----
/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
! static short utf16Fixup[32] =
{
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
! 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
};
***************
*** 3112,3116 ****
while (len1 > 0 && len2 > 0) {
! unsigned long c1, c2;
long diff;
--- 3163,3167 ----
while (len1 > 0 && len2 > 0) {
! Py_UNICODE c1, c2;
long diff;