[Python-checkins] CVS: python/dist/src/Objects unicodeobject.c,2.60,2.61
Fredrik Lundh
python-dev@python.org
Sun, 3 Sep 2000 04:29:55 -0700
Update of /cvsroot/python/python/dist/src/Objects
In directory slayer.i.sourceforge.net:/tmp/cvs-serv17379/Objects
Modified Files:
unicodeobject.c
Log Message:
changed \x to consume exactly two hex digits, also for unicode
strings. closes PEP-223.
also added \U escape (eight hex digits).
Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.60
retrieving revision 2.61
diff -C2 -r2.60 -r2.61
*** unicodeobject.c 2000/08/18 19:30:40 2.60
--- unicodeobject.c 2000/09/03 11:29:49 2.61
***************
*** 1164,1167 ****
--- 1164,1168 ----
Py_UNICODE *p = NULL, *buf = NULL;
const char *end;
+ Py_UCS4 chr;
/* Escaped strings will always be longer than the resulting
***************
*** 1215,1240 ****
break;
! /* \xXXXX escape with 1-n hex digits. for compatibility
! with 8-bit strings, this code ignores all but the last
! two digits */
case 'x':
! x = 0;
! c = (unsigned char)*s;
! if (isxdigit(c)) {
! do {
! x = (x<<4) & 0xF0;
! if ('0' <= c && c <= '9')
! x += c - '0';
! else if ('a' <= c && c <= 'f')
! x += 10 + c - 'a';
! else
! x += 10 + c - 'A';
! c = (unsigned char)*++s;
! } while (isxdigit(c));
! *p++ = (unsigned char) x;
! } else {
! *p++ = '\\';
! *p++ = (unsigned char)s[-1];
}
break;
--- 1216,1240 ----
break;
! /* \xXX with two hex digits */
case 'x':
! for (x = 0, i = 0; i < 2; i++) {
! c = (unsigned char)s[i];
! if (!isxdigit(c)) {
! if (unicodeescape_decoding_error(&s, &x, errors,
! "truncated \\xXX"))
! goto onError;
! i++;
! break;
! }
! x = (x<<4) & ~0xF;
! if (c >= '0' && c <= '9')
! x += c - '0';
! else if (c >= 'a' && c <= 'f')
! x += 10 + c - 'a';
! else
! x += 10 + c - 'A';
}
+ s += i;
+ *p++ = x;
break;
***************
*** 1262,1273 ****
break;
case 'N':
/* Ok, we need to deal with Unicode Character Names now,
* make sure we've imported the hash table data...
*/
! if (pucnHash == NULL)
! {
PyObject *mod = 0, *v = 0;
-
mod = PyImport_ImportModule("ucnhash");
if (mod == NULL)
--- 1262,1293 ----
break;
+ /* \UXXXXXXXX with 8 hex digits */
+ case 'U':
+ for (chr = 0, i = 0; i < 8; i++) {
+ c = (unsigned char)s[i];
+ if (!isxdigit(c)) {
+ if (unicodeescape_decoding_error(&s, &x, errors,
+ "truncated \\uXXXX"))
+ goto onError;
+ i++;
+ break;
+ }
+ chr = (chr<<4) & ~0xF;
+ if (c >= '0' && c <= '9')
+ chr += c - '0';
+ else if (c >= 'a' && c <= 'f')
+ chr += 10 + c - 'a';
+ else
+ chr += 10 + c - 'A';
+ }
+ s += i;
+ goto store;
+
case 'N':
/* Ok, we need to deal with Unicode Character Names now,
* make sure we've imported the hash table data...
*/
! if (pucnHash == NULL) {
PyObject *mod = 0, *v = 0;
mod = PyImport_ImportModule("ucnhash");
if (mod == NULL)
***************
*** 1276,1295 ****
Py_DECREF(mod);
if (v == NULL)
- {
goto onError;
- }
pucnHash = PyCObject_AsVoidPtr(v);
Py_DECREF(v);
if (pucnHash == NULL)
- {
goto onError;
- }
}
! if (*s == '{')
! {
const char *start = s + 1;
const char *endBrace = start;
- Py_UCS4 value;
unsigned long j;
--- 1296,1309 ----
Py_DECREF(mod);
if (v == NULL)
goto onError;
pucnHash = PyCObject_AsVoidPtr(v);
Py_DECREF(v);
if (pucnHash == NULL)
goto onError;
}
! if (*s == '{') {
const char *start = s + 1;
const char *endBrace = start;
unsigned long j;
***************
*** 1304,1309 ****
endBrace++;
}
! if (endBrace != end && *endBrace == '}')
! {
j = pucnHash->hash(start, endBrace - start);
if (j > pucnHash->cKeys ||
--- 1318,1322 ----
endBrace++;
}
! if (endBrace != end && *endBrace == '}') {
j = pucnHash->hash(start, endBrace - start);
if (j > pucnHash->cKeys ||
***************
*** 1321,1349 ****
}
goto ucnFallthrough;
- }
- value = ((_Py_UnicodeCharacterName *)
- (pucnHash->getValue(j)))->value;
- if (value < 1<<16)
- {
- /* In UCS-2 range, easy solution.. */
- *p++ = value;
}
! else
! {
! /* Oops, its in UCS-4 space, */
! /* compute and append the two surrogates: */
! /* translate from 10000..10FFFF to 0..FFFFF */
! value -= 0x10000;
!
! /* high surrogate = top 10 bits added to D800 */
! *p++ = 0xD800 + (value >> 10);
!
! /* low surrogate = bottom 10 bits added to DC00 */
! *p++ = 0xDC00 + (value & ~0xFC00);
! }
s = endBrace + 1;
! }
! else
! {
if (unicodeescape_decoding_error(
&s, &x, errors,
--- 1334,1343 ----
}
goto ucnFallthrough;
}
! chr = ((_Py_UnicodeCharacterName *)
! (pucnHash->getValue(j)))->value;
s = endBrace + 1;
! goto store;
! } else {
if (unicodeescape_decoding_error(
&s, &x, errors,
***************
*** 1364,1367 ****
--- 1358,1378 ----
*p++ = (unsigned char)s[-1];
break;
+ store:
+ /* when we get here, chr is a 32-bit unicode character */
+ if (chr <= 0xffff)
+ /* UCS-2 character */
+ *p++ = (Py_UNICODE) chr;
+ else if (chr <= 0x10ffff) {
+ /* UCS-4 character. store as two surrogate characters */
+ chr -= 0x10000L;
+ *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
+ *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
+ } else {
+ if (unicodeescape_decoding_error(
+ &s, &x, errors,
+ "Illegal Unicode character")
+ )
+ goto onError;
+ }
}
}