[Python-checkins] CVS: python/dist/src/Objects unicodeobject.c,2.60,2.61

Fredrik Lundh python-dev@python.org
Sun, 3 Sep 2000 04:29:55 -0700


Update of /cvsroot/python/python/dist/src/Objects
In directory slayer.i.sourceforge.net:/tmp/cvs-serv17379/Objects

Modified Files:
	unicodeobject.c 
Log Message:


changed \x to consume exactly two hex digits, also for unicode
strings.  closes PEP-223.

also added \U escape (eight hex digits).


Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.60
retrieving revision 2.61
diff -C2 -r2.60 -r2.61
*** unicodeobject.c	2000/08/18 19:30:40	2.60
--- unicodeobject.c	2000/09/03 11:29:49	2.61
***************
*** 1164,1167 ****
--- 1164,1168 ----
      Py_UNICODE *p = NULL, *buf = NULL;
      const char *end;
+     Py_UCS4 chr;
      
      /* Escaped strings will always be longer than the resulting
***************
*** 1215,1240 ****
              break;
  
!         /* \xXXXX escape with 1-n hex digits.  for compatibility
!            with 8-bit strings, this code ignores all but the last
!            two digits */
          case 'x':
!             x = 0;
!             c = (unsigned char)*s;
!             if (isxdigit(c)) {
!                 do {
!                     x = (x<<4) & 0xF0;
!                     if ('0' <= c && c <= '9')
!                         x += c - '0';
!                     else if ('a' <= c && c <= 'f')
!                         x += 10 + c - 'a';
!                     else
!                         x += 10 + c - 'A';
!                     c = (unsigned char)*++s;
!                 } while (isxdigit(c));
!                 *p++ = (unsigned char) x;
!             } else {
!                 *p++ = '\\';
!                 *p++ = (unsigned char)s[-1];
              }
              break;
  
--- 1216,1240 ----
              break;
  
!         /* \xXX with two hex digits */
          case 'x':
!             for (x = 0, i = 0; i < 2; i++) {
!                 c = (unsigned char)s[i];
!                 if (!isxdigit(c)) {
!                     if (unicodeescape_decoding_error(&s, &x, errors,
!                                                      "truncated \\xXX"))
!                         goto onError;
!                     i++;
!                     break;
!                 }
!                 x = (x<<4) & ~0xF;
!                 if (c >= '0' && c <= '9')
!                     x += c - '0';
!                 else if (c >= 'a' && c <= 'f')
!                     x += 10 + c - 'a';
!                 else
!                     x += 10 + c - 'A';
              }
+             s += i;
+             *p++ = x;
              break;
  
***************
*** 1262,1273 ****
              break;
  
          case 'N':
              /* Ok, we need to deal with Unicode Character Names now,
               * make sure we've imported the hash table data...
               */
!             if (pucnHash == NULL)
!             {
                  PyObject *mod = 0, *v = 0;
-     
                  mod = PyImport_ImportModule("ucnhash");
                  if (mod == NULL)
--- 1262,1293 ----
              break;
  
+         /* \UXXXXXXXX with 8 hex digits */
+         case 'U':
+             for (chr = 0, i = 0; i < 8; i++) {
+                 c = (unsigned char)s[i];
+                 if (!isxdigit(c)) {
+                     if (unicodeescape_decoding_error(&s, &x, errors,
+                                                      "truncated \\uXXXX"))
+                         goto onError;
+                     i++;
+                     break;
+                 }
+                 chr = (chr<<4) & ~0xF;
+                 if (c >= '0' && c <= '9')
+                     chr += c - '0';
+                 else if (c >= 'a' && c <= 'f')
+                     chr += 10 + c - 'a';
+                 else
+                     chr += 10 + c - 'A';
+             }
+             s += i;
+             goto store;
+ 
          case 'N':
              /* Ok, we need to deal with Unicode Character Names now,
               * make sure we've imported the hash table data...
               */
!             if (pucnHash == NULL) {
                  PyObject *mod = 0, *v = 0;
                  mod = PyImport_ImportModule("ucnhash");
                  if (mod == NULL)
***************
*** 1276,1295 ****
                  Py_DECREF(mod);
                  if (v == NULL)
-                 {
                      goto onError;
-                 }
                  pucnHash = PyCObject_AsVoidPtr(v);
                  Py_DECREF(v);
                  if (pucnHash == NULL)
-                 {
                      goto onError;
-                 }
              }
                  
!             if (*s == '{')
!             {
                  const char *start = s + 1;
                  const char *endBrace = start;
-                 Py_UCS4 value;
                  unsigned long j;
  
--- 1296,1309 ----
                  Py_DECREF(mod);
                  if (v == NULL)
                      goto onError;
                  pucnHash = PyCObject_AsVoidPtr(v);
                  Py_DECREF(v);
                  if (pucnHash == NULL)
                      goto onError;
              }
                  
!             if (*s == '{') {
                  const char *start = s + 1;
                  const char *endBrace = start;
                  unsigned long j;
  
***************
*** 1304,1309 ****
                      endBrace++;
                  }
!                 if (endBrace != end && *endBrace == '}')
!                 {
                      j = pucnHash->hash(start, endBrace - start);
                      if (j > pucnHash->cKeys ||
--- 1318,1322 ----
                      endBrace++;
                  }
!                 if (endBrace != end && *endBrace == '}') {
                      j = pucnHash->hash(start, endBrace - start);
                      if (j > pucnHash->cKeys ||
***************
*** 1321,1349 ****
                          }
                          goto ucnFallthrough;
-                     }
-                     value = ((_Py_UnicodeCharacterName *)
-                                (pucnHash->getValue(j)))->value;
-                     if (value < 1<<16)
-                     {
-                         /* In UCS-2 range, easy solution.. */
-                         *p++ = value;
                      }
!                     else
!                     {
!                         /* Oops, its in UCS-4 space, */
!                         /*  compute and append the two surrogates: */
!                         /*  translate from 10000..10FFFF to 0..FFFFF */
!                         value -= 0x10000;
!                     
!                         /* high surrogate = top 10 bits added to D800 */
!                         *p++ = 0xD800 + (value >> 10);
!                         
!                         /* low surrogate  = bottom 10 bits added to DC00 */
!                         *p++ = 0xDC00 + (value & ~0xFC00);
!                     }
                      s = endBrace + 1;
!                 }
!                 else
!                 {
                      if (unicodeescape_decoding_error(
                              &s, &x, errors,
--- 1334,1343 ----
                          }
                          goto ucnFallthrough;
                      }
!                     chr = ((_Py_UnicodeCharacterName *)
!                            (pucnHash->getValue(j)))->value;
                      s = endBrace + 1;
!                     goto store;
!                 } else {
                      if (unicodeescape_decoding_error(
                              &s, &x, errors,
***************
*** 1364,1367 ****
--- 1358,1378 ----
              *p++ = (unsigned char)s[-1];
              break;
+ store:
+             /* when we get here, chr is a 32-bit unicode character */
+             if (chr <= 0xffff)
+                 /* UCS-2 character */
+                 *p++ = (Py_UNICODE) chr;
+             else if (chr <= 0x10ffff) {
+                 /* UCS-4 character.  store as two surrogate characters */
+                 chr -= 0x10000L;
+                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
+                 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
+             } else {
+                 if (unicodeescape_decoding_error(
+                     &s, &x, errors,
+                     "Illegal Unicode character")
+                     )
+                     goto onError;
+             }
          }
      }