[I18n-sig] [PATCH] UTF-8 decoding: Fix handling of invalid byte sequences

05 May 2000 18:35:18 +0200

--=-=-=

Could you have a look at the following patch?  It fixes a rather
funny scoping problem with the continue statement, which results in
more deterministic handling of invalid sequences.  In addition, the
treatment of invalid characters in "replace" mode is improved: now,
an incomplete or otherwise invalid UTF-8 sequence generates exactly
one replacement character.  As a result, the Python UTF-8 decoder now
passes Markus Kuhn's UTF-8 stress test.  (Shall I make a Python test
out of it?)

If there aren't any objections, I'll forward this patch through the
official channels (if it's still necessary).


--=-=-=
Content-Type: text/x-patch
Content-Disposition: attachment; filename=python-utf8.diff

Index: unicodeobject.c
===================================================================
RCS file: /projects/cvsroot/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.18
diff -u -r2.18 unicodeobject.c

--- unicodeobject.c	2000/05/04 15:52:20	2.18
+++ unicodeobject.c	2000/05/05 15:57:53
@@ -534,7 +534,8 @@
 #define UTF8_ERROR(details)  do {                       \
     if (utf8_decoding_error(&s, &p, errors, details))   \
         goto onError;                                   \
-    continue;                                           \
+    else                                                \
+        goto nextCharacter;                             \
 } while (0)
 
 PyObject *PyUnicode_DecodeUTF8(const char *s,
@@ -559,7 +560,10 @@
     e = s + size;
 
     while (s < e) {
-        register Py_UNICODE ch = (unsigned char)*s;
+        register Py_UNICODE ch;
+
+    nextCharacter:
+	ch = (unsigned char)*s;
 
         if (ch < 0x80) {
             *p++ = ch;
@@ -583,29 +587,44 @@
             break;
 
         case 2:
-            if ((s[1] & 0xc0) != 0x80) 
+	    if ((s[1] & 0xc0) != 0x80) {
                 UTF8_ERROR("invalid data");
+	    }
             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
-            if (ch < 0x80)
+            if (ch < 0x80) {
+		/* Skip rest of this sequence. */
+		s++;
                 UTF8_ERROR("illegal encoding");
-	    else
+	    } else
 		*p++ = ch;
             break;
 
         case 3:
             if ((s[1] & 0xc0) != 0x80 || 
-                (s[2] & 0xc0) != 0x80) 
+                (s[2] & 0xc0) != 0x80) {
+		/* Skip character which likely belongs to this sequence. */
+		if ((s[1] & 0xc0) == 0x80) {
+		    s++;
+		}
                 UTF8_ERROR("invalid data");
+	    }
             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
-            if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
+            if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
+		/* Skip rest of this sequence. */
+		s += 2;
                 UTF8_ERROR("illegal encoding");
-	    else
+	    } else
 		*p++ = ch;
             break;
 
         default:
             /* Other sizes are only needed for UCS-4 */
-            UTF8_ERROR("unsupported Unicode code range");
+	    /* Skip over these characters. */
+	    s++;
+	    while (s < e && ((*s & 0xc0) == 0x80)) s++;
+	    /* UTF8_ERROR will skip one character. */
+	    s--;
+	    UTF8_ERROR("unsupported Unicode code range");
         }
         s += n;
     }

--=-=-=--