[issue14654] More fast utf-8 decoding

Tue Apr 24 19:46:16 CEST 2012

Serhiy Storchaka <storchaka at gmail.com> added the comment:

Here are two new patches. The first one takes into account the Martin
wishes about comments. The second also rejects optimization for ASCII.

On the Intel Atom last patch annihilates acceleration for some cases
(mostly-ascii with UCS2 data):

                                          vanilla     patch1      patch3

utf-8         'A'*9999+'\u0100'           124 (+8%)   288 (-53%)  134
utf-8         'A'*9999+'\u8000'           124 (+8%)   291 (-54%)  134
utf-8       '\u0100'+'A'*9999             78 (+5%)    123 (-33%)  82
utf-8       '\u8000'+'A'*9999             78 (+5%)    124 (-34%)  82

On the AMD Athlon there is no noticeable effect.

----------
Added file: http://bugs.python.org/file25342/decode_utf8_2.patch
Added file: http://bugs.python.org/file25343/decode_utf8_3.patch

_______________________________________
Python tracker <report at bugs.python.org>
<http://bugs.python.org/issue14654>
_______________________________________
-------------- next part --------------
diff -r c820aa9c0c00 Objects/stringlib/codecs.h

--- a/Objects/stringlib/codecs.h	Fri Apr 20 18:04:03 2012 -0400
+++ b/Objects/stringlib/codecs.h	Tue Apr 24 19:51:31 2012 +0300
@@ -21,7 +21,6 @@
                            const char **src_pos, Py_ssize_t *dest_index)
 {
     int ret;
-    Py_ssize_t n;
     const char *s = start;
     const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
     STRINGLIB_CHAR *p = dest;
@@ -48,15 +47,33 @@
                     unsigned long value = *(unsigned long *) _s;
                     if (value & ASCII_CHAR_MASK)
                         break;
-                    _p[0] = _s[0];
-                    _p[1] = _s[1];
-                    _p[2] = _s[2];
-                    _p[3] = _s[3];
-#if (SIZEOF_LONG == 8)
-                    _p[4] = _s[4];
-                    _p[5] = _s[5];
-                    _p[6] = _s[6];
-                    _p[7] = _s[7];
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+                    _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
+                    _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
+                    _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
+                    _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
+#if SIZEOF_LONG == 8
+                    _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
+                    _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
+                    _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
+                    _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
+#endif
+#else
+#if SIZEOF_LONG == 8
+                    _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
+                    _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
+                    _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
+                    _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
+                    _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
+                    _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
+                    _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
+                    _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
+#else
+                    _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
+                    _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
+                    _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
+                    _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
+#endif
 #endif
                     _s += SIZEOF_LONG;
                     _p += SIZEOF_LONG;
@@ -67,78 +84,114 @@
                     break;
                 ch = (unsigned char)*s;
             }
+            if (ch < 0x80) {
+                s++;
+                *p++ = ch;
+                continue;
+            }
         }
 
-        if (ch < 0x80) {
-            s++;
+        if (ch < 0xC2) {
+            /* invalid sequence
+               \x80-\xBF -- continuation byte
+               \xC0-\xC1 -- fake 0000-007F */
+            goto _error;
+        }
+
+        if (ch < 0xE0) {
+            /* \xC2\x80-\xDF\xBF -- 0080-07FF */
+            Py_UCS4 ch2;
+            if (end - s < 2) {
+                /* unexpected end of data: the caller will decide whether
+                   it's an error or not */
+                goto _error;
+            }
+            ch2 = (unsigned char)s[1];
+            if ((ch2 & 0xc0) != 0x80)
+                /* invalid continuation byte */
+                goto _error;
+            ch = (ch << 6) + ch2 - 030200;
+            assert ((ch > 0x007F) && (ch <= 0x07FF));
+            s += 2;
             *p++ = ch;
             continue;
         }
 
-        n = utf8_code_length[ch];
-
-        if (s + n > end) {
-            /* unexpected end of data: the caller will decide whether
-               it's an error or not */
-            goto _error;
-        }
-
-        switch (n) {
-        case 0:
-            /* invalid start byte */
-            goto _error;
-        case 1:
-            /* internal error */
-            goto _error;
-        case 2:
-            if ((s[1] & 0xc0) != 0x80)
-                /* invalid continuation byte */
+#if STRINGLIB_SIZEOF_CHAR >= 2
+        if (ch < 0xF0) {
+            /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
+            Py_UCS4 ch2, ch3;
+            if (end - s < 3) {
+                /* unexpected end of data: the caller will decide whether
+                   it's an error or not */
                 goto _error;
-            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
-            assert ((ch > 0x007F) && (ch <= 0x07FF));
-            s += 2;
-            *p++ = ch;
-            break;
-
-        case 3:
-            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
-               will result in surrogates in range d800-dfff. Surrogates are
-               not valid UTF-8 so they are rejected.
-               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
-               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
-            if ((s[1] & 0xc0) != 0x80 ||
-                (s[2] & 0xc0) != 0x80 ||
-                ((unsigned char)s[0] == 0xE0 &&
-                 (unsigned char)s[1] < 0xA0) ||
-                ((unsigned char)s[0] == 0xED &&
-                 (unsigned char)s[1] > 0x9F)) {
+            }
+            ch2 = (unsigned char)s[1];
+            ch3 = (unsigned char)s[2];
+            if ((ch2 & 0xc0) != 0x80 ||
+                (ch3 & 0xc0) != 0x80) {
                 /* invalid continuation byte */
                 goto _error;
             }
-            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+            if (ch == 0xE0) {
+                if (ch2 < 0xA0)
+                    /* invalid sequence
+                       \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
+                    goto _error;
+            }
+            else if (ch == 0xED && ch2 > 0x9F) {
+                /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
+                   will result in surrogates in range D800-DFFF. Surrogates are
+                   not valid UTF-8 so they are rejected.
+                   See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
+                   (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
+                goto _error;
+            }
+            ch = (ch << 12) + (ch2 << 6) + ch3 - 03420200;
             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
             s += 3;
             *p++ = ch;
-            break;
+            continue;
+        }
 
-        case 4:
-            if ((s[1] & 0xc0) != 0x80 ||
-                (s[2] & 0xc0) != 0x80 ||
-                (s[3] & 0xc0) != 0x80 ||
-                ((unsigned char)s[0] == 0xF0 &&
-                 (unsigned char)s[1] < 0x90) ||
-                ((unsigned char)s[0] == 0xF4 &&
-                 (unsigned char)s[1] > 0x8F)) {
+#if STRINGLIB_SIZEOF_CHAR >= 4
+        if (ch < 0xF5) {
+            /* \xF0\x90\x80\80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
+            Py_UCS4 ch2, ch3, ch4;
+            if (end - s < 4) {
+                /* unexpected end of data: the caller will decide whether
+                   it's an error or not */
+                goto _error;
+            }
+            ch2 = (unsigned char)s[1];
+            ch3 = (unsigned char)s[2];
+            ch4 = (unsigned char)s[3];
+            if ((ch2 & 0xc0) != 0x80 ||
+                (ch3 & 0xc0) != 0x80 ||
+                (ch4 & 0xc0) != 0x80) {
                 /* invalid continuation byte */
                 goto _error;
             }
-            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
-                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
+            if (ch == 0xF0) {
+                if (ch2 < 0x90)
+                    /* invalid sequence
+                       \xF0\x80\x80\80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
+                    goto _error;
+            }
+            else if (ch == 0xF4 && ch2 > 0x8F) {
+                /* invalid sequence
+                   \xF4\x90\x80\80- -- 110000- overflow */
+                goto _error;
+            }
+            ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - 0362020200;
             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
             s += 4;
             *p++ = ch;
-            break;
+            continue;
         }
+#endif
+#endif
+        goto _error;
     }
     ret = 0;
     goto _ok;
-------------- next part --------------
diff -r c820aa9c0c00 Objects/stringlib/codecs.h
--- a/Objects/stringlib/codecs.h	Fri Apr 20 18:04:03 2012 -0400
+++ b/Objects/stringlib/codecs.h	Tue Apr 24 19:12:40 2012 +0300
@@ -21,7 +21,6 @@
                            const char **src_pos, Py_ssize_t *dest_index)
 {
     int ret;
-    Py_ssize_t n;
     const char *s = start;
     const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
     STRINGLIB_CHAR *p = dest;
@@ -67,78 +66,114 @@
                     break;
                 ch = (unsigned char)*s;
             }
+            if (ch < 0x80) {
+                s++;
+                *p++ = ch;
+                continue;
+            }
         }
 
-        if (ch < 0x80) {
-            s++;
+        if (ch < 0xC2) {
+            /* invalid sequence
+               \x80-\xBF -- continuation byte
+               \xC0-\xC1 -- fake 0000-007F */
+            goto _error;
+        }
+
+        if (ch < 0xE0) {
+            /* \xC2\x80-\xDF\xBF -- 0080-07FF */
+            Py_UCS4 ch2;
+            if (end - s < 2) {
+                /* unexpected end of data: the caller will decide whether
+                   it's an error or not */
+                goto _error;
+            }
+            ch2 = (unsigned char)s[1];
+            if ((ch2 & 0xc0) != 0x80)
+                /* invalid continuation byte */
+                goto _error;
+            ch = (ch << 6) + ch2 - 030200;
+            assert ((ch > 0x007F) && (ch <= 0x07FF));
+            s += 2;
             *p++ = ch;
             continue;
         }
 
-        n = utf8_code_length[ch];
-
-        if (s + n > end) {
-            /* unexpected end of data: the caller will decide whether
-               it's an error or not */
-            goto _error;
-        }
-
-        switch (n) {
-        case 0:
-            /* invalid start byte */
-            goto _error;
-        case 1:
-            /* internal error */
-            goto _error;
-        case 2:
-            if ((s[1] & 0xc0) != 0x80)
-                /* invalid continuation byte */
+#if STRINGLIB_SIZEOF_CHAR >= 2
+        if (ch < 0xF0) {
+            /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
+            Py_UCS4 ch2, ch3;
+            if (end - s < 3) {
+                /* unexpected end of data: the caller will decide whether
+                   it's an error or not */
                 goto _error;
-            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
-            assert ((ch > 0x007F) && (ch <= 0x07FF));
-            s += 2;
-            *p++ = ch;
-            break;
-
-        case 3:
-            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
-               will result in surrogates in range d800-dfff. Surrogates are
-               not valid UTF-8 so they are rejected.
-               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
-               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
-            if ((s[1] & 0xc0) != 0x80 ||
-                (s[2] & 0xc0) != 0x80 ||
-                ((unsigned char)s[0] == 0xE0 &&
-                 (unsigned char)s[1] < 0xA0) ||
-                ((unsigned char)s[0] == 0xED &&
-                 (unsigned char)s[1] > 0x9F)) {
+            }
+            ch2 = (unsigned char)s[1];
+            ch3 = (unsigned char)s[2];
+            if ((ch2 & 0xc0) != 0x80 ||
+                (ch3 & 0xc0) != 0x80) {
                 /* invalid continuation byte */
                 goto _error;
             }
-            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+            if (ch == 0xE0) {
+                if (ch2 < 0xA0)
+                    /* invalid sequence
+                       \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
+                    goto _error;
+            }
+            else if (ch == 0xED && ch2 > 0x9F) {
+                /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
+                   will result in surrogates in range D800-DFFF. Surrogates are
+                   not valid UTF-8 so they are rejected.
+                   See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
+                   (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
+                goto _error;
+            }
+            ch = (ch << 12) + (ch2 << 6) + ch3 - 03420200;
             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
             s += 3;
             *p++ = ch;
-            break;
+            continue;
+        }
 
-        case 4:
-            if ((s[1] & 0xc0) != 0x80 ||
-                (s[2] & 0xc0) != 0x80 ||
-                (s[3] & 0xc0) != 0x80 ||
-                ((unsigned char)s[0] == 0xF0 &&
-                 (unsigned char)s[1] < 0x90) ||
-                ((unsigned char)s[0] == 0xF4 &&
-                 (unsigned char)s[1] > 0x8F)) {
+#if STRINGLIB_SIZEOF_CHAR >= 4
+        if (ch < 0xF5) {
+            /* \xF0\x90\x80\80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
+            Py_UCS4 ch2, ch3, ch4;
+            if (end - s < 4) {
+                /* unexpected end of data: the caller will decide whether
+                   it's an error or not */
+                goto _error;
+            }
+            ch2 = (unsigned char)s[1];
+            ch3 = (unsigned char)s[2];
+            ch4 = (unsigned char)s[3];
+            if ((ch2 & 0xc0) != 0x80 ||
+                (ch3 & 0xc0) != 0x80 ||
+                (ch4 & 0xc0) != 0x80) {
                 /* invalid continuation byte */
                 goto _error;
             }
-            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
-                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
+            if (ch == 0xF0) {
+                if (ch2 < 0x90)
+                    /* invalid sequence
+                       \xF0\x80\x80\80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
+                    goto _error;
+            }
+            else if (ch == 0xF4 && ch2 > 0x8F) {
+                /* invalid sequence
+                   \xF4\x90\x80\80- -- 110000- overflow */
+                goto _error;
+            }
+            ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - 0362020200;
             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
             s += 4;
             *p++ = ch;
-            break;
+            continue;
         }
+#endif
+#endif
+        goto _error;
     }
     ret = 0;
     goto _ok;