[pypy-commit] pypy unicode-utf8: general progress towards moving more of the infrastructure from runicode towards unicodehelper, which helps us to deal with surrogates nicely

Wed Nov 15 11:44:17 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r93044:1d6d78e72d50
Date: 2017-11-15 17:43 +0100
http://bitbucket.org/pypy/pypy/changeset/1d6d78e72d50/

Log:	general progress towards moving more of the infrastructure from
	runicode towards unicodehelper, which helps us to deal with
	surrogates nicely

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,6 +1,7 @@
 from pypy.interpreter.error import OperationError
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib import runicode, rutf8
+from rpython.rlib.rarithmetic import r_uint
 from rpython.rlib.rstring import StringBuilder
 from pypy.module._codecs import interp_codecs
 
@@ -43,6 +44,15 @@
     from pypy.objspace.std.unicodeobject import encode_object
     return encode_object(space, w_data, encoding, errors)
 
+def combine_flags(one, two):
+    if one == rutf8.FLAG_ASCII and two == rutf8.FLAG_ASCII:
+        return rutf8.FLAG_ASCII
+    elif (one == rutf8.FLAG_HAS_SURROGATES or
+          two == rutf8.FLAG_HAS_SURROGATES):
+        return rutf8.FLAG_HAS_SURROGATES
+    return rutf8.FLAG_REGULAR
+
+
 def _has_surrogate(u):
     for c in u:
         if 0xD800 <= ord(c) <= 0xDFFF:
@@ -58,25 +68,221 @@
             flag = rutf8.FLAG_REGULAR
     return flag
 
+def hexescape(builder, s, pos, digits,
+              encoding, errorhandler, message, errors):
+    chr = 0
+    if pos + digits > len(s):
+        endinpos = pos
+        while endinpos < len(s) and s[endinpos] in hexdigits:
+            endinpos += 1
+        uuu
+        res, size, pos = errorhandler(errors, encoding,
+                                message, s, pos-2, endinpos)
+        builder.append(res)
+    else:
+        try:
+            chr = r_uint(int(s[pos:pos+digits], 16))
+        except ValueError:
+            aaaa
+            endinpos = pos
+            while s[endinpos] in hexdigits:
+                endinpos += 1
+            res, pos = errorhandler(errors, encoding,
+                                    message, s, pos-2, endinpos)
+            builder.append(res)
+        else:
+            # when we get here, chr is a 32-bit unicode character
+            if chr > 0x10ffff:
+                UUU
+                message = "illegal Unicode character"
+                res, pos = errorhandler(errors, encoding,
+                                        message, s, pos-2, pos+digits)
+                builder.append(res)
+            else:
+                rutf8.unichr_as_utf8_append(builder, chr, True)
+                if chr <= 0x7f:
+                    flag = rutf8.FLAG_ASCII
+                elif 0xd800 <= chr <= 0xdfff:
+                    flag = rutf8.FLAG_HAS_SURROGATES
+                else:
+                    flag = rutf8.FLAG_REGULAR
+                pos += digits
+                size = 1
+                
+    return pos, size, flag
+
+def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler):
+    size = len(s)
+    if size == 0:
+        return '', 0, 0, rutf8.FLAG_ASCII
+
+    flag = rutf8.FLAG_ASCII
+    builder = StringBuilder(size)
+    pos = 0
+    outsize = 0
+    while pos < size:
+        ch = s[pos]
+
+        # Non-escape characters are interpreted as Unicode ordinals
+        if ch != '\\':
+            if ord(ch) > 0x7F:
+                rutf8.unichr_as_utf8_append(builder, ord(ch))
+                flag = combine_flags(rutf8.FLAG_REGULAR, flag)
+            else:
+                builder.append(ch)
+            pos += 1
+            outsize += 1
+            continue
+
+        # - Escapes
+        pos += 1
+        if pos >= size:
+            message = "\\ at end of string"
+            res, pos = errorhandler(errors, "unicodeescape",
+                                    message, s, pos-1, size)
+            newsize, newflag = rutf8.check_utf8(res, True)
+            outsize + newsize
+            flag = combine_flags(flag, newflag)
+            builder.append(res)
+            continue
+
+        ch = s[pos]
+        pos += 1
+        # \x escapes
+        if ch == '\n': pass
+        elif ch == '\\':
+            builder.append('\\')
+            outsize += 1
+        elif ch == '\'':
+            builder.append('\'')
+            outsize += 1
+        elif ch == '\"':
+            builder.append('\"')
+            outsize += 1
+        elif ch == 'b' :
+            builder.append('\b')
+            outsize += 1
+        elif ch == 'f' :
+            builder.append('\f')
+            outsize += 1
+        elif ch == 't' :
+            builder.append('\t')
+            outsize += 1
+        elif ch == 'n' :
+            builder.append('\n')
+            outsize += 1
+        elif ch == 'r' :
+            builder.append('\r')
+            outsize += 1
+        elif ch == 'v' :
+            builder.append('\v')
+            outsize += 1
+        elif ch == 'a' :
+            builder.append('\a')
+            outsize += 1
+        elif '0' <= ch <= '7':
+            x = ord(ch) - ord('0')
+            if pos < size:
+                ch = s[pos]
+                if '0' <= ch <= '7':
+                    pos += 1
+                    x = (x<<3) + ord(ch) - ord('0')
+                    if pos < size:
+                        ch = s[pos]
+                        if '0' <= ch <= '7':
+                            pos += 1
+                            x = (x<<3) + ord(ch) - ord('0')
+            outsize += 1
+            if x >= 0x7F:
+                rutf8.unichr_as_utf8_append(builder, x)
+                flag = combine_flags(rutf8.FLAG_REGULAR, flag)
+            else:
+                builder.append(chr(x))
+        # hex escapes
+        # \xXX
+        elif ch == 'x':
+            digits = 2
+            message = "truncated \\xXX escape"
+            pos, newsize, newflag = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+            flag = combine_flags(flag, newflag)
+            outsize += newsize
+
+        # \uXXXX
+        elif ch == 'u':
+            digits = 4
+            message = "truncated \\uXXXX escape"
+            pos, newsize, newflag = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+            flag = combine_flags(flag, newflag)
+            outsize += newsize
+
+        #  \UXXXXXXXX
+        elif ch == 'U':
+            digits = 8
+            message = "truncated \\UXXXXXXXX escape"
+            pos, newsize, newflag = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+            flag = combine_flags(flag, newflag)
+            outsize += newsize
+
+        # \N{name}
+        elif ch == 'N' and ud_handler is not None:
+            message = "malformed \\N character escape"
+            look = pos
+
+            if look < size and s[look] == '{':
+                # look for the closing brace
+                while look < size and s[look] != '}':
+                    look += 1
+                if look < size and s[look] == '}':
+                    # found a name.  look it up in the unicode database
+                    message = "unknown Unicode character name"
+                    name = s[pos+1:look]
+                    code = ud_handler.call(name)
+                    if code < 0:
+                        res, pos = errorhandler(errors, "unicodeescape",
+                                                message, s, pos-1, look+1)
+                        newsize, newflag = rutf8.check_utf8(res, True)
+                        flag = combine_flags(flag, newflag)
+                        outsize += newsize
+                        builder.append(res)
+                        continue
+                    pos = look + 1
+                    XXX
+                    if code <= MAXUNICODE:
+                        builder.append(UNICHR(code))
+                    else:
+                        code -= 0x10000L
+                        builder.append(unichr(0xD800 + (code >> 10)))
+                        builder.append(unichr(0xDC00 + (code & 0x03FF)))
+                else:
+                    YYY
+                    res, pos = errorhandler(errors, "unicodeescape",
+                                            message, s, pos-1, look+1)
+                    builder.append(res)
+            else:
+                AAA
+                res, pos = errorhandler(errors, "unicodeescape",
+                                        message, s, pos-1, look+1)
+                builder.append(res)
+        else:
+            builder.append('\\')
+            builder.append(ch)
+            outsize += 2
+
+    return builder.build(), pos, outsize, flag
+
 # These functions take and return unwrapped rpython strings and unicodes
 def decode_unicode_escape(space, string):
     state = space.fromcache(interp_codecs.CodecState)
     unicodedata_handler = state.get_unicodedata_handler(space)
-    # XXX pick better length, maybe
-    # XXX that guy does not belong in runicode (nor in rutf8)
-    result_u, consumed = runicode.str_decode_unicode_escape(
-        string, len(string), "strict",
-        final=True, errorhandler=DecodeWrapper(decode_error_handler(space)).handle,
-        unicodedata_handler=unicodedata_handler)
-    # XXX argh.  we want each surrogate to be encoded separately
-    utf8 = result_u.encode('utf8')
-    if rutf8.first_non_ascii_char(utf8) == -1:
-        flag = rutf8.FLAG_ASCII
-    elif _has_surrogate(result_u):
-        flag = rutf8.FLAG_HAS_SURROGATES
-    else:
-        flag = rutf8.FLAG_REGULAR
-    return utf8, len(result_u), flag
+    result_utf8, consumed, length, flag = str_decode_unicode_escape(
+        string, "strict",
+        final=True,
+        errorhandler=decode_error_handler(space),
+        ud_handler=unicodedata_handler)
+    return result_utf8, length, flag
 
 def decode_raw_unicode_escape(space, string):
     # XXX pick better length, maybe
@@ -111,8 +317,10 @@
     try:
         length, flag = rutf8.check_utf8(string, allow_surrogates=True)
     except rutf8.CheckError as e:
+        # convert position into unicode position
+        lgt, flags = rutf8.check_utf8(string, True, stop=e.pos)
         decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', string,
-                                    e.pos, e.pos + 1)
+                                    lgt, lgt + 1)
         assert False, "unreachable"
     return length, flag
 
@@ -131,23 +339,28 @@
     # DEPRECATED
     return (s, check_utf8_or_raise(space, s))
 
-def utf8_encode_ascii(utf8, utf8len, errors, errorhandler):
-    if len(utf8) == utf8len:
-        return utf8
-    # No Way At All to emulate the calls to the error handler in
-    # less than three pages, so better not.
-    u = utf8.decode("utf8")
-    w = EncodeWrapper(errorhandler)
-    return runicode.unicode_encode_ascii(u, len(u), errors, w.handle)
-
-def str_decode_ascii(s, slen, errors, final, errorhandler):
+def str_decode_ascii(s, errors, final, errorhandler):
     try:
         rutf8.check_ascii(s)
-        return s, slen, len(s), rutf8.FLAG_ASCII
+        return s, len(s), len(s), rutf8.FLAG_ASCII
     except rutf8.CheckError:
-        w = DecodeWrapper((errorhandler))
-        u, pos = runicode.str_decode_ascii(s, slen, errors, final, w.handle)
-        return u.encode('utf8'), pos, len(u), _get_flag(u)
+        return _str_decode_ascii_slowpath(s, errors, final, errorhandler)
+
+def _str_decode_ascii_slowpath(s, errors, final, errorhandler):
+    i = 0
+    res = StringBuilder()
+    while i < len(s):
+        ch = s[i]
+        if ord(ch) > 0x7F:
+            r, i = errorhandler(errors, 'ascii', 'ordinal not in range(128)',
+                s, i, i + 1)
+            res.append(r)
+        else:
+            res.append(ch)
+            i += 1
+    ress = res.build()
+    lgt, flag = rutf8.check_utf8(ress, True)
+    return ress, len(s), lgt, flag
 
 # XXX wrappers, think about speed
 
@@ -165,21 +378,14 @@
     def handle(self, errors, encoding, msg, s, pos, endpos):
         return self.orig(errors, encoding, msg, s.encode("utf8"), pos, endpos)
 
-# some irregular interfaces
-def str_decode_utf8(s, slen, errors, final, errorhandler):
-    w = DecodeWrapper(errorhandler)
-    u, pos = runicode.str_decode_utf_8_impl(s, slen, errors, final, w.handle,
-        runicode.allow_surrogate_by_default)
-    return u.encode('utf8'), pos, len(u), _get_flag(u)
+#def str_decode_unicode_escape(s, slen, errors, final, errorhandler, ud_handler):
+#    w = DecodeWrapper(errorhandler)
+#    u, pos = runicode.str_decode_unicode_escape(s, slen, errors, final,
+#                                                w.handle,
+#                                                ud_handler)
+#    return u.encode('utf8'), pos, len(u), _get_flag(u)
 
-def str_decode_unicode_escape(s, slen, errors, final, errorhandler, ud_handler):
-    w = DecodeWrapper(errorhandler)
-    u, pos = runicode.str_decode_unicode_escape(s, slen, errors, final,
-                                                w.handle,
-                                                ud_handler)
-    return u.encode('utf8'), pos, len(u), _get_flag(u)
-
-def setup_new_encoders(encoding):
+def setup_new_encoders_legacy(encoding):
     encoder_name = 'utf8_encode_' + encoding
     encoder_call_name = 'unicode_encode_' + encoding
     decoder_name = 'str_decode_' + encoding
@@ -200,9 +406,322 @@
         globals()[decoder_name] = decoder
 
 def setup():
-    for encoding in ['utf_7', 'unicode_escape', 'raw_unicode_escape',
+    for encoding in ['raw_unicode_escape',
                      'utf_16', 'utf_16_le', 'utf_16_be', 'utf_32_le', 'utf_32',
                      'utf_32_be', 'latin_1', 'unicode_internal']:
-        setup_new_encoders(encoding)
+        setup_new_encoders_legacy(encoding)
 
 setup()
+
+def utf8_encode_ascii(utf8, errors, errorhandler):
+    """ Don't be confused - this is a slowpath for errors e.g. "ignore"
+    or an obscure errorhandler
+    """
+    res = StringBuilder()
+    i = 0
+    pos = 0
+    while i < len(utf8):
+        ch = rutf8.codepoint_at_pos(utf8, i)
+        if ch >= 0x7F:
+            msg = "ordinal not in range(128)"
+            r, newpos = errorhandler(errors, 'ascii', msg, utf8,
+                pos, pos + 1)
+            for _ in range(newpos - pos):
+                i = rutf8.next_codepoint_pos(utf8, i)
+            pos = newpos
+            res.append(r)
+        else:
+            res.append(chr(ch))
+            i = rutf8.next_codepoint_pos(utf8, i)    
+            pos += 1
+
+    s = res.build()
+    return s
+
+# some irregular interfaces
+def str_decode_utf8(s, slen, errors, final, errorhandler):
+    xxxx
+
+    u, pos = runicode.str_decode_utf_8_impl(s, slen, errors, final, w.handle,
+        runicode.allow_surrogate_by_default)
+    return u.encode('utf8'), pos, len(u), _get_flag(u)
+
+# ____________________________________________________________
+# utf-7
+
+# Three simple macros defining base-64
+
+def _utf7_IS_BASE64(oc):
+    "Is c a base-64 character?"
+    c = chr(oc)
+    return c.isalnum() or c == '+' or c == '/'
+def _utf7_TO_BASE64(n):
+    "Returns the base-64 character of the bottom 6 bits of n"
+    return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[n & 0x3f]
+def _utf7_FROM_BASE64(c):
+    "given that c is a base-64 character, what is its base-64 value?"
+    if c >= 'a':
+        return ord(c) - 71
+    elif c >= 'A':
+        return ord(c) - 65
+    elif c >= '0':
+        return ord(c) + 4
+    elif c == '+':
+        return 62
+    else: # c == '/'
+        return 63
+
+def _utf7_DECODE_DIRECT(oc):
+    return oc <= 127 and oc != ord('+')
+
+# The UTF-7 encoder treats ASCII characters differently according to
+# whether they are Set D, Set O, Whitespace, or special (i.e. none of
+# the above).  See RFC2152.  This array identifies these different
+# sets:
+# 0 : "Set D"
+#      alphanumeric and '(),-./:?
+# 1 : "Set O"
+#     !"#$%&*;<=>@[]^_`{|}
+# 2 : "whitespace"
+#     ht nl cr sp
+# 3 : special (must be base64 encoded)
+#     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
+
+utf7_category = [
+#  nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
+#  dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+#  sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /
+    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
+#   0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
+#   @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
+    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+#   P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
+#   `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o
+    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+#   p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
+]
+
+# ENCODE_DIRECT: this character should be encoded as itself.  The
+# answer depends on whether we are encoding set O as itself, and also
+# on whether we are encoding whitespace as itself.  RFC2152 makes it
+# clear that the answers to these questions vary between
+# applications, so this code needs to be flexible.
+
+def _utf7_ENCODE_DIRECT(oc, directO, directWS):
+    return(oc < 128 and oc > 0 and
+           (utf7_category[oc] == 0 or
+            (directWS and utf7_category[oc] == 2) or
+            (directO and utf7_category[oc] == 1)))
+
+def _utf7_ENCODE_CHAR(result, oc, base64bits, base64buffer):
+    if oc >= 0x10000:
+        # code first surrogate
+        base64bits += 16
+        base64buffer = (base64buffer << 16) | 0xd800 | ((oc-0x10000) >> 10)
+        while base64bits >= 6:
+            result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
+            base64bits -= 6
+        # prepare second surrogate
+        oc = 0xDC00 | ((oc-0x10000) & 0x3FF)
+    base64bits += 16
+    base64buffer = (base64buffer << 16) | oc
+    while base64bits >= 6:
+        result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
+        base64bits -= 6
+    return base64bits, base64buffer
+
+def str_decode_utf_7(s, errors, final=False,
+                     errorhandler=None):
+    size = len(s)
+    if size == 0:
+        return '', 0, 0, rutf8.FLAG_ASCII
+
+    inShift = False
+    base64bits = 0
+    base64buffer = 0
+    surrogate = 0
+    outsize = 0
+
+    result = StringBuilder(size)
+    pos = 0
+    shiftOutStartPos = 0
+    flag = rutf8.FLAG_ASCII
+    startinpos = 0
+    while pos < size:
+        ch = s[pos]
+
+        if inShift: # in a base-64 section
+            if _utf7_IS_BASE64(ord(ch)): #consume a base-64 character
+                base64buffer = (base64buffer << 6) | _utf7_FROM_BASE64(ch)
+                base64bits += 6
+                pos += 1
+
+                if base64bits >= 16:
+                    # enough bits for a UTF-16 value
+                    outCh = base64buffer >> (base64bits - 16)
+                    base64bits -= 16
+                    base64buffer &= (1 << base64bits) - 1 # clear high bits
+                    assert outCh <= 0xffff
+                    if surrogate:
+                        # expecting a second surrogate
+                        if outCh >= 0xDC00 and outCh <= 0xDFFF:
+                            xxxx
+                            result.append(
+                                UNICHR((((surrogate & 0x3FF)<<10) |
+                                        (outCh & 0x3FF)) + 0x10000))
+                            surrogate = 0
+                            continue
+                        else:
+                            YYYY
+                            result.append(unichr(surrogate))
+                            surrogate = 0
+                            # Not done with outCh: falls back to next line
+                    if outCh >= 0xD800 and outCh <= 0xDBFF:
+                        # first surrogate
+                        surrogate = outCh
+                    else:
+                        flag = combine_flags(flag, rutf8.unichr_to_flag(outCh))
+                        outsize += 1
+                        rutf8.unichr_as_utf8_append(result, outCh, True)
+
+            else:
+                # now leaving a base-64 section
+                inShift = False
+
+                if base64bits > 0: # left-over bits
+                    if base64bits >= 6:
+                        # We've seen at least one base-64 character
+                        aaa
+                        pos += 1
+                        msg = "partial character in shift sequence"
+                        res, pos = errorhandler(errors, 'utf7',
+                                                msg, s, pos-1, pos)
+                        result.append(res)
+                        continue
+                    else:
+                        # Some bits remain; they should be zero
+                        if base64buffer != 0:
+                            bbb
+                            pos += 1
+                            msg = "non-zero padding bits in shift sequence"
+                            res, pos = errorhandler(errors, 'utf7',
+                                                    msg, s, pos-1, pos)
+                            result.append(res)
+                            continue
+
+                if surrogate and _utf7_DECODE_DIRECT(ord(ch)):
+                    outsize += 1
+                    flag = rutf8.FLAG_HAS_SURROGATES
+                    rutf8.unichr_as_utf8_append(result, surrogate, True)
+                surrogate = 0
+
+                if ch == '-':
+                    # '-' is absorbed; other terminating characters are
+                    # preserved
+                    pos += 1
+
+        elif ch == '+':
+            startinpos = pos
+            pos += 1 # consume '+'
+            if pos < size and s[pos] == '-': # '+-' encodes '+'
+                pos += 1
+                result.append('+')
+                outsize += 1
+            else: # begin base64-encoded section
+                inShift = 1
+                surrogate = 0
+                shiftOutStartPos = result.getlength()
+                base64bits = 0
+                base64buffer = 0
+
+        elif _utf7_DECODE_DIRECT(ord(ch)): # character decodes at itself
+            xxx
+            result.append(unichr(ord(ch)))
+            pos += 1
+        else:
+            yyy
+            startinpos = pos
+            pos += 1
+            msg = "unexpected special character"
+            res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
+            result.append(res)
+
+    # end of string
+    final_length = result.getlength()
+    if inShift and final: # in shift sequence, no more to follow
+        # if we're in an inconsistent state, that's an error
+        inShift = 0
+        if (surrogate or
+            base64bits >= 6 or
+            (base64bits > 0 and base64buffer != 0)):
+            msg = "unterminated shift sequence"
+            xxxx
+            res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos, pos)
+            reslen, resflags = rutf8.check_utf8(res, True)
+            outsize += reslen
+            flag = combine_flags(flag, resflags)
+            result.append(res)
+            final_length = result.getlength()
+    elif inShift:
+        pos = startinpos
+        final_length = shiftOutStartPos # back off output
+
+    assert final_length >= 0
+    return result.build()[:final_length], pos, outsize, flag
+
+def utf8_encode_utf_7(s, errors, errorhandler=None):
+    size = len(s)
+    if size == 0:
+        return ''
+    result = StringBuilder(size)
+
+    encodeSetO = encodeWhiteSpace = False
+
+    inShift = False
+    base64bits = 0
+    base64buffer = 0
+
+    pos = 0
+    while pos < size:
+        oc = rutf8.codepoint_at_pos(s, pos)
+        if not inShift:
+            if oc == ord('+'):
+                result.append('+-')
+            elif _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace):
+                result.append(chr(oc))
+            else:
+                result.append('+')
+                inShift = True
+                base64bits, base64buffer = _utf7_ENCODE_CHAR(
+                    result, oc, base64bits, base64buffer)
+        else:
+            if _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace):
+                # shifting out
+                if base64bits: # output remaining bits
+                    result.append(_utf7_TO_BASE64(base64buffer << (6-base64bits)))
+                    base64buffer = 0
+                    base64bits = 0
+
+                inShift = False
+                ## Characters not in the BASE64 set implicitly unshift the
+                ## sequence so no '-' is required, except if the character is
+                ## itself a '-'
+                if _utf7_IS_BASE64(oc) or oc == ord('-'):
+                    result.append('-')
+                result.append(chr(oc))
+            else:
+                base64bits, base64buffer = _utf7_ENCODE_CHAR(
+                    result, oc, base64bits, base64buffer)
+        pos = rutf8.next_codepoint_pos(s, pos)
+
+    if base64bits:
+        result.append(_utf7_TO_BASE64(base64buffer << (6 - base64bits)))
+    if inShift:
+        result.append('-')
+
+    return result.build()
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -37,6 +37,7 @@
             if decode:
                 w_cls = space.w_UnicodeDecodeError
                 w_input = space.newbytes(input)
+                length = len(input)
             else:
                 w_cls = space.w_UnicodeEncodeError
                 length, flag = rutf8.check_utf8(input, allow_surrogates=True)
@@ -61,17 +62,13 @@
             w_replace, w_newpos = space.fixedview(w_res, 2)
             newpos = space.int_w(w_newpos)
             if newpos < 0:
-                newpos = len(input) + newpos
-            if newpos < 0 or newpos > len(input):
+                newpos = length + newpos
+            if newpos < 0 or newpos > length:
                 raise oefmt(space.w_IndexError,
                             "position %d from error handler out of bounds",
                             newpos)
             w_replace = space.convert_to_w_unicode(w_replace)
-            replace = w_replace._utf8.decode('utf8')
-            if decode:
-                return replace, newpos
-            else:
-                return replace, None, newpos
+            return w_replace._utf8, newpos
         return call_errorhandler
 
     def make_decode_errorhandler(self, space):
@@ -384,8 +381,7 @@
         func = getattr(unicodehelper, rname)
         utf8len = w_arg._length
         # XXX deal with func() returning length or not
-        result = func(w_arg._utf8, utf8len,
-            errors, state.encode_error_handler)
+        result = func(w_arg._utf8, errors, state.encode_error_handler)
         return space.newtuple([space.newbytes(result), space.newint(utf8len)])
     wrap_encoder.func_name = rname
     globals()[name] = wrap_encoder
@@ -403,7 +399,7 @@
         final = space.is_true(w_final)
         state = space.fromcache(CodecState)
         func = getattr(unicodehelper, rname)
-        result, consumed, length, flag = func(string, len(string), errors,
+        result, consumed, length, flag = func(string, errors,
                                               final, state.decode_error_handler)
         return space.newtuple([space.newutf8(result, length, flag),
                                space.newint(consumed)])
@@ -476,8 +472,6 @@
     try:
         lgt, flag = rutf8.check_utf8(string, allow_surrogates=True)
     except rutf8.CheckError:
-        # XXX do the way around runicode - we can optimize it later if we
-        # decide we care about obscure cases
         res, consumed, lgt, flag = unicodehelper.str_decode_utf8(string,
             len(string), errors, final, state.decode_error_handler)
         return space.newtuple([space.newutf8(res, lgt, flag),
@@ -695,7 +689,7 @@
     unicode_name_handler = state.get_unicodedata_handler(space)
 
     result, consumed, lgt, flag = unicodehelper.str_decode_unicode_escape(
-        string, len(string), errors,
+        string, errors,
         final, state.decode_error_handler,
         unicode_name_handler)
 
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -9,7 +9,6 @@
 from rpython.rlib.rstring import (
     StringBuilder, split, rsplit, UnicodeBuilder, replace_count, startswith,
     endswith)
-from rpython.rlib.runicode import make_unicode_escape_function
 from rpython.rlib import rutf8, jit
 
 from pypy.interpreter import unicodehelper
@@ -48,9 +47,16 @@
         else:
             assert flag == rutf8.FLAG_REGULAR
             self._index_storage = rutf8.null_storage()
+        # XXX checking, remove before any performance measurments
+        #     ifdef not_running_in_benchmark
         lgt, flag_check = rutf8.check_utf8(utf8str, True)
         assert lgt == length
-        assert flag == flag_check
+        if flag_check == rutf8.FLAG_ASCII:
+            # there are cases where we copy part of REULAR that happens
+            # to be ascii
+            assert flag in (rutf8.FLAG_ASCII, rutf8.FLAG_REGULAR)
+        else:
+            assert flag == flag_check
         # the storage can be one of:
         # - null, unicode with no surrogates
         # - rutf8.UTF8_HAS_SURROGATES
@@ -351,7 +357,7 @@
             elif unicodedb.islower(ch):
                 ch = unicodedb.toupper(ch)
             if ch >= 0x80:
-                flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
+                flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
             rutf8.unichr_as_utf8_append(builder, ch)
         return W_UnicodeObject(builder.build(), self._length, flag)
 
@@ -376,7 +382,7 @@
             else:
                 ch = unicodedb.tolower(ch)
             if ch >= 0x80:
-                flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
+                flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
             rutf8.unichr_as_utf8_append(builder, ch)
             previous_is_cased = unicodedb.iscased(ch)
         return builder.build(), flag
@@ -402,7 +408,7 @@
                     codepoint = space.int_w(w_newval)
                 elif isinstance(w_newval, W_UnicodeObject):
                     result.append(w_newval._utf8)
-                    flag = self._combine_flags(flag, w_newval._get_flag())
+                    flag = unicodehelper.combine_flags(flag, w_newval._get_flag())
                     result_length += w_newval._length
                     continue
                 else:
@@ -411,7 +417,7 @@
                                 "or unicode")
             try:
                 if codepoint >= 0x80:
-                    flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
+                    flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
                 rutf8.unichr_as_utf8_append(result, codepoint,
                                             allow_surrogates=True)
                 result_length += 1
@@ -535,7 +541,7 @@
         while pos < len(self._utf8):
             lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos))
             if lower >= 0x80:
-                flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
+                flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
             rutf8.unichr_as_utf8_append(builder, lower) # XXX allow surrogates?
             pos = rutf8.next_codepoint_pos(self._utf8, pos)
         return W_UnicodeObject(builder.build(), self._len(), flag)
@@ -623,15 +629,6 @@
             return True
         return endswith(value, prefix, start, end)
 
-    @staticmethod
-    def _combine_flags(self_flag, other_flag):
-        if self_flag == rutf8.FLAG_ASCII and other_flag == rutf8.FLAG_ASCII:
-            return rutf8.FLAG_ASCII
-        elif (self_flag == rutf8.FLAG_HAS_SURROGATES or
-              other_flag == rutf8.FLAG_HAS_SURROGATES):
-            return rutf8.FLAG_HAS_SURROGATES
-        return rutf8.FLAG_REGULAR
-
     def _get_flag(self):
         if self.is_ascii():
             return rutf8.FLAG_ASCII
@@ -646,7 +643,7 @@
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
             raise
-        flag = self._combine_flags(self._get_flag(), w_other._get_flag())
+        flag = unicodehelper.combine_flags(self._get_flag(), w_other._get_flag())
         return W_UnicodeObject(self._utf8 + w_other._utf8,
                                self._len() + w_other._len(), flag)
 
@@ -671,7 +668,7 @@
             # XXX Maybe the extra copy here is okay? It was basically going to
             #     happen anyway, what with being placed into the builder
             w_u = self.convert_arg_to_w_unicode(space, w_s)
-            flag = self._combine_flags(flag, w_u._get_flag())
+            flag = unicodehelper.combine_flags(flag, w_u._get_flag())
             unwrapped.append(w_u._utf8)
             lgt += w_u._length
             prealloc_size += len(unwrapped[i])
@@ -723,7 +720,7 @@
             uchar = rutf8.codepoint_at_pos(value, i)
             uchar = unicodedb.toupper(uchar)
             if uchar >= 0x80:
-                flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
+                flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
             i = rutf8.next_codepoint_pos(value, i)
             rutf8.unichr_as_utf8_append(builder, uchar)
         return W_UnicodeObject(builder.build(), self._length, flag)
@@ -837,14 +834,14 @@
         ch = unicodedb.toupper(uchar)
         rutf8.unichr_as_utf8_append(builder, ch)
         if ch >= 0x80:
-            flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
+            flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
         while i < len(value):
             uchar = rutf8.codepoint_at_pos(value, i)
             i = rutf8.next_codepoint_pos(value, i)
             ch = unicodedb.tolower(uchar)
             rutf8.unichr_as_utf8_append(builder, ch)
             if ch >= 0x80:
-                flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
+                flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
         return W_UnicodeObject(builder.build(), self._len(), flag)
 
     @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
@@ -930,7 +927,7 @@
         except OverflowError:
             raise oefmt(space.w_OverflowError, "replace string is too long")
 
-        flag = self._combine_flags(self._get_flag(), w_by._get_flag())
+        flag = unicodehelper.combine_flags(self._get_flag(), w_by._get_flag())
         newlength = self._length + replacements * (w_by._length - w_sub._length)
         return W_UnicodeObject(res, newlength, flag)
 
@@ -1052,7 +1049,7 @@
         if w_fillchar._len() != 1:
             raise oefmt(space.w_TypeError,
                         "rjust() argument 2 must be a single character")
-        flag = self._combine_flags(self._get_flag(), w_fillchar._get_flag())
+        flag = unicodehelper.combine_flags(self._get_flag(), w_fillchar._get_flag())
         d = width - lgt
         if d > 0:
             if len(w_fillchar._utf8) == 1:
@@ -1071,7 +1068,7 @@
         if w_fillchar._len() != 1:
             raise oefmt(space.w_TypeError,
                         "ljust() argument 2 must be a single character")
-        flag = self._combine_flags(self._get_flag(), w_fillchar._get_flag())
+        flag = unicodehelper.combine_flags(self._get_flag(), w_fillchar._get_flag())
         d = width - self._len()
         if d > 0:
             if len(w_fillchar._utf8) == 1:
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -452,6 +452,13 @@
     ('ofs', lltype.FixedSizeArray(lltype.Char, 16)))
     ))))
 
+def unichr_to_flag(ch):
+    if ch <= 0x7F:
+        return FLAG_ASCII
+    elif 0xD800 <= ch <= 0xDFFF:
+        return FLAG_HAS_SURROGATES
+    return FLAG_REGULAR
+
 FLAG_REGULAR = 0
 FLAG_HAS_SURROGATES = 1
 FLAG_ASCII = 2