[pypy-commit] pypy utf8-unicode2: Refactor Utf8Builder API some; don't allow .append(<some int>)

Fri Aug 8 09:22:42 CEST 2014

Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r72719:b5e27ed82427
Date: 2014-08-08 02:12 -0500
http://bitbucket.org/pypy/pypy/changeset/b5e27ed82427/

Log:	Refactor Utf8Builder API some; don't allow .append(<some int>)

diff --git a/pypy/interpreter/test/test_utf8.py b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -9,10 +9,10 @@
 
 def build_utf8str():
     builder = Utf8Builder()
-    builder.append('A') #0x41
-    builder.append(0x10F) #0xC4 0x8F
-    builder.append(0x20AC) #0xE2 0x82 0xAC
-    builder.append(0x1F63D) #0xF0 0x9F 0x98 0xBD
+    builder.append_ascii('A') #0x41
+    builder.append_codepoint(0x10F) #0xC4 0x8F
+    builder.append_codepoint(0x20AC) #0xE2 0x82 0xAC
+    builder.append_codepoint(0x1F63D) #0xF0 0x9F 0x98 0xBD
     return builder.build()
 
 def test_builder():
@@ -88,7 +88,7 @@
 
 def test_unicode_literal_comparison():
     builder = Utf8Builder()
-    builder.append(0x10F)
+    builder.append_codepoint(0x10F)
     s = builder.build()
     assert s == u'\u010F'
     assert s[0] == u'\u010F'
diff --git a/pypy/interpreter/test/test_utf8_codecs.py b/pypy/interpreter/test/test_utf8_codecs.py
--- a/pypy/interpreter/test/test_utf8_codecs.py
+++ b/pypy/interpreter/test/test_utf8_codecs.py
@@ -58,7 +58,7 @@
             assert t is s
             assert start == startingpos
             assert stop == endingpos
-            return "42424242", None, stop
+            return Utf8Str("42424242"), None, stop
         encoder = self.getencoder(encoding)
         result = encoder(s, len(s), "foo!", errorhandler)
         assert called[0]
@@ -85,8 +85,8 @@
                 assert stop == endingpos
                 if msg is not None:
                     assert errmsg == msg
-                return "42424242", stop
-            return "", endingpos
+                return Utf8Str("42424242"), stop
+            return Utf8Str(""), endingpos
         decoder = self.getdecoder(encoding)
         if addstuff:
             s += "some rest in ascii"
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -21,7 +21,7 @@
     # Like unichr, but returns a Utf8Str object
     # TODO: Do this without the builder so its faster
     b = Utf8Builder()
-    b.append(value)
+    b.append_codepoint(value)
     return b.build()
 
 def utf8ord_bytes(bytes, start):
@@ -130,6 +130,13 @@
             else:
                 self._len = len(data)
 
+        if not we_are_translated():
+            self.bytes.decode('utf8')
+
+            if self._is_ascii:
+                for i in self.bytes:
+                    assert ord(i) < 128
+
     def _calc_length(self):
         pos = 0
         length = 0
@@ -559,15 +566,15 @@
                     i += 1
                     c2 = intmask(array[i])
                     if c2 == 0:
-                        builder.append(c)
+                        builder.append_codepoint(c)
                         break
                     elif not (0xDC00 <= c2 <= 0xDFFF):
-                        builder.append(c)
+                        builder.append_codepoint(c)
                         c = c2
                     else:
                         c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000;
 
-            builder.append(c)
+            builder.append_codepoint(c)
             i += 1
 
         return builder.build()
@@ -587,15 +594,15 @@
                     i += 1
                     c2 = intmask(array[i])
                     if c2 == 0:
-                        builder.append(c)
+                        builder.append_codepoint(c)
                         break
                     elif not (0xDC00 <= c2 <= 0xDFFF):
-                        builder.append(c)
+                        builder.append_codepoint(c)
                         c = c2
                     else:
                         c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000;
 
-            builder.append(c)
+            builder.append_codepoint(c)
             i += 1
 
         return builder.build()
@@ -613,12 +620,12 @@
                     i += 1
                     c2 = intmask(array[i])
                     if not (0xDC00 <= c2 <= 0xDFFF):
-                        builder.append(c)
+                        builder.append_codepoint(c)
                         c = c2
                     else:
                         c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000;
 
-            builder.append(c)
+            builder.append_codepoint(c)
             i += 1
 
         return builder.build()
@@ -634,42 +641,54 @@
         self._length = 0
 
 
+    def append_codepoint(self, c):
+        if c < 0x80:
+            self._builder.append(chr(c))
+        elif c < 0x800:
+            self._builder.append(chr(0xC0 | (c >> 6)))
+            self._builder.append(chr(0x80 | (c & 0x3F)))
+            self._is_ascii = False
+        elif c < 0x10000:
+            self._builder.append(chr(0xE0 | (c >> 12)))
+            self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
+            self._builder.append(chr(0x80 | (c & 0x3F)))
+            self._is_ascii = False
+        elif c <= 0x10FFFF:
+            self._builder.append(chr(0xF0 | (c >> 18)))
+            self._builder.append(chr(0x80 | (c >> 12 & 0x3F)))
+            self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
+            self._builder.append(chr(0x80 | (c & 0x3F)))
+            self._is_ascii = False
+        else:
+            raise ValueError("Invalid unicode codepoint > 0x10FFFF.")
+        self._length += 1
+
+    def append_ascii(self, str):
+        if not we_are_translated():
+            # XXX For testing purposes, make sure this is actually ascii
+            for i in str:
+                assert ord(i) < 128
+
+        self._builder.append(str)
+        self._length += len(str)
+
+    def append_utf8(self, ustr):
+        self._builder.append(ustr.bytes)
+        if not ustr._is_ascii:
+            self._is_ascii = False
+        self._length += len(ustr)
+
+    def _append_bytes(self, bytes, is_ascii=False):
+        # XXX This breaks getlength()
+        self._builder.append(bytes)
+        self._is_ascii = self._is_ascii and is_ascii
+
     @specialize.argtype(1)
     def append(self, c):
         if isinstance(c, Utf8Str):
-            self._builder.append(c.bytes)
-            if not c._is_ascii:
-                self._is_ascii = False
-            self._length += len(c)
-
-        elif isinstance(c, int):
-            if c < 0x80:
-                self._builder.append(chr(c))
-            elif c < 0x800:
-                self._builder.append(chr(0xC0 | (c >> 6)))
-                self._builder.append(chr(0x80 | (c & 0x3F)))
-                self._is_ascii = False
-            elif c < 0x10000:
-                self._builder.append(chr(0xE0 | (c >> 12)))
-                self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
-                self._builder.append(chr(0x80 | (c & 0x3F)))
-                self._is_ascii = False
-            elif c <= 0x10FFFF:
-                self._builder.append(chr(0xF0 | (c >> 18)))
-                self._builder.append(chr(0x80 | (c >> 12 & 0x3F)))
-                self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
-                self._builder.append(chr(0x80 | (c & 0x3F)))
-                self._is_ascii = False
-            else:
-                raise ValueError("Invalid unicode codepoint > 0x10FFFF.")
-            self._length += 1
+            self.append_utf8(c)
         else:
-            assert isinstance(c, str)
-            self._builder.append(c)
-
-            # XXX The assumption here is that the bytes being appended are
-            #     ASCII, ie 1:1 byte:char
-            self._length += len(c)
+            self.append_ascii(c)
 
     @specialize.argtype(1)
     def append_slice(self, s, start, end):
@@ -685,18 +704,12 @@
                             type(s))
         self._length += end - start
 
-    @specialize.argtype(1)
     def append_multiple_char(self, c, count):
         # TODO: What do I do when I have an int? Is it fine to just loop over
         #       .append(c) then? Should (can) I force a resize first?
-        if isinstance(c, int):
-            self._builder.append_multiple_char(chr(c), count)
-            return
-
-        if isinstance(c, str):
-            self._builder.append_multiple_char(c, count)
-        else:
-            self._builder.append_multiple_char(c.bytes, count)
+        if ord(c) > 127:
+            raise ValueError("an ascii char is required")
+        self._builder.append_multiple_char(c, count)
         self._length += count
 
     def getlength(self):
@@ -705,6 +718,7 @@
     def build(self):
         return Utf8Str(self._builder.build(), self._is_ascii)
 
+
 class WCharContextManager(object):
     def __init__(self, str):
         self.str = str
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -34,7 +34,7 @@
 
         # Non-escape characters are interpreted as Unicode ordinals
         if ch != '\\':
-            builder.append(ord(ch))
+            builder.append_codepoint(ord(ch))
             pos += 1
             continue
 
@@ -44,23 +44,23 @@
             message = "\\ at end of string"
             res, pos = errorhandler(errors, "unicodeescape",
                                     message, s, pos-1, size)
-            builder.append(res)
+            builder.append_utf8(res)
             continue
 
         ch = s[pos]
         pos += 1
         # \x escapes
         if ch == '\n': pass
-        elif ch == '\\': builder.append('\\')
-        elif ch == '\'': builder.append('\'')
-        elif ch == '\"': builder.append('\"')
-        elif ch == 'b' : builder.append('\b')
-        elif ch == 'f' : builder.append('\f')
-        elif ch == 't' : builder.append('\t')
-        elif ch == 'n' : builder.append('\n')
-        elif ch == 'r' : builder.append('\r')
-        elif ch == 'v' : builder.append('\v')
-        elif ch == 'a' : builder.append('\a')
+        elif ch == '\\': builder.append_ascii('\\')
+        elif ch == '\'': builder.append_ascii('\'')
+        elif ch == '\"': builder.append_ascii('\"')
+        elif ch == 'b' : builder.append_ascii('\b')
+        elif ch == 'f' : builder.append_ascii('\f')
+        elif ch == 't' : builder.append_ascii('\t')
+        elif ch == 'n' : builder.append_ascii('\n')
+        elif ch == 'r' : builder.append_ascii('\r')
+        elif ch == 'v' : builder.append_ascii('\v')
+        elif ch == 'a' : builder.append_ascii('\a')
         elif '0' <= ch <= '7':
             x = ord(ch) - ord('0')
             if pos < size:
@@ -73,7 +73,7 @@
                         if '0' <= ch <= '7':
                             pos += 1
                             x = (x<<3) + ord(ch) - ord('0')
-            builder.append(x)
+            builder.append_codepoint(x)
         # hex escapes
         # \xXX
         elif ch == 'x':
@@ -105,7 +105,7 @@
                            "(can't load unicodedata module)")
                 res, pos = errorhandler(errors, "unicodeescape",
                                         message, s, pos-1, size)
-                builder.append(res)
+                builder.append_utf8(res)
                 continue
 
             if look < size and s[look] == '{':
@@ -120,21 +120,21 @@
                     if code < 0:
                         res, pos = errorhandler(errors, "unicodeescape",
                                                 message, s, pos-1, look+1)
-                        builder.append(res)
+                        builder.append_utf8(res)
                         continue
                     pos = look + 1
-                    builder.append(code)
+                    builder.append_codepoint(code)
                 else:
                     res, pos = errorhandler(errors, "unicodeescape",
                                             message, s, pos-1, look+1)
-                    builder.append(res)
+                    builder.append_utf8(res)
             else:
                 res, pos = errorhandler(errors, "unicodeescape",
                                         message, s, pos-1, look+1)
-                builder.append(res)
+                builder.append_utf8(res)
         else:
-            builder.append('\\')
-            builder.append(ord(ch))
+            builder.append_ascii('\\')
+            builder.append_codepoint(ord(ch))
 
     return builder.build(), pos
 
@@ -149,7 +149,7 @@
             endinpos += 1
         res, pos = errorhandler(errors, encoding,
                                 message, s, pos-2, endinpos)
-        builder.append(res)
+        builder.append_utf8(res)
     else:
         try:
             chr = r_uint(int(s[pos:pos+digits], 16))
@@ -159,18 +159,18 @@
                 endinpos += 1
             res, pos = errorhandler(errors, encoding,
                                     message, s, pos-2, endinpos)
-            builder.append(res)
+            builder.append_utf8(res)
         else:
             # when we get here, chr is a 32-bit unicode character
             if chr <= MAXUNICODE:
-                builder.append(intmask(chr))
+                builder.append_codepoint(intmask(chr))
                 pos += digits
 
             else:
                 message = "illegal Unicode character"
                 res, pos = errorhandler(errors, encoding,
                                         message, s, pos-2, pos+digits)
-                builder.append(res)
+                builder.append_utf8(res)
     return pos
 
 def make_unicode_escape_function(pass_printable=False, unicode_output=False,
@@ -288,7 +288,7 @@
 
         # Non-escape characters are interpreted as Unicode ordinals
         if ch != '\\':
-            result.append(ord(ch))
+            result.append_codepoint(ord(ch))
             pos += 1
             continue
 
@@ -299,18 +299,18 @@
             pos += 1
             if pos == size or s[pos] != '\\':
                 break
-            result.append('\\')
+            result.append_ascii('\\')
 
         # we have a backslash at the end of the string, stop here
         if pos >= size:
-            result.append('\\')
+            result.append_ascii('\\')
             break
 
         if ((pos - bs) & 1 == 0 or
             pos >= size or
             (s[pos] != 'u' and s[pos] != 'U')):
-            result.append('\\')
-            result.append(ord(s[pos]))
+            result.append_ascii('\\')
+            result.append_codepoint(ord(s[pos]))
             pos += 1
             continue
 
@@ -350,7 +350,7 @@
     pos = 0
     result = Utf8Builder(size)
     while pos < size:
-        result.append(ord(s[pos]))
+        result.append_codepoint(ord(s[pos]))
         pos += 1
     return result.build(), pos
 
@@ -370,12 +370,12 @@
     while pos < size:
         c = s[pos]
         if ord(c) < 128:
-            result.append(c)
+            result.append_ascii(c)
             pos += 1
         else:
             r, pos = errorhandler(errors, "ascii", "ordinal not in range(128)",
                                   s,  pos, pos + 1)
-            result.append(r)
+            result.append_utf8(r)
     return result.build(), pos
 
 
@@ -416,8 +416,9 @@
                 # py3k only
                 result.append(rs)
                 continue
+
             for ch in ru:
-                cd = utf8.ORD(ch, 0)
+                cd = utf8ord(ch, 0)
                 if cd < limit:
                     result.append(chr(cd))
                 else:
@@ -470,9 +471,10 @@
             if (iter.pos != len(s) and oc <= 0xDBFF and
                 0xDC00 <= iter.peek_next() <= 0xDFFF):
                 oc2 = iter.next()
-                result.append(((oc - 0xD800) << 10 | (oc2 - 0xDC00)) + 0x10000)
+                result.append_codepoint(
+                        ((oc - 0xD800) << 10 | (oc2 - 0xDC00)) + 0x10000)
             elif allow_surrogates:
-                result.append(oc)
+                result.append_codepoint(oc)
             else:
                 ru, rs, pos = errorhandler(errors, 'utf8',
                                         'surrogates not allowed', s,
@@ -480,17 +482,17 @@
                 iter.move(pos - iter.pos)
                 if rs is not None:
                     # py3k only
-                    result.append(rs)
+                    result.append_utf8(rs)
                     continue
                 for ch in ru:
                     if ord(ch) < 0x80:
-                        result.append(ch)
+                        result.append_ascii(ch)
                     else:
                         errorhandler('strict', 'utf8',
                                     'surrogates not allowed',
                                     s, pos-1, pos)
         else:
-            result.append(oc)
+            result.append_codepoint(oc)
 
     return result.build().bytes
 
@@ -516,7 +518,7 @@
         # fast path for ASCII
         # XXX maybe use a while loop here
         if ordch1 < 0x80:
-            result.append(ordch1)
+            result.append_codepoint(ordch1)
             pos += 1
             continue
 
@@ -532,7 +534,7 @@
                 r, pos = errorhandler(errors, 'utf8',
                                       'unexpected end of data',
                                       s, pos, pos+1)
-                result.append(r)
+                result.append_utf8(r)
                 break
             ordch2 = ord(s[pos+1])
             if n == 3:
@@ -544,14 +546,14 @@
                     r, pos = errorhandler(errors, 'utf8',
                                           'invalid continuation byte',
                                           s, pos, pos+1)
-                    result.append(r)
+                    result.append_utf8(r)
                     continue
                 else:
                     # second byte valid, but third byte missing
                     r, pos = errorhandler(errors, 'utf8',
                                       'unexpected end of data',
                                       s, pos, pos+2)
-                    result.append(r)
+                    result.append_utf8(r)
                     break
             elif n == 4:
                 # 4-bytes seq with 1 or 2 continuation bytes
@@ -562,28 +564,28 @@
                     r, pos = errorhandler(errors, 'utf8',
                                           'invalid continuation byte',
                                           s, pos, pos+1)
-                    result.append(r)
+                    result.append_utf8(r)
                     continue
                 elif charsleft == 2 and ord(s[pos+2])>>6 != 0x2:   # 0b10
                     # third byte invalid, take the first two and continue
                     r, pos = errorhandler(errors, 'utf8',
                                           'invalid continuation byte',
                                           s, pos, pos+2)
-                    result.append(r)
+                    result.append_utf8(r)
                     continue
                 else:
                     # there's only 1 or 2 valid cb, but the others are missing
                     r, pos = errorhandler(errors, 'utf8',
                                       'unexpected end of data',
                                       s, pos, pos+charsleft+1)
-                    result.append(r)
+                    result.append_utf8(r)
                     break
 
         if n == 0:
             r, pos = errorhandler(errors, 'utf8',
                                   'invalid start byte',
                                   s, pos, pos+1)
-            result.append(r)
+            result.append_utf8(r)
 
         elif n == 1:
             assert 0, "ascii should have gone through the fast path"
@@ -594,11 +596,11 @@
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+1)
-                result.append(r)
+                result.append_utf8(r)
                 continue
             # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
-            result.append(((ordch1 & 0x1F) << 6) +    # 0b00011111
-                           (ordch2 & 0x3F))           # 0b00111111
+            result.append_codepoint(((ordch1 & 0x1F) << 6) +    # 0b00011111
+                                     (ordch2 & 0x3F))           # 0b00111111
             pos += 2
 
         elif n == 3:
@@ -612,18 +614,18 @@
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+1)
-                result.append(r)
+                result.append_utf8(r)
                 continue
             elif ordch3>>6 != 0x2:     # 0b10
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+2)
-                result.append(r)
+                result.append_utf8(r)
                 continue
             # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
-            result.append((((ordch1 & 0x0F) << 12) +     # 0b00001111
-                           ((ordch2 & 0x3F) << 6) +      # 0b00111111
-                            (ordch3 & 0x3F)))            # 0b00111111
+            result.append_codepoint((((ordch1 & 0x0F) << 12) +     # 0b00001111
+                                     ((ordch2 & 0x3F) << 6) +      # 0b00111111
+                                      (ordch3 & 0x3F)))            # 0b00111111
             pos += 3
 
         elif n == 4:
@@ -636,19 +638,19 @@
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+1)
-                result.append(r)
+                result.append_utf8(r)
                 continue
             elif ordch3>>6 != 0x2:     # 0b10
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+2)
-                result.append(r)
+                result.append_utf8(r)
                 continue
             elif ordch4>>6 != 0x2:     # 0b10
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+3)
-                result.append(r)
+                result.append_utf8(r)
                 continue
             # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
             c = (((ordch1 & 0x07) << 18) +      # 0b00000111
@@ -659,7 +661,7 @@
             # TODO: Why doesn't this raise an error when c > MAXUNICODE? If I'm
             #       converting utf8 -> utf8 is this necessary
             if c <= MAXUNICODE:
-                result.append(c)
+                result.append_codepoint(c)
             pos += 4
 
     return pos
@@ -748,13 +750,13 @@
                 break
             r, pos = errorhandler(errors, 'utf16', "truncated data",
                                   s, pos, len(s))
-            result.append(r)
+            result.append_utf8(r)
             if len(s) - pos < 2:
                 break
         ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
         pos += 2
         if ch < 0xD800 or ch > 0xDFFF:
-            result.append(ch)
+            result.append_codepoint(ch)
             continue
         # UTF-16 code pair:
         if len(s) - pos < 2:
@@ -763,26 +765,26 @@
                 break
             errmsg = "unexpected end of data"
             r, pos = errorhandler(errors, 'utf16', errmsg, s, pos, len(s))
-            result.append(r)
+            result.append_utf8(r)
             if len(s) - pos < 2:
                 break
         elif 0xD800 <= ch <= 0xDBFF:
             ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo])
             pos += 2
             if 0xDC00 <= ch2 <= 0xDFFF:
-                result.append((((ch & 0x3FF)<<10) |
-                              (ch2 & 0x3FF)) + 0x10000)
+                result.append_codepoint((((ch & 0x3FF)<<10) |
+                                          (ch2 & 0x3FF)) + 0x10000)
                 continue
             else:
                 r, pos = errorhandler(errors, 'utf16',
                                       "illegal UTF-16 surrogate",
                                       s, pos - 4, pos - 2)
-                result.append(r)
+                result.append_utf8(r)
         else:
             r, pos = errorhandler(errors, 'utf16',
                                   "illegal encoding",
                                   s, pos - 2, pos)
-            result.append(r)
+            result.append_utf8(r)
     return result.build(), pos, bo
 
 def create_surrogate_pair(val):
@@ -930,7 +932,7 @@
                 break
             r, pos = errorhandler(errors, encodingname, "truncated data",
                                   s, pos, len(s))
-            result.append(r)
+            result.append_utf8(r)
             if len(s) - pos < 4:
                 break
             continue
@@ -940,10 +942,10 @@
             r, pos = errorhandler(errors, encodingname,
                                   "codepoint not in range(0x110000)",
                                   s, pos, len(s))
-            result.append(r)
+            result.append_utf8(r)
             continue
 
-        result.append(ch)
+        result.append_codepoint(ch)
         pos += 4
     return result.build(), pos, bo
 
@@ -1131,19 +1133,19 @@
                     if surrogate:
                         # expecting a second surrogate
                         if outCh >= 0xDC00 and outCh <= 0xDFFFF:
-                            result.append((((surrogate & 0x3FF)<<10) |
-                                           (outCh & 0x3FF)) + 0x10000)
+                            result.append_codepoint((((surrogate & 0x3FF)<<10) |
+                                                      (outCh & 0x3FF)) + 0x10000)
                             surrogate = 0
                             continue
                         else:
-                            result.append(surrogate)
+                            result.append_codepoint(surrogate)
                             surrogate = 0
                             # Not done with outCh: falls back to next line
                     if outCh >= 0xD800 and outCh <= 0xDBFF:
                         # first surrogate
                         surrogate = outCh
                     else:
-                        result.append(outCh)
+                        result.append_codepoint(outCh)
 
             else:
                 # now leaving a base-64 section
@@ -1151,7 +1153,7 @@
                 pos += 1
 
                 if surrogate:
-                    result.append(surrogate)
+                    result.append_codepoint(surrogate)
                     surrogate = 0
 
                 if base64bits > 0: # left-over bits
@@ -1160,7 +1162,7 @@
                         msg = "partial character in shift sequence"
                         res, pos = errorhandler(errors, 'utf7',
                                                 msg, s, pos-1, pos)
-                        result.append(res)
+                        result.append_utf8(res)
                         continue
                     else:
                         # Some bits remain; they should be zero
@@ -1168,7 +1170,7 @@
                             msg = "non-zero padding bits in shift sequence"
                             res, pos = errorhandler(errors, 'utf7',
                                                     msg, s, pos-1, pos)
-                            result.append(res)
+                            result.append_utf8(res)
                             continue
 
                 if ch == '-':
@@ -1178,13 +1180,13 @@
                     base64buffer = 0
                     surrogate = 0
                 else:
-                    result.append(ch)
+                    result.append_codepoint(oc)
 
         elif ch == '+':
             pos += 1 # consume '+'
             if pos < size and s[pos] == '-': # '+-' encodes '+'
                 pos += 1
-                result.append('+')
+                result.append_ascii('+')
             else: # begin base64-encoded section
                 inShift = 1
                 shiftOutStartPos = pos - 1
@@ -1192,13 +1194,13 @@
                 base64buffer = 0
 
         elif _utf7_DECODE_DIRECT(oc): # character decodes at itself
-            result.append(chr(oc))
+            result.append_codepoint(oc)
             pos += 1
         else:
             pos += 1
             msg = "unexpected special character"
             res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
-            result.append(res)
+            result.append_utf8(res)
 
     # end of string
 
@@ -1209,7 +1211,7 @@
             (base64bits > 0 and base64buffer != 0)):
             msg = "unterminated shift sequence"
             res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos, pos)
-            result.append(res)
+            result.append_utf8(res)
     elif inShift:
         pos = shiftOutStartPos # back off output
 
@@ -1298,9 +1300,9 @@
             r, pos = errorhandler(errors, "charmap",
                                   "character maps to <undefined>",
                                   s,  pos, pos + 1)
-            result.append(r)
+            result.append_utf8(r)
             continue
-        result.append(c)
+        result.append_utf8(c)
         pos += 1
     return result.build(), pos
 
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -231,9 +231,9 @@
         pos = start
         while pos < end:
             code = utf8ord(obj, pos)
-            builder.append("&#")
-            builder.append(str(code))
-            builder.append(";")
+            builder.append_ascii("&#")
+            builder.append_ascii(str(code))
+            builder.append_ascii(";")
             pos += 1
         return space.newtuple([space.wrap(builder.build()), w_end])
     else:
@@ -254,13 +254,13 @@
             oc = utf8ord(obj, pos)
             num = hex(oc)
             if (oc >= 0x10000):
-                builder.append("\\U")
+                builder.append_ascii("\\U")
                 zeros = 8
             elif (oc >= 0x100):
-                builder.append("\\u")
+                builder.append_ascii("\\u")
                 zeros = 4
             else:
-                builder.append("\\x")
+                builder.append_ascii("\\x")
                 zeros = 2
             lnum = len(num)
             nb = zeros + 2 - lnum # num starts with '0x'
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -141,9 +141,9 @@
                         i += 1
                     else:
                         seennl |= SEEN_CR
-                    builder.append('\n')
+                    builder.append_ascii('\n')
                     continue
-                builder.append(c)
+                builder.append_codepoint(c)
             output = builder.build()
 
         self.seennl |= seennl
@@ -614,7 +614,7 @@
         # Keep reading chunks until we have n characters to return
         while True:
             data = self._get_decoded_chars(remaining)
-            builder.append(data)
+            builder.append_utf8(data)
             remaining -= len(data)
 
             if remaining <= 0: # Done
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -418,8 +418,8 @@
                 result.append_multiple_char(' ', padding)
                 # pad with spaces on the left
             if sign:
-                # TODO: Why r[0]?
                 result.append(r[0])        # the sign
+            # prefix is is only every '' or '0x', ie always ascii
             result.append(prefix)               # the prefix
             if padnumber == '0':
                 result.append_multiple_char('0', padding)
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -7,7 +7,7 @@
 from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, ORD, utf8chr
 from pypy.interpreter.utf8_codecs import (
     unicode_encode_latin_1, unicode_encode_ascii, str_decode_ascii)
-from rpython.rlib import rstring, runicode, rlocale, rfloat, jit
+from rpython.rlib import rstring, rlocale, rfloat, jit
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.rfloat import copysign, formatd
 
@@ -698,6 +698,7 @@
             need_separator = False
             done = False
             previous = 0
+
             while True:
                 group = ord(grouping[grouping_state])
                 if group > 0:
@@ -750,9 +751,7 @@
 
             if spec.n_sign:
                 if self.is_unicode:
-                    # TODO: A better way to do this might be to check if
-                    # spec.sign < 127 ...
-                    sign  = str_decode_ascii(chr(spec.sign), 1, 'strict')[0]
+                    sign = str_decode_ascii(chr(spec.sign), 1, 'strict')[0]
                 else:
                     sign = chr(spec.sign)
                 out.append(sign)