[pypy-commit] pypy unicode-utf8: fixes until we get to formatting problems

Mon Nov 20 10:55:02 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r93101:f074b4987d57
Date: 2017-11-20 16:54 +0100
http://bitbucket.org/pypy/pypy/changeset/f074b4987d57/

Log:	fixes until we get to formatting problems

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1759,20 +1759,6 @@
 
     def utf8_w(self, w_obj):
         return w_obj.utf8_w(self)
-
-    @specialize.argtype(1)
-    def unicode_w(self, w_obj):
-        return self.utf8_w(w_obj).decode('utf8')
-
-    def realunicode_w(self, w_obj):
-        return self.realutf8_w(w_obj).decode('utf8')
-
-    def newunicode(self, u):
-        from pypy.interpreter import unicodehelper
-        assert isinstance(u, unicode)
-        # XXX let's disallow that
-        return self.newutf8(u.encode("utf8"), len(u), unicodehelper._get_flag(u))
-
     def convert_to_w_unicode(self, w_obj):
         return w_obj.convert_to_w_unicode(self)
 
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -61,10 +61,10 @@
 
 @given(strategies.text())
 def test_unicode_raw_escape(u):
-    r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict')
+    r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict', None)
     assert r == u.encode("raw-unicode-escape")
 
 @given(strategies.text())
 def test_unicode_escape(u):
-    r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict")
+    r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
     assert r == u.encode("unicode-escape")
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -60,14 +60,12 @@
             return True
     return False
 
-def _get_flag(u):
-    flag = rutf8.FLAG_ASCII
-    for c in u:
-        if 0xD800 <= ord(c) <= 0xDFFF:
-            return rutf8.FLAG_HAS_SURROGATES
-        if ord(c) >= 0x80:
-            flag = rutf8.FLAG_REGULAR
-    return flag
+def get_flag_from_code(oc):
+    if oc <= 0x7F:
+        return rutf8.FLAG_ASCII
+    if 0xD800 <= oc <= 0xDFFF:
+        return rutf8.FLAG_HAS_SURROGATES
+    return rutf8.FLAG_REGULAR
 
 # These functions take and return unwrapped rpython strings
 def decode_unicode_escape(space, string):
@@ -134,7 +132,11 @@
     return ress, len(s), lgt, flag
 
 def str_decode_latin_1(s, errors, final, errorhandler):
-    xxx
+    try:
+        rutf8.check_ascii(s)
+        return s, len(s), len(s), rutf8.FLAG_ASCII
+    except rutf8.CheckError:
+        return _str_decode_latin_1_slowpath(s, errors, final, errorhandler)
 
 def utf8_encode_latin_1(s, errors, errorhandler):
     try:
@@ -208,7 +210,6 @@
     slen = len(s)
     res = StringBuilder(slen)
     pos = 0
-    continuation_bytes = 0
     end = len(s)
     while pos < end:
         ordch1 = ord(s[pos])
@@ -229,6 +230,7 @@
         if ordch1 <= 0xDF:
             if pos >= end:
                 if not final:
+                    pos -= 1
                     break
                 r, pos = errorhandler(errors, "utf8", "unexpected end of data",
                     s, pos - 1, pos)
@@ -243,7 +245,6 @@
                 continue
             # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
             pos += 1
-            continuation_bytes += 1
             res.append(chr(ordch1))
             res.append(chr(ordch2))
             continue
@@ -251,6 +252,7 @@
         if ordch1 <= 0xEF:
             if (pos + 2) > end:
                 if not final:
+                    pos -= 1
                     break
                 r, pos = errorhandler(errors, "utf8", "unexpected end of data",
                     s, pos - 1, pos + 1)
@@ -272,7 +274,6 @@
             pos += 2
 
             # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
-            continuation_bytes += 2
             res.append(chr(ordch1))
             res.append(chr(ordch2))
             res.append(chr(ordch3))
@@ -281,6 +282,7 @@
         if ordch1 <= 0xF4:
             if (pos + 3) > end:
                 if not final:
+                    pos -= 1
                     break
                 r, pos = errorhandler(errors, "utf8", "unexpected end of data",
                     s, pos - 1, pos)
@@ -312,15 +314,12 @@
             res.append(chr(ordch2))
             res.append(chr(ordch3))
             res.append(chr(ordch4))
-            continuation_bytes += 3
             continue
 
         r, pos = errorhandler(errors, "utf8", "invalid start byte",
                 s, pos - 1, pos)
         res.append(r)
 
-    assert pos == end
-    assert pos - continuation_bytes >= 0
     r = res.build()
     lgt, flag = rutf8.check_utf8(r, True)
     return r, pos, lgt, flag
@@ -352,19 +351,14 @@
         else:
             # when we get here, chr is a 32-bit unicode character
             if chr > 0x10ffff:
-                UUU
                 message = "illegal Unicode character"
                 res, pos = errorhandler(errors, encoding,
                                         message, s, pos-2, pos+digits)
+                size, flag = rutf8.check_utf8(res)
                 builder.append(res)
             else:
                 rutf8.unichr_as_utf8_append(builder, chr, True)
-                if chr <= 0x7f:
-                    flag = rutf8.FLAG_ASCII
-                elif 0xd800 <= chr <= 0xdfff:
-                    flag = rutf8.FLAG_HAS_SURROGATES
-                else:
-                    flag = rutf8.FLAG_REGULAR
+                flag = get_flag_from_code(chr)
                 pos += digits
                 size = 1
 
@@ -508,22 +502,22 @@
                         builder.append(res)
                         continue
                     pos = look + 1
-                    XXX
-                    if code <= MAXUNICODE:
-                        builder.append(UNICHR(code))
-                    else:
-                        code -= 0x10000L
-                        builder.append(unichr(0xD800 + (code >> 10)))
-                        builder.append(unichr(0xDC00 + (code & 0x03FF)))
+                    outsize += 1
+                    flag = combine_flags(flag, get_flag_from_code(code))
+                    rutf8.unichr_as_utf8_append(builder, code)
                 else:
-                    YYY
                     res, pos = errorhandler(errors, "unicodeescape",
                                             message, s, pos-1, look+1)
+                    newsize, newflag = rutf8.check_utf8(res, True)
+                    flag = combine_flags(flag, newflag)
+                    outsize += newsize
                     builder.append(res)
             else:
-                AAA
                 res, pos = errorhandler(errors, "unicodeescape",
                                         message, s, pos-1, look+1)
+                newsize, newflag = rutf8.check_utf8(res, True)
+                flag = combine_flags(flag, newflag)
+                outsize += newsize
                 builder.append(res)
         else:
             builder.append('\\')
@@ -602,7 +596,7 @@
     for i in range(zeros-1, -1, -1):
         result.append(TABLE[(char >> (4 * i)) & 0x0f])
 
-def utf8_encode_raw_unicode_escape(s, errors, errorhandler=None):
+def utf8_encode_raw_unicode_escape(s, errors, errorhandler):
     # errorhandler is not used: this function cannot cause Unicode errors
     size = len(s)
     if size == 0:
@@ -621,7 +615,7 @@
     return result.build()
 
 
-def utf8_encode_unicode_escape(s, errors):
+def utf8_encode_unicode_escape(s, errors, errorhandler):
     return _utf8_encode_unicode_escape(s)
 
 # ____________________________________________________________
@@ -851,7 +845,7 @@
     assert final_length >= 0
     return result.build()[:final_length], pos, outsize, flag
 
-def utf8_encode_utf_7(s, errors, errorhandler=None):
+def utf8_encode_utf_7(s, errors, errorhandler):
     size = len(s)
     if size == 0:
         return ''
@@ -1294,3 +1288,153 @@
                              errorhandler=None, allow_surrogates=True):
     return unicode_encode_utf_32_helper(s, errors, errorhandler,
                                         allow_surrogates, "little")
+
+# ____________________________________________________________
+# unicode-internal
+
+def str_decode_unicode_internal(s, errors, final=False,
+                                errorhandler=None):
+    size = len(s)
+    if size == 0:
+        return '', 0, 0, rutf8.FLAG_ASCII
+
+    unicode_bytes = 4
+    if BYTEORDER == "little":
+        start = 0
+        stop = unicode_bytes
+        step = 1
+    else:
+        start = unicode_bytes - 1
+        stop = -1
+        step = -1
+
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        if pos > size - unicode_bytes:
+            res, pos = errorhandler(errors, "unicode_internal",
+                                    "truncated input",
+                                    s, pos, size)
+            result.append(res)
+            if pos > size - unicode_bytes:
+                break
+            continue
+        t = r_uint(0)
+        h = 0
+        for j in range(start, stop, step):
+            t += r_uint(ord(s[pos + j])) << (h*8)
+            h += 1
+        if t > 0x10ffff:
+            res, pos = errorhandler(errors, "unicode_internal",
+                                    "unichr(%d) not in range" % (t,),
+                                    s, pos, pos + unicode_bytes)
+            result.append(res)
+            continue
+        rutf8.unichr_as_utf8_append(result, intmask(t))
+        pos += unicode_bytes
+    r = result.build()
+    lgt, flag = rutf8.check_utf8(r, True)
+    return r, pos, lgt, flag
+
+def utf8_encode_unicode_internal(s, errors, errorhandler):
+    size = len(s)
+    if size == 0:
+        return ''
+
+    result = StringBuilder(size * 4)
+    pos = 0
+    while pos < size:
+        oc = rutf8.codepoint_at_pos(s, pos)
+        if BYTEORDER == "little":
+            result.append(chr(oc       & 0xFF))
+            result.append(chr(oc >>  8 & 0xFF))
+            result.append(chr(oc >> 16 & 0xFF))
+            result.append(chr(oc >> 24 & 0xFF))
+        else:
+            result.append(chr(oc >> 24 & 0xFF))
+            result.append(chr(oc >> 16 & 0xFF))
+            result.append(chr(oc >>  8 & 0xFF))
+            result.append(chr(oc       & 0xFF))
+        pos = rutf8.next_codepoint_pos(s, pos)
+
+    return result.build()
+
+# ____________________________________________________________
+# Charmap
+
+ERROR_CHAR = u'\ufffe'.encode('utf8')
+
+ at specialize.argtype(4)
+def str_decode_charmap(s, errors, final=False,
+                       errorhandler=None, mapping=None):
+    "mapping can be a rpython dictionary, or a dict-like object."
+
+    # Default to Latin-1
+    if mapping is None:
+        return str_decode_latin_1(s, errors, final=final,
+                                  errorhandler=errorhandler)
+    size = len(s)
+    if size == 0:
+        return '', 0, 0, rutf8.FLAG_ASCII
+
+    pos = 0
+    result = StringBuilder(size)
+    while pos < size:
+        ch = s[pos]
+
+        c = mapping.get(ch, ERROR_CHAR)
+        if c == ERROR_CHAR:
+            r, pos = errorhandler(errors, "charmap",
+                                  "character maps to <undefined>",
+                                  s,  pos, pos + 1)
+            result.append(r)
+            continue
+        result.append(c)
+        pos += 1
+    r = result.build()
+    lgt, flag = rutf8.check_utf8(r, True)
+    return r, pos, lgt, flag
+
+def utf8_encode_charmap(s, errors, errorhandler=None,
+                           mapping=None):
+    YYY
+    if mapping is None:
+        return unicode_encode_latin_1(s, size, errors,
+                                      errorhandler=errorhandler)
+
+    if errorhandler is None:
+        errorhandler = default_unicode_error_encode
+
+    if size == 0:
+        return ''
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+
+        c = mapping.get(ch, '')
+        if len(c) == 0:
+            # collect all unencodable chars. Important for narrow builds.
+            collend = pos + 1
+            while collend < size and mapping.get(s[collend], '') == '':
+                collend += 1
+            ru, rs, pos = errorhandler(errors, "charmap",
+                                       "character maps to <undefined>",
+                                       s, pos, collend)
+            if rs is not None:
+                # py3k only
+                result.append(rs)
+                continue
+            for ch2 in ru:
+                c2 = mapping.get(ch2, '')
+                if len(c2) == 0:
+                    errorhandler(
+                        "strict", "charmap",
+                        "character maps to <undefined>",
+                        s,  pos, pos + 1)
+                result.append(c2)
+            continue
+        result.append(c)
+        pos += 1
+    return result.build()
+
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,7 +1,6 @@
 from rpython.rlib import jit, rutf8
 from rpython.rlib.objectmodel import we_are_translated, not_rpython
 from rpython.rlib.rstring import UnicodeBuilder
-from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
 
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
@@ -563,14 +562,14 @@
 
         if space.isinstance_w(w_ch, space.w_unicode):
             # Charmap may return a unicode string
-            return space.unicode_w(w_ch)
+            return space.utf8_w(w_ch)
         elif space.isinstance_w(w_ch, space.w_int):
             # Charmap may return a number
             x = space.int_w(w_ch)
             if not 0 <= x <= 0x10FFFF:
                 raise oefmt(space.w_TypeError,
                     "character mapping must be in range(0x110000)")
-            return code_to_unichr(x)
+            return rutf8.unichr_as_utf8(x)
         elif space.is_w(w_ch, space.w_None):
             # Charmap may return None
             return errorchar
@@ -614,12 +613,13 @@
 
 @unwrap_spec(string='bufferstr', errors='text_or_none')
 def charmap_decode(space, string, errors="strict", w_mapping=None):
-    from pypy.interpreter.unicodehelper import DecodeWrapper
+    from pypy.interpreter import unicodehelper
 
     if errors is None:
         errors = 'strict'
     if len(string) == 0:
-        return space.newtuple([space.newunicode(u''), space.newint(0)])
+        return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII),
+                               space.newint(0)])
 
     if space.is_none(w_mapping):
         mapping = None
@@ -628,14 +628,14 @@
 
     final = True
     state = space.fromcache(CodecState)
-    result, consumed = runicode.str_decode_charmap(
-        string, len(string), errors,
-        final, DecodeWrapper(state.decode_error_handler).handle, mapping)
-    return space.newtuple([space.newunicode(result), space.newint(consumed)])
+    result, consumed, lgt, flag = unicodehelper.str_decode_charmap(
+        string, errors, final, state.decode_error_handler, mapping)
+    return space.newtuple([space.newutf8(result, lgt, flag),
+                           space.newint(consumed)])
 
 @unwrap_spec(utf8='utf8', errors='text_or_none')
 def charmap_encode(space, utf8, errors="strict", w_mapping=None):
-    from pypy.interpreter.unicodehelper import EncodeWrapper
+    from pypy.interpreter import unicodehelper
 
     if errors is None:
         errors = 'strict'
@@ -645,10 +645,8 @@
         mapping = Charmap_Encode(space, w_mapping)
 
     state = space.fromcache(CodecState)
-    uni = utf8.decode('utf8')
-    result = runicode.unicode_encode_charmap(
-        uni, len(uni), errors,
-        EncodeWrapper(state.encode_error_handler).handle, mapping)
+    result = unicodehelper.unicode_encode_charmap(
+        utf8, errors, state.encode_error_handler, mapping)
     return space.newtuple([space.newbytes(result), space.newint(len(uni))])
 
 
@@ -707,7 +705,7 @@
 
 @unwrap_spec(errors='text_or_none')
 def unicode_internal_decode(space, w_string, errors="strict"):
-    from pypy.interpreter.unicodehelper import DecodeWrapper
+    from pypy.interpreter import unicodehelper
 
     if errors is None:
         errors = 'strict'
@@ -718,14 +716,16 @@
     string = space.readbuf_w(w_string).as_str()
 
     if len(string) == 0:
-        return space.newtuple([space.newunicode(u''), space.newint(0)])
+        return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII),
+                               space.newint(0)])
 
     final = True
     state = space.fromcache(CodecState)
-    result, consumed = runicode.str_decode_unicode_internal(
-        string, len(string), errors,
-        final, DecodeWrapper(state.decode_error_handler).handle)
-    return space.newtuple([space.newunicode(result), space.newint(consumed)])
+    result, consumed, lgt, flag = unicodehelper.str_decode_unicode_internal(
+        string, errors,
+        final, state.decode_error_handler)
+    return space.newtuple([space.newutf8(result, lgt, flag),
+                           space.newint(consumed)])
 
 # ____________________________________________________________
 # support for the "string escape" codec
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -15,7 +15,6 @@
                          'utf-32', 'utf-32-le', 'utf-32-be',
                          'raw_unicode_escape',
                          'unicode_escape', 'unicode_internal'):
-            print encoding
             assert unicode(u.encode(encoding),encoding) == u
 
     def test_ucs4(self):
diff --git a/pypy/module/exceptions/interp_exceptions.py b/pypy/module/exceptions/interp_exceptions.py
--- a/pypy/module/exceptions/interp_exceptions.py
+++ b/pypy/module/exceptions/interp_exceptions.py
@@ -285,7 +285,7 @@
 
     def descr_init(self, space, w_object, w_start, w_end, w_reason):
         # typechecking
-        space.realunicode_w(w_object)
+        space.utf8_w(w_object)
         space.int_w(w_start)
         space.int_w(w_end)
         space.realtext_w(w_reason)
@@ -719,7 +719,7 @@
     def descr_init(self, space, w_encoding, w_object, w_start, w_end, w_reason):
         # typechecking
         space.realtext_w(w_encoding)
-        space.realunicode_w(w_object)  # XXX realutf8()?
+        space.utf8_w(w_object)
         space.int_w(w_start)
         space.int_w(w_end)
         space.realtext_w(w_reason)
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -432,8 +432,7 @@
 
         def fmt_s(self, w_value):
             space = self.space
-            got_unicode = space.isinstance_w(w_value,
-                                                         space.w_unicode)
+            got_unicode = space.isinstance_w(w_value, space.w_unicode)
             if not do_unicode:
                 if got_unicode:
                     raise NeedUnicodeFormattingError
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -164,9 +164,9 @@
         if isinstance(x, str):
             return self.newtext(x)
         if isinstance(x, unicode):
-            from pypy.interpreter import unicodehelper
-            return self.newutf8(x.encode('utf8'), len(x),
-                                unicodehelper._get_flag(x))
+            x = x.encode('utf8')
+            lgt, flag = rutf8.check_utf8(x, True)
+            return self.newutf8(x, lgt, flag)
         if isinstance(x, float):
             return W_FloatObject(x)
         if isinstance(x, W_Root):