[pypy-commit] pypy unicode-utf8: general progress

Tue Nov 21 11:19:51 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r93114:cefc9ed0b4c5
Date: 2017-11-21 17:19 +0100
http://bitbucket.org/pypy/pypy/changeset/cefc9ed0b4c5/

Log:	general progress

diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -74,8 +74,8 @@
     substr = s[ps : q]
     if rawmode or '\\' not in s[ps:]:
         if need_encoding:
-            utf, (lgt, flag) = unicodehelper.decode_utf8(space, substr)
-            w_u = space.newutf8(utf, lgt, flag)
+            lgt, flag = unicodehelper.check_utf8_or_raise(space, substr)
+            w_u = space.newutf8(substr, lgt, flag)
             w_v = unicodehelper.encode(space, w_u, encoding)
             return w_v
         else:
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1094,9 +1094,9 @@
         byteorder = BYTEORDER
 
     pos = 0
+    index = 0
     while pos < size:
         ch = rutf8.codepoint_at_pos(s, pos)
-        pos = rutf8.next_codepoint_pos(s, pos)
 
         if ch < 0xD800:
             _STORECHAR(result, ch, byteorder)
@@ -1106,27 +1106,27 @@
         elif ch >= 0xE000 or allow_surrogates:
             _STORECHAR(result, ch, byteorder)
         else:
-            ru, pos = errorhandler(errors, public_encoding_name,
+            ru, newindex = errorhandler(errors, public_encoding_name,
                                    'surrogates not allowed',
                                     s, pos-1, pos)
-            xxx
-            #if rs is not None:
-            #    # py3k only
-            #    if len(rs) % 2 != 0:
-            #        errorhandler('strict', public_encoding_name,
-            #                     'surrogates not allowed',
-            #                     s, pos-1, pos)
-            #    result.append(rs)
-            #    continue
-            for ch in ru:
+            for j in range(newindex - index):
+                pos = rutf8.next_codepoint_pos(s, pos)
+            j = 0
+            while j < len(ru):
+                ch = rutf8.codepoint_at_pos(ru, j)
                 if ord(ch) < 0xD800:
                     _STORECHAR(result, ord(ch), byteorder)
                 else:
                     errorhandler('strict', public_encoding_name,
                                  'surrogates not allowed',
                                  s, pos-1, pos)
+                j = rutf8.next_codepoint_pos(ru, j)
+            index = newindex
             continue
 
+        pos = rutf8.next_codepoint_pos(s, pos)
+        index += 1
+
     return result.build()
 
 def utf8_encode_utf_16(s, errors,
@@ -1285,32 +1285,30 @@
         byteorder = BYTEORDER
 
     pos = 0
+    index = 0
     while pos < size:
         ch = rutf8.codepoint_at_pos(s, pos)
         pos = rutf8.next_codepoint_pos(s, pos)
-        ch2 = 0
         if not allow_surrogates and 0xD800 <= ch < 0xE000:
-            ru, pos = errorhandler(errors, public_encoding_name,
+            ru, newindex = errorhandler(errors, public_encoding_name,
                                         'surrogates not allowed',
                                         s, pos-1, pos)
-            XXX
-            if rs is not None:
-                # py3k only
-                if len(rs) % 4 != 0:
-                    errorhandler('strict', public_encoding_name,
-                                    'surrogates not allowed',
-                                    s, pos-1, pos)
-                result.append(rs)
-                continue
-            for ch in ru:
+            for j in range(newindex - index):
+                pos = rutf8.next_codepoint_pos(s, pos)
+            j = 0
+            while j < len(ru):
+                ch = rutf8.codepoint_at_pos(ru, j)
                 if ord(ch) < 0xD800:
                     _STORECHAR32(result, ord(ch), byteorder)
                 else:
                     errorhandler('strict', public_encoding_name,
-                                    'surrogates not allowed',
-                                    s, pos-1, pos)
+                                 'surrogates not allowed',
+                                 s, pos-1, pos)
+                j = rutf8.next_codepoint_pos(ru, j)
+            index = newindex
             continue
         _STORECHAR32(result, ch, byteorder)
+        index += 1
 
     return result.build()
 
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -2,8 +2,9 @@
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.translator.tool.cbuild import ExternalCompilationInfo
 from rpython.translator import cdir
+from rpython.rlib import rutf8
 
-UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD'
+UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD'.encode("utf8")
 
 
 class EncodeDecodeError(Exception):
@@ -126,7 +127,7 @@
                                     errorcb, namecb, stringdata)
         src = pypy_cjk_dec_outbuf(decodebuf)
         length = pypy_cjk_dec_outlen(decodebuf)
-        return rffi.wcharpsize2unicode(src, length)
+        return rffi.wcharpsize2utf8(src, length)
 
 def multibytecodec_decerror(decodebuf, e, errors,
                             errorcb, namecb, stringdata):
@@ -148,7 +149,7 @@
     if errors == "strict":
         raise EncodeDecodeError(start, end, reason)
     elif errors == "ignore":
-        replace = u""
+        replace = ""
     elif errors == "replace":
         replace = UNICODE_REPLACEMENT_CHARACTER
     else:
@@ -156,8 +157,12 @@
         replace, end = errorcb(errors, namecb, reason,
                                stringdata, start, end)
         # 'replace' is RPython unicode here
-    with rffi.scoped_nonmoving_unicodebuffer(replace) as inbuf:
-        r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, len(replace), end)
+    lgt, _ = rutf8.check_utf8(replace, True)
+    inbuf = rffi.utf82wcharp(replace, lgt)
+    try:
+        r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end)
+    finally:
+        lltype.free(inbuf, flavor='raw')
     if r == MBERR_NOMEMORY:
         raise MemoryError
 
@@ -256,6 +261,7 @@
             replace = "?"
     else:
         assert errorcb
+        XXX
         retu, rets, end = errorcb(errors, namecb, reason,
                                   unicodedata.encode("utf8"), start, end)
         if rets is not None:
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -1,3 +1,6 @@
+
+from rpython.rlib import rutf8
+
 from pypy.interpreter.baseobjspace import W_Root
 from pypy.interpreter.gateway import interp2app, unwrap_spec
 from pypy.interpreter.typedef import TypeDef
@@ -18,13 +21,14 @@
         state = space.fromcache(CodecState)
         #
         try:
-            u_output = c_codecs.decode(self.codec, input, errors,
+            utf8_output = c_codecs.decode(self.codec, input, errors,
                                      state.decode_error_handler, self.name)
         except c_codecs.EncodeDecodeError as e:
             raise wrap_unicodedecodeerror(space, e, input, self.name)
         except RuntimeError:
             raise wrap_runtimeerror(space)
-        return space.newtuple([space.newunicode(u_output),
+        lgt, flag = rutf8.check_utf8(utf8_output, True)
+        return space.newtuple([space.newutf8(utf8_output, lgt, flag),
                                space.newint(len(input))])
 
     @unwrap_spec(input='utf8', errors="text_or_none")
@@ -74,7 +78,7 @@
             space.newtext(e.reason)]))
 
 def wrap_unicodeencodeerror(space, e, input, inputlen, name):
-    flag = 13
+    _, flag = rutf8.check_utf8(input, True)
     raise OperationError(
         space.w_UnicodeEncodeError,
         space.newtuple([
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -841,8 +841,7 @@
                 prefix = "0x"
             as_str = value.format(LONG_DIGITS[:base], prefix)
             if self.is_unicode:
-                XXX
-                return as_str.decode("latin-1")
+                return rutf8.decode_latin_1(as_str)
             return as_str
 
         def _int_to_base(self, base, value):
diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1009,6 +1009,29 @@
  wcharp2unicoden, wcharpsize2unicode, unicode2wchararray, unicode2rawmem,
  ) = make_string_mappings(unicode)
 
+def wcharpsize2utf8(w, size):
+    """ Helper to convert WCHARP pointer to utf8 in one go.
+    Equivalent to wcharpsize2unicode().encode("utf8")
+    """
+    from rpython.rlib import rutf8
+
+    s = StringBuilder(size)
+    for i in range(size):
+        rutf8.unichr_as_utf8_append(s, ord(w[i]))
+    return s.build()
+
+def utf82wcharp(utf8, utf8len):
+    from rpython.rlib import rutf8
+
+    w = lltype.malloc(CWCHARP.TO, utf8len, flavor='raw')
+    i = 0
+    index = 0
+    while i < len(utf8):
+        w[index] = unichr(rutf8.codepoint_at_pos(utf8, i))
+        i = rutf8.next_codepoint_pos(utf8, i)
+        index += 1
+    return w
+
 # char**
 CCHARPP = lltype.Ptr(lltype.Array(CCHARP, hints={'nolength': True}))
 
diff --git a/rpython/rtyper/lltypesystem/test/test_rffi.py b/rpython/rtyper/lltypesystem/test/test_rffi.py
--- a/rpython/rtyper/lltypesystem/test/test_rffi.py
+++ b/rpython/rtyper/lltypesystem/test/test_rffi.py
@@ -590,6 +590,14 @@
         res = fn(expected_extra_mallocs=range(30))
         assert res == 32 * len(d)
 
+    def test_wcharp_to_utf8(self):
+        wchar = lltype.malloc(CWCHARP.TO, 3, flavor='raw')
+        wchar[0] = u'\u1234'
+        wchar[1] = u'\x80'
+        wchar[2] = u'a'
+        assert wcharpsize2utf8(wchar, 3).decode("utf8") == u'\u1234\x80a'
+        lltype.free(wchar, flavor='raw')
+
 class TestRffiInternals:
     def test_struct_create(self):
         X = CStruct('xx', ('one', INT))