[pypy-commit] pypy unicode-utf8: interpreter fixes

Tue Nov 21 04:30:34 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r93109:86548802b11b
Date: 2017-11-21 10:29 +0100
http://bitbucket.org/pypy/pypy/changeset/86548802b11b/

Log:	interpreter fixes

diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -231,11 +231,14 @@
     return s[pt:ps]
 
 def decode_utf8_recode(space, s, ps, end, recode_encoding):
-    lgt, flag = unicodehelper.check_utf8_or_raise(space, s, ps, end)
-    w_v = unicodehelper.encode(space, space.newutf8(s[ps:end], lgt, flag),
+    p = ps
+    while p < end and ord(s[p]) & 0x80:
+        p += 1
+    lgt, flag = unicodehelper.check_utf8_or_raise(space, s, ps, p)
+    w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt, flag),
                                recode_encoding)
     v = space.bytes_w(w_v)
-    return v, ps
+    return v, p
 
 def raise_app_valueerror(space, msg):
     raise OperationError(space.w_ValueError, space.newtext(msg))
diff --git a/pypy/interpreter/pyparser/test/test_parsestring.py b/pypy/interpreter/pyparser/test/test_parsestring.py
--- a/pypy/interpreter/pyparser/test/test_parsestring.py
+++ b/pypy/interpreter/pyparser/test/test_parsestring.py
@@ -10,7 +10,7 @@
             assert space.str_w(w_ret) == value
         elif isinstance(value, unicode):
             assert space.type(w_ret) == space.w_unicode
-            assert space.unicode_w(w_ret) == value
+            assert space.utf8_w(w_ret).decode('utf8') == value
         else:
             assert False
 
@@ -102,7 +102,4 @@
     def test_decode_unicode_utf8(self):
         buf = parsestring.decode_unicode_utf8(self.space,
                                               'u"\xf0\x9f\x92\x8b"', 2, 6)
-        if sys.maxunicode == 65535:
-            assert buf == r"\U0000d83d\U0000dc8b"
-        else:
-            assert buf == r"\U0001f48b"
+        assert buf == r"\U0001f48b"
diff --git a/pypy/interpreter/test/test_objspace.py b/pypy/interpreter/test/test_objspace.py
--- a/pypy/interpreter/test/test_objspace.py
+++ b/pypy/interpreter/test/test_objspace.py
@@ -216,9 +216,7 @@
         space = self.space
         w = space.wrap
         assert space.text0_w(w("123")) == "123"
-        exc = space.raises_w(space.w_TypeError, space.text0_w, w("123\x004"))
-        assert space.unicode0_w(w(u"123")) == u"123"
-        exc = space.raises_w(space.w_TypeError, space.unicode0_w, w(u"123\x004"))
+        space.raises_w(space.w_TypeError, space.text0_w, w("123\x004"))
 
     def test_getindex_w(self):
         w_instance1 = self.space.appexec([], """():
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -351,12 +351,12 @@
         try:
             chr = r_uint(int(s[pos:pos+digits], 16))
         except ValueError:
-            aaaa
             endinpos = pos
             while s[endinpos] in hexdigits:
                 endinpos += 1
             res, pos = errorhandler(errors, encoding,
                                     message, s, pos-2, endinpos)
+            size, flag = rutf8.check_utf8(res, True)
             builder.append(res)
         else:
             # when we get here, chr is a 32-bit unicode character
@@ -1392,7 +1392,7 @@
     while pos < size:
         ch = s[pos]
 
-        c = mapping.get(ch, ERROR_CHAR)
+        c = mapping.get(ord(ch), ERROR_CHAR)
         if c == ERROR_CHAR:
             r, pos = errorhandler(errors, "charmap",
                                   "character maps to <undefined>",
@@ -1407,20 +1407,17 @@
 
 def utf8_encode_charmap(s, errors, errorhandler=None,
                            mapping=None):
-    YYY
+    size = len(s)
     if mapping is None:
-        return unicode_encode_latin_1(s, size, errors,
-                                      errorhandler=errorhandler)
-
-    if errorhandler is None:
-        errorhandler = default_unicode_error_encode
+        return utf8_encode_latin_1(s, size, errors,
+                                   errorhandler=errorhandler)
 
     if size == 0:
         return ''
     result = StringBuilder(size)
     pos = 0
     while pos < size:
-        ch = s[pos]
+        ch = rutf8.codepoint_at_pos(s, pos)
 
         c = mapping.get(ch, '')
         if len(c) == 0:
@@ -1428,9 +1425,10 @@
             collend = pos + 1
             while collend < size and mapping.get(s[collend], '') == '':
                 collend += 1
-            ru, rs, pos = errorhandler(errors, "charmap",
-                                       "character maps to <undefined>",
-                                       s, pos, collend)
+            rs, pos = errorhandler(errors, "charmap",
+                                   "character maps to <undefined>",
+                                   s, pos, collend)
+            XXXX
             if rs is not None:
                 # py3k only
                 result.append(rs)
@@ -1445,6 +1443,6 @@
                 result.append(c2)
             continue
         result.append(c)
-        pos += 1
+        pos = rutf8.next_codepoint_pos(s, pos)
     return result.build()
 
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -551,10 +551,10 @@
 
         # get the character from the mapping
         if self.mapping_w is not None:
-            w_ch = self.mapping_w[ord(ch)]
+            w_ch = self.mapping_w[ch]
         else:
             try:
-                w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
+                w_ch = space.getitem(self.w_mapping, space.newint(ch))
             except OperationError as e:
                 if not e.match(space, space.w_LookupError):
                     raise
@@ -587,7 +587,7 @@
 
         # get the character from the mapping
         try:
-            w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
+            w_ch = space.getitem(self.w_mapping, space.newint(ch))
         except OperationError as e:
             if not e.match(space, space.w_LookupError):
                 raise
@@ -633,8 +633,8 @@
     return space.newtuple([space.newutf8(result, lgt, flag),
                            space.newint(consumed)])
 
- at unwrap_spec(utf8='utf8', errors='text_or_none')
-def charmap_encode(space, utf8, errors="strict", w_mapping=None):
+ at unwrap_spec(errors='text_or_none')
+def charmap_encode(space, w_unicode, errors="strict", w_mapping=None):
     from pypy.interpreter import unicodehelper
 
     if errors is None:
@@ -645,9 +645,10 @@
         mapping = Charmap_Encode(space, w_mapping)
 
     state = space.fromcache(CodecState)
-    result = unicodehelper.unicode_encode_charmap(
-        utf8, errors, state.encode_error_handler, mapping)
-    return space.newtuple([space.newbytes(result), space.newint(len(uni))])
+    w_uni = unicodehelper.convert_arg_to_w_unicode(space, w_unicode)
+    result = unicodehelper.utf8_encode_charmap(
+        space.utf8_w(w_uni), errors, state.encode_error_handler, mapping)
+    return space.newtuple([space.newbytes(result), space.newint(w_uni._len())])
 
 
 @unwrap_spec(chars='utf8')