[pypy-commit] pypy unicode-utf8: Review for surrogates

Thu Nov 23 04:48:57 EST 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r93136:dc6582a05b85
Date: 2017-11-23 10:48 +0100
http://bitbucket.org/pypy/pypy/changeset/dc6582a05b85/

Log:	Review for surrogates

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -370,14 +370,15 @@
             builder.append(res)
         else:
             # when we get here, chr is a 32-bit unicode character
-            if chr > 0x10ffff:
+            try:
+                rutf8.unichr_as_utf8_append(builder, intmask(chr), True)
+            except ValueError:
                 message = "illegal Unicode character"
                 res, pos = errorhandler(errors, encoding,
                                         message, s, pos-2, pos+digits)
                 size, flag = rutf8.check_utf8(res, True)
                 builder.append(res)
             else:
-                rutf8.unichr_as_utf8_append(builder, intmask(chr), True)
                 flag = rutf8.get_flag_from_code(intmask(chr))
                 pos += digits
                 size = 1
@@ -466,7 +467,7 @@
                             pos += 1
                             x = (x<<3) + ord(ch) - ord('0')
             outsize += 1
-            if x >= 0x7F:
+            if x > 0x7F:
                 rutf8.unichr_as_utf8_append(builder, x)
                 flag = combine_flags(rutf8.FLAG_REGULAR, flag)
             else:
@@ -524,7 +525,9 @@
                     pos = look + 1
                     outsize += 1
                     flag = combine_flags(flag, rutf8.get_flag_from_code(code))
-                    rutf8.unichr_as_utf8_append(builder, code)
+                    rutf8.unichr_as_utf8_append(builder, code,
+                                                allow_surrogates=True)
+                    # xxx 'code' is probably always within range here...
                 else:
                     res, pos = errorhandler(errors, "unicodeescape",
                                             message, s, pos-1, look+1)
@@ -772,7 +775,8 @@
                             surrogate = 0
                             continue
                         else:
-                            rutf8.unichr_as_utf8_append(result, surrogate)
+                            rutf8.unichr_as_utf8_append(result, surrogate,
+                                                        allow_surrogates=True)
                             flag = rutf8.FLAG_HAS_SURROGATES
                             outsize += 1
                             surrogate = 0
@@ -1236,7 +1240,7 @@
             result.append(r)
             continue
 
-        rutf8.unichr_as_utf8_append(result, ch)
+        rutf8.unichr_as_utf8_append(result, ch, allow_surrogates=True)
         pos += 4
     r = result.build()
     lgt, flag = rutf8.check_utf8(r, True)
@@ -1360,7 +1364,7 @@
                                     s, pos, pos + unicode_bytes)
             result.append(res)
             continue
-        rutf8.unichr_as_utf8_append(result, intmask(t))
+        rutf8.unichr_as_utf8_append(result, intmask(t), allow_surrogates=True)
         pos += unicode_bytes
     r = result.build()
     lgt, flag = rutf8.check_utf8(r, True)
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -127,7 +127,7 @@
                                     errorcb, namecb, stringdata)
         src = pypy_cjk_dec_outbuf(decodebuf)
         length = pypy_cjk_dec_outlen(decodebuf)
-        return rffi.wcharpsize2utf8(src, length)
+        return rffi.wcharpsize2utf8(src, length) # assumes no out-of-range chars
 
 def multibytecodec_decerror(decodebuf, e, errors,
                             errorcb, namecb, stringdata):
diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1012,6 +1012,7 @@
 def wcharpsize2utf8(w, size):
     """ Helper to convert WCHARP pointer to utf8 in one go.
     Equivalent to wcharpsize2unicode().encode("utf8")
+    Raises ValueError if characters are outside range(0x110000)!
     """
     from rpython.rlib import rutf8