[pypy-commit] pypy default: An attempt to reduce differences in rlib/runicode.py,

Mon Sep 24 23:45:50 CEST 2012

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: 
Changeset: r57525:ff3afc19472c
Date: 2012-09-24 23:34 +0200
http://bitbucket.org/pypy/pypy/changeset/ff3afc19472c/

Log:	An attempt to reduce differences in rlib/runicode.py, between
	default and py3k branches.
	- by default, utf_8 functions don't allow lone surrogates. The 2.7
	implementation has to pass 'allow_surrogates=True'
	- unicode_encode_unicode_escape() is a generated function. py3k
	needs a very similar function for repr(), but which returns unicode.
	and the 'u' suffix is now an option.

diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -405,7 +405,6 @@
          "ascii_encode",
          "latin_1_encode",
          "utf_7_encode",
-         "utf_8_encode",
          "utf_16_encode",
          "utf_16_be_encode",
          "utf_16_le_encode",
@@ -422,7 +421,6 @@
          "ascii_decode",
          "latin_1_decode",
          "utf_7_decode",
-         "utf_8_decode",
          "utf_16_decode",
          "utf_16_be_decode",
          "utf_16_le_decode",
@@ -437,6 +435,30 @@
     make_encoder_wrapper('mbcs_encode')
     make_decoder_wrapper('mbcs_decode')
 
+# utf-8 functions are not regular, because we have to pass
+# "allow_surrogates=True"
+ at unwrap_spec(uni=unicode, errors='str_or_None')
+def utf_8_encode(space, uni, errors="strict"):
+    if errors is None:
+        errors = 'strict'
+    state = space.fromcache(CodecState)
+    result = runicode.unicode_encode_utf_8(
+        uni, len(uni), errors, state.encode_error_handler,
+        allow_surrogates=True)
+    return space.newtuple([space.wrap(result), space.wrap(len(uni))])
+
+ at unwrap_spec(string='bufferstr', errors='str_or_None')
+def utf_8_decode(space, string, errors="strict", w_final=False):
+    if errors is None:
+        errors = 'strict'
+    final = space.is_true(w_final)
+    state = space.fromcache(CodecState)
+    result, consumed = runicode.str_decode_utf_8(
+        string, len(string), errors,
+        final, state.decode_error_handler,
+        allow_surrogates=True)
+    return space.newtuple([space.wrap(result), space.wrap(consumed)])
+
 @unwrap_spec(data=str, errors='str_or_None', byteorder=int)
 def utf_16_ex_decode(space, data, errors='strict', byteorder=0, w_final=False):
     if errors is None:
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -13,7 +13,7 @@
 from pypy.rlib.objectmodel import compute_hash, specialize
 from pypy.rlib.objectmodel import compute_unique_id
 from pypy.rlib.rstring import UnicodeBuilder
-from pypy.rlib.runicode import unicode_encode_unicode_escape
+from pypy.rlib.runicode import make_unicode_escape_function
 from pypy.module.unicodedata import unicodedb
 from pypy.tool.sourcetools import func_with_new_name
 from pypy.rlib import jit
@@ -918,10 +918,13 @@
                     space.wrap("character mapping must return integer, None or unicode"))
     return W_UnicodeObject(u''.join(result))
 
+_repr_function, _ = make_unicode_escape_function(
+    pass_printable=False, unicode_output=False, quotes=True, prefix='u')
+
 def repr__Unicode(space, w_unicode):
     chars = w_unicode._value
     size = len(chars)
-    s = unicode_encode_unicode_escape(chars, size, "strict", quotes=True)
+    s = _repr_function(chars, size, "strict")
     return space.wrap(s)
 
 def mod__Unicode_ANY(space, w_format, w_values):
diff --git a/pypy/objspace/std/unicodetype.py b/pypy/objspace/std/unicodetype.py
--- a/pypy/objspace/std/unicodetype.py
+++ b/pypy/objspace/std/unicodetype.py
@@ -236,13 +236,14 @@
             if encoding == 'ascii':
                 u = space.unicode_w(w_object)
                 eh = encode_error_handler(space)
-                return space.wrap(unicode_encode_ascii(u, len(u), None,
-                                                       errorhandler=eh))
+                return space.wrap(unicode_encode_ascii(
+                        u, len(u), None, errorhandler=eh))
             if encoding == 'utf-8':
                 u = space.unicode_w(w_object)
                 eh = encode_error_handler(space)
-                return space.wrap(unicode_encode_utf_8(u, len(u), None,
-                                                       errorhandler=eh))
+                return space.wrap(unicode_encode_utf_8(
+                        u, len(u), None, errorhandler=eh,
+                        allow_surrogates=True))
         from pypy.module._codecs.interp_codecs import lookup_codec
         w_encoder = space.getitem(lookup_codec(space, encoding), space.wrap(0))
     if errors is None:
@@ -265,15 +266,14 @@
             # XXX error handling
             s = space.bufferstr_w(w_obj)
             eh = decode_error_handler(space)
-            return space.wrap(str_decode_ascii(s, len(s), None,
-                                               final=True,
-                                               errorhandler=eh)[0])
+            return space.wrap(str_decode_ascii(
+                    s, len(s), None, final=True, errorhandler=eh)[0])
         if encoding == 'utf-8':
             s = space.bufferstr_w(w_obj)
             eh = decode_error_handler(space)
-            return space.wrap(str_decode_utf_8(s, len(s), None,
-                                               final=True,
-                                               errorhandler=eh)[0])
+            return space.wrap(str_decode_utf_8(
+                    s, len(s), None, final=True, errorhandler=eh,
+                    allow_surrogates=True)[0])
     w_codecs = space.getbuiltinmodule("_codecs")
     w_decode = space.getattr(w_codecs, space.wrap("decode"))
     if errors is None:
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -77,12 +77,14 @@
 ]
 
 def str_decode_utf_8(s, size, errors, final=False,
-                     errorhandler=None):
+                     errorhandler=None, allow_surrogates=False):
     if errorhandler is None:
         errorhandler = raise_unicode_exception_decode
-    return str_decode_utf_8_impl(s, size, errors, final, errorhandler)
+    return str_decode_utf_8_impl(s, size, errors, final, errorhandler,
+                                 allow_surrogates=allow_surrogates)
 
-def str_decode_utf_8_impl(s, size, errors, final, errorhandler):
+def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
+                          allow_surrogates):
     if size == 0:
         return u'', 0
 
@@ -184,8 +186,7 @@
             if (ordch2>>6 != 0x2 or    # 0b10
                 (ordch1 == 0xe0 and ordch2 < 0xa0)
                 # surrogates shouldn't be valid UTF-8!
-                # Uncomment the line below to make them invalid.
-                # or (ordch1 == 0xed and ordch2 > 0x9f)
+                or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f)
                 ):
                 r, pos = errorhandler(errors, 'utf-8',
                                       'invalid continuation byte',
@@ -254,13 +255,16 @@
     result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
     result.append((chr((0x80 | (ch & 0x3f)))))
 
-def unicode_encode_utf_8(s, size, errors, errorhandler=None):
+def unicode_encode_utf_8(s, size, errors, errorhandler=None,
+                         allow_surrogates=False):
+    if errorhandler is None:
+        errorhandler = raise_unicode_exception_encode
     assert(size >= 0)
     result = StringBuilder(size)
-    i = 0
-    while i < size:
-        ch = ord(s[i])
-        i += 1
+    pos = 0
+    while pos < size:
+        ch = ord(s[pos])
+        pos += 1
         if ch < 0x80:
             # Encode ASCII
             result.append(chr(ch))
@@ -272,20 +276,32 @@
             # Encode UCS2 Unicode ordinals
             if ch < 0x10000:
                 # Special case: check for high surrogate
-                if 0xD800 <= ch <= 0xDBFF and i != size:
-                    ch2 = ord(s[i])
-                    # Check for low surrogate and combine the two to
-                    # form a UCS4 value
-                    if 0xDC00 <= ch2 <= 0xDFFF:
-                        ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
-                        i += 1
-                        _encodeUCS4(result, ch3)
+                if 0xD800 <= ch <= 0xDFFF:
+                    if pos != size:
+                        ch2 = ord(s[pos])
+                        # Check for low surrogate and combine the two to
+                        # form a UCS4 value
+                        if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
+                            ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
+                            pos += 1
+                            _encodeUCS4(result, ch3)
+                            continue
+                    if not allow_surrogates:
+                        r, pos = errorhandler(errors, 'utf-8',
+                                              'surrogates not allowed',
+                                              s, pos-1, pos)
+                        for ch in r:
+                            if ord(ch) < 0x80:
+                                result.append(chr(ord(ch)))
+                            else:
+                                errorhandler('strict', 'utf-8',
+                                             'surrogates not allowed',
+                                             s, pos-1, pos)
                         continue
-                # Fall through: handles isolated high surrogates
+                    # else: Fall through and handles isolated high surrogates
                 result.append((chr((0xe0 | (ch >> 12)))))
                 result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
                 result.append((chr((0x80 | (ch & 0x3f)))))
-                continue
             else:
                 _encodeUCS4(result, ch)
     return result.build()
@@ -1202,28 +1218,41 @@
 
     return builder.build(), pos
 
-def make_unicode_escape_function():
+def make_unicode_escape_function(pass_printable=False, unicode_output=False,
+                                 quotes=False, prefix=None):
     # Python3 has two similar escape functions: One to implement
     # encode('unicode_escape') and which outputs bytes, and unicode.__repr__
     # which outputs unicode.  They cannot share RPython code, so we generate
     # them with the template below.
     # Python2 does not really need this, but it reduces diffs between branches.
-    def unicode_escape(s, size, errors, errorhandler=None, quotes=False):
+
+    if unicode_output:
+        STRING_BUILDER = UnicodeBuilder
+        STR = unicode
+        CHR = UNICHR
+    else:
+        STRING_BUILDER = StringBuilder
+        STR = str
+        CHR = chr
+
+    def unicode_escape(s, size, errors, errorhandler=None):
         # errorhandler is not used: this function cannot cause Unicode errors
-        result = StringBuilder(size)
+        result = STRING_BUILDER(size)
 
         if quotes:
+            if prefix:
+                result.append(STR(prefix))
             if s.find(u'\'') != -1 and s.find(u'\"') == -1:
                 quote = ord('\"')
-                result.append('u"')
+                result.append(STR('"'))
             else:
                 quote = ord('\'')
-                result.append('u\'')
+                result.append(STR('\''))
         else:
             quote = 0
 
             if size == 0:
-                return ''
+                return STR('')
 
         pos = 0
         while pos < size:
@@ -1232,8 +1261,8 @@
 
             # Escape quotes
             if quotes and (oc == quote or ch == '\\'):
-                result.append('\\')
-                result.append(chr(oc))
+                result.append(STR('\\'))
+                result.append(CHR(oc))
                 pos += 1
                 continue
 
@@ -1248,7 +1277,7 @@
 
                 if 0xDC00 <= oc2 <= 0xDFFF:
                     ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
-                    raw_unicode_escape_helper(result, ucs)
+                    char_escape_helper(result, ucs)
                     pos += 1
                     continue
                 # Fall through: isolated surrogates are copied as-is
@@ -1256,46 +1285,51 @@
 
             # Map special whitespace to '\t', \n', '\r'
             if ch == '\t':
-                result.append('\\t')
+                result.append(STR('\\t'))
             elif ch == '\n':
-                result.append('\\n')
+                result.append(STR('\\n'))
             elif ch == '\r':
-                result.append('\\r')
+                result.append(STR('\\r'))
             elif ch == '\\':
-                result.append('\\\\')
+                result.append(STR('\\\\'))
 
             # Map non-printable or non-ascii to '\xhh' or '\uhhhh'
-            elif oc < 32 or oc >= 0x7F:
-                raw_unicode_escape_helper(result, oc)
+            elif pass_printable and not unicodedb.isprintable(oc):
+                char_escape_helper(result, oc)
+            elif not pass_printable and (oc < 32 or oc >= 0x7F):
+                char_escape_helper(result, oc)
 
             # Copy everything else as-is
             else:
-                result.append(chr(oc))
+                result.append(CHR(oc))
             pos += 1
 
         if quotes:
-            result.append(chr(quote))
+            result.append(CHR(quote))
         return result.build()
 
     def char_escape_helper(result, char):
         num = hex(char)
+        if STR is unicode:
+            num = num.decode('ascii')
         if char >= 0x10000:
-            result.append("\\U")
+            result.append(STR("\\U"))
             zeros = 8
         elif char >= 0x100:
-            result.append("\\u")
+            result.append(STR("\\u"))
             zeros = 4
         else:
-            result.append("\\x")
+            result.append(STR("\\x"))
             zeros = 2
         lnum = len(num)
         nb = zeros + 2 - lnum # num starts with '0x'
         if nb > 0:
-            result.append_multiple_char('0', nb)
+            result.append_multiple_char(STR('0'), nb)
         result.append_slice(num, 2, lnum)
 
     return unicode_escape, char_escape_helper
 
+# This function is also used by _codecs/interp_codecs.py
 (unicode_encode_unicode_escape, raw_unicode_escape_helper
  ) = make_unicode_escape_function()
 
diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -118,6 +118,9 @@
         for i in range(10000):
             for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
                              "utf-32 utf-32-be utf-32-le").split():
+                if encoding == 'utf-8' and 0xd800 <= i <= 0xdfff:
+                    # Don't try to encode lone surrogates
+                    continue
                 self.checkdecode(unichr(i), encoding)
 
     def test_random(self):
@@ -242,9 +245,8 @@
             self.checkdecode(s, "utf-8")
 
     def test_utf8_surrogate(self):
-        # A surrogate should not be valid utf-8, but python 2.x accepts them.
-        # This test will raise an error with python 3.x
-        self.checkdecode(u"\ud800", "utf-8")
+        # surrogates used to be allowed by python 2.x
+        raises(UnicodeDecodeError, self.checkdecode, u"\ud800", "utf-8")
 
     def test_invalid_start_byte(self):
         """
@@ -691,12 +693,16 @@
             self.checkencode(s, "utf-8")
 
     def test_utf8_surrogates(self):
-        # check replacing of two surrogates by single char while encoding
         # make sure that the string itself is not marshalled
         u = u"\ud800"
         for i in range(4):
             u += u"\udc00"
-        self.checkencode(u, "utf-8")
+        if runicode.MAXUNICODE < 65536:
+            # Check replacing of two surrogates by single char while encoding
+            self.checkencode(u, "utf-8")
+        else:
+            # This is not done in wide unicode builds
+            raises(UnicodeEncodeError, self.checkencode, u, "utf-8")
 
     def test_ascii_error(self):
         self.checkencodeerror(u"abc\xFF\xFF\xFFcde", "ascii", 3, 6)