[pypy-commit] pypy default: Tweak the RPython and PyPy ord() to behave like CPython's when given

Mon Sep 28 10:35:21 CEST 2015

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r79878:47c87779c73a
Date: 2015-09-28 10:35 +0200
http://bitbucket.org/pypy/pypy/changeset/47c87779c73a/

Log:	Tweak the RPython and PyPy ord() to behave like CPython's when given
	strange inputs: never return negative numbers on 64-bit. Also fix
	the repr() of unicodes containing such a character. (Tested in the
	array module because it's hard to make invalid unichars otherwise.)

diff --git a/pypy/module/array/test/test_array.py b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -844,6 +844,18 @@
         b.byteswap()
         assert a != b
 
+    def test_unicode_ord_positive(self):
+        import sys
+        if sys.maxunicode == 0xffff:
+            skip("test for 32-bit unicodes")
+        a = self.array('u', '\xff\xff\xff\xff')
+        assert len(a) == 1
+        assert repr(a[0]) == "u'\Uffffffff'"
+        if sys.maxint == 2147483647:
+            assert ord(a[0]) == -1
+        else:
+            assert ord(a[0]) == 4294967295
+
     def test_weakref(self):
         import weakref
         a = self.array('c', 'Hi!')
diff --git a/rpython/annotator/unaryop.py b/rpython/annotator/unaryop.py
--- a/rpython/annotator/unaryop.py
+++ b/rpython/annotator/unaryop.py
@@ -652,11 +652,11 @@
     def len(self):
         return immutablevalue(1)
 
+class __extend__(SomeChar):
+
     def ord(self):
         return SomeInteger(nonneg=True)
 
-class __extend__(SomeChar):
-
     def method_isspace(self):
         return s_Bool
 
@@ -675,6 +675,13 @@
     def method_upper(self):
         return self
 
+class __extend__(SomeUnicodeCodePoint):
+
+    def ord(self):
+        # warning, on 32-bit with 32-bit unichars, this might return
+        # negative numbers
+        return SomeInteger()
+
 class __extend__(SomeIterator):
 
     def iter(self):
diff --git a/rpython/jit/metainterp/test/test_ajit.py b/rpython/jit/metainterp/test/test_ajit.py
--- a/rpython/jit/metainterp/test/test_ajit.py
+++ b/rpython/jit/metainterp/test/test_ajit.py
@@ -4320,14 +4320,14 @@
         
         self.meta_interp(allfuncs, [9, 2000])
 
-    def test_unichar_might_be_signed(self):
-        py.test.skip("wchar_t is sometimes a signed 32-bit integer type, "
-                     "but RPython inteprets it as unsigned (but still "
-                     "translates to wchar_t, so can create confusion)")
+    def test_unichar_ord_is_never_signed_on_64bit(self):
+        import sys
+        if sys.maxunicode == 0xffff:
+            py.test.skip("test for 32-bit unicodes")
         def f(x):
-            return rffi.cast(lltype.Signed, rffi.cast(lltype.UniChar, x))
+            return ord(rffi.cast(lltype.UniChar, x))
         res = self.interp_operations(f, [-1])
-        if rffi.r_wchar_t.SIGN:
+        if sys.maxint == 2147483647:
             assert res == -1
         else:
-            assert res == 2 ** 16 - 1 or res == 2 ** 32 - 1
+            assert res == 4294967295
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -1403,11 +1403,10 @@
             result.append(CHR(quote))
         return result.build()
 
+    TABLE = STR('0123456789abcdef')
+
     def char_escape_helper(result, char):
-        num = hex(char)
-        if STR is unicode:
-            num = num.decode('ascii')
-        if char >= 0x10000:
+        if char >= 0x10000 or char < 0:
             result.append(STR("\\U"))
             zeros = 8
         elif char >= 0x100:
@@ -1416,11 +1415,8 @@
         else:
             result.append(STR("\\x"))
             zeros = 2
-        lnum = len(num)
-        nb = zeros + 2 - lnum # num starts with '0x'
-        if nb > 0:
-            result.append_multiple_char(STR('0'), nb)
-        result.append_slice(num, 2, lnum)
+        for i in range(zeros-1, -1, -1):
+            result.append(TABLE[(char >> (4 * i)) & 0x0f])
 
     return unicode_escape, char_escape_helper
 
diff --git a/rpython/translator/c/src/int.h b/rpython/translator/c/src/int.h
--- a/rpython/translator/c/src/int.h
+++ b/rpython/translator/c/src/int.h
@@ -231,8 +231,12 @@
 #define OP_TRUNCATE_LONGLONG_TO_INT(x,r) r = (Signed)(x)
 #define OP_TRUNCATE_LONGLONGLONG_TO_INT(x,r) r = (Signed)(x)
 
-#define OP_CAST_UNICHAR_TO_INT(x,r)    r = (Signed)((Unsigned)(x)) /*?*/
-#define OP_CAST_INT_TO_UNICHAR(x,r)    r = (unsigned int)(x)
+/* Casting from UniChar to int goes first via "unsigned int".
+   On 64-bit platforms, this forces a signed 32-bit wchar_t
+   to an unsigned integer, which is also what CPython's ord()
+   does. */
+#define OP_CAST_UNICHAR_TO_INT(x,r)    r = ((unsigned int)(x))
+#define OP_CAST_INT_TO_UNICHAR(x,r)    r = (x)
 
 /* bool operations */