[pypy-commit] pypy default: merge branch 'unicode-dtype'

rlamy noreply at buildbot.pypy.org
Mon Jul 6 19:46:43 CEST 2015


Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: 
Changeset: r78474:d62f1b272ea9
Date: 2015-07-06 18:44 +0100
http://bitbucket.org/pypy/pypy/changeset/d62f1b272ea9/

Log:	merge branch 'unicode-dtype'

diff --git a/pypy/module/micronumpy/boxes.py b/pypy/module/micronumpy/boxes.py
--- a/pypy/module/micronumpy/boxes.py
+++ b/pypy/module/micronumpy/boxes.py
@@ -196,7 +196,12 @@
                     "'%T' object is not iterable", self)
 
     def descr_str(self, space):
-        return space.wrap(self.get_dtype(space).itemtype.str_format(self, add_quotes=False))
+        tp = self.get_dtype(space).itemtype
+        return space.wrap(tp.str_format(self, add_quotes=False))
+
+    def descr_repr(self, space):
+        tp = self.get_dtype(space).itemtype
+        return space.wrap(tp.str_format(self, add_quotes=True))
 
     def descr_format(self, space, w_spec):
         return space.format(self.item(space), w_spec)
@@ -618,16 +623,25 @@
         return W_StringBox(arr, 0, arr.dtype)
 
 class W_UnicodeBox(W_CharacterBox):
+    def __init__(self, value):
+        self._value = value
+
+    def convert_to(self, space, dtype):
+        if dtype.is_unicode():
+            return self
+        elif dtype.is_object():
+            return W_ObjectBox(space.wrap(self._value))
+        else:
+            raise oefmt(space.w_NotImplementedError,
+                        "Conversion from unicode not implemented yet")
+
+    def get_dtype(self, space):
+        from pypy.module.micronumpy.descriptor import new_unicode_dtype
+        return new_unicode_dtype(space, len(self._value))
+
     def descr__new__unicode_box(space, w_subtype, w_arg):
-        raise oefmt(space.w_NotImplementedError, "Unicode is not supported yet")
-        from pypy.module.micronumpy.descriptor import new_unicode_dtype
-        arg = space.unicode_w(space.unicode_from_object(w_arg))
-        # XXX size computations, we need tests anyway
-        arr = VoidBoxStorage(len(arg), new_unicode_dtype(space, len(arg)))
-        # XXX not this way, we need store
-        #for i in range(len(arg)):
-        #    arr.storage[i] = arg[i]
-        return W_UnicodeBox(arr, 0, arr.dtype)
+        value = space.unicode_w(space.unicode_from_object(w_arg))
+        return W_UnicodeBox(value)
 
 class W_ObjectBox(W_GenericBox):
     descr__new__, _get_dtype, descr_reduce = new_dtype_getter(NPY.OBJECT)
@@ -649,7 +663,7 @@
     __getitem__ = interp2app(W_GenericBox.descr_getitem),
     __iter__ = interp2app(W_GenericBox.descr_iter),
     __str__ = interp2app(W_GenericBox.descr_str),
-    __repr__ = interp2app(W_GenericBox.descr_str),
+    __repr__ = interp2app(W_GenericBox.descr_repr),
     __format__ = interp2app(W_GenericBox.descr_format),
     __int__ = interp2app(W_GenericBox.descr_int),
     __long__ = interp2app(W_GenericBox.descr_long),
diff --git a/pypy/module/micronumpy/casting.py b/pypy/module/micronumpy/casting.py
--- a/pypy/module/micronumpy/casting.py
+++ b/pypy/module/micronumpy/casting.py
@@ -325,6 +325,8 @@
         return complex_dtype
     elif space.isinstance_w(w_obj, space.w_str):
         return variable_dtype(space, 'S%d' % space.len_w(w_obj))
+    elif space.isinstance_w(w_obj, space.w_unicode):
+        return new_unicode_dtype(space, space.len_w(w_obj))
     return object_dtype
 
 @signature(ann.instance(W_Dtype), ann.instance(W_Dtype), returns=ann.bool())
diff --git a/pypy/module/micronumpy/test/test_dtypes.py b/pypy/module/micronumpy/test/test_dtypes.py
--- a/pypy/module/micronumpy/test/test_dtypes.py
+++ b/pypy/module/micronumpy/test/test_dtypes.py
@@ -1052,20 +1052,6 @@
         assert d.name == "unicode256"
         assert d.num == 19
 
-    def test_string_boxes(self):
-        from numpy import str_
-        assert isinstance(str_(3), str_)
-
-    def test_unicode_boxes(self):
-        from numpy import unicode_
-        import sys
-        if '__pypy__' in sys.builtin_module_names:
-            exc = raises(NotImplementedError, unicode_, 3)
-            assert exc.value.message.find('not supported yet') >= 0
-        else:
-            u = unicode_(3)
-            assert isinstance(u, unicode)
-
     def test_character_dtype(self):
         import numpy as np
         from numpy import array, character
@@ -1133,7 +1119,7 @@
 
     def test_array_from_record(self):
         import numpy as np
-        a = np.array(('???', -999, -12345678.9), 
+        a = np.array(('???', -999, -12345678.9),
                      dtype=[('c', '|S3'), ('a', '<i8'), ('b', '<f8')])
         # Change the order of the keys
         b = np.array(a, dtype=[('a', '<i8'), ('b', '<f8'), ('c', '|S3')])
@@ -1141,7 +1127,7 @@
         assert b.dtype.fields['a'][1] == 0
         assert b['a'] == -999
         a = np.array(('N/A', 1e+20, 1e+20, 999999),
-                     dtype=[('name', '|S4'), ('x', '<f8'), 
+                     dtype=[('name', '|S4'), ('x', '<f8'),
                             ('y', '<f8'), ('block', '<i8', (2, 3))])
         assert (a['block'] == 999999).all()
 
diff --git a/pypy/module/micronumpy/test/test_ndarray.py b/pypy/module/micronumpy/test/test_ndarray.py
--- a/pypy/module/micronumpy/test/test_ndarray.py
+++ b/pypy/module/micronumpy/test/test_ndarray.py
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
 import py
 import sys
 
@@ -322,6 +323,14 @@
         assert b.flags['C']
         assert (b == a).all()
 
+    def test_unicode(self):
+        import numpy as np
+        a = np.array([3, u'Aÿ', ''], dtype='U3')
+        assert a.shape == (3,)
+        assert a.dtype == np.dtype('U3')
+        assert a[0] == u'3'
+        assert a[1] == u'Aÿ'
+
     def test_dtype_attribute(self):
         import numpy as np
         a = np.array(40000, dtype='uint16')
@@ -380,6 +389,9 @@
         assert zeros((), dtype='S') == ''
         assert zeros((), dtype='S').shape == ()
         assert zeros((), dtype='S').dtype == '|S1'
+        assert zeros(5, dtype='U')[4] == u''
+        assert zeros(5, dtype='U').shape == (5,)
+        assert zeros(5, dtype='U').dtype == '<U1'
 
     def test_check_shape(self):
         import numpy as np
@@ -2423,6 +2435,12 @@
         a.fill(12)
         assert (a == '1').all()
 
+    def test_unicode_filling(self):
+        import numpy as np
+        a = np.empty((10,10), dtype='U1')
+        a.fill(12)
+        assert (a == u'1').all()
+
     def test_boolean_indexing(self):
         import numpy as np
         a = np.zeros((1, 3))
diff --git a/pypy/module/micronumpy/test/test_object_arrays.py b/pypy/module/micronumpy/test/test_object_arrays.py
--- a/pypy/module/micronumpy/test/test_object_arrays.py
+++ b/pypy/module/micronumpy/test/test_object_arrays.py
@@ -171,4 +171,8 @@
         assert 'a' * 100 in str(a)
         b = a.astype('S')
         assert 'a' * 100 in str(b)
-
+        a = np.array([123], dtype='U')
+        assert a[0] == u'123'
+        b = a.astype('O')
+        assert b[0] == u'123'
+        assert type(b[0]) is unicode
diff --git a/pypy/module/micronumpy/test/test_scalar.py b/pypy/module/micronumpy/test/test_scalar.py
--- a/pypy/module/micronumpy/test/test_scalar.py
+++ b/pypy/module/micronumpy/test/test_scalar.py
@@ -1,3 +1,4 @@
+# -*- encoding:utf-8 -*-
 from pypy.module.micronumpy.test.test_base import BaseNumpyAppTest
 
 class AppTestScalar(BaseNumpyAppTest):
@@ -457,3 +458,25 @@
 
         for t in complex64, complex128:
             _do_test(t, 17j, -17j)
+
+    def test_string_boxes(self):
+        from numpy import str_
+        assert isinstance(str_(3), str_)
+        assert str_(3) == '3'
+        assert str(str_(3)) == '3'
+        assert repr(str_(3)) == "'3'"
+
+    def test_unicode_boxes(self):
+        from numpy import unicode_
+        u = unicode_(3)
+        assert isinstance(u, unicode)
+        assert u == u'3'
+
+    def test_unicode_repr(self):
+        from numpy import unicode_
+        u = unicode_(3)
+        assert str(u) == '3'
+        assert repr(u) == "u'3'"
+        u = unicode_(u'Aÿ')
+        # raises(UnicodeEncodeError, "str(u)")  # XXX
+        assert repr(u) == repr(u'Aÿ')
diff --git a/pypy/module/micronumpy/test/test_selection.py b/pypy/module/micronumpy/test/test_selection.py
--- a/pypy/module/micronumpy/test/test_selection.py
+++ b/pypy/module/micronumpy/test/test_selection.py
@@ -210,22 +210,28 @@
             assert (c == a).all(), msg
 
     def test_sort_unicode(self):
+        import sys
         from numpy import array
         # test unicode sorts.
         s = 'aaaaaaaa'
-        try:
-            a = array([s + chr(i) for i in range(101)], dtype=unicode)
-            b = a[::-1].copy()
-        except:
-            skip('unicode type not supported yet')
-        for kind in ['q', 'm', 'h'] :
+        a = array([s + chr(i) for i in range(101)], dtype=unicode)
+        b = a[::-1].copy()
+        for kind in ['q', 'm', 'h']:
             msg = "unicode sort, kind=%s" % kind
-            c = a.copy();
-            c.sort(kind=kind)
-            assert (c == a).all(), msg
-            c = b.copy();
-            c.sort(kind=kind)
-            assert (c == a).all(), msg
+            c = a.copy()
+            if '__pypy__' in sys.builtin_module_names:
+                exc = raises(NotImplementedError, "c.sort(kind=kind)")
+                assert 'non-numeric types' in exc.value.message
+            else:
+                c.sort(kind=kind)
+                assert (c == a).all(), msg
+            c = b.copy()
+            if '__pypy__' in sys.builtin_module_names:
+                exc = raises(NotImplementedError, "c.sort(kind=kind)")
+                assert 'non-numeric types' in exc.value.message
+            else:
+                c.sort(kind=kind)
+                assert (c == a).all(), msg
 
     def test_sort_objects(self):
         # test object array sorts.
diff --git a/pypy/module/micronumpy/types.py b/pypy/module/micronumpy/types.py
--- a/pypy/module/micronumpy/types.py
+++ b/pypy/module/micronumpy/types.py
@@ -11,7 +11,7 @@
     most_neg_value_of, LONG_BIT
 from rpython.rlib.rawstorage import (alloc_raw_storage,
     raw_storage_getitem_unaligned, raw_storage_setitem_unaligned)
-from rpython.rlib.rstring import StringBuilder
+from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
 from rpython.rlib.rstruct.ieee import (float_pack, float_unpack, unpack_float,
                                        pack_float80, unpack_float80)
 from rpython.rlib.rstruct.nativefmttable import native_is_bigendian
@@ -50,6 +50,7 @@
             pass
         return _raw_storage_getitem_unaligned(T, storage, offset)
 '''
+
 def simple_unary_op(func):
     specialize.argtype(1)(func)
     @functools.wraps(func)
@@ -2177,7 +2178,7 @@
             self._store(storage, i, offset, box, width)
 
 class UnicodeType(FlexibleType):
-    T = lltype.Char
+    T = lltype.UniChar
     num = NPY.UNICODE
     kind = NPY.UNICODELTR
     char = NPY.UNICODELTR
@@ -2189,58 +2190,121 @@
     def coerce(self, space, dtype, w_item):
         if isinstance(w_item, boxes.W_UnicodeBox):
             return w_item
-        raise OperationError(space.w_NotImplementedError, space.wrap(
-            "coerce (probably from set_item) not implemented for unicode type"))
+        value = space.unicode_w(space.unicode_from_object(w_item))
+        return boxes.W_UnicodeBox(value)
 
     def store(self, arr, i, offset, box, native):
         assert isinstance(box, boxes.W_UnicodeBox)
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        value = box._value
+        with arr as storage:
+            self._store(storage, i, offset, box, arr.dtype.elsize)
+
+    @jit.unroll_safe
+    def _store(self, storage, i, offset, box, width):
+        size = min(width // 4, len(box._value))
+        for k in range(size):
+            index = i + offset + 4*k
+            data = rffi.cast(Int32.T, ord(box._value[k]))
+            raw_storage_setitem_unaligned(storage, index, data)
+        for k in range(size, width // 4):
+            index = i + offset + 4*k
+            data = rffi.cast(Int32.T, 0)
+            raw_storage_setitem_unaligned(storage, index, data)
 
     def read(self, arr, i, offset, dtype):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        if dtype is None:
+            dtype = arr.dtype
+        size = dtype.elsize // 4
+        builder = UnicodeBuilder(size)
+        with arr as storage:
+            for k in range(size):
+                index = i + offset + 4*k
+                codepoint = raw_storage_getitem_unaligned(
+                    Int32.T, arr.storage, index)
+                char = unichr(codepoint)
+                if char == u'\0':
+                    break
+                builder.append(char)
+        return boxes.W_UnicodeBox(builder.build())
 
     def str_format(self, item, add_quotes=True):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(item, boxes.W_UnicodeBox)
+        if add_quotes:
+            w_unicode = self.to_builtin_type(self.space, item)
+            return self.space.str_w(self.space.repr(w_unicode))
+        else:
+            # Same as W_UnicodeBox.descr_repr() but without quotes and prefix
+            from rpython.rlib.runicode import unicode_encode_unicode_escape
+            return unicode_encode_unicode_escape(item._value,
+                                                 len(item._value), 'strict')
 
     def to_builtin_type(self, space, box):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(box, boxes.W_UnicodeBox)
+        return space.wrap(box._value)
 
     def eq(self, v1, v2):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(v1, boxes.W_UnicodeBox)
+        assert isinstance(v2, boxes.W_UnicodeBox)
+        return v1._value == v2._value
 
     def ne(self, v1, v2):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(v1, boxes.W_UnicodeBox)
+        assert isinstance(v2, boxes.W_UnicodeBox)
+        return v1._value != v2._value
 
     def lt(self, v1, v2):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(v1, boxes.W_UnicodeBox)
+        assert isinstance(v2, boxes.W_UnicodeBox)
+        return v1._value < v2._value
 
     def le(self, v1, v2):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(v1, boxes.W_UnicodeBox)
+        assert isinstance(v2, boxes.W_UnicodeBox)
+        return v1._value <= v2._value
 
     def gt(self, v1, v2):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(v1, boxes.W_UnicodeBox)
+        assert isinstance(v2, boxes.W_UnicodeBox)
+        return v1._value > v2._value
 
     def ge(self, v1, v2):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(v1, boxes.W_UnicodeBox)
+        assert isinstance(v2, boxes.W_UnicodeBox)
+        return v1._value >= v2._value
 
     def logical_and(self, v1, v2):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(v1, boxes.W_UnicodeBox)
+        assert isinstance(v2, boxes.W_UnicodeBox)
+        if bool(v1) and bool(v2):
+            return Bool._True
+        return Bool._False
 
     def logical_or(self, v1, v2):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(v1, boxes.W_UnicodeBox)
+        assert isinstance(v2, boxes.W_UnicodeBox)
+        if bool(v1) or bool(v2):
+            return Bool._True
+        return Bool._False
 
     def logical_not(self, v):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
-
-    @str_binary_op
+        assert isinstance(v, boxes.W_UnicodeBox)
+        return not bool(v)
+
     def logical_xor(self, v1, v2):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(v1, boxes.W_UnicodeBox)
+        assert isinstance(v2, boxes.W_UnicodeBox)
+        a = bool(v1)
+        b = bool(v2)
+        return (not b and a) or (not a and b)
 
     def bool(self, v):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(v, boxes.W_UnicodeBox)
+        return bool(v._value)
 
     def fill(self, storage, width, native, box, start, stop, offset, gcstruct):
-        raise oefmt(self.space.w_NotImplementedError, "unicode type not completed")
+        assert isinstance(box, boxes.W_UnicodeBox)
+        for i in xrange(start, stop, width):
+            self._store(storage, i, offset, box, width)
 
 
 class VoidType(FlexibleType):


More information about the pypy-commit mailing list