[pypy-commit] pypy unicode-utf8: fix micronumpy

fijal pypy.commits at gmail.com
Mon Dec 11 01:38:36 EST 2017


Author: fijal
Branch: unicode-utf8
Changeset: r93358:fadafada40af
Date: 2017-12-11 08:37 +0200
http://bitbucket.org/pypy/pypy/changeset/fadafada40af/

Log:	fix micronumpy

diff --git a/pypy/module/micronumpy/boxes.py b/pypy/module/micronumpy/boxes.py
--- a/pypy/module/micronumpy/boxes.py
+++ b/pypy/module/micronumpy/boxes.py
@@ -11,6 +11,7 @@
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib import jit
+from rpython.rlib.rutf8 import get_utf8_length
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.tool.sourcetools import func_with_new_name
 from pypy.module.micronumpy import constants as NPY
@@ -636,7 +637,8 @@
         if dtype.is_unicode():
             return self
         elif dtype.is_object():
-            return W_ObjectBox(space.newunicode(self._value))
+            return W_ObjectBox(space.newutf8(self._value,
+                               get_utf8_length(self._value)))
         else:
             raise oefmt(space.w_NotImplementedError,
                         "Conversion from unicode not implemented yet")
@@ -646,7 +648,7 @@
         return new_unicode_dtype(space, len(self._value))
 
     def descr__new__unicode_box(space, w_subtype, w_arg):
-        value = space.unicode_w(space.unicode_from_object(w_arg))
+        value = space.utf8_w(space.unicode_from_object(w_arg))
         return W_UnicodeBox(value)
 
 class W_ObjectBox(W_GenericBox):
diff --git a/pypy/module/micronumpy/compile.py b/pypy/module/micronumpy/compile.py
--- a/pypy/module/micronumpy/compile.py
+++ b/pypy/module/micronumpy/compile.py
@@ -197,7 +197,7 @@
         return StringObject(obj)
     newbytes = newtext
 
-    def newunicode(self, obj):
+    def newutf8(self, obj, l):
         raise NotImplementedError
 
     def newlist(self, items):
@@ -305,10 +305,10 @@
         raise NotImplementedError
     text_w = bytes_w
 
-    def unicode_w(self, w_obj):
+    def utf8_w(self, w_obj):
         # XXX
         if isinstance(w_obj, StringObject):
-            return unicode(w_obj.v)
+            return w_obj.v
         raise NotImplementedError
 
     def int(self, w_obj):
diff --git a/pypy/module/micronumpy/types.py b/pypy/module/micronumpy/types.py
--- a/pypy/module/micronumpy/types.py
+++ b/pypy/module/micronumpy/types.py
@@ -1,6 +1,7 @@
 import functools
 import math
 from rpython.rlib.unroll import unrolling_iterable
+from rpython.rlib.rutf8 import Utf8StringIterator, get_utf8_length, Utf8StringBuilder
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.objspace.std.floatobject import float2string
 from pypy.objspace.std.complexobject import str_format
@@ -2271,23 +2272,29 @@
         if isinstance(w_item, boxes.W_UnicodeBox):
             return w_item
         if isinstance(w_item, boxes.W_ObjectBox):
-            value = space.unicode_w(space.unicode_from_object(w_item.w_obj))
+            value = space.utf8_w(space.unicode_from_object(w_item.w_obj))
         else:
-            value = space.unicode_w(space.unicode_from_object(w_item))
+            value = space.utf8_w(space.unicode_from_object(w_item))
         return boxes.W_UnicodeBox(value)
 
+    def convert_utf8_to_unichar_list(self, utf8):
+        l = []
+        for ch in Utf8StringIterator(utf8):
+            l.append(unichr(ch))
+        return l
+
     def store(self, arr, i, offset, box, native):
         assert isinstance(box, boxes.W_UnicodeBox)
-        value = box._value
         with arr as storage:
             self._store(storage, i, offset, box, arr.dtype.elsize)
 
     @jit.unroll_safe
     def _store(self, storage, i, offset, box, width):
-        size = min(width // 4, len(box._value))
+        v = self.convert_utf8_to_unichar_list(box._value)
+        size = min(width // 4, len(v))
         for k in range(size):
             index = i + offset + 4*k
-            data = rffi.cast(Int32.T, ord(box._value[k]))
+            data = rffi.cast(Int32.T, ord(v[k]))
             raw_storage_setitem_unaligned(storage, index, data)
         # zero out the remaining memory
         for index in range(size * 4 + i + offset, width):
@@ -2298,16 +2305,16 @@
         if dtype is None:
             dtype = arr.dtype
         size = dtype.elsize // 4
-        builder = UnicodeBuilder(size)
+        builder = Utf8StringBuilder(size)
         with arr as storage:
             for k in range(size):
                 index = i + offset + 4*k
-                codepoint = raw_storage_getitem_unaligned(
-                    Int32.T, arr.storage, index)
-                char = unichr(codepoint)
-                if char == u'\0':
+                codepoint = rffi.cast(lltype.Signed,
+                    raw_storage_getitem_unaligned(
+                    Int32.T, arr.storage, index))
+                if codepoint == 0:
                     break
-                builder.append(char)
+                builder.append_code(codepoint)
         return boxes.W_UnicodeBox(builder.build())
 
     def str_format(self, item, add_quotes=True):
@@ -2323,7 +2330,7 @@
 
     def to_builtin_type(self, space, box):
         assert isinstance(box, boxes.W_UnicodeBox)
-        return space.newunicode(box._value)
+        return space.newutf8(box._value, get_utf8_length(box._value))
 
     def eq(self, v1, v2):
         assert isinstance(v1, boxes.W_UnicodeBox)


More information about the pypy-commit mailing list