[pypy-commit] pypy unicode-utf8: fix unicodedata module
fijal
pypy.commits at gmail.com
Sat Dec 9 08:11:35 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r93326:b4456e64ff3b
Date: 2017-12-09 15:10 +0200
http://bitbucket.org/pypy/pypy/changeset/b4456e64ff3b/
Log: fix unicodedata module
diff --git a/pypy/module/unicodedata/interp_ucd.py b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -7,11 +7,8 @@
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.typedef import TypeDef, interp_attrproperty
from rpython.rlib.rarithmetic import r_longlong
-from rpython.rlib.objectmodel import we_are_translated
-from rpython.rlib.runicode import MAXUNICODE
from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
-from rpython.rlib.runicode import code_to_unichr, ord_accepts_surrogate
-import sys
+from rpython.rlib.rutf8 import Utf8StringBuilder, unichr_as_utf8
# Contants for Hangul characters
@@ -30,49 +27,17 @@
# unicode code point.
-if MAXUNICODE > 0xFFFF:
- # Target is wide build
- def unichr_to_code_w(space, w_unichr):
- if not space.isinstance_w(w_unichr, space.w_unicode):
- raise oefmt(
- space.w_TypeError, 'argument 1 must be unicode, not %T',
- w_unichr)
+# Target is wide build
+def unichr_to_code_w(space, w_unichr):
+ if not space.isinstance_w(w_unichr, space.w_unicode):
+ raise oefmt(
+ space.w_TypeError, 'argument 1 must be unicode, not %T',
+ w_unichr)
- if not we_are_translated() and sys.maxunicode == 0xFFFF:
- # Host CPython is narrow build, accept surrogates
- try:
- return ord_accepts_surrogate(space.unicode_w(w_unichr))
- except TypeError:
- raise oefmt(space.w_TypeError,
- "need a single Unicode character as parameter")
- else:
- if not space.len_w(w_unichr) == 1:
- raise oefmt(space.w_TypeError,
- "need a single Unicode character as parameter")
- return space.int_w(space.ord(w_unichr))
-
-else:
- # Target is narrow build
- def unichr_to_code_w(space, w_unichr):
- if not space.isinstance_w(w_unichr, space.w_unicode):
- raise oefmt(
- space.w_TypeError, 'argument 1 must be unicode, not %T',
- w_unichr)
-
- if not we_are_translated() and sys.maxunicode > 0xFFFF:
- # Host CPython is wide build, forbid surrogates
- if not space.len_w(w_unichr) == 1:
- raise oefmt(space.w_TypeError,
- "need a single Unicode character as parameter")
- return space.int_w(space.ord(w_unichr))
-
- else:
- # Accept surrogates
- try:
- return ord_accepts_surrogate(space.unicode_w(w_unichr))
- except TypeError:
- raise oefmt(space.w_TypeError,
- "need a single Unicode character as parameter")
+ if not space.len_w(w_unichr) == 1:
+ raise oefmt(space.w_TypeError,
+ "need a single Unicode character as parameter")
+ return space.int_w(space.ord(w_unichr))
class UCD(W_Root):
@@ -110,7 +75,7 @@
except KeyError:
msg = space.mod(space.newtext("undefined character name '%s'"), space.newtext(name))
raise OperationError(space.w_KeyError, msg)
- return space.newunicode(code_to_unichr(code))
+ return space.newutf8(unichr_as_utf8(code), 1)
def name(self, space, w_unichr, w_default=None):
code = unichr_to_code_w(space, w_unichr)
@@ -259,10 +224,10 @@
result[0] = ch
if not composed: # If decomposed normalization we are done
- return space.newunicode(u''.join([unichr(i) for i in result[:j]]))
+ return self.build(space, result, stop=j)
if j <= 1:
- return space.newunicode(u''.join([unichr(i) for i in result[:j]]))
+ return self.build(space, result, stop=j)
current = result[0]
starter_pos = 0
@@ -310,7 +275,13 @@
result[starter_pos] = current
- return space.newunicode(u''.join([unichr(i) for i in result[:next_insert]]))
+ return self.build(space, result, stop=next_insert)
+
+ def build(self, space, r, stop):
+ builder = Utf8StringBuilder(stop * 3)
+ for i in range(stop):
+ builder.append_code(r[i])
+ return space.newutf8(builder.build(), stop)
methods = {}
diff --git a/pypy/module/unicodedata/test/test_hyp.py b/pypy/module/unicodedata/test/test_hyp.py
--- a/pypy/module/unicodedata/test/test_hyp.py
+++ b/pypy/module/unicodedata/test/test_hyp.py
@@ -1,3 +1,4 @@
+
import pytest
try:
from hypothesis import given, strategies as st, example, settings
@@ -5,12 +6,14 @@
pytest.skip("hypothesis required")
from pypy.module.unicodedata.interp_ucd import ucd
+from rpython.rlib.rutf8 import get_utf8_length
def make_normalization(space, NF_code):
def normalize(s):
- w_s = space.newunicode(s)
+ u = s.encode('utf8')
+ w_s = space.newutf8(u, get_utf8_length(u))
w_res = ucd.normalize(space, NF_code, w_s)
- return space.unicode_w(w_res)
+ return space.utf8_w(w_res).decode('utf8')
return normalize
all_forms = ['NFC', 'NFD', 'NFKC', 'NFKD']
More information about the pypy-commit
mailing list