[pypy-commit] pypy unicode-utf8: fixes until we get to formatting problems
fijal
pypy.commits at gmail.com
Mon Nov 20 10:55:02 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r93101:f074b4987d57
Date: 2017-11-20 16:54 +0100
http://bitbucket.org/pypy/pypy/changeset/f074b4987d57/
Log: fixes until we get to formatting problems
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1759,20 +1759,6 @@
def utf8_w(self, w_obj):
return w_obj.utf8_w(self)
-
- @specialize.argtype(1)
- def unicode_w(self, w_obj):
- return self.utf8_w(w_obj).decode('utf8')
-
- def realunicode_w(self, w_obj):
- return self.realutf8_w(w_obj).decode('utf8')
-
- def newunicode(self, u):
- from pypy.interpreter import unicodehelper
- assert isinstance(u, unicode)
- # XXX let's disallow that
- return self.newutf8(u.encode("utf8"), len(u), unicodehelper._get_flag(u))
-
def convert_to_w_unicode(self, w_obj):
return w_obj.convert_to_w_unicode(self)
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -61,10 +61,10 @@
@given(strategies.text())
def test_unicode_raw_escape(u):
- r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict')
+ r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict', None)
assert r == u.encode("raw-unicode-escape")
@given(strategies.text())
def test_unicode_escape(u):
- r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict")
+ r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
assert r == u.encode("unicode-escape")
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -60,14 +60,12 @@
return True
return False
-def _get_flag(u):
- flag = rutf8.FLAG_ASCII
- for c in u:
- if 0xD800 <= ord(c) <= 0xDFFF:
- return rutf8.FLAG_HAS_SURROGATES
- if ord(c) >= 0x80:
- flag = rutf8.FLAG_REGULAR
- return flag
+def get_flag_from_code(oc):
+ if oc <= 0x7F:
+ return rutf8.FLAG_ASCII
+ if 0xD800 <= oc <= 0xDFFF:
+ return rutf8.FLAG_HAS_SURROGATES
+ return rutf8.FLAG_REGULAR
# These functions take and return unwrapped rpython strings
def decode_unicode_escape(space, string):
@@ -134,7 +132,11 @@
return ress, len(s), lgt, flag
def str_decode_latin_1(s, errors, final, errorhandler):
- xxx
+ try:
+ rutf8.check_ascii(s)
+ return s, len(s), len(s), rutf8.FLAG_ASCII
+ except rutf8.CheckError:
+ return _str_decode_latin_1_slowpath(s, errors, final, errorhandler)
def utf8_encode_latin_1(s, errors, errorhandler):
try:
@@ -208,7 +210,6 @@
slen = len(s)
res = StringBuilder(slen)
pos = 0
- continuation_bytes = 0
end = len(s)
while pos < end:
ordch1 = ord(s[pos])
@@ -229,6 +230,7 @@
if ordch1 <= 0xDF:
if pos >= end:
if not final:
+ pos -= 1
break
r, pos = errorhandler(errors, "utf8", "unexpected end of data",
s, pos - 1, pos)
@@ -243,7 +245,6 @@
continue
# 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
pos += 1
- continuation_bytes += 1
res.append(chr(ordch1))
res.append(chr(ordch2))
continue
@@ -251,6 +252,7 @@
if ordch1 <= 0xEF:
if (pos + 2) > end:
if not final:
+ pos -= 1
break
r, pos = errorhandler(errors, "utf8", "unexpected end of data",
s, pos - 1, pos + 1)
@@ -272,7 +274,6 @@
pos += 2
# 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
- continuation_bytes += 2
res.append(chr(ordch1))
res.append(chr(ordch2))
res.append(chr(ordch3))
@@ -281,6 +282,7 @@
if ordch1 <= 0xF4:
if (pos + 3) > end:
if not final:
+ pos -= 1
break
r, pos = errorhandler(errors, "utf8", "unexpected end of data",
s, pos - 1, pos)
@@ -312,15 +314,12 @@
res.append(chr(ordch2))
res.append(chr(ordch3))
res.append(chr(ordch4))
- continuation_bytes += 3
continue
r, pos = errorhandler(errors, "utf8", "invalid start byte",
s, pos - 1, pos)
res.append(r)
- assert pos == end
- assert pos - continuation_bytes >= 0
r = res.build()
lgt, flag = rutf8.check_utf8(r, True)
return r, pos, lgt, flag
@@ -352,19 +351,14 @@
else:
# when we get here, chr is a 32-bit unicode character
if chr > 0x10ffff:
- UUU
message = "illegal Unicode character"
res, pos = errorhandler(errors, encoding,
message, s, pos-2, pos+digits)
+ size, flag = rutf8.check_utf8(res)
builder.append(res)
else:
rutf8.unichr_as_utf8_append(builder, chr, True)
- if chr <= 0x7f:
- flag = rutf8.FLAG_ASCII
- elif 0xd800 <= chr <= 0xdfff:
- flag = rutf8.FLAG_HAS_SURROGATES
- else:
- flag = rutf8.FLAG_REGULAR
+ flag = get_flag_from_code(chr)
pos += digits
size = 1
@@ -508,22 +502,22 @@
builder.append(res)
continue
pos = look + 1
- XXX
- if code <= MAXUNICODE:
- builder.append(UNICHR(code))
- else:
- code -= 0x10000L
- builder.append(unichr(0xD800 + (code >> 10)))
- builder.append(unichr(0xDC00 + (code & 0x03FF)))
+ outsize += 1
+ flag = combine_flags(flag, get_flag_from_code(code))
+ rutf8.unichr_as_utf8_append(builder, code)
else:
- YYY
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, look+1)
+ newsize, newflag = rutf8.check_utf8(res, True)
+ flag = combine_flags(flag, newflag)
+ outsize += newsize
builder.append(res)
else:
- AAA
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, look+1)
+ newsize, newflag = rutf8.check_utf8(res, True)
+ flag = combine_flags(flag, newflag)
+ outsize += newsize
builder.append(res)
else:
builder.append('\\')
@@ -602,7 +596,7 @@
for i in range(zeros-1, -1, -1):
result.append(TABLE[(char >> (4 * i)) & 0x0f])
-def utf8_encode_raw_unicode_escape(s, errors, errorhandler=None):
+def utf8_encode_raw_unicode_escape(s, errors, errorhandler):
# errorhandler is not used: this function cannot cause Unicode errors
size = len(s)
if size == 0:
@@ -621,7 +615,7 @@
return result.build()
-def utf8_encode_unicode_escape(s, errors):
+def utf8_encode_unicode_escape(s, errors, errorhandler):
return _utf8_encode_unicode_escape(s)
# ____________________________________________________________
@@ -851,7 +845,7 @@
assert final_length >= 0
return result.build()[:final_length], pos, outsize, flag
-def utf8_encode_utf_7(s, errors, errorhandler=None):
+def utf8_encode_utf_7(s, errors, errorhandler):
size = len(s)
if size == 0:
return ''
@@ -1294,3 +1288,153 @@
errorhandler=None, allow_surrogates=True):
return unicode_encode_utf_32_helper(s, errors, errorhandler,
allow_surrogates, "little")
+
+# ____________________________________________________________
+# unicode-internal
+
+def str_decode_unicode_internal(s, errors, final=False,
+ errorhandler=None):
+ size = len(s)
+ if size == 0:
+ return '', 0, 0, rutf8.FLAG_ASCII
+
+ unicode_bytes = 4
+ if BYTEORDER == "little":
+ start = 0
+ stop = unicode_bytes
+ step = 1
+ else:
+ start = unicode_bytes - 1
+ stop = -1
+ step = -1
+
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ if pos > size - unicode_bytes:
+ res, pos = errorhandler(errors, "unicode_internal",
+ "truncated input",
+ s, pos, size)
+ result.append(res)
+ if pos > size - unicode_bytes:
+ break
+ continue
+ t = r_uint(0)
+ h = 0
+ for j in range(start, stop, step):
+ t += r_uint(ord(s[pos + j])) << (h*8)
+ h += 1
+ if t > 0x10ffff:
+ res, pos = errorhandler(errors, "unicode_internal",
+ "unichr(%d) not in range" % (t,),
+ s, pos, pos + unicode_bytes)
+ result.append(res)
+ continue
+ rutf8.unichr_as_utf8_append(result, intmask(t))
+ pos += unicode_bytes
+ r = result.build()
+ lgt, flag = rutf8.check_utf8(r, True)
+ return r, pos, lgt, flag
+
+def utf8_encode_unicode_internal(s, errors, errorhandler):
+ size = len(s)
+ if size == 0:
+ return ''
+
+ result = StringBuilder(size * 4)
+ pos = 0
+ while pos < size:
+ oc = rutf8.codepoint_at_pos(s, pos)
+ if BYTEORDER == "little":
+ result.append(chr(oc & 0xFF))
+ result.append(chr(oc >> 8 & 0xFF))
+ result.append(chr(oc >> 16 & 0xFF))
+ result.append(chr(oc >> 24 & 0xFF))
+ else:
+ result.append(chr(oc >> 24 & 0xFF))
+ result.append(chr(oc >> 16 & 0xFF))
+ result.append(chr(oc >> 8 & 0xFF))
+ result.append(chr(oc & 0xFF))
+ pos = rutf8.next_codepoint_pos(s, pos)
+
+ return result.build()
+
+# ____________________________________________________________
+# Charmap
+
+ERROR_CHAR = u'\ufffe'.encode('utf8')
+
+ at specialize.argtype(4)
+def str_decode_charmap(s, errors, final=False,
+ errorhandler=None, mapping=None):
+ "mapping can be a rpython dictionary, or a dict-like object."
+
+ # Default to Latin-1
+ if mapping is None:
+ return str_decode_latin_1(s, errors, final=final,
+ errorhandler=errorhandler)
+ size = len(s)
+ if size == 0:
+ return '', 0, 0, rutf8.FLAG_ASCII
+
+ pos = 0
+ result = StringBuilder(size)
+ while pos < size:
+ ch = s[pos]
+
+ c = mapping.get(ch, ERROR_CHAR)
+ if c == ERROR_CHAR:
+ r, pos = errorhandler(errors, "charmap",
+ "character maps to <undefined>",
+ s, pos, pos + 1)
+ result.append(r)
+ continue
+ result.append(c)
+ pos += 1
+ r = result.build()
+ lgt, flag = rutf8.check_utf8(r, True)
+ return r, pos, lgt, flag
+
+def utf8_encode_charmap(s, errors, errorhandler=None,
+ mapping=None):
+ YYY
+ if mapping is None:
+ return unicode_encode_latin_1(s, size, errors,
+ errorhandler=errorhandler)
+
+ if errorhandler is None:
+ errorhandler = default_unicode_error_encode
+
+ if size == 0:
+ return ''
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ c = mapping.get(ch, '')
+ if len(c) == 0:
+ # collect all unencodable chars. Important for narrow builds.
+ collend = pos + 1
+ while collend < size and mapping.get(s[collend], '') == '':
+ collend += 1
+ ru, rs, pos = errorhandler(errors, "charmap",
+ "character maps to <undefined>",
+ s, pos, collend)
+ if rs is not None:
+ # py3k only
+ result.append(rs)
+ continue
+ for ch2 in ru:
+ c2 = mapping.get(ch2, '')
+ if len(c2) == 0:
+ errorhandler(
+ "strict", "charmap",
+ "character maps to <undefined>",
+ s, pos, pos + 1)
+ result.append(c2)
+ continue
+ result.append(c)
+ pos += 1
+ return result.build()
+
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,7 +1,6 @@
from rpython.rlib import jit, rutf8
from rpython.rlib.objectmodel import we_are_translated, not_rpython
from rpython.rlib.rstring import UnicodeBuilder
-from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
@@ -563,14 +562,14 @@
if space.isinstance_w(w_ch, space.w_unicode):
# Charmap may return a unicode string
- return space.unicode_w(w_ch)
+ return space.utf8_w(w_ch)
elif space.isinstance_w(w_ch, space.w_int):
# Charmap may return a number
x = space.int_w(w_ch)
if not 0 <= x <= 0x10FFFF:
raise oefmt(space.w_TypeError,
"character mapping must be in range(0x110000)")
- return code_to_unichr(x)
+ return rutf8.unichr_as_utf8(x)
elif space.is_w(w_ch, space.w_None):
# Charmap may return None
return errorchar
@@ -614,12 +613,13 @@
@unwrap_spec(string='bufferstr', errors='text_or_none')
def charmap_decode(space, string, errors="strict", w_mapping=None):
- from pypy.interpreter.unicodehelper import DecodeWrapper
+ from pypy.interpreter import unicodehelper
if errors is None:
errors = 'strict'
if len(string) == 0:
- return space.newtuple([space.newunicode(u''), space.newint(0)])
+ return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII),
+ space.newint(0)])
if space.is_none(w_mapping):
mapping = None
@@ -628,14 +628,14 @@
final = True
state = space.fromcache(CodecState)
- result, consumed = runicode.str_decode_charmap(
- string, len(string), errors,
- final, DecodeWrapper(state.decode_error_handler).handle, mapping)
- return space.newtuple([space.newunicode(result), space.newint(consumed)])
+ result, consumed, lgt, flag = unicodehelper.str_decode_charmap(
+ string, errors, final, state.decode_error_handler, mapping)
+ return space.newtuple([space.newutf8(result, lgt, flag),
+ space.newint(consumed)])
@unwrap_spec(utf8='utf8', errors='text_or_none')
def charmap_encode(space, utf8, errors="strict", w_mapping=None):
- from pypy.interpreter.unicodehelper import EncodeWrapper
+ from pypy.interpreter import unicodehelper
if errors is None:
errors = 'strict'
@@ -645,10 +645,8 @@
mapping = Charmap_Encode(space, w_mapping)
state = space.fromcache(CodecState)
- uni = utf8.decode('utf8')
- result = runicode.unicode_encode_charmap(
- uni, len(uni), errors,
- EncodeWrapper(state.encode_error_handler).handle, mapping)
+ result = unicodehelper.unicode_encode_charmap(
+ utf8, errors, state.encode_error_handler, mapping)
return space.newtuple([space.newbytes(result), space.newint(len(uni))])
@@ -707,7 +705,7 @@
@unwrap_spec(errors='text_or_none')
def unicode_internal_decode(space, w_string, errors="strict"):
- from pypy.interpreter.unicodehelper import DecodeWrapper
+ from pypy.interpreter import unicodehelper
if errors is None:
errors = 'strict'
@@ -718,14 +716,16 @@
string = space.readbuf_w(w_string).as_str()
if len(string) == 0:
- return space.newtuple([space.newunicode(u''), space.newint(0)])
+ return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII),
+ space.newint(0)])
final = True
state = space.fromcache(CodecState)
- result, consumed = runicode.str_decode_unicode_internal(
- string, len(string), errors,
- final, DecodeWrapper(state.decode_error_handler).handle)
- return space.newtuple([space.newunicode(result), space.newint(consumed)])
+ result, consumed, lgt, flag = unicodehelper.str_decode_unicode_internal(
+ string, errors,
+ final, state.decode_error_handler)
+ return space.newtuple([space.newutf8(result, lgt, flag),
+ space.newint(consumed)])
# ____________________________________________________________
# support for the "string escape" codec
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -15,7 +15,6 @@
'utf-32', 'utf-32-le', 'utf-32-be',
'raw_unicode_escape',
'unicode_escape', 'unicode_internal'):
- print encoding
assert unicode(u.encode(encoding),encoding) == u
def test_ucs4(self):
diff --git a/pypy/module/exceptions/interp_exceptions.py b/pypy/module/exceptions/interp_exceptions.py
--- a/pypy/module/exceptions/interp_exceptions.py
+++ b/pypy/module/exceptions/interp_exceptions.py
@@ -285,7 +285,7 @@
def descr_init(self, space, w_object, w_start, w_end, w_reason):
# typechecking
- space.realunicode_w(w_object)
+ space.utf8_w(w_object)
space.int_w(w_start)
space.int_w(w_end)
space.realtext_w(w_reason)
@@ -719,7 +719,7 @@
def descr_init(self, space, w_encoding, w_object, w_start, w_end, w_reason):
# typechecking
space.realtext_w(w_encoding)
- space.realunicode_w(w_object) # XXX realutf8()?
+ space.utf8_w(w_object)
space.int_w(w_start)
space.int_w(w_end)
space.realtext_w(w_reason)
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -432,8 +432,7 @@
def fmt_s(self, w_value):
space = self.space
- got_unicode = space.isinstance_w(w_value,
- space.w_unicode)
+ got_unicode = space.isinstance_w(w_value, space.w_unicode)
if not do_unicode:
if got_unicode:
raise NeedUnicodeFormattingError
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -164,9 +164,9 @@
if isinstance(x, str):
return self.newtext(x)
if isinstance(x, unicode):
- from pypy.interpreter import unicodehelper
- return self.newutf8(x.encode('utf8'), len(x),
- unicodehelper._get_flag(x))
+ x = x.encode('utf8')
+ lgt, flag = rutf8.check_utf8(x, True)
+ return self.newutf8(x, lgt, flag)
if isinstance(x, float):
return W_FloatObject(x)
if isinstance(x, W_Root):
More information about the pypy-commit
mailing list