[pypy-commit] pypy unicode-utf8: Review for surrogates
arigo
pypy.commits at gmail.com
Thu Nov 23 04:48:57 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r93136:dc6582a05b85
Date: 2017-11-23 10:48 +0100
http://bitbucket.org/pypy/pypy/changeset/dc6582a05b85/
Log: Review for surrogates
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -370,14 +370,15 @@
builder.append(res)
else:
# when we get here, chr is a 32-bit unicode character
- if chr > 0x10ffff:
+ try:
+ rutf8.unichr_as_utf8_append(builder, intmask(chr), True)
+ except ValueError:
message = "illegal Unicode character"
res, pos = errorhandler(errors, encoding,
message, s, pos-2, pos+digits)
size, flag = rutf8.check_utf8(res, True)
builder.append(res)
else:
- rutf8.unichr_as_utf8_append(builder, intmask(chr), True)
flag = rutf8.get_flag_from_code(intmask(chr))
pos += digits
size = 1
@@ -466,7 +467,7 @@
pos += 1
x = (x<<3) + ord(ch) - ord('0')
outsize += 1
- if x >= 0x7F:
+ if x > 0x7F:
rutf8.unichr_as_utf8_append(builder, x)
flag = combine_flags(rutf8.FLAG_REGULAR, flag)
else:
@@ -524,7 +525,9 @@
pos = look + 1
outsize += 1
flag = combine_flags(flag, rutf8.get_flag_from_code(code))
- rutf8.unichr_as_utf8_append(builder, code)
+ rutf8.unichr_as_utf8_append(builder, code,
+ allow_surrogates=True)
+ # xxx 'code' is probably always within range here...
else:
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, look+1)
@@ -772,7 +775,8 @@
surrogate = 0
continue
else:
- rutf8.unichr_as_utf8_append(result, surrogate)
+ rutf8.unichr_as_utf8_append(result, surrogate,
+ allow_surrogates=True)
flag = rutf8.FLAG_HAS_SURROGATES
outsize += 1
surrogate = 0
@@ -1236,7 +1240,7 @@
result.append(r)
continue
- rutf8.unichr_as_utf8_append(result, ch)
+ rutf8.unichr_as_utf8_append(result, ch, allow_surrogates=True)
pos += 4
r = result.build()
lgt, flag = rutf8.check_utf8(r, True)
@@ -1360,7 +1364,7 @@
s, pos, pos + unicode_bytes)
result.append(res)
continue
- rutf8.unichr_as_utf8_append(result, intmask(t))
+ rutf8.unichr_as_utf8_append(result, intmask(t), allow_surrogates=True)
pos += unicode_bytes
r = result.build()
lgt, flag = rutf8.check_utf8(r, True)
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -127,7 +127,7 @@
errorcb, namecb, stringdata)
src = pypy_cjk_dec_outbuf(decodebuf)
length = pypy_cjk_dec_outlen(decodebuf)
- return rffi.wcharpsize2utf8(src, length)
+ return rffi.wcharpsize2utf8(src, length) # assumes no out-of-range chars
def multibytecodec_decerror(decodebuf, e, errors,
errorcb, namecb, stringdata):
diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1012,6 +1012,7 @@
def wcharpsize2utf8(w, size):
""" Helper to convert WCHARP pointer to utf8 in one go.
Equivalent to wcharpsize2unicode().encode("utf8")
+ Raises ValueError if characters are outside range(0x110000)!
"""
from rpython.rlib import rutf8
More information about the pypy-commit
mailing list