[pypy-commit] pypy unicode-utf8-py3: add dummy allow_surrogates kwarg to all encoding functions, fix translation
mattip
pypy.commits at gmail.com
Mon Sep 17 01:45:57 EDT 2018
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95137:613679b79386
Date: 2018-09-17 08:44 +0300
http://bitbucket.org/pypy/pypy/changeset/613679b79386/
Log: add dummy allow_surrogates kwarg to all encoding functions, fix
translation
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -245,7 +245,7 @@
s = start + ru + end
return s
-def utf8_encode_latin_1(s, errors, errorhandler):
+def utf8_encode_latin_1(s, errors, errorhandler, allow_surrogates=False):
try:
rutf8.check_ascii(s)
return s
@@ -311,7 +311,7 @@
return result.build()
if _WIN32:
- def utf8_encode_mbcs(s, errors, errorhandler):
+ def utf8_encode_mbcs(s, errors, errorhandler, allow_surrogates=False):
res = rutf8.utf8_encode_mbcs(s, errors, errorhandler,
force_replace=False)
return res
@@ -321,7 +321,7 @@
res, size = runicode.str_decode_mbcs(s, slen, errors, final=final,
errorhandler=errorhandler, force_ignore=force_ignore)
res_utf8 = runicode.unicode_encode_utf_8(res, len(res), 'strict')
- return res_utf8, len(res)
+ return res_utf8, len(res), len(res)
def str_decode_utf8(s, errors, final, errorhandler, allow_surrogates=False):
""" Same as checking for the valid utf8, but we know the utf8 is not
@@ -686,7 +686,7 @@
for i in range(zeros-1, -1, -1):
result.append(TABLE[(char >> (4 * i)) & 0x0f])
-def utf8_encode_raw_unicode_escape(s, errors, errorhandler):
+def utf8_encode_raw_unicode_escape(s, errors, errorhandler, allow_surrogates=False):
# errorhandler is not used: this function cannot cause Unicode errors
size = len(s)
if size == 0:
@@ -705,7 +705,7 @@
return result.build()
-def utf8_encode_unicode_escape(s, errors, errorhandler):
+def utf8_encode_unicode_escape(s, errors, errorhandler, allow_surrogates=False):
return _utf8_encode_unicode_escape(s)
# ____________________________________________________________
@@ -938,7 +938,7 @@
assert final_length >= 0
return result.build()[:final_length], outsize, size
-def utf8_encode_utf_7(s, errors, errorhandler):
+def utf8_encode_utf_7(s, errors, errorhandler, allow_surrogates=False):
size = len(s)
if size == 0:
return ''
@@ -1002,7 +1002,7 @@
errorhandler=encode_unicode_error_handler(space),
allow_surrogates=allow_surrogates)
-def encode_utf8sp(space, uni):
+def encode_utf8sp(space, uni, allow_surrogates=True):
# Surrogate-preserving utf-8 encoding. Any surrogate character
# turns into its 3-bytes encoding, whether it is paired or not.
# This should always be reversible, and the reverse is
@@ -1202,7 +1202,8 @@
errors, public_encoding_name, 'surrogates not allowed',
s, pos, pos+1)
#for cp in rutf8.Utf8StringIterator(res_8):
- for cp in res_8:
+ for ch in res_8:
+ cp = ord(ch)
if cp < 0xD800 or allow_surrogates:
_STORECHAR(result, cp, byteorder)
else:
@@ -1566,7 +1567,7 @@
lgt = rutf8.check_utf8(r, True)
return r, lgt
-def utf8_encode_unicode_internal(s, errors, errorhandler):
+def utf8_encode_unicode_internal(s, errors, errorhandler, allow_surrogates=False):
size = len(s)
if size == 0:
return ''
@@ -1625,7 +1626,7 @@
lgt = rutf8.codepoints_in_utf8(r)
return r, lgt, pos
-def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None):
+def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None, allow_surrogates=False):
if mapping is None:
return utf8_encode_latin_1(s, errors, errorhandler=errorhandler)
size = len(s)
@@ -1667,7 +1668,7 @@
# ____________________________________________________________
# Decimal Encoder
-def unicode_encode_decimal(s, errors, errorhandler=None):
+def unicode_encode_decimal(s, errors, errorhandler=None, allow_surrogates=False):
"""Converts whitespace to ' ', decimal characters to their
corresponding ASCII digit and all other Latin-1 characters except
\0 as-is. Characters outside this range (Unicode ordinals 1-256)
More information about the pypy-commit
mailing list