[pypy-commit] pypy unicode-utf8: (fijal, arigo) kill helpers methods that don't belong to rutf8
fijal
pypy.commits at gmail.com
Fri Feb 24 14:00:11 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r90342:4caeffafa080
Date: 2017-02-24 19:59 +0100
http://bitbucket.org/pypy/pypy/changeset/4caeffafa080/
Log: (fijal, arigo) kill helpers methods that don't belong to rutf8
They also don't belong to runicode, have a more minimal change for
now, but should be fixed at some point
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -39,18 +39,20 @@
state = space.fromcache(interp_codecs.CodecState)
unicodedata_handler = state.get_unicodedata_handler(space)
# XXX pick better length, maybe
- result, consumed, length = rutf8.str_decode_utf8_escape(
+ # XXX that guy does not belong in runicode (nor in rutf8)
+ result_u, consumed = runicode.str_decode_unicode_escape(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space),
unicodedata_handler=unicodedata_handler)
- return result, length
+ return result_u.encode('utf8'), len(result_u)
def decode_raw_unicode_escape(space, string):
# XXX pick better length, maybe
- result, consumed, length = rutf8.str_decode_raw_utf8_escape(
+ # XXX that guy does not belong in runicode (nor in rutf8)
+ result_u, consumed = runicode.str_decode_raw_unicode_escape(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space))
- return result, length
+ return result_u.encode('utf8'), len(result_u)
def check_utf8(space, string):
# Surrogates are accepted and not treated specially at all.
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -1,6 +1,7 @@
from rpython.rlib.rstring import StringBuilder
from rpython.rlib import runicode, jit
+from rpython.rlib.rarithmetic import r_uint
from rpython.rlib.nonconst import NonConstant
from rpython.tool.sourcetools import func_with_new_name
@@ -29,6 +30,36 @@
chr((0x80 | (code & 0x3f)))), lgt
raise ValueError
+def unichr_as_utf8_append(builder, code):
+ """ Encode code (numeric value) as utf8 encoded string
+ """
+ if code < 0:
+ raise ValueError
+ lgt = 1
+ if code >= runicode.MAXUNICODE:
+ lgt = 2
+ if code < 0x80:
+ # Encode ASCII
+ builder.append(chr(code))
+ return 1
+ if code < 0x0800:
+ # Encode Latin-1
+ builder.append(chr((0xc0 | (code >> 6))))
+ builder.append(chr((0x80 | (code & 0x3f))))
+ return lgt
+ if code < 0x10000:
+ builder.append(chr((0xe0 | (code >> 12))))
+ builder.append(chr((0x80 | ((code >> 6) & 0x3f))))
+ builder.append(chr((0x80 | (code & 0x3f))))
+ return lgt
+ if code < 0x10ffff:
+ builder.append(chr((0xf0 | (code >> 18))))
+ builder.append(chr((0x80 | ((code >> 12) & 0x3f))))
+ builder.append(chr((0x80 | ((code >> 6) & 0x3f))))
+ builder.append(chr((0x80 | (code & 0x3f))))
+ return lgt
+ raise ValueError
+
def next_codepoint_pos(code, pos):
""" Gives the position of the next codepoint after pos, -1
if it's the last one (assumes valid utf8)
@@ -50,6 +81,13 @@
def default_unicode_error_check(*args):
xxx
+def default_unicode_error_decode(errors, encoding, message, s, pos, endpos, lgt):
+ if errors == 'replace':
+ return '\xef\xbf\xbd', endpos, lgt + 1 # u'\ufffd'
+ if errors == 'ignore':
+ return '', endpos, lgt
+ raise UnicodeDecodeError(encoding, s, pos, endpos, message)
+
def check_newline_utf8(s, pos):
chr1 = ord(s[pos])
if 0xa <= chr1 <= 0xd:
@@ -207,184 +245,3 @@
return pos, lgt
str_check_utf8_elidable = jit.elidable(
func_with_new_name(str_check_utf8_impl, "str_check_utf8_elidable"))
-
-
-def str_decode_raw_utf8_escape(s, size, errors, final=False,
- errorhandler=None):
- lgt = 0
- if errorhandler is None:
- errorhandler = None # default_unicode_error_decode
- if size == 0:
- return '', 0, 0
- result = StringBuilder(size)
- pos = 0
- while pos < size:
- ch = s[pos]
-
- # Non-escape characters are interpreted as Unicode ordinals
- if ch != '\\':
- result.append(ch)
- pos += 1
- lgt += 1
- continue
-
- # \u-escapes are only interpreted iff the number of leading
- # backslashes is odd
- bs = pos
- while pos < size:
- pos += 1
- if pos == size or s[pos] != '\\':
- break
- lgt += 1
- result.append('\\')
-
- # we have a backslash at the end of the string, stop here
- if pos >= size:
- lgt += 1
- result.append('\\')
- break
-
- if ((pos - bs) & 1 == 0 or
- pos >= size or
- (s[pos] != 'u' and s[pos] != 'U')):
- result.append('\\')
- result.append(s[pos])
- lgt += 2
- pos += 1
- continue
-
- digits = 4 if s[pos] == 'u' else 8
- message = "truncated \\uXXXX"
- pos += 1
- xxx # change hexescape to deal with utf8
- pos = hexescape(result, s, pos, digits,
- "rawunicodeescape", errorhandler, message, errors)
-
- return result.build(), pos, lgt
-
-def str_decode_utf8_escape(s, size, errors, final=False,
- errorhandler=None,
- unicodedata_handler=None):
- if errorhandler is None:
- errorhandler = default_unicode_error_decode
-
- if size == 0:
- return '', 0, 0
-
- lgt = 0
- builder = StringBuilder(size)
- pos = 0
- while pos < size:
- ch = s[pos]
-
- # Non-escape characters are interpreted as Unicode ordinals
- if ch != '\\':
- builder.append(ch)
- pos += 1
- lgt += 1
- continue
-
- # - Escapes
- pos += 1
- if pos >= size:
- message = "\\ at end of string"
- res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-1, size)
- builder.append(res)
- lgt += 1
- continue
-
- ch = s[pos]
- pos += 1
- # \x escapes
- if ch == '\n': pass
- elif ch == '\\': builder.append('\\'); lgt += 1
- elif ch == '\'': builder.append('\''); lgt += 1
- elif ch == '\"': builder.append('\"'); lgt += 1
- elif ch == 'b' : builder.append('\b'); lgt += 1
- elif ch == 'f' : builder.append('\f'); lgt += 1
- elif ch == 't' : builder.append('\t'); lgt += 1
- elif ch == 'n' : builder.append('\n'); lgt += 1
- elif ch == 'r' : builder.append('\r'); lgt += 1
- elif ch == 'v' : builder.append('\v'); lgt += 1
- elif ch == 'a' : builder.append('\a'); lgt += 1
- elif '0' <= ch <= '7':
- xxx
- x = ord(ch) - ord('0')
- if pos < size:
- ch = s[pos]
- if '0' <= ch <= '7':
- pos += 1
- x = (x<<3) + ord(ch) - ord('0')
- if pos < size:
- ch = s[pos]
- if '0' <= ch <= '7':
- pos += 1
- x = (x<<3) + ord(ch) - ord('0')
- builder.append(unichr(x))
- # hex escapes
- # \xXX
- elif ch == 'x':
- xxx
- digits = 2
- message = "truncated \\xXX escape"
- pos = hexescape(builder, s, pos, digits,
- "unicodeescape", errorhandler, message, errors)
-
- # \uXXXX
- elif ch == 'u':
- xxx
- digits = 4
- message = "truncated \\uXXXX escape"
- pos = hexescape(builder, s, pos, digits,
- "unicodeescape", errorhandler, message, errors)
-
- # \UXXXXXXXX
- elif ch == 'U':
- xxx
- digits = 8
- message = "truncated \\UXXXXXXXX escape"
- pos = hexescape(builder, s, pos, digits,
- "unicodeescape", errorhandler, message, errors)
-
- # \N{name}
- elif ch == 'N' and unicodedata_handler is not None:
- xxx
- message = "malformed \\N character escape"
- look = pos
-
- if look < size and s[look] == '{':
- # look for the closing brace
- while look < size and s[look] != '}':
- look += 1
- if look < size and s[look] == '}':
- # found a name. look it up in the unicode database
- message = "unknown Unicode character name"
- name = s[pos+1:look]
- code = unicodedata_handler.call(name)
- if code < 0:
- res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-1, look+1)
- builder.append(res)
- continue
- pos = look + 1
- if code <= MAXUNICODE:
- builder.append(UNICHR(code))
- else:
- code -= 0x10000L
- builder.append(unichr(0xD800 + (code >> 10)))
- builder.append(unichr(0xDC00 + (code & 0x03FF)))
- else:
- res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-1, look+1)
- builder.append(res)
- else:
- res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-1, look+1)
- builder.append(res)
- else:
- builder.append('\\')
- builder.append(ch)
- lgt += 2
-
- return builder.build(), pos, lgt
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -47,11 +47,6 @@
assert consumed == len(s)
assert length == len(u)
- at given(strategies.binary())
-def test_str_decode_raw_utf8_escape(uni):
- return # XXX fix details
- rutf8.str_decode_raw_utf8_escape(uni, len(uni), None)
-
@given(strategies.characters())
def test_next_pos(uni):
skips = []
More information about the pypy-commit
mailing list