[pypy-commit] pypy unicode-utf8: (fijal, arigo) kill helpers methods that don't belong to rutf8

Fri Feb 24 14:00:11 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r90342:4caeffafa080
Date: 2017-02-24 19:59 +0100
http://bitbucket.org/pypy/pypy/changeset/4caeffafa080/

Log:	(fijal, arigo) kill helpers methods that don't belong to rutf8

	They also don't belong to runicode, have a more minimal change for
	now, but should be fixed at some point

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -39,18 +39,20 @@
     state = space.fromcache(interp_codecs.CodecState)
     unicodedata_handler = state.get_unicodedata_handler(space)
     # XXX pick better length, maybe
-    result, consumed, length = rutf8.str_decode_utf8_escape(
+    # XXX that guy does not belong in runicode (nor in rutf8)
+    result_u, consumed = runicode.str_decode_unicode_escape(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space),
         unicodedata_handler=unicodedata_handler)
-    return result, length
+    return result_u.encode('utf8'), len(result_u)
 
 def decode_raw_unicode_escape(space, string):
     # XXX pick better length, maybe
-    result, consumed, length = rutf8.str_decode_raw_utf8_escape(
+    # XXX that guy does not belong in runicode (nor in rutf8)
+    result_u, consumed = runicode.str_decode_raw_unicode_escape(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space))
-    return result, length
+    return result_u.encode('utf8'), len(result_u)
 
 def check_utf8(space, string):
     # Surrogates are accepted and not treated specially at all.
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -1,6 +1,7 @@
 
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib import runicode, jit
+from rpython.rlib.rarithmetic import r_uint
 from rpython.rlib.nonconst import NonConstant
 from rpython.tool.sourcetools import func_with_new_name
 
@@ -29,6 +30,36 @@
                 chr((0x80 | (code & 0x3f)))), lgt
     raise ValueError
 
+def unichr_as_utf8_append(builder, code):
+    """ Encode code (numeric value) as utf8 encoded string
+    """
+    if code < 0:
+        raise ValueError
+    lgt = 1
+    if code >= runicode.MAXUNICODE:
+        lgt = 2
+    if code < 0x80:
+        # Encode ASCII
+        builder.append(chr(code))
+        return 1
+    if code < 0x0800:
+        # Encode Latin-1
+        builder.append(chr((0xc0 | (code >> 6))))
+        builder.append(chr((0x80 | (code & 0x3f))))
+        return lgt
+    if code < 0x10000:
+        builder.append(chr((0xe0 | (code >> 12))))
+        builder.append(chr((0x80 | ((code >> 6) & 0x3f))))
+        builder.append(chr((0x80 | (code & 0x3f))))
+        return lgt
+    if code < 0x10ffff:
+        builder.append(chr((0xf0 | (code >> 18))))
+        builder.append(chr((0x80 | ((code >> 12) & 0x3f))))
+        builder.append(chr((0x80 | ((code >> 6) & 0x3f))))
+        builder.append(chr((0x80 | (code & 0x3f))))
+        return lgt
+    raise ValueError
+
 def next_codepoint_pos(code, pos):
     """ Gives the position of the next codepoint after pos, -1
     if it's the last one (assumes valid utf8)
@@ -50,6 +81,13 @@
 def default_unicode_error_check(*args):
     xxx
 
+def default_unicode_error_decode(errors, encoding, message, s, pos, endpos, lgt):
+    if errors == 'replace':
+        return '\xef\xbf\xbd', endpos, lgt + 1 # u'\ufffd'
+    if errors == 'ignore':
+        return '', endpos, lgt
+    raise UnicodeDecodeError(encoding, s, pos, endpos, message)
+
 def check_newline_utf8(s, pos):
     chr1 = ord(s[pos])
     if 0xa <= chr1 <= 0xd:
@@ -207,184 +245,3 @@
     return pos, lgt
 str_check_utf8_elidable = jit.elidable(
     func_with_new_name(str_check_utf8_impl, "str_check_utf8_elidable"))
-
-
-def str_decode_raw_utf8_escape(s, size, errors, final=False,
-                               errorhandler=None):
-    lgt = 0
-    if errorhandler is None:
-        errorhandler = None # default_unicode_error_decode
-    if size == 0:
-        return '', 0, 0
-    result = StringBuilder(size)
-    pos = 0
-    while pos < size:
-        ch = s[pos]
-
-        # Non-escape characters are interpreted as Unicode ordinals
-        if ch != '\\':
-            result.append(ch)
-            pos += 1
-            lgt += 1
-            continue
-
-        # \u-escapes are only interpreted iff the number of leading
-        # backslashes is odd
-        bs = pos
-        while pos < size:
-            pos += 1
-            if pos == size or s[pos] != '\\':
-                break
-            lgt += 1
-            result.append('\\')
-
-        # we have a backslash at the end of the string, stop here
-        if pos >= size:
-            lgt += 1
-            result.append('\\')
-            break
-
-        if ((pos - bs) & 1 == 0 or
-            pos >= size or
-            (s[pos] != 'u' and s[pos] != 'U')):
-            result.append('\\')
-            result.append(s[pos])
-            lgt += 2
-            pos += 1
-            continue
-
-        digits = 4 if s[pos] == 'u' else 8
-        message = "truncated \\uXXXX"
-        pos += 1
-        xxx # change hexescape to deal with utf8
-        pos = hexescape(result, s, pos, digits,
-                        "rawunicodeescape", errorhandler, message, errors)
-
-    return result.build(), pos, lgt
-
-def str_decode_utf8_escape(s, size, errors, final=False,
-                              errorhandler=None,
-                              unicodedata_handler=None):
-    if errorhandler is None:
-        errorhandler = default_unicode_error_decode
-
-    if size == 0:
-        return '', 0, 0
-
-    lgt = 0
-    builder = StringBuilder(size)
-    pos = 0
-    while pos < size:
-        ch = s[pos]
-
-        # Non-escape characters are interpreted as Unicode ordinals
-        if ch != '\\':
-            builder.append(ch)
-            pos += 1
-            lgt += 1
-            continue
-
-        # - Escapes
-        pos += 1
-        if pos >= size:
-            message = "\\ at end of string"
-            res, pos = errorhandler(errors, "unicodeescape",
-                                    message, s, pos-1, size)
-            builder.append(res)
-            lgt += 1
-            continue
-
-        ch = s[pos]
-        pos += 1
-        # \x escapes
-        if ch == '\n': pass
-        elif ch == '\\': builder.append('\\'); lgt += 1
-        elif ch == '\'': builder.append('\''); lgt += 1
-        elif ch == '\"': builder.append('\"'); lgt += 1
-        elif ch == 'b' : builder.append('\b'); lgt += 1
-        elif ch == 'f' : builder.append('\f'); lgt += 1
-        elif ch == 't' : builder.append('\t'); lgt += 1
-        elif ch == 'n' : builder.append('\n'); lgt += 1
-        elif ch == 'r' : builder.append('\r'); lgt += 1
-        elif ch == 'v' : builder.append('\v'); lgt += 1
-        elif ch == 'a' : builder.append('\a'); lgt += 1
-        elif '0' <= ch <= '7':
-            xxx
-            x = ord(ch) - ord('0')
-            if pos < size:
-                ch = s[pos]
-                if '0' <= ch <= '7':
-                    pos += 1
-                    x = (x<<3) + ord(ch) - ord('0')
-                    if pos < size:
-                        ch = s[pos]
-                        if '0' <= ch <= '7':
-                            pos += 1
-                            x = (x<<3) + ord(ch) - ord('0')
-            builder.append(unichr(x))
-        # hex escapes
-        # \xXX
-        elif ch == 'x':
-            xxx
-            digits = 2
-            message = "truncated \\xXX escape"
-            pos = hexescape(builder, s, pos, digits,
-                            "unicodeescape", errorhandler, message, errors)
-
-        # \uXXXX
-        elif ch == 'u':
-            xxx
-            digits = 4
-            message = "truncated \\uXXXX escape"
-            pos = hexescape(builder, s, pos, digits,
-                            "unicodeescape", errorhandler, message, errors)
-
-        #  \UXXXXXXXX
-        elif ch == 'U':
-            xxx
-            digits = 8
-            message = "truncated \\UXXXXXXXX escape"
-            pos = hexescape(builder, s, pos, digits,
-                            "unicodeescape", errorhandler, message, errors)
-
-        # \N{name}
-        elif ch == 'N' and unicodedata_handler is not None:
-            xxx
-            message = "malformed \\N character escape"
-            look = pos
-
-            if look < size and s[look] == '{':
-                # look for the closing brace
-                while look < size and s[look] != '}':
-                    look += 1
-                if look < size and s[look] == '}':
-                    # found a name.  look it up in the unicode database
-                    message = "unknown Unicode character name"
-                    name = s[pos+1:look]
-                    code = unicodedata_handler.call(name)
-                    if code < 0:
-                        res, pos = errorhandler(errors, "unicodeescape",
-                                                message, s, pos-1, look+1)
-                        builder.append(res)
-                        continue
-                    pos = look + 1
-                    if code <= MAXUNICODE:
-                        builder.append(UNICHR(code))
-                    else:
-                        code -= 0x10000L
-                        builder.append(unichr(0xD800 + (code >> 10)))
-                        builder.append(unichr(0xDC00 + (code & 0x03FF)))
-                else:
-                    res, pos = errorhandler(errors, "unicodeescape",
-                                            message, s, pos-1, look+1)
-                    builder.append(res)
-            else:
-                res, pos = errorhandler(errors, "unicodeescape",
-                                        message, s, pos-1, look+1)
-                builder.append(res)
-        else:
-            builder.append('\\')
-            builder.append(ch)
-            lgt += 2
-
-    return builder.build(), pos, lgt
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -47,11 +47,6 @@
         assert consumed == len(s)
         assert length == len(u)
 
- at given(strategies.binary())
-def test_str_decode_raw_utf8_escape(uni):
-    return # XXX fix details
-    rutf8.str_decode_raw_utf8_escape(uni, len(uni), None)
-
 @given(strategies.characters())
 def test_next_pos(uni):
     skips = []