[pypy-commit] pypy unicode-utf8: shuffle stuff around to new file rutf8
fijal
pypy.commits at gmail.com
Tue Feb 21 07:10:22 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r90255:76c0add1d61e
Date: 2017-02-21 13:10 +0100
http://bitbucket.org/pypy/pypy/changeset/76c0add1d61e/
Log: shuffle stuff around to new file rutf8
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,6 +1,6 @@
from pypy.interpreter.error import OperationError
from rpython.rlib.objectmodel import specialize
-from rpython.rlib import runicode
+from rpython.rlib import runicode, rutf8
from pypy.module._codecs import interp_codecs
@specialize.memo()
@@ -39,7 +39,7 @@
state = space.fromcache(interp_codecs.CodecState)
unicodedata_handler = state.get_unicodedata_handler(space)
# XXX pick better length, maybe
- result, consumed = runicode.str_decode_utf8_escape(
+ result, consumed = rutf8.str_decode_utf8_escape(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space),
unicodedata_handler=unicodedata_handler)
@@ -47,7 +47,7 @@
def decode_raw_unicode_escape(space, string):
# XXX pick better length, maybe
- result, consumed = runicode.str_decode_raw_utf8_escape(
+ result, consumed = rutf8.str_decode_raw_utf8_escape(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space))
return result
diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -5,7 +5,7 @@
from pypy.interpreter import gateway
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import unwrap_spec, WrappedDefault
-from rpython.rlib.runicode import UNICHR, unichr_as_utf8
+from rpython.rlib.rutf8 import unichr_as_utf8
from rpython.rlib.rfloat import isnan, isinf, round_double
from rpython.rlib import rfloat
import __builtin__
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -7,8 +7,8 @@
from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
from rpython.rlib.runicode import (
make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
- unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii,
- check_ascii, AsciiCheckError)
+ unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii)
+from rpython.rlib import rutf8
from pypy.interpreter import unicodehelper
from pypy.interpreter.baseobjspace import W_Root
@@ -579,8 +579,8 @@
return unicode_from_encoded_object(space, w_bytes, encoding, "strict")
s = space.bytes_w(w_bytes)
try:
- check_ascii(s)
- except AsciiCheckError:
+ rutf8.check_ascii(s)
+ except rutf8.AsciiCheckError:
# raising UnicodeDecodeError is messy, "please crash for me"
return utf8_from_encoded_object(space, w_bytes, "ascii", "strict")
return W_UnicodeObject(s)
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -42,36 +42,6 @@
if len(u) == 1:
return ord(u[0])
raise TypeError
-
-def unichr_as_utf8(code):
- """ Encode code (numeric value) as utf8 encoded string
- """
- if code < 0:
- raise ValueError
- if code < 0x80:
- # Encode ASCII
- return chr(code)
- if code < 0x0800:
- # Encode Latin-1
- return chr((0xc0 | (code >> 6))) + chr((0x80 | (code & 0x3f)))
- if code < 0x10000:
- return (chr((0xe0 | (code >> 12))) +
- chr((0x80 | ((code >> 6) & 0x3f))) +
- chr((0x80 | (code & 0x3f))))
- if code < 0x10ffff:
- return (chr((0xf0 | (code >> 18))) +
- chr((0x80 | ((code >> 12) & 0x3f))) +
- chr((0x80 | ((code >> 6) & 0x3f))) +
- chr((0x80 | (code & 0x3f))))
- raise ValueError
-
-class AsciiCheckError(Exception):
- pass
-
-def check_ascii(s):
- for i in range(0, len(s)):
- if ord(s[i]) & 0x80:
- raise AsciiCheckError
if MAXUNICODE > sys.maxunicode:
# A version of unichr which allows codes outside the BMP
@@ -1407,129 +1377,6 @@
return builder.build(), pos
-def str_decode_utf8_escape(s, size, errors, final=False,
- errorhandler=None,
- unicodedata_handler=None):
- if errorhandler is None:
- errorhandler = default_unicode_error_decode
-
- if size == 0:
- return '', 0
-
- builder = StringBuilder(size)
- pos = 0
- while pos < size:
- ch = s[pos]
-
- # Non-escape characters are interpreted as Unicode ordinals
- if ch != '\\':
- builder.append(ch)
- pos += 1
- continue
-
- # - Escapes
- pos += 1
- if pos >= size:
- message = "\\ at end of string"
- res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-1, size)
- builder.append(res)
- continue
-
- ch = s[pos]
- pos += 1
- # \x escapes
- if ch == '\n': pass
- elif ch == '\\': builder.append('\\')
- elif ch == '\'': builder.append('\'')
- elif ch == '\"': builder.append('\"')
- elif ch == 'b' : builder.append('\b')
- elif ch == 'f' : builder.append('\f')
- elif ch == 't' : builder.append('\t')
- elif ch == 'n' : builder.append('\n')
- elif ch == 'r' : builder.append('\r')
- elif ch == 'v' : builder.append('\v')
- elif ch == 'a' : builder.append('\a')
- elif '0' <= ch <= '7':
- xxx
- x = ord(ch) - ord('0')
- if pos < size:
- ch = s[pos]
- if '0' <= ch <= '7':
- pos += 1
- x = (x<<3) + ord(ch) - ord('0')
- if pos < size:
- ch = s[pos]
- if '0' <= ch <= '7':
- pos += 1
- x = (x<<3) + ord(ch) - ord('0')
- builder.append(unichr(x))
- # hex escapes
- # \xXX
- elif ch == 'x':
- xxx
- digits = 2
- message = "truncated \\xXX escape"
- pos = hexescape(builder, s, pos, digits,
- "unicodeescape", errorhandler, message, errors)
-
- # \uXXXX
- elif ch == 'u':
- xxx
- digits = 4
- message = "truncated \\uXXXX escape"
- pos = hexescape(builder, s, pos, digits,
- "unicodeescape", errorhandler, message, errors)
-
- # \UXXXXXXXX
- elif ch == 'U':
- xxx
- digits = 8
- message = "truncated \\UXXXXXXXX escape"
- pos = hexescape(builder, s, pos, digits,
- "unicodeescape", errorhandler, message, errors)
-
- # \N{name}
- elif ch == 'N' and unicodedata_handler is not None:
- xxx
- message = "malformed \\N character escape"
- look = pos
-
- if look < size and s[look] == '{':
- # look for the closing brace
- while look < size and s[look] != '}':
- look += 1
- if look < size and s[look] == '}':
- # found a name. look it up in the unicode database
- message = "unknown Unicode character name"
- name = s[pos+1:look]
- code = unicodedata_handler.call(name)
- if code < 0:
- res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-1, look+1)
- builder.append(res)
- continue
- pos = look + 1
- if code <= MAXUNICODE:
- builder.append(UNICHR(code))
- else:
- code -= 0x10000L
- builder.append(unichr(0xD800 + (code >> 10)))
- builder.append(unichr(0xDC00 + (code & 0x03FF)))
- else:
- res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-1, look+1)
- builder.append(res)
- else:
- res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-1, look+1)
- builder.append(res)
- else:
- builder.append('\\')
- builder.append(ch)
-
- return builder.build(), pos
-
def make_unicode_escape_function(pass_printable=False, unicode_output=False,
quotes=False, prefix=None):
# Python3 has two similar escape functions: One to implement
@@ -1650,54 +1497,6 @@
# ____________________________________________________________
# Raw unicode escape
-def str_decode_raw_utf8_escape(s, size, errors, final=False,
- errorhandler=None):
- if errorhandler is None:
- errorhandler = default_unicode_error_decode
- if size == 0:
- return '', 0
- result = StringBuilder(size)
- pos = 0
- while pos < size:
- ch = s[pos]
-
- # Non-escape characters are interpreted as Unicode ordinals
- if ch != '\\':
- result.append(ch)
- pos += 1
- continue
-
- # \u-escapes are only interpreted iff the number of leading
- # backslashes is odd
- bs = pos
- while pos < size:
- pos += 1
- if pos == size or s[pos] != '\\':
- break
- result.append('\\')
-
- # we have a backslash at the end of the string, stop here
- if pos >= size:
- result.append('\\')
- break
-
- if ((pos - bs) & 1 == 0 or
- pos >= size or
- (s[pos] != 'u' and s[pos] != 'U')):
- result.append('\\')
- result.append(s[pos])
- pos += 1
- continue
-
- digits = 4 if s[pos] == 'u' else 8
- message = "truncated \\uXXXX"
- pos += 1
- xxx # change hexescape to deal with utf8
- pos = hexescape(result, s, pos, digits,
- "rawunicodeescape", errorhandler, message, errors)
-
- return result.build(), pos
-
def str_decode_raw_unicode_escape(s, size, errors, final=False,
errorhandler=None):
if errorhandler is None:
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rutf8.py
@@ -0,0 +1,203 @@
+
+from rpython.rlib.rstring import StringBuilder
+
+def unichr_as_utf8(code):
+ """ Encode code (numeric value) as utf8 encoded string
+ """
+ if code < 0:
+ raise ValueError
+ if code < 0x80:
+ # Encode ASCII
+ return chr(code)
+ if code < 0x0800:
+ # Encode Latin-1
+ return chr((0xc0 | (code >> 6))) + chr((0x80 | (code & 0x3f)))
+ if code < 0x10000:
+ return (chr((0xe0 | (code >> 12))) +
+ chr((0x80 | ((code >> 6) & 0x3f))) +
+ chr((0x80 | (code & 0x3f))))
+ if code < 0x10ffff:
+ return (chr((0xf0 | (code >> 18))) +
+ chr((0x80 | ((code >> 12) & 0x3f))) +
+ chr((0x80 | ((code >> 6) & 0x3f))) +
+ chr((0x80 | (code & 0x3f))))
+ raise ValueError
+
+class AsciiCheckError(Exception):
+ pass
+
+def check_ascii(s):
+ for i in range(0, len(s)):
+ if ord(s[i]) & 0x80:
+ raise AsciiCheckError
+
+def str_decode_raw_utf8_escape(s, size, errors, final=False,
+ errorhandler=None):
+ if errorhandler is None:
+ errorhandler = None # default_unicode_error_decode
+ if size == 0:
+ return '', 0
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ # Non-escape characters are interpreted as Unicode ordinals
+ if ch != '\\':
+ result.append(ch)
+ pos += 1
+ continue
+
+ # \u-escapes are only interpreted iff the number of leading
+ # backslashes is odd
+ bs = pos
+ while pos < size:
+ pos += 1
+ if pos == size or s[pos] != '\\':
+ break
+ result.append('\\')
+
+ # we have a backslash at the end of the string, stop here
+ if pos >= size:
+ result.append('\\')
+ break
+
+ if ((pos - bs) & 1 == 0 or
+ pos >= size or
+ (s[pos] != 'u' and s[pos] != 'U')):
+ result.append('\\')
+ result.append(s[pos])
+ pos += 1
+ continue
+
+ digits = 4 if s[pos] == 'u' else 8
+ message = "truncated \\uXXXX"
+ pos += 1
+ xxx # change hexescape to deal with utf8
+ pos = hexescape(result, s, pos, digits,
+ "rawunicodeescape", errorhandler, message, errors)
+
+ return result.build(), pos
+
+def str_decode_utf8_escape(s, size, errors, final=False,
+ errorhandler=None,
+ unicodedata_handler=None):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_decode
+
+ if size == 0:
+ return '', 0
+
+ builder = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ # Non-escape characters are interpreted as Unicode ordinals
+ if ch != '\\':
+ builder.append(ch)
+ pos += 1
+ continue
+
+ # - Escapes
+ pos += 1
+ if pos >= size:
+ message = "\\ at end of string"
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, size)
+ builder.append(res)
+ continue
+
+ ch = s[pos]
+ pos += 1
+ # \x escapes
+ if ch == '\n': pass
+ elif ch == '\\': builder.append('\\')
+ elif ch == '\'': builder.append('\'')
+ elif ch == '\"': builder.append('\"')
+ elif ch == 'b' : builder.append('\b')
+ elif ch == 'f' : builder.append('\f')
+ elif ch == 't' : builder.append('\t')
+ elif ch == 'n' : builder.append('\n')
+ elif ch == 'r' : builder.append('\r')
+ elif ch == 'v' : builder.append('\v')
+ elif ch == 'a' : builder.append('\a')
+ elif '0' <= ch <= '7':
+ xxx
+ x = ord(ch) - ord('0')
+ if pos < size:
+ ch = s[pos]
+ if '0' <= ch <= '7':
+ pos += 1
+ x = (x<<3) + ord(ch) - ord('0')
+ if pos < size:
+ ch = s[pos]
+ if '0' <= ch <= '7':
+ pos += 1
+ x = (x<<3) + ord(ch) - ord('0')
+ builder.append(unichr(x))
+ # hex escapes
+ # \xXX
+ elif ch == 'x':
+ xxx
+ digits = 2
+ message = "truncated \\xXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+
+ # \uXXXX
+ elif ch == 'u':
+ xxx
+ digits = 4
+ message = "truncated \\uXXXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+
+ # \UXXXXXXXX
+ elif ch == 'U':
+ xxx
+ digits = 8
+ message = "truncated \\UXXXXXXXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+
+ # \N{name}
+ elif ch == 'N' and unicodedata_handler is not None:
+ xxx
+ message = "malformed \\N character escape"
+ look = pos
+
+ if look < size and s[look] == '{':
+ # look for the closing brace
+ while look < size and s[look] != '}':
+ look += 1
+ if look < size and s[look] == '}':
+ # found a name. look it up in the unicode database
+ message = "unknown Unicode character name"
+ name = s[pos+1:look]
+ code = unicodedata_handler.call(name)
+ if code < 0:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ continue
+ pos = look + 1
+ if code <= MAXUNICODE:
+ builder.append(UNICHR(code))
+ else:
+ code -= 0x10000L
+ builder.append(unichr(0xD800 + (code >> 10)))
+ builder.append(unichr(0xDC00 + (code & 0x03FF)))
+ else:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ else:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ else:
+ builder.append('\\')
+ builder.append(ch)
+
+ return builder.build(), pos
More information about the pypy-commit
mailing list