[pypy-commit] pypy unicode-utf8: remove dead code and fix obscure length tracking bug
fijal
pypy.commits at gmail.com
Sun Feb 26 17:42:18 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r90371:1c4b1dc9e28e
Date: 2017-02-26 21:35 +0100
http://bitbucket.org/pypy/pypy/changeset/1c4b1dc9e28e/
Log: remove dead code and fix obscure length tracking bug
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1682,6 +1682,9 @@
def unicode_w(self, w_obj):
return self.utf8_w(w_obj).decode('utf8')
+ def realunicode_w(self, w_obj):
+ return self.realutf8_w(w_obj).decode('utf8')
+
def newunicode(self, u):
assert isinstance(u, unicode)
return self.newutf8(u.encode("utf8"), len(u))
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -108,11 +108,11 @@
return _create_list_from_unicode(self._value)
def ord(self, space):
- if len(self._value) != 1:
+ if self._len() != 1:
raise oefmt(space.w_TypeError,
"ord() expected a character, but string of length %d "
"found", len(self._value))
- return space.newint(ord(self._value[0]))
+ return space.newint(rutf8.codepoint_at_pos(self._utf8, 0))
def _new(self, value):
return W_UnicodeObject(value.encode('utf8'), len(value))
@@ -503,7 +503,7 @@
lgt += 1
if keepends:
eol = pos
- lgt += 2
+ lgt += 1
strs_w.append(W_UnicodeObject(value[sol:eol], lgt))
return space.newlist(strs_w)
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -66,6 +66,39 @@
return pos + 1
return pos + ord(runicode._utf8_code_length[chr1 - 0x80])
+def codepoint_at_pos(code, pos):
+ """ Give a codepoint in code at pos - assumes valid utf8, no checking!
+ """
+ import pdb
+ pdb.set_trace()
+ ordch1 = ord(code[pos])
+ if ordch1 < 0x80:
+ return ordch1
+
+ n = ord(runicode._utf8_code_length[ordch1 - 0x80])
+ if n == 2:
+ ordch2 = ord(code[pos+1])
+ # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
+ return (((ordch1 & 0x1F) << 6) + # 0b00011111
+ (ordch2 & 0x3F)) # 0b00111111
+ elif n == 3:
+ ordch2 = ord(code[pos+1])
+ ordch3 = ord(code[pos+2])
+ # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
+ return (((ordch1 & 0x0F) << 12) + # 0b00001111
+ ((ordch2 & 0x3F) << 6) + # 0b00111111
+ (ordch3 & 0x3F)) # 0b00111111
+ elif n == 4:
+ ordch2 = ord(code[pos+1])
+ ordch3 = ord(code[pos+2])
+ ordch4 = ord(code[pos+3])
+ # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
+ return (((ordch1 & 0x07) << 18) + # 0b00000111
+ ((ordch2 & 0x3F) << 12) + # 0b00111111
+ ((ordch3 & 0x3F) << 6) + # 0b00111111
+ (ordch4 & 0x3F)) # 0b00111111
+ assert False, "unreachable"
+
class AsciiCheckError(Exception):
def __init__(self, pos):
self.pos = pos
@@ -75,9 +108,6 @@
if ord(s[i]) & 0x80:
raise AsciiCheckError(i)
-def default_unicode_error_check(*args):
- xxx
-
def utf8_encode_ascii(s, errors, encoding, msg, errorhandler):
res = StringBuilder(len(s))
u_pos = 0
@@ -109,14 +139,6 @@
pos += 1
return result.build(), pos, -1
-
-def default_unicode_error_decode(errors, encoding, message, s, pos, endpos, lgt):
- if errors == 'replace':
- return '\xef\xbf\xbd', endpos, lgt + 1 # u'\ufffd'
- if errors == 'ignore':
- return '', endpos, lgt
- raise UnicodeDecodeError(encoding, s, pos, endpos, message)
-
def check_newline_utf8(s, pos):
chr1 = ord(s[pos])
if 0xa <= chr1 <= 0xd:
More information about the pypy-commit
mailing list