[pypy-commit] pypy unicode-utf8: remove dead code and fix obscure length tracking bug

Sun Feb 26 17:42:18 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r90371:1c4b1dc9e28e
Date: 2017-02-26 21:35 +0100
http://bitbucket.org/pypy/pypy/changeset/1c4b1dc9e28e/

Log:	remove dead code and fix obscure length tracking bug

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1682,6 +1682,9 @@
     def unicode_w(self, w_obj):
         return self.utf8_w(w_obj).decode('utf8')
 
+    def realunicode_w(self, w_obj):
+        return self.realutf8_w(w_obj).decode('utf8')
+
     def newunicode(self, u):
         assert isinstance(u, unicode)
         return self.newutf8(u.encode("utf8"), len(u))
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -108,11 +108,11 @@
         return _create_list_from_unicode(self._value)
 
     def ord(self, space):
-        if len(self._value) != 1:
+        if self._len() != 1:
             raise oefmt(space.w_TypeError,
                          "ord() expected a character, but string of length %d "
                          "found", len(self._value))
-        return space.newint(ord(self._value[0]))
+        return space.newint(rutf8.codepoint_at_pos(self._utf8, 0))
 
     def _new(self, value):
         return W_UnicodeObject(value.encode('utf8'), len(value))
@@ -503,7 +503,7 @@
                     lgt += 1
             if keepends:
                 eol = pos
-                lgt += 2
+                lgt += 1
             strs_w.append(W_UnicodeObject(value[sol:eol], lgt))
         return space.newlist(strs_w)
 
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -66,6 +66,39 @@
         return pos + 1
     return pos + ord(runicode._utf8_code_length[chr1 - 0x80])
 
+def codepoint_at_pos(code, pos):
+    """ Give a codepoint in code at pos - assumes valid utf8, no checking!
+    """
+    import pdb
+    pdb.set_trace()
+    ordch1 = ord(code[pos])
+    if ordch1 < 0x80:
+        return ordch1
+
+    n = ord(runicode._utf8_code_length[ordch1 - 0x80])
+    if n == 2:
+        ordch2 = ord(code[pos+1])
+        # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
+        return (((ordch1 & 0x1F) << 6) +    # 0b00011111
+                 (ordch2 & 0x3F))           # 0b00111111
+    elif n == 3:
+        ordch2 = ord(code[pos+1])
+        ordch3 = ord(code[pos+2])
+        # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
+        return (((ordch1 & 0x0F) << 12) +     # 0b00001111
+                ((ordch2 & 0x3F) << 6) +      # 0b00111111
+                (ordch3 & 0x3F))              # 0b00111111
+    elif n == 4:
+        ordch2 = ord(code[pos+1])
+        ordch3 = ord(code[pos+2])
+        ordch4 = ord(code[pos+3])
+        # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
+        return (((ordch1 & 0x07) << 18) +      # 0b00000111
+                ((ordch2 & 0x3F) << 12) +      # 0b00111111
+                ((ordch3 & 0x3F) << 6) +       # 0b00111111
+                (ordch4 & 0x3F))               # 0b00111111
+    assert False, "unreachable"
+
 class AsciiCheckError(Exception):
     def __init__(self, pos):
         self.pos = pos
@@ -75,9 +108,6 @@
         if ord(s[i]) & 0x80:
             raise AsciiCheckError(i)
 
-def default_unicode_error_check(*args):
-    xxx
-
 def utf8_encode_ascii(s, errors, encoding, msg, errorhandler):
     res = StringBuilder(len(s))
     u_pos = 0
@@ -109,14 +139,6 @@
         pos += 1
     return result.build(), pos, -1
 
-
-def default_unicode_error_decode(errors, encoding, message, s, pos, endpos, lgt):
-    if errors == 'replace':
-        return '\xef\xbf\xbd', endpos, lgt + 1 # u'\ufffd'
-    if errors == 'ignore':
-        return '', endpos, lgt
-    raise UnicodeDecodeError(encoding, s, pos, endpos, message)
-
 def check_newline_utf8(s, pos):
     chr1 = ord(s[pos])
     if 0xa <= chr1 <= 0xd: