[pypy-commit] pypy utf8-unicode2: pobjspace and interpreter tests now pass

Tue Jul 8 09:43:29 CEST 2014

Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r72383:104602bd7dd9
Date: 2014-07-08 02:37 -0500
http://bitbucket.org/pypy/pypy/changeset/104602bd7dd9/

Log:	pobjspace and interpreter tests now pass

diff too long, truncating to 2000 out of 2118 lines

diff --git a/pypy/interpreter/astcompiler/test/test_astbuilder.py b/pypy/interpreter/astcompiler/test/test_astbuilder.py
--- a/pypy/interpreter/astcompiler/test/test_astbuilder.py
+++ b/pypy/interpreter/astcompiler/test/test_astbuilder.py
@@ -8,6 +8,7 @@
 from pypy.interpreter.pyparser.error import SyntaxError
 from pypy.interpreter.astcompiler.astbuilder import ast_from_node
 from pypy.interpreter.astcompiler import ast, consts
+from pypy.interpreter.utf8 import Utf8Str
 
 
 class TestAstBuilder:
@@ -1103,7 +1104,7 @@
         assert info.encoding == "utf-7"
         s = ast_from_node(space, tree, info).body[0].value
         assert isinstance(s, ast.Str)
-        assert space.eq_w(s.s, space.wrap(sentence))
+        assert space.eq_w(s.s, space.wrap(Utf8Str.from_unicode(sentence)))
 
     def test_string_bug(self):
         space = self.space
diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py b/pypy/interpreter/astcompiler/test/test_compiler.py
--- a/pypy/interpreter/astcompiler/test/test_compiler.py
+++ b/pypy/interpreter/astcompiler/test/test_compiler.py
@@ -919,11 +919,7 @@
         import sys
         d = {}
         exec '# -*- coding: utf-8 -*-\n\nu = u"\xf0\x9f\x92\x8b"' in d
-        if sys.maxunicode > 65535 and self.maxunicode > 65535:
-            expected_length = 1
-        else:
-            expected_length = 2
-        assert len(d['u']) == expected_length
+        assert len(d['u']) == 1
 
 
 class TestOptimizations:
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -153,6 +153,10 @@
                 const = code_hook(space, const, hidden_applevel, code_hook)
             if isinstance(const, unicode):
                 const = Utf8Str.from_unicode(const)
+            if isinstance(const, tuple):
+                const = tuple(x if not isinstance(x, unicode)
+                                else Utf8Str.from_unicode(x)
+                                for x in const)
             newconsts_w[num] = space.wrap(const)
             num += 1
         # stick the underlying CPython magic value, if the code object
diff --git a/pypy/interpreter/pyparser/test/test_parsestring.py b/pypy/interpreter/pyparser/test/test_parsestring.py
--- a/pypy/interpreter/pyparser/test/test_parsestring.py
+++ b/pypy/interpreter/pyparser/test/test_parsestring.py
@@ -102,7 +102,4 @@
     def test_decode_unicode_utf8(self):
         buf = parsestring.decode_unicode_utf8(self.space,
                                               'u"\xf0\x9f\x92\x8b"', 2, 6)
-        if sys.maxunicode == 65535:
-            assert buf == r"\U0000d83d\U0000dc8b"
-        else:
-            assert buf == r"\U0001f48b"
+        assert buf == r"\U0001f48b"
diff --git a/pypy/interpreter/test/test_gateway.py b/pypy/interpreter/test/test_gateway.py
--- a/pypy/interpreter/test/test_gateway.py
+++ b/pypy/interpreter/test/test_gateway.py
@@ -4,6 +4,7 @@
 from pypy.interpreter import gateway, argument
 from pypy.interpreter.gateway import ObjSpace, W_Root, WrappedDefault
 from pypy.interpreter.signature import Signature
+from pypy.interpreter.utf8 import Utf8Str
 import py
 import sys
 
@@ -519,7 +520,7 @@
                                                       unicode])
         w_app_g3_u = space.wrap(app_g3_u)
         assert self.space.eq_w(
-            space.call_function(w_app_g3_u, w(u"foo")),
+            space.call_function(w_app_g3_u, w(Utf8Str("foo"))),
             w(3))
         assert self.space.eq_w(
             space.call_function(w_app_g3_u, w("baz")),
diff --git a/pypy/interpreter/test/test_objspace.py b/pypy/interpreter/test/test_objspace.py
--- a/pypy/interpreter/test/test_objspace.py
+++ b/pypy/interpreter/test/test_objspace.py
@@ -2,6 +2,7 @@
 from pypy.interpreter.error import OperationError
 from pypy.interpreter.function import Function
 from pypy.interpreter.pycode import PyCode
+from pypy.interpreter.utf8 import Utf8Str
 from rpython.rlib.rarithmetic import r_longlong, r_ulonglong
 import sys
 
@@ -217,8 +218,9 @@
         w = space.wrap
         assert space.str0_w(w("123")) == "123"
         exc = space.raises_w(space.w_TypeError, space.str0_w, w("123\x004"))
-        assert space.unicode0_w(w(u"123")) == u"123"
-        exc = space.raises_w(space.w_TypeError, space.unicode0_w, w(u"123\x004"))
+        assert space.unicode0_w(w(Utf8Str("123"))) == u"123"
+        exc = space.raises_w(space.w_TypeError, space.unicode0_w,
+                             w(Utf8Str.from_unicode(u"123\x004")))
 
     def test_getindex_w(self):
         w_instance1 = self.space.appexec([], """():
diff --git a/pypy/interpreter/test/test_utf8.py b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -35,13 +35,15 @@
         iter.move(i)
         if i != 4:
             assert iter.peek_next() == [0x41, 0x10F, 0x20AC, 0x1F63D][i]
-        assert list(iter) == [0x41, 0x10F, 0x20AC, 0x1F63D][i:]
+        l = list(iter)
+        assert l == [0x41, 0x10F, 0x20AC, 0x1F63D][i:]
 
     for i in range(1, 5):
         iter = s.codepoint_iter()
         list(iter) # move the iterator to the end
         iter.move(-i)
-        assert list(iter) == [0x41, 0x10F, 0x20AC, 0x1F63D][4-i:]
+        l = list(iter)
+        assert l == [0x41, 0x10F, 0x20AC, 0x1F63D][4-i:]
 
     iter = s.char_iter()
     l = [s.bytes.decode('utf8') for s in list(iter)]
@@ -50,6 +52,27 @@
     else:
         assert l == [u'A', u'\u010F', u'\u20AC', u'\U00001F63D']
 
+def test_reverse_iterator():
+    s = build_utf8str()
+    iter = s.reverse_codepoint_iter()
+    assert iter.peek_next() == 0x1F63D
+    assert list(iter) == [0x1F63D, 0x20AC, 0x10F, 0x41]
+
+    for i in range(1, 5):
+        iter = s.reverse_codepoint_iter()
+        iter.move(i)
+        if i != 4:
+            assert iter.peek_next() == [0x1F63D, 0x20AC, 0x10F, 0x41][i]
+        l = list(iter)
+        assert l == [0x1F63D, 0x20AC, 0x10F, 0x41][i:]
+
+    for i in range(1, 5):
+        iter = s.reverse_codepoint_iter()
+        list(iter) # move the iterator to the end
+        iter.move(-i)
+        l = list(iter)
+        assert l == [0x1F63D, 0x20AC, 0x10F, 0x41][4-i:]
+
 def test_builder_append_slice():
     builder = Utf8Builder()
     builder.append_slice(Utf8Str.from_unicode(u"0ê0"), 1, 2)
@@ -57,6 +80,10 @@
 
     assert builder.build() == u"êes"
 
+def test_eq():
+    assert Utf8Str('test') == Utf8Str('test')
+    assert Utf8Str('test') != Utf8Str('test1')
+
 def test_unicode_literal_comparison():
     builder = Utf8Builder()
     builder.append(0x10F)
@@ -152,5 +179,17 @@
 
     assert s.split() == u.split()
     assert s.split(' ') == u.split(' ')
-    assert s.split(maxsplit=1) == u.split(None, 1)
+    assert s.split(maxsplit=2) == u.split(None, 2)
+    assert s.split(' ', 2) == u.split(' ', 2)
     assert s.split('\n') == [s]
+
+def test_rsplit():
+    # U+00A0 is a non-breaking space
+    u = u"one two three\xA0four"
+    s = Utf8Str.from_unicode(u)
+
+    assert s.rsplit() == u.rsplit()
+    assert s.rsplit(' ') == u.rsplit(' ')
+    assert s.rsplit(maxsplit=2) == u.rsplit(None, 2)
+    assert s.rsplit(' ', 2) == u.rsplit(' ', 2)
+    assert s.rsplit('\n') == [s]
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -104,6 +104,9 @@
             return Utf8Str('')
         # TODO: If start > _len or stop >= _len, then raise exception 
 
+        if stop > len(self):
+            stop = len(self)
+
         if self._is_ascii:
             return Utf8Str(self.bytes[start:stop], True)
 
@@ -124,6 +127,12 @@
         return Utf8Str(self.bytes[start_byte:stop_byte], is_ascii,
                        stop - start)
 
+    def byte_slice(self, start, end):
+        return Utf8Str(self.bytes[start:end], self._is_ascii)
+
+    def __repr__(self):
+        return "<Utf8Str: %r>" % unicode(self)
+
     def __add__(self, other):
         return Utf8Str(self.bytes + other.bytes,
                        self._is_ascii and other._is_ascii)
@@ -134,6 +143,9 @@
     def __len__(self):
         return self._len
 
+    def __hash__(self):
+        return hash(self.bytes)
+
     def __eq__(self, other):
         """NOT_RPYTHON"""
         if isinstance(other, Utf8Str):
@@ -143,6 +155,27 @@
 
         return False
 
+    def __ne__(self, other):
+        """NOT_RPYTHON"""
+        if isinstance(other, Utf8Str):
+            return self.bytes != other.bytes
+        if isinstance(other, unicode):
+            return unicode(self.bytes, 'utf8') != other
+
+        return True
+
+    def __lt__(self, other):
+        return self.bytes < other.bytes
+
+    def __le__(self, other):
+        return self.bytes <= other.bytes
+
+    def __gt__(self, other):
+        return self.bytes > other.bytes
+
+    def __ge__(self, other):
+        return self.bytes >= other.bytes
+
     @specialize.argtype(1)
     def __contains__(self, other):
         if isinstance(other, Utf8Str):
@@ -158,11 +191,20 @@
     def __iter__(self):
         return self.char_iter()
 
+    def __unicode__(self):
+        return unicode(self.bytes, 'utf8')
+
     def char_iter(self):
-        return Utf8StrCharIterator(self)
+        return Utf8CharacterIter(self)
+
+    def reverse_char_iter(self):
+        return Utf8ReverseCharacterIter(self)
 
     def codepoint_iter(self):
-        return Utf8StrCodePointIterator(self)
+        return Utf8CodePointIter(self)
+
+    def reverse_codepoint_iter(self):
+        return Utf8ReverseCodePointIter(self)
 
     @specialize.argtype(1, 2)
     def _bound_check(self, start, end):
@@ -270,12 +312,11 @@
             else:
                 break
 
-            iter.prev_count(1)
             start_byte = iter.byte_pos
-            iter.next_count(1)
 
             if maxsplit == 0:
-                res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)]))
+                res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)],
+                           self._is_ascii))
                 break
 
             for cd in iter:
@@ -283,12 +324,12 @@
                     break
             else:
                 # Hit the end of the string
-                res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)]))
+                res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)],
+                           self._is_ascii))
                 break
 
-            iter.prev_count(1)
-            res.append(Utf8Str(self.bytes[start_byte:iter.byte_pos]))
-            iter.next_count(1)
+            res.append(Utf8Str(self.bytes[start_byte:iter.byte_pos],
+                               self._is_ascii))
             maxsplit -= 1
 
         return res
@@ -302,15 +343,54 @@
                 other_bytes = other.bytes
             return [Utf8Str(s) for s in self.bytes.rsplit(other_bytes, maxsplit)]
 
-        # TODO: I need to make a reverse_codepoint_iter first
+        res = []
+        iter = self.reverse_codepoint_iter()
+        while True:
+            # Find the start of the next word
+            for cd in iter:
+                if not unicodedb.isspace(cd):
+                    break
+            else:
+                break
 
+            start_byte = self.next_char(iter.byte_pos)
+
+            if maxsplit == 0:
+                res.append(Utf8Str(self.bytes[0:start_byte], self._is_ascii))
+                break
+
+            # Find the end of the word
+            for cd in iter:
+                if unicodedb.isspace(cd):
+                    break
+            else:
+                # We hit the end of the string
+                res.append(Utf8Str(self.bytes[0:start_byte], self._is_ascii))
+                break
+
+            end_byte = self.next_char(iter.byte_pos)
+            res.append(Utf8Str(self.bytes[end_byte:start_byte],
+                               self._is_ascii))
+            maxsplit -= 1
+
+        res.reverse()
+        return res
+
+    @specialize.argtype(1)
     def join(self, other):
         if len(other) == 0:
             return Utf8Str('')
 
-        assert isinstance(other[0], Utf8Str)
-        return Utf8Str(self.bytes.join([s.bytes for s in other]),
-                       self._is_ascii and all(s._is_ascii for s in other))
+        if isinstance(other[0], Utf8Str):
+            return Utf8Str(
+                self.bytes.join([s.bytes for s in other]),
+                self._is_ascii and all(s._is_ascii for s in other)
+            )
+        else:
+            return Utf8Str(
+                self.bytes.join([s for s in other]),
+                self._is_ascii and all(s._is_ascii for s in other)
+            )
 
     def as_unicode(self):
         """NOT_RPYTHON"""
@@ -321,83 +401,18 @@
         """NOT_RPYTHON"""
         return Utf8Str(u.encode('utf-8'))
 
-class Utf8StrCodePointIterator(object):
-    def __init__(self, ustr):
-        self.ustr = ustr
-        self.pos = 0
-        self.byte_pos = 0
+    def next_char(self, byte_pos):
+        return byte_pos + utf8_code_length[ord(self.bytes[byte_pos])]
 
-        if len(ustr) != 0:
-            self.current = utf8ord_bytes(ustr.bytes, 0)
-        else:
-            self.current = -1
+    def prev_char(self, byte_pos):
+        if byte_pos == 0:
+            return -1
+        byte_pos -= 1
+        while utf8_code_length[ord(self.bytes[byte_pos])] == 0:
+            byte_pos -= 1
+        return byte_pos
 
-    def __iter__(self):
-        return self
 
-    def next(self):
-        if self.pos == len(self.ustr):
-            raise StopIteration()
-        self.current = utf8ord_bytes(self.ustr.bytes, self.byte_pos)
-
-        self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
-        self.pos += 1
-
-        return self.current
-
-    def next_count(self, count=1):
-        self.pos += count
-        while count > 1:
-            self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
-            count -= 1
-        self.current = utf8ord_bytes(self.ustr.bytes, self.byte_pos)
-        self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
-
-    def prev_count(self, count=1):
-        self.pos -= count
-        while count > 0:
-            self.byte_pos -= 1
-            while utf8_code_length[ord(self.ustr.bytes[self.byte_pos])] == 0:
-                self.byte_pos -= 1
-            count -= 1
-
-        self.current = utf8ord_bytes(self.ustr.bytes, self.byte_pos)
-
-    def move(self, count):
-        if count > 0:
-            self.next_count(count)
-        elif count < 0:
-            self.prev_count(-count)
-
-    def peek_next(self):
-        return utf8ord_bytes(self.ustr.bytes, self.byte_pos)
-
-class Utf8StrCharIterator(object):
-    def __init__(self, ustr):
-        self.ustr = ustr
-        self.byte_pos = 0
-        self.current = self._get_current()
-
-    def __iter__(self):
-        return self
-
-    def _get_current(self):
-        if self.byte_pos == len(self.ustr.bytes):
-            return None
-        length = utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
-        return Utf8Str(''.join([self.ustr.bytes[i]
-                        for i in range(self.byte_pos, self.byte_pos + length)]),
-                       length == 1)
-
-    def next(self):
-        #import pdb; pdb.set_trace()
-        ret = self.current
-        if ret is None:
-            raise StopIteration()
-
-        self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
-        self.current = self._get_current()
-        return ret
 
 class Utf8Builder(object):
     @specialize.argtype(1)
@@ -452,9 +467,168 @@
             raise TypeError("Invalid type '%s' for Utf8Str.append_slice" %
                             type(s))
 
+    @specialize.argtype(1)
     def append_multiple_char(self, c, count):
-        self._builder.append_multiple_char(c, count)
+        # TODO: What do I do when I have an int? Is it fine to just loop over
+        #       .append(c) then? Should (can) I force a resize first?
+        if isinstance(c, int):
+            self._builder.append_multiple_char(chr(c), count)
+            return
+
+        if len(c) > 1:
+            import pdb; pdb.set_trace()
+        if isinstance(c, str):
+            self._builder.append_multiple_char(c, count)
+        else:
+            self._builder.append_multiple_char(c.bytes, count)
 
     def build(self):
         return Utf8Str(self._builder.build(), self._is_ascii)
 
+# _______________________________________________
+
+# iter.current is the current (ie the last returned) element
+# iter.pos isthe position of the current element
+# iter.byte_pos isthe byte position of the current element
+# In the before-the-start state, for foward iterators iter.pos and
+# iter.byte_pos are -1. For reverse iterators, they are len(ustr) and
+# len(ustr.bytes) respectively.
+
+class ForwardIterBase(object):
+    def __init__(self, ustr):
+        self.ustr = ustr
+        self.pos = -1
+
+        self._byte_pos = 0
+        self.byte_pos = -1
+        self.current = self._default
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        if self.pos + 1 == len(self.ustr):
+            raise StopIteration()
+
+        self.pos += 1
+        self.byte_pos = self._byte_pos
+
+        self.current = self._value(self.byte_pos)
+
+        self._byte_pos = self.ustr.next_char(self._byte_pos)
+        return self.current
+
+    def peek_next(self):
+        return self._value(self._byte_pos)
+
+    def peek_prev(self):
+        return self._value(self._move_backward(self.byte_pos))
+
+    def move(self, count):
+        if count > 0:
+            self.pos += count
+
+            while count != 1:
+                self._byte_pos = self.ustr.next_char(self._byte_pos)
+                count -= 1
+            self.byte_pos = self._byte_pos
+            self._byte_pos = self.ustr.next_char(self._byte_pos)
+            self.current = self._value(self.byte_pos)
+
+        elif count < 0:
+            self.pos += count
+            while count < -1:
+                self.byte_pos = self.ustr.prev_char(self.byte_pos)
+                count += 1
+            self._byte_pos = self.byte_pos
+            self.byte_pos = self.ustr.prev_char(self.byte_pos)
+            self.current = self._value(self.byte_pos)
+
+    def copy(self):
+        iter = self.__class__(self.ustr)
+        iter.pos = self.pos
+        iter.byte_pos = self.byte_pos
+        iter._byte_pos = self._byte_pos
+        iter.current = self.current
+        return iter
+
+class ReverseIterBase(object):
+    def __init__(self, ustr):
+        self.ustr = ustr
+        self.pos = len(ustr)
+        self.byte_pos = len(ustr.bytes)
+        self.current = self._default
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        if self.pos == 0:
+            raise StopIteration()
+
+        self.pos -= 1
+        self.byte_pos = self.ustr.prev_char(self.byte_pos)
+        self.current = self._value(self.byte_pos)
+        return self.current
+
+    def peek_next(self):
+        return self._value(self.ustr.prev_char(self.byte_pos))
+
+    def peek_prev(self):
+        return self._value(self.ustr.next_char(self.byte_pos))
+
+    def move(self, count):
+        if count > 0:
+            self.pos -= count
+            while count != 0:
+                self.byte_pos = self.ustr.prev_char(self.byte_pos)
+                count -= 1
+            self.current = self._value(self.byte_pos)
+        elif count < 0:
+            self.pos -= count
+            while count != 0:
+                self.byte_pos = self.ustr.next_char(self.byte_pos)
+                count += 1
+            self.current = self._value(self.byte_pos)
+
+    def copy(self):
+        iter = self.__class__(self.ustr)
+        iter.pos = self.pos
+        iter.byte_pos = self.byte_pos
+        iter.current = self.current
+        return iter
+
+def make_iterator(name, base, calc_value, default):
+    class C(base):
+        _default = default
+        _value = calc_value
+    C.__name__ = name
+    return C
+
+def codepoint_calc_value(self, byte_pos):
+    if byte_pos == -1 or byte_pos == len(self.ustr.bytes):
+        return -1
+    return utf8ord_bytes(self.ustr.bytes, byte_pos)
+
+def character_calc_value(self, byte_pos):
+    if byte_pos == -1 or byte_pos == len(self.ustr.bytes):
+        return None
+    length = utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
+    return Utf8Str(''.join([self.ustr.bytes[i]
+                    for i in range(self.byte_pos, self.byte_pos + length)]),
+                    length == 1)
+
+Utf8CodePointIter = make_iterator("Utf8CodePointIter", ForwardIterBase,
+                                  codepoint_calc_value, -1)
+Utf8CharacterIter = make_iterator("Utf8CharacterIter", ForwardIterBase,
+                                  character_calc_value, None)
+Utf8ReverseCodePointIter = make_iterator(
+    "Utf8ReverseCodePointIter", ReverseIterBase, codepoint_calc_value, -1)
+Utf8ReverseCharacterIter = make_iterator(
+    "Utf8ReverseCharacterIter", ReverseIterBase, character_calc_value, None)
+
+del make_iterator
+del codepoint_calc_value
+del character_calc_value
+del ForwardIterBase
+del ReverseIterBase
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -208,7 +208,6 @@
 
         pos = 0
         while pos < size:
-            #oc = ORD(s, pos)
             oc = utf8ord(s, pos)
 
             # Escape quotes
@@ -460,10 +459,10 @@
     else:
         return s.bytes
 
-    iter.move(-1)
     result = Utf8Builder(len(s.bytes))
     result.append_slice(s.bytes, 0, iter.byte_pos)
 
+    iter.move(-1)
     for oc in iter:
         if oc >= 0xD800 and oc <= 0xDFFF:
             # Check the next character to see if this is a surrogate pair
@@ -741,7 +740,6 @@
 
     result = Utf8Builder(size // 2)
 
-    #XXX I think the errors are not correctly handled here
     while pos < size:
         # remaining bytes at the end? (size should be even)
         if len(s) - pos < 2:
@@ -869,7 +867,8 @@
 
 def str_decode_utf_32_helper(s, size, errors, final=True,
                              errorhandler=None,
-                             byteorder="native"):
+                             byteorder="native",
+                             encodingname='utf32'):
     if errorhandler is None:
         errorhandler = default_unicode_error_decode
     bo = 0
@@ -924,7 +923,7 @@
         if len(s) - pos < 4:
             if not final:
                 break
-            r, pos = errorhandler(errors, 'utf32', "truncated data",
+            r, pos = errorhandler(errors, encodingname, "truncated data",
                                   s, pos, len(s))
             result.append(r)
             if len(s) - pos < 4:
@@ -933,7 +932,8 @@
         ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) |
               (ord(s[pos + iorder[1]]) << 8)  | ord(s[pos + iorder[0]]))
         if ch >= 0x110000:
-            r, pos = errorhandler(errors, 'utf32', "codepoint not in range(0x110000)",
+            r, pos = errorhandler(errors, encodingname,
+                                  "codepoint not in range(0x110000)",
                                   s, pos, len(s))
             result.append(r)
             continue
@@ -1097,7 +1097,7 @@
     if errorhandler is None:
         errorhandler = default_unicode_error_decode
     if size == 0:
-        return u'', 0
+        return Utf8Str(''), 0
 
     inShift = False
     base64bits = 0
@@ -1345,9 +1345,12 @@
 def str_decode_unicode_internal(s, size, errors, final=False,
                                 errorhandler=None):
     if BYTEORDER == 'little':
-        return str_decode_utf_32_le(s, size, errors, errorhandler)
+        result, length, byteorder = str_decode_utf_32_helper(
+            s, size, errors, final, errorhandler, "little", "unicode_internal")
     else:
-        return str_decode_utf_32_be(s, size, errors, errorhandler)
+        result, length, byteorder = str_decode_utf_32_helper(
+            s, size, errors, final, errorhandler, "internal", "unicode_internal")
+    return result, length
 
 def unicode_encode_unicode_internal(s, size, errors, errorhandler=None):
     if BYTEORDER == 'little':
@@ -1561,6 +1564,7 @@
 
 def default_unicode_error_decode(errors, encoding, msg, s,
                                  startingpos, endingpos):
+    """NOT_RPYTHON"""
     if errors == 'replace':
         return _unicode_error_replacement, endingpos
     if errors == 'ignore':
@@ -1570,9 +1574,10 @@
 
 def default_unicode_error_encode(errors, encoding, msg, u,
                                  startingpos, endingpos):
+    """NOT_RPYTHON"""
     if errors == 'replace':
         return '?', None, endingpos
     if errors == 'ignore':
         return '', None, endingpos
-    raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
+    raise UnicodeEncodeError(encoding, unicode(u), startingpos, endingpos, msg)
 
diff --git a/pypy/objspace/std/bytearrayobject.py b/pypy/objspace/std/bytearrayobject.py
--- a/pypy/objspace/std/bytearrayobject.py
+++ b/pypy/objspace/std/bytearrayobject.py
@@ -9,6 +9,7 @@
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import WrappedDefault, interp2app, unwrap_spec
 from pypy.interpreter.signature import Signature
+from pypy.interpreter.utf8_codecs import str_decode_latin_1
 from pypy.objspace.std.sliceobject import W_SliceObject
 from pypy.objspace.std.stdtypedef import StdTypeDef
 from pypy.objspace.std.stringmethods import StringMethods, _get_buffer
@@ -154,9 +155,11 @@
         w_dict = self.getdict(space)
         if w_dict is None:
             w_dict = space.w_None
+        ustr = str_decode_latin_1(''.join(self.data), len(self.data),
+                                  'strict')[0]
         return space.newtuple([
             space.type(self), space.newtuple([
-                space.wrap(''.join(self.data).decode('latin-1')),
+                space.wrap(ustr),
                 space.wrap('latin-1')]),
             w_dict])
 
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -4,7 +4,9 @@
 import string
 
 from pypy.interpreter.error import OperationError, oefmt
-from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, ORD
+from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, ORD, utf8chr
+from pypy.interpreter.utf8_codecs import (
+    unicode_encode_latin_1, unicode_encode_ascii, str_decode_ascii)
 from rpython.rlib import rstring, runicode, rlocale, rfloat, jit
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.rfloat import copysign, formatd
@@ -20,7 +22,7 @@
     result = 0
     i = start
     while i < end:
-        digit = ord(s[i]) - ord('0')
+        digit = ORD(s, i) - ord('0')
         if 0 <= digit <= 9:
             if result > (sys.maxint - digit) / 10:
                 raise oefmt(space.w_ValueError,
@@ -63,22 +65,24 @@
                 out = Utf8Builder()
             else:
                 out = rstring.StringBuilder()
+
             if not level:
                 raise OperationError(space.w_ValueError,
                                      space.wrap("Recursion depth exceeded"))
             level -= 1
-            s = self.template
-            return self._do_build_string(start, end, level, out, s)
+            return self._do_build_string(start, end, level, out, self.template)
 
         @jit.look_inside_iff(lambda self, start, end, level, out, s: jit.isconstant(s))
         def _do_build_string(self, start, end, level, out, s):
             space = self.space
             last_literal = i = start
+
             while i < end:
                 c = ORD(s, i)
                 i += 1
                 if c == ord("{") or c == ord("}"):
                     at_end = i == end
+
                     # Find escaped "{" and "}"
                     markup_follows = True
                     if c == ord("}"):
@@ -87,6 +91,7 @@
                                                  space.wrap("Single '}'"))
                         i += 1
                         markup_follows = False
+
                     if c == ord("{"):
                         if at_end:
                             raise OperationError(space.w_ValueError,
@@ -94,6 +99,7 @@
                         if ORD(s, i) == ord("{"):
                             i += 1
                             markup_follows = False
+
                     # Attach literal data, ending with { or }
                     out.append_slice(s, last_literal, i - 1)
                     if not markup_follows:
@@ -101,6 +107,7 @@
                             end_literal = i - 1
                             assert end_literal > last_literal
                             literal = self.template[last_literal:end_literal]
+
                             w_entry = space.newtuple([
                                 space.wrap(literal),
                                 space.w_None, space.w_None, space.w_None])
@@ -108,6 +115,7 @@
                             self.last_end = i
                         last_literal = i
                         continue
+
                     nested = 1
                     field_start = i
                     recursive = False
@@ -121,6 +129,7 @@
                             if not nested:
                                 break
                         i += 1
+
                     if nested:
                         raise OperationError(space.w_ValueError,
                                              space.wrap("Unmatched '{'"))
@@ -139,41 +148,43 @@
             # Find ":" or "!"
             i = start
             while i < end:
-                c = s[i]
+                c = ORD(s, i)
                 if c == ord(":") or c == ord("!"):
                     end_name = i
+
                     if c == ord("!"):
                         i += 1
                         if i == end:
                             w_msg = self.space.wrap("expected conversion")
                             raise OperationError(self.space.w_ValueError, w_msg)
-                        conversion = s[i]
+
+                        conversion = ORD(s, i)
                         i += 1
                         if i < end:
-                            if s[i] != ':':
+                            if ORD(s, i) != ord(':'):
                                 w_msg = self.space.wrap("expected ':' after"
                                                         " format specifier")
                                 raise OperationError(self.space.w_ValueError,
                                                      w_msg)
                             i += 1
                     else:
-                        conversion = None
+                        conversion = -1
                         i += 1
                     return s[start:end_name], conversion, i
                 i += 1
-            return s[start:end], None, end
+            return s[start:end], -1, end
 
         @jit.unroll_safe
         def _get_argument(self, name):
             # First, find the argument.
             space = self.space
             i = 0
-            end = len(name)
-            while i < end:
-                c = name[i]
+            while i < len(name):
+                c = ORD(name, i)
                 if c == ord("[") or c == ord("."):
                     break
                 i += 1
+
             empty = not i
             if empty:
                 index = -1
@@ -181,12 +192,14 @@
                 index, stop = _parse_int(self.space, name, 0, i)
                 if stop != i:
                     index = -1
+
             use_numeric = empty or index != -1
             if self.auto_numbering_state == ANS_INIT and use_numeric:
                 if empty:
                     self.auto_numbering_state = ANS_AUTO
                 else:
                     self.auto_numbering_state = ANS_MANUAL
+
             if use_numeric:
                 if self.auto_numbering_state == ANS_MANUAL:
                     if empty:
@@ -204,7 +217,8 @@
                 kwarg = name[:i]
                 if self.is_unicode:
                     try:
-                        arg_key = kwarg.encode("latin-1")
+                        arg_key = unicode_encode_latin_1(kwarg, len(kwarg),
+                                                         'strict')
                     except UnicodeEncodeError:
                         # Not going to be found in a dict of strings.
                         raise OperationError(space.w_KeyError, space.wrap(kwarg))
@@ -220,7 +234,7 @@
                 except IndexError:
                     w_msg = space.wrap("index out of range")
                     raise OperationError(space.w_IndexError, w_msg)
-            return self._resolve_lookups(w_arg, name, i, end)
+            return self._resolve_lookups(w_arg, name, i, len(name))
 
         @jit.unroll_safe
         def _resolve_lookups(self, w_obj, name, start, end):
@@ -228,15 +242,16 @@
             space = self.space
             i = start
             while i < end:
-                c = name[i]
+                c = ORD(name, i)
                 if c == ord("."):
                     i += 1
                     start = i
                     while i < end:
-                        c = name[i]
+                        c = ORD(name, i)
                         if c == ord("[") or c == ord("."):
                             break
                         i += 1
+
                     if start == i:
                         w_msg = space.wrap("Empty attribute in format string")
                         raise OperationError(space.w_ValueError, w_msg)
@@ -247,18 +262,17 @@
                         self.parser_list_w.append(space.newtuple([
                             space.w_True, w_attr]))
                 elif c == ord("["):
-                    got_bracket = False
                     i += 1
                     start = i
                     while i < end:
-                        c = name[i]
+                        c = ORD(name, i)
                         if c == ord("]"):
-                            got_bracket = True
                             break
                         i += 1
-                    if not got_bracket:
+                    else:
                         raise OperationError(space.w_ValueError,
                                              space.wrap("Missing ']'"))
+
                     index, reached = _parse_int(self.space, name, start, i)
                     if index != -1 and reached == i:
                         w_item = space.wrap(index)
@@ -285,29 +299,30 @@
                 if c == ord("[") or c == ord("."):
                     break
                 i += 1
+
             if i == 0:
                 index = -1
             else:
                 index, stop = _parse_int(self.space, name, 0, i)
                 if stop != i:
                     index = -1
+
             if index >= 0:
                 w_first = space.wrap(index)
             else:
                 w_first = space.wrap(name[:i])
-            #
+
             self.parser_list_w = []
             self._resolve_lookups(None, name, i, end)
-            #
+
             return space.newtuple([w_first,
                                    space.iter(space.newlist(self.parser_list_w))])
 
         def _convert(self, w_obj, conversion):
             space = self.space
-            conv = ORD(conversion, 0)
-            if conv == ord("r"):
+            if conversion == ord("r"):
                 return space.repr(w_obj)
-            elif conv == ord("s"):
+            elif conversion == ord("s"):
                 if self.is_unicode:
                     return space.call_function(space.w_unicode, w_obj)
                 return space.str(w_obj)
@@ -318,7 +333,7 @@
         def _render_field(self, start, end, recursive, level):
             name, conversion, spec_start = self._parse_field(start, end)
             spec = self.template[spec_start:end]
-            #
+
             if self.parser_list_w is not None:
                 # used from formatter_parser()
                 if level == 1:    # ignore recursive calls
@@ -333,12 +348,13 @@
                     self.parser_list_w.append(w_entry)
                     self.last_end = end + 1
                 return self.empty
-            #
+
             w_obj = self._get_argument(name)
-            if conversion is not None:
+            if conversion != -1:
                 w_obj = self._convert(w_obj, conversion)
             if recursive:
                 spec = self._build_string(spec_start, end, level)
+
             w_rendered = self.space.format(w_obj, self.space.wrap(spec))
             unwrapper = "unicode_w" if self.is_unicode else "str_w"
             to_interp = getattr(self.space, unwrapper)
@@ -348,7 +364,7 @@
             self.parser_list_w = []
             self.last_end = 0
             self._build_string(0, len(self.template), 2)
-            #
+
             space = self.space
             if self.last_end < len(self.template):
                 w_lastentry = space.newtuple([
@@ -413,7 +429,7 @@
         def __init__(self, space, is_unicode, spec):
             self.space = space
             self.is_unicode = is_unicode
-            self.empty = u"" if is_unicode else ""
+            self.empty = Utf8Str("") if is_unicode else ""
             self.spec = spec
 
         def _is_alignment(self, c):
@@ -429,78 +445,76 @@
 
         def _parse_spec(self, default_type, default_align):
             space = self.space
-            self._fill_char = self._lit("\0")[0]
-            self._align = default_align
+            self._fill_char = ord("\0")
+
+            self._align = ord(default_align)
             self._alternate = False
-            self._sign = "\0"
+            self._sign = ord("\0")
             self._thousands_sep = False
             self._precision = -1
-            the_type = default_type
+
             spec = self.spec
             if not spec:
                 return True
+
             length = len(spec)
             i = 0
             got_align = True
-            if length - i >= 2 and self._is_alignment(spec[i + 1]):
-                self._align = spec[i + 1]
-                self._fill_char = spec[i]
+
+            if length - i >= 2 and self._is_alignment(ORD(spec, i + 1)):
+                self._align = ORD(spec, i + 1)
+                self._fill_char = ORD(spec, i)
                 i += 2
-            elif length - i >= 1 and self._is_alignment(spec[i]):
-                self._align = spec[i]
+            elif length - i >= 1 and self._is_alignment(ORD(spec, i)):
+                self._align = ORD(spec, i)
                 i += 1
             else:
                 got_align = False
-            if length - i >= 1 and self._is_sign(spec[i]):
-                self._sign = spec[i]
+
+            if length - i >= 1 and self._is_sign(ORD(spec, i)):
+                self._sign = ORD(spec, i)
                 i += 1
-            if length - i >= 1 and spec[i] == "#":
+            if length - i >= 1 and ORD(spec, i) == ord("#"):
                 self._alternate = True
                 i += 1
-            if self._fill_char == "\0" and length - i >= 1 and spec[i] == "0":
-                self._fill_char = self._lit("0")[0]
+
+            if (self._fill_char == ord("\0") and length - i >= 1 and
+                ORD(spec, i) == ord("0")):
+                self._fill_char = ord("0")
                 if not got_align:
-                    self._align = "="
+                    self._align = ord("=")
                 i += 1
+
             self._width, i = _parse_int(self.space, spec, i, length)
-            if length != i and spec[i] == ",":
+            if length != i and ORD(spec, i) == ord(","):
                 self._thousands_sep = True
                 i += 1
-            if length != i and spec[i] == ".":
+            if length != i and ORD(spec, i) == ord("."):
                 i += 1
                 self._precision, i = _parse_int(self.space, spec, i, length)
                 if self._precision == -1:
                     raise OperationError(space.w_ValueError,
                                          space.wrap("no precision given"))
+
             if length - i > 1:
                 raise OperationError(space.w_ValueError,
                                      space.wrap("invalid format spec"))
             if length - i == 1:
-                presentation_type = spec[i]
                 if self.is_unicode:
                     try:
-                        the_type = spec[i].encode("ascii")[0]
+                        self._type = unicode_encode_ascii(spec[i], 1, 'strict')[0]
                     except UnicodeEncodeError:
                         raise OperationError(space.w_ValueError,
                                              space.wrap("invalid presentation type"))
                 else:
-                    the_type = presentation_type
+                    self._type = spec[i]
                 i += 1
-            self._type = the_type
+            else:
+                self._type = default_type
+
             if self._thousands_sep:
-                tp = self._type
-                if (tp == "d" or
-                    tp == "e" or
-                    tp == "f" or
-                    tp == "g" or
-                    tp == "E" or
-                    tp == "G" or
-                    tp == "%" or
-                    tp == "F" or
-                    tp == "\0"):
-                    # ok
-                    pass
-                else:
+                if self._type not in ('d', 'e', 'f', 'g', 'E', 'G', '%', 'F',
+                                      '\0'):
                     raise OperationError(space.w_ValueError,
                                          space.wrap("invalid type with ','"))
             return False
@@ -511,12 +525,13 @@
                 total = self._width
             else:
                 total = length
+
             align = self._align
-            if align == ">":
+            if align == ord(">"):
                 left = total - length
-            elif align == "^":
+            elif align == ord("^"):
                 left = (total - length) / 2
-            elif align == "<" or align == "=":
+            elif align == ord("<") or align == ord("="):
                 left = 0
             else:
                 raise AssertionError("shouldn't be here")
@@ -525,22 +540,16 @@
             self._right_pad = right
             return total
 
-        def _lit(self, s):
-            if self.is_unicode:
-                return s.decode("ascii")
-            else:
-                return s
-
         def _pad(self, string):
             builder = self._builder()
-            builder.append_multiple_char(self._fill_char, self._left_pad)
+            builder.append_multiple_char(chr(self._fill_char), self._left_pad)
             builder.append(string)
-            builder.append_multiple_char(self._fill_char, self._right_pad)
+            builder.append_multiple_char(chr(self._fill_char), self._right_pad)
             return builder.build()
 
         def _builder(self):
             if self.is_unicode:
-                return rstring.UnicodeBuilder()
+                return Utf8Builder()
             else:
                 return rstring.StringBuilder()
 
@@ -555,23 +564,25 @@
                 return space.wrap(string)
             if self._type != "s":
                 self._unknown_presentation("string")
-            if self._sign != "\0":
+            if self._sign != ord("\0"):
                 msg = "Sign not allowed in string format specifier"
                 raise OperationError(space.w_ValueError, space.wrap(msg))
             if self._alternate:
                 msg = "Alternate form not allowed in string format specifier"
                 raise OperationError(space.w_ValueError, space.wrap(msg))
-            if self._align == "=":
+            if self._align == ord("="):
                 msg = "'=' alignment not allowed in string format specifier"
                 raise OperationError(space.w_ValueError, space.wrap(msg))
+
             length = len(string)
             precision = self._precision
             if precision != -1 and length >= precision:
                 assert precision >= 0
                 length = precision
                 string = string[:precision]
-            if self._fill_char == "\0":
-                self._fill_char = self._lit(" ")[0]
+
+            if self._fill_char == ord("\0"):
+                self._fill_char = ord(" ")
             self._calc_padding(string, length)
             return space.wrap(self._pad(string))
 
@@ -586,9 +597,11 @@
                 dec = "."
                 thousands = ""
                 grouping = "\256"
+
             if self.is_unicode:
-                self._loc_dec = dec.decode("ascii")
-                self._loc_thousands = thousands.decode("ascii")
+                self._loc_dec = str_decode_ascii(dec, len(dec), 'strict')[0]
+                self._loc_thousands = str_decode_ascii(
+                    thousands, len(thousands), 'strict')[0]
             else:
                 self._loc_dec = dec
                 self._loc_thousands = thousands
@@ -617,41 +630,45 @@
             spec.n_rpadding = 0
             spec.n_min_width = 0
             spec.n_total = 0
-            spec.sign = "\0"
+            spec.sign = ord("\0")
             spec.n_sign = 0
+
             sign = self._sign
-            if sign == "+":
+            if sign == ord("+"):
                 spec.n_sign = 1
-                spec.sign = "-" if sign_char == "-" else "+"
-            elif sign == " ":
+                spec.sign = ord("-") if sign_char == "-" else ord("+")
+            elif sign == ord(" "):
                 spec.n_sign = 1
-                spec.sign = "-" if sign_char == "-" else " "
+                spec.sign = ord("-") if sign_char == "-" else ord(" ")
             elif sign_char == "-":
                 spec.n_sign = 1
-                spec.sign = "-"
+                spec.sign = ord("-")
             extra_length = (spec.n_sign + spec.n_prefix + spec.n_decimal +
                             spec.n_remainder) # Not padding or digits
-            if self._fill_char == "0" and self._align == "=":
+
+            if self._fill_char == ord("0") and self._align == ord("="):
                 spec.n_min_width = self._width - extra_length
             if self._loc_thousands:
                 self._group_digits(spec, digits[to_number:])
                 n_grouped_digits = len(self._grouped_digits)
             else:
                 n_grouped_digits = spec.n_digits
+
             n_padding = self._width - (extra_length + n_grouped_digits)
             if n_padding > 0:
                 align = self._align
-                if align == "<":
+                if align == ord("<"):
                     spec.n_rpadding = n_padding
-                elif align == ">":
+                elif align == ord(">"):
                     spec.n_lpadding = n_padding
-                elif align == "^":
+                elif align == ord("^"):
                     spec.n_lpadding = n_padding // 2
                     spec.n_rpadding = n_padding - spec.n_lpadding
-                elif align == "=":
+                elif align == ord("="):
                     spec.n_spadding = n_padding
                 else:
                     raise AssertionError("shouldn't reach")
+
             spec.n_total = spec.n_lpadding + spec.n_sign + spec.n_prefix + \
                            spec.n_spadding + n_grouped_digits + \
                            spec.n_decimal + spec.n_remainder + spec.n_rpadding
@@ -720,21 +737,28 @@
         def _fill_number(self, spec, num, to_digits, to_prefix, fill_char,
                          to_remainder, upper, grouped_digits=None):
             out = self._builder()
+
             if spec.n_lpadding:
-                out.append_multiple_char(fill_char[0], spec.n_lpadding)
+                out.append_multiple_char(chr(fill_char), spec.n_lpadding)
+
             if spec.n_sign:
                 if self.is_unicode:
-                    sign = spec.sign.decode("ascii")
+                    # TODO: A better way to do this might be to check if
+                    # spec.sign < 127 ...
+                    sign  = str_decode_ascii(chr(spec.sign), 1, 'strict')[0]
                 else:
-                    sign = spec.sign
+                    sign = chr(spec.sign)
                 out.append(sign)
+
             if spec.n_prefix:
                 pref = num[to_prefix:to_prefix + spec.n_prefix]
                 if upper:
                     pref = self._upcase_string(pref)
                 out.append(pref)
+
             if spec.n_spadding:
-                out.append_multiple_char(fill_char[0], spec.n_spadding)
+                out.append_multiple_char(chr(fill_char), spec.n_spadding)
+
             if spec.n_digits != 0:
                 if self._loc_thousands:
                     if grouped_digits is not None:
@@ -749,12 +773,13 @@
                 if upper:
                     digits = self._upcase_string(digits)
                 out.append(digits)
+
             if spec.n_decimal:
-                out.append(self._lit(".")[0])
+                out.append(".")
             if spec.n_remainder:
                 out.append(num[to_remainder:])
             if spec.n_rpadding:
-                out.append_multiple_char(fill_char[0], spec.n_rpadding)
+                out.append_multiple_char(chr(fill_char), spec.n_rpadding)
             #if complex, need to call twice - just retun the buffer
             return out.build()
 
@@ -764,14 +789,14 @@
                 msg = "precision not allowed in integer type"
                 raise OperationError(space.w_ValueError, space.wrap(msg))
             sign_char = "\0"
-            tp = self._type
-            if tp == "c":
-                if self._sign != "\0":
+
+            if self._type == "c":
+                if self._sign != ord("\0"):
                     msg = "sign not allowed with 'c' presentation type"
                     raise OperationError(space.w_ValueError, space.wrap(msg))
                 value = space.int_w(w_num)
                 if self.is_unicode:
-                    result = runicode.UNICHR(value)
+                    result = utf8chr(value)
                 else:
                     result = chr(value)
                 n_digits = 1
@@ -781,16 +806,16 @@
                 to_prefix = 0
                 to_numeric = 0
             else:
-                if tp == "b":
+                if self._type == "b":
                     base = 2
                     skip_leading = 2
-                elif tp == "o":
+                elif self._type == "o":
                     base = 8
                     skip_leading = 2
-                elif tp == "x" or tp == "X":
+                elif self._type == "x" or self._type == "X":
                     base = 16
                     skip_leading = 2
-                elif tp == "n" or tp == "d":
+                elif self._type == "n" or self._type == "d":
                     base = 10
                     skip_leading = 0
                 else:
@@ -801,7 +826,7 @@
                     result = self._long_to_base(base, space.bigint_w(w_num))
                 n_prefix = skip_leading if self._alternate else 0
                 to_prefix = 0
-                if result[0] == "-":
+                if ORD(result, 0) == ord("-"):
                     sign_char = "-"
                     skip_leading += 1
                     to_prefix += 1
@@ -809,10 +834,10 @@
                 n_remainder = 0
                 to_remainder = 0
                 to_numeric = skip_leading
-            self._get_locale(tp)
+            self._get_locale(self._type)
             spec = self._calc_num_width(n_prefix, sign_char, to_numeric, n_digits,
                                         n_remainder, False, result)
-            fill = self._lit(" ") if self._fill_char == "\0" else self._fill_char
+            fill = ord(" ") if self._fill_char == ord("\0") else self._fill_char
             upper = self._type == "X"
             return self.space.wrap(self._fill_number(spec, result, to_numeric,
                                      to_prefix, fill, to_remainder, upper))
@@ -827,14 +852,14 @@
                 prefix = "0x"
             as_str = value.format(LONG_DIGITS[:base], prefix)
             if self.is_unicode:
-                return as_str.decode("ascii")
+                return str_decode_ascii(as_str, len(as_str), 'strict')[0]
             return as_str
 
         def _int_to_base(self, base, value):
             if base == 10:
                 s = str(value)
                 if self.is_unicode:
-                    return s.decode("ascii")
+                    return str_decode_ascii(s, len(s), 'strict')[0]
                 return s
             # This part is slow.
             negative = value < 0
@@ -879,22 +904,10 @@
                 if self.is_unicode:
                     return space.call_function(space.w_unicode, w_num)
                 return self.space.str(w_num)
-            tp = self._type
-            if (tp == "b" or
-                tp == "c" or
-                tp == "d" or
-                tp == "o" or
-                tp == "x" or
-                tp == "X" or
-                tp == "n"):
+
+            if self._type in ("b", "c", "d", "o", "x", "X", "n"):
                 return self._format_int_or_long(w_num, kind)
-            elif (tp == "e" or
-                  tp == "E" or
-                  tp == "f" or
-                  tp == "F" or
-                  tp == "g" or
-                  tp == "G" or
-                  tp == "%"):
+            elif self._type in ("e", "E", "f", "F", "g", "G", "%"):
                 w_float = space.float(w_num)
                 return self._format_float(w_float)
             else:
@@ -921,6 +934,7 @@
             if self._alternate:
                 msg = "alternate form not allowed in float formats"
                 raise OperationError(space.w_ValueError, space.wrap(msg))
+
             tp = self._type
             self._get_locale(tp)
             if tp == "\0":
@@ -929,6 +943,7 @@
                 flags |= rfloat.DTSF_ADD_DOT_0
             elif tp == "n":
                 tp = "g"
+
             value = space.float_w(w_float)
             if tp == "%":
                 tp = "f"
@@ -936,6 +951,7 @@
                 add_pct = True
             else:
                 add_pct = False
+
             if self._precision == -1:
                 self._precision = default_precision
             result, special = rfloat.double_to_string(value, tp,
@@ -943,22 +959,26 @@
             if add_pct:
                 result += "%"
             n_digits = len(result)
-            if result[0] == "-":
+
+            if ORD(result, 0) == ord("-"):
                 sign = "-"
                 to_number = 1
                 n_digits -= 1
             else:
                 sign = "\0"
                 to_number = 0
+
             have_dec_point, to_remainder = self._parse_number(result, to_number)
             n_remainder = len(result) - to_remainder
+
             if self.is_unicode:
-                digits = result.decode("ascii")
+                digits = str_decode_ascii(result , len(result), 'strict')[0]
             else:
                 digits = result
+
             spec = self._calc_num_width(0, sign, to_number, n_digits,
                                         n_remainder, have_dec_point, digits)
-            fill = self._lit(" ") if self._fill_char == "\0" else self._fill_char
+            fill = ord(" ") if self._fill_char == ord("\0") else self._fill_char
             return self.space.wrap(self._fill_number(spec, digits, to_number, 0,
                                       fill, to_remainder, False))
 
@@ -968,30 +988,23 @@
                 if self.is_unicode:
                     return space.call_function(space.w_unicode, w_float)
                 return space.str(w_float)
-            tp = self._type
-            if (tp == "\0" or
-                tp == "e" or
-                tp == "E" or
-                tp == "f" or
-                tp == "F" or
-                tp == "g" or
-                tp == "G" or
-                tp == "n" or
-                tp == "%"):
+
+            if self._type in ("\0", "e", "E", "f", "F", "g", "G", "n", "%"):
                 return self._format_float(w_float)
             self._unknown_presentation("float")
 
         def _format_complex(self, w_complex):
             space = self.space
+
             tp = self._type
             self._get_locale(tp)
             default_precision = 6
-            if self._align == "=":
+            if self._align == ord("="):
                 # '=' alignment is invalid
                 msg = ("'=' alignment flag is not allowed in"
                        " complex format specifier")
                 raise OperationError(space.w_ValueError, space.wrap(msg))
-            if self._fill_char == "0":
+            if self._fill_char == ord("0"):
                 #zero padding is invalid
                 msg = "Zero padding is not allowed in complex format specifier"
                 raise OperationError(space.w_ValueError, space.wrap(msg))
@@ -1047,7 +1060,7 @@
             tmp_fill_char = self._fill_char
             tmp_align = self._align
             tmp_width = self._width
-            self._fill_char = "\0"
+            self._fill_char = ord("\0")
             self._align = "<"
             self._width = -1
 
@@ -1058,8 +1071,8 @@
                                                                to_imag_number)
 
             if self.is_unicode:
-                re_num = re_num.decode("ascii")
-                im_num = im_num.decode("ascii")
+                re_num = str_decode_ascii(re_num, len(re_num), 'strict')[0]
+                im_num = str_decode_ascii(im_num, len(im_num), 'strict')[0]
 
             #set remainder, in CPython _parse_number sets this
             #using n_re_digits causes tests to fail
@@ -1073,7 +1086,7 @@
             #self._grouped_digits will get overwritten in imaginary calc_num_width
             re_grouped_digits = self._grouped_digits
             if not skip_re:
-                self._sign = "+"
+                self._sign = ord("+")
             im_spec = self._calc_num_width(0, im_sign, to_imag_number, n_im_digits,
                                            im_n_remainder, im_have_dec,
                                            im_num)
@@ -1093,14 +1106,14 @@
 
             out = self._builder()
             fill = self._fill_char
-            if fill == "\0":
-                fill = self._lit(" ")[0]
+            if fill == ord("\0"):
+                fill = ord(" ")
 
             #compose the string
             #add left padding
-            out.append_multiple_char(fill, self._left_pad)
+            out.append_multiple_char(chr(fill), self._left_pad)
             if add_parens:
-                out.append(self._lit('(')[0])
+                out.append('(')
 
             #if the no. has a real component, add it
             if not skip_re:
@@ -1114,13 +1127,13 @@
                                          im_grouped_digits))
 
             #add 'j' character
-            out.append(self._lit('j')[0])
+            out.append('j')
 
             if add_parens:
-                out.append(self._lit(')')[0])
+                out.append(')')
 
             #add right padding
-            out.append_multiple_char(fill, self._right_pad)
+            out.append_multiple_char(chr(fill), self._right_pad)
 
             return self.space.wrap(out.build())
 
@@ -1131,15 +1144,8 @@
             #parse format specification, set associated variables
             if self._parse_spec("\0", ">"):
                 return space.str(w_complex)
-            tp = self._type
-            if (tp == "\0" or
-                tp == "e" or
-                tp == "E" or
-                tp == "f" or
-                tp == "F" or
-                tp == "g" or
-                tp == "G" or
-                tp == "n"):
+
+            if self._type in ('\0', 'e', 'E', 'f', 'F', 'g', 'G', 'n'):
                 return self._format_complex(w_complex)
             self._unknown_presentation("complex")
     return Formatter
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -162,9 +162,6 @@
         if isinstance(x, Utf8Str):
             return wrapunicode(self, x)
 
-        if isinstance(x, unicode):
-            import pdb; pdb.set_trace()
-
         if isinstance(x, float):
             return W_FloatObject(x)
         if isinstance(x, W_Root):
diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -585,13 +585,13 @@
         by = self._op_val(space, w_sep)
         if len(by) == 0:
             raise oefmt(space.w_ValueError, "empty separator")
-        res = self._split(value, by, maxsplit)
+        res = self._rsplit(value, by, maxsplit)
 
         return self._newlist_unwrapped(space, res)
 
     @staticmethod
     def _rsplit(value, sep=None, maxsplit=-1):
-        return value.split(sep, maxsplit)
+        return rsplit(value, sep, maxsplit)
 
     @unwrap_spec(keepends=bool)
     def descr_splitlines(self, space, keepends=False):
@@ -606,7 +606,8 @@
             eol = pos
             pos += 1
             # read CRLF as one line break
-            if pos < length and value[eol] == '\r' and value[pos] == '\n':
+            if (pos < length and ORD(value, eol) == ord('\r') and
+                                 ORD(value, pos) == ord('\n')):
                 pos += 1
             if keepends:
                 eol = pos
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -3,6 +3,7 @@
 
 from pypy.objspace.std.dictmultiobject import (W_DictMultiObject,
     BytesDictStrategy, ObjectDictStrategy)
+from pypy.interpreter.utf8 import Utf8Str
 
 
 class TestW_DictObject(object):
@@ -142,8 +143,9 @@
 
     def test_listview_unicode_dict(self):
         w = self.space.wrap
+        w_u = lambda x: w(Utf8Str.from_unicode(x))
         w_d = self.space.newdict()
-        w_d.initialize_content([(w(u"a"), w(1)), (w(u"b"), w(2))])
+        w_d.initialize_content([(w_u(u"a"), w(1)), (w_u(u"b"), w(2))])
         assert self.space.listview_unicode(w_d) == [u"a", u"b"]
 
     def test_listview_int_dict(self):
@@ -154,7 +156,8 @@
 
     def test_keys_on_string_unicode_int_dict(self, monkeypatch):
         w = self.space.wrap
-        
+        w_u = lambda x: w(Utf8Str.from_unicode(x))
+
         w_d = self.space.newdict()
         w_d.initialize_content([(w(1), w("a")), (w(2), w("b"))])
         w_l = self.space.call_method(w_d, "keys")
@@ -174,7 +177,7 @@
         # but we need space.newlist_unicode for it
         monkeypatch.undo() 
         w_d = self.space.newdict()
-        w_d.initialize_content([(w(u"a"), w(1)), (w(u"b"), w(6))])
+        w_d.initialize_content([(w_u(u"a"), w(1)), (w_u(u"b"), w(6))])
         w_l = self.space.call_method(w_d, "keys")
         assert sorted(self.space.listview_unicode(w_l)) == [u"a", u"b"]
 
diff --git a/pypy/objspace/std/test/test_index.py b/pypy/objspace/std/test/test_index.py
--- a/pypy/objspace/std/test/test_index.py
+++ b/pypy/objspace/std/test/test_index.py
@@ -1,5 +1,7 @@
 from py.test import raises
 
+from pypy.interpreter.utf8 import Utf8Str
+
 class AppTest_IndexProtocol:
     def setup_class(self):
         w_oldstyle = self.space.appexec([], """():
@@ -263,7 +265,7 @@
 class AppTest_UnicodeTestCase(SeqTestCase, StringTestCase):
     def setup_method(self, method):
         SeqTestCase.setup_method(self, method)
-        self.w_seq = self.space.wrap(u"this is a test")
+        self.w_seq = self.space.wrap(Utf8Str.from_unicode(u"this is a test"))
         self.w_const = self.space.appexec([], """(): return unicode""")
 
 
diff --git a/pypy/objspace/std/test/test_lengthhint.py b/pypy/objspace/std/test/test_lengthhint.py
--- a/pypy/objspace/std/test/test_lengthhint.py
+++ b/pypy/objspace/std/test/test_lengthhint.py
@@ -1,3 +1,4 @@
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.module._collections.interp_deque import W_Deque
 from pypy.module.itertools.interp_itertools import W_Repeat
 
@@ -71,7 +72,7 @@
         self._test_length_hint(self.space.wrap('P' * self.SIZE))
 
     def test_unicode(self):
-        self._test_length_hint(self.space.wrap(u'Y' * self.SIZE))
+        self._test_length_hint(self.space.wrap(Utf8Str('Y' * self.SIZE)))
 
     def test_tuple(self):
         self._test_length_hint(self.space.wrap(tuple(self.ITEMS)))
diff --git a/pypy/objspace/std/test/test_liststrategies.py b/pypy/objspace/std/test/test_liststrategies.py
--- a/pypy/objspace/std/test/test_liststrategies.py
+++ b/pypy/objspace/std/test/test_liststrategies.py
@@ -1,4 +1,5 @@
 import sys
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.objspace.std.listobject import (
     W_ListObject, EmptyListStrategy, ObjectListStrategy, IntegerListStrategy,
     FloatListStrategy, BytesListStrategy, RangeListStrategy,
@@ -11,20 +12,22 @@
     def test_check_strategy(self):
         space = self.space
         w = space.wrap
+        w_u = lambda x: w(Utf8Str(x))
         assert isinstance(W_ListObject(space, []).strategy, EmptyListStrategy)
         assert isinstance(W_ListObject(space, [w(1),w('a')]).strategy, ObjectListStrategy)
         assert isinstance(W_ListObject(space, [w(1),w(2),w(3)]).strategy,
                           IntegerListStrategy)
         assert isinstance(W_ListObject(space, [w('a'), w('b')]).strategy,
                           BytesListStrategy)
-        assert isinstance(W_ListObject(space, [w(u'a'), w(u'b')]).strategy,
+        assert isinstance(W_ListObject(space, [w_u('a'), w_u('b')]).strategy,
                           UnicodeListStrategy)
-        assert isinstance(W_ListObject(space, [w(u'a'), w('b')]).strategy,
+        assert isinstance(W_ListObject(space, [w_u('a'), w('b')]).strategy,
                           ObjectListStrategy) # mixed unicode and bytes
 
     def test_empty_to_any(self):
         space = self.space
         w = space.wrap
+        w_u = lambda x: w(Utf8Str(x))
         l = W_ListObject(space, [])
         assert isinstance(l.strategy, EmptyListStrategy)
         l.append(w((1,3)))
@@ -42,7 +45,7 @@
 
         l = W_ListObject(space, [])
         assert isinstance(l.strategy, EmptyListStrategy)
-        l.append(w(u'a'))
+        l.append(w_u('a'))
         assert isinstance(l.strategy, UnicodeListStrategy)
 
         l = W_ListObject(space, [])
@@ -70,9 +73,10 @@
 
     def test_unicode_to_any(self):
         space = self.space
-        l = W_ListObject(space, [space.wrap(u'a'), space.wrap(u'b'), space.wrap(u'c')])
+        w_u = lambda x: space.wrap(Utf8Str(x))
+        l = W_ListObject(space, [w_u('a'), w_u('b'), w_u('c')])
         assert isinstance(l.strategy, UnicodeListStrategy)
-        l.append(space.wrap(u'd'))
+        l.append(w_u('d'))
         assert isinstance(l.strategy, UnicodeListStrategy)
         l.append(space.wrap(3))
         assert isinstance(l.strategy, ObjectListStrategy)
@@ -89,6 +93,7 @@
     def test_setitem(self):
         space = self.space
         w = space.wrap
+        w_u = lambda x: w(Utf8Str(x))
         # This should work if test_listobject.py passes
         l = W_ListObject(space, [w('a'),w('b'),w('c')])
         assert space.eq_w(l.getitem(0), w('a'))
@@ -110,7 +115,7 @@
         assert isinstance(l.strategy, ObjectListStrategy)
 
         # UnicodeStrategy to ObjectStrategy
-        l = W_ListObject(space, [w(u'a'),w(u'b'),w(u'c')])
+        l = W_ListObject(space, [w_u('a'),w_u('b'),w_u('c')])
         assert isinstance(l.strategy, UnicodeListStrategy)
         l.setitem(0, w(2))
         assert isinstance(l.strategy, ObjectListStrategy)
@@ -124,6 +129,7 @@
     def test_insert(self):
         space = self.space
         w = space.wrap
+        w_u = lambda x: w(Utf8Str(x))
         # no change
         l = W_ListObject(space, [w(1),w(2),w(3)])
         assert isinstance(l.strategy, IntegerListStrategy)
@@ -137,7 +143,7 @@
         assert isinstance(l.strategy, ObjectListStrategy)
 
         # UnicodeStrategy
-        l = W_ListObject(space, [w(u'a'),w(u'b'),w(u'c')])
+        l = W_ListObject(space, [w_u('a'),w_u('b'),w_u('c')])
         assert isinstance(l.strategy, UnicodeListStrategy)
         l.insert(3, w(2))
         assert isinstance(l.strategy, ObjectListStrategy)
@@ -186,6 +192,7 @@
     def test_setslice(self):
         space = self.space
         w = space.wrap
+        w_u = lambda x: w(Utf8Str(x))
 
         l = W_ListObject(space, [])
         assert isinstance(l.strategy, EmptyListStrategy)
@@ -217,7 +224,7 @@
         assert isinstance(l.strategy, ObjectListStrategy)
 
         # UnicodeStrategy to ObjectStrategy
-        l = W_ListObject(space, [w(u'a'), w(u'b'), w(u'c')])
+        l = W_ListObject(space, [w_u('a'), w_u('b'), w_u('c')])
         assert isinstance(l.strategy, UnicodeListStrategy)
         l.setslice(0, 1, 2, W_ListObject(space, [w(1), w(2), w(3)]))
         assert isinstance(l.strategy, ObjectListStrategy)
@@ -242,6 +249,8 @@
         def wrapitems(items):
             items_w = []
             for i in items:
+                if isinstance(i, unicode):
+                    i = Utf8Str.from_unicode(i)
                 items_w.append(space.wrap(i))
             return items_w
 
@@ -323,6 +332,7 @@
     def test_empty_extend_with_any(self):
         space = self.space
         w = space.wrap
+        w_u = lambda x: w(Utf8Str(x))
 
         empty = W_ListObject(space, [])
         assert isinstance(empty.strategy, EmptyListStrategy)
@@ -336,7 +346,7 @@
 
         empty = W_ListObject(space, [])
         assert isinstance(empty.strategy, EmptyListStrategy)
-        empty.extend(W_ListObject(space, [w(u"a"), w(u"b"), w(u"c")]))
+        empty.extend(W_ListObject(space, [w_u("a"), w_u("b"), w_u("c")]))
         assert isinstance(empty.strategy, UnicodeListStrategy)
 
         empty = W_ListObject(space, [])
@@ -588,11 +598,12 @@
         assert self.space.eq_w(l3, W_ListObject(self.space, [self.space.wrap(1), self.space.wrap(2), self.space.wrap(3), self.space.wrap(4), self.space.wrap(5)]))
 
     def test_unicode(self):
+        wrap_u = lambda x: self.space.wrap(Utf8Str(x))
         l1 = W_ListObject(self.space, [self.space.wrap("eins"), self.space.wrap("zwei")])
         assert isinstance(l1.strategy, BytesListStrategy)
-        l2 = W_ListObject(self.space, [self.space.wrap(u"eins"), self.space.wrap(u"zwei")])
+        l2 = W_ListObject(self.space, [wrap_u("eins"), wrap_u("zwei")])
         assert isinstance(l2.strategy, UnicodeListStrategy)
-        l3 = W_ListObject(self.space, [self.space.wrap("eins"), self.space.wrap(u"zwei")])
+        l3 = W_ListObject(self.space, [self.space.wrap("eins"), wrap_u(u"zwei")])
         assert isinstance(l3.strategy, ObjectListStrategy)
 
     def test_listview_bytes(self):
@@ -603,20 +614,22 @@
 
     def test_listview_unicode(self):
         space = self.space
+        wrap_u = lambda x: self.space.wrap(Utf8Str(x))
         assert space.listview_unicode(space.wrap(1)) == None
-        w_l = self.space.newlist([self.space.wrap(u'a'), self.space.wrap(u'b')])
+        w_l = self.space.newlist([wrap_u('a'), wrap_u('b')])
         assert space.listview_unicode(w_l) == [u"a", u"b"]
 
     def test_string_join_uses_listview_bytes(self):
         space = self.space
+        wrap_u = lambda x: self.space.wrap(Utf8Str(x))
         w_l = self.space.newlist([self.space.wrap('a'), self.space.wrap('b')])
         w_l.getitems = None
         assert space.str_w(space.call_method(space.wrap("c"), "join", w_l)) == "acb"
         #
         # the same for unicode
-        w_l = self.space.newlist([self.space.wrap(u'a'), self.space.wrap(u'b')])
+        w_l = self.space.newlist([wrap_u('a'), wrap_u('b')])
         w_l.getitems = None
-        assert space.unicode_w(space.call_method(space.wrap(u"c"), "join", w_l)) == u"acb"
+        assert space.unicode_w(space.call_method(wrap_u("c"), "join", w_l)) == u"acb"
 
     def test_string_join_returns_same_instance(self):
         space = self.space
@@ -626,10 +639,11 @@
         assert space.is_w(space.call_method(space.wrap(" -- "), "join", w_l), w_text)
         #
         # the same for unicode
-        w_text = space.wrap(u"text")
+        w_base = space.wrap(Utf8Str(" -- "))
+        w_text = space.wrap(Utf8Str("text"))
         w_l = self.space.newlist([w_text])
         w_l.getitems = None
-        assert space.is_w(space.call_method(space.wrap(u" -- "), "join", w_l), w_text)
+        assert space.is_w(space.call_method(w_base, "join", w_l), w_text)
 
     def test_newlist_bytes(self):
         space = self.space
@@ -656,7 +670,7 @@
 
     def test_unicode_uses_newlist_unicode(self):
         space = self.space
-        w_u = space.wrap(u"a b c")
+        w_u = space.wrap(Utf8Str("a b c"))
         space.newlist = None
         try:
             w_l = space.call_method(w_u, "split")
@@ -711,7 +725,8 @@
 
     def test_listview_unicode_list(self):
         space = self.space
-        w_l = W_ListObject(space, [space.wrap(u"a"), space.wrap(u"b")])
+        wrap_u = lambda x: self.space.wrap(Utf8Str(x))
+        w_l = W_ListObject(space, [wrap_u("a"), wrap_u("b")])
         assert self.space.listview_unicode(w_l) == [u"a", u"b"]
 
     def test_listview_int_list(self):
diff --git a/pypy/objspace/std/test/test_newformat.py b/pypy/objspace/std/test/test_newformat.py
--- a/pypy/objspace/std/test/test_newformat.py
+++ b/pypy/objspace/std/test/test_newformat.py
@@ -382,30 +382,30 @@
         assert l == [('abcd', None, None, None)]
         #
         l = list('ab{0}cd'._formatter_parser())
-        assert l == [('ab', '0', '', None), ('cd', None, None, None)]
+        assert l == [('ab', '0', '', -1), ('cd', None, None, None)]
         #
         l = list('{0}cd'._formatter_parser())
-        assert l == [('', '0', '', None), ('cd', None, None, None)]
+        assert l == [('', '0', '', -1), ('cd', None, None, None)]
         #
         l = list('ab{0}'._formatter_parser())
-        assert l == [('ab', '0', '', None)]
+        assert l == [('ab', '0', '', -1)]
         #
         l = list(''._formatter_parser())
         assert l == []
         #
         l = list('{0:123}'._formatter_parser())
-        assert l == [('', '0', '123', None)]
+        assert l == [('', '0', '123', -1)]
         #
         l = list('{0!x:123}'._formatter_parser())
-        assert l == [('', '0', '123', 'x')]
+        assert l == [('', '0', '123', ord('x'))]
         #
         l = list('{0!x:12{sdd}3}'._formatter_parser())
-        assert l == [('', '0', '12{sdd}3', 'x')]
+        assert l == [('', '0', '12{sdd}3', ord('x'))]
 
     def test_u_formatter_parser(self):
         l = list(u'{0!x:12{sdd}3}'._formatter_parser())
-        assert l == [(u'', u'0', u'12{sdd}3', u'x')]
-        for x in l[0]:
+        assert l == [(u'', u'0', u'12{sdd}3', ord(u'x'))]
+        for x in l[0][:-1]:
             assert isinstance(x, unicode)
 
     def test_formatter_parser_escape(self):