[pypy-commit] pypy utf8-unicode2: Simplify iterators. Use iterators consistently when encoding unicode strings

Thu Aug 14 08:17:55 CEST 2014

Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r72787:9f7fc269657f
Date: 2014-08-14 01:02 -0500
http://bitbucket.org/pypy/pypy/changeset/9f7fc269657f/

Log:	Simplify iterators. Use iterators consistently when encoding unicode
	strings

diff --git a/pypy/interpreter/test/test_utf8.py b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -2,8 +2,7 @@
 
 import py
 import sys
-from pypy.interpreter.utf8 import (
-    Utf8Str, Utf8Builder, utf8chr, utf8ord)
+from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord
 from rpython.rtyper.lltypesystem import rffi
 from rpython.rtyper.test.test_llinterp import interpret
 
@@ -29,24 +28,8 @@
 def test_iterator():
     s = build_utf8str()
     iter = s.codepoint_iter()
-    assert iter.peek_next() == 0x41
     assert list(iter) == [0x41, 0x10F, 0x20AC, 0x1F63D]
 
-    for i in range(1, 5):
-        iter = s.codepoint_iter()
-        iter.move(i)
-        if i != 4:
-            assert iter.peek_next() == [0x41, 0x10F, 0x20AC, 0x1F63D][i]
-        l = list(iter)
-        assert l == [0x41, 0x10F, 0x20AC, 0x1F63D][i:]
-
-    for i in range(1, 5):
-        iter = s.codepoint_iter()
-        list(iter) # move the iterator to the end
-        iter.move(-i)
-        l = list(iter)
-        assert l == [0x41, 0x10F, 0x20AC, 0x1F63D][4-i:]
-
     iter = s.char_iter()
     l = [s.bytes.decode('utf8') for s in list(iter)]
     if sys.maxunicode < 65536:
@@ -54,26 +37,17 @@
     else:
         assert l == [u'A', u'\u010F', u'\u20AC', u'\U0001F63D']
 
-def test_reverse_iterator():
+def test_new_iterator():
     s = build_utf8str()
-    iter = s.reverse_codepoint_iter()
-    assert iter.peek_next() == 0x1F63D
-    assert list(iter) == [0x1F63D, 0x20AC, 0x10F, 0x41]
+    i = s.iter()
+    while not i.finished():
+        assert utf8ord(s, i.pos()) == i.current()
+        i.move(1)
 
-    for i in range(1, 5):
-        iter = s.reverse_codepoint_iter()
-        iter.move(i)
-        if i != 4:
-            assert iter.peek_next() == [0x1F63D, 0x20AC, 0x10F, 0x41][i]
-        l = list(iter)
-        assert l == [0x1F63D, 0x20AC, 0x10F, 0x41][i:]
-
-    for i in range(1, 5):
-        iter = s.reverse_codepoint_iter()
-        list(iter) # move the iterator to the end
-        iter.move(-i)
-        l = list(iter)
-        assert l == [0x1F63D, 0x20AC, 0x10F, 0x41][4-i:]
+    i = s.iter(len(s) - 1)
+    while i.pos() >= 0:
+        assert utf8ord(s, i.pos()) == i.current()
+        i.move(-1)
 
 def test_builder_append_slice():
     builder = Utf8Builder()
@@ -146,7 +120,6 @@
     s = Utf8Str(' ')
     assert s.join([]) == u''
 
-    
     assert s.join([Utf8Str('one')]) == u'one'
     assert s.join([Utf8Str('one'), Utf8Str('two')]) == u'one two'
 
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -309,18 +309,15 @@
     def __unicode__(self):
         return unicode(self.bytes, 'utf8')
 
+    def iter(self, start=0):
+        return Utf8Iterator(self, start)
+
     def char_iter(self):
         return Utf8CharacterIter(self)
 
-    def reverse_char_iter(self):
-        return Utf8ReverseCharacterIter(self)
-
     def codepoint_iter(self):
         return Utf8CodePointIter(self)
 
-    def reverse_codepoint_iter(self):
-        return Utf8ReverseCodePointIter(self)
-
     @specialize.argtype(1, 2)
     def _bound_check(self, start, end):
         if start is None:
@@ -432,7 +429,7 @@
             else:
                 break
 
-            start_byte = iter.byte_pos
+            start_byte = iter._byte_pos
             assert start_byte >= 0
 
             if maxsplit == 0:
@@ -449,7 +446,7 @@
                            self._is_ascii))
                 break
 
-            end = iter.byte_pos
+            end = iter._byte_pos
             assert end >= 0
             res.append(Utf8Str(self.bytes[start_byte:end], self._is_ascii))
             maxsplit -= 1
@@ -466,32 +463,32 @@
                 other_bytes = other.bytes
             return [Utf8Str(s) for s in self.bytes.rsplit(other_bytes, maxsplit)]
 
+        if len(self) == 0:
+            return []
+
         res = []
-        iter = self.reverse_codepoint_iter()
+        iter = self.iter(len(self) - 1)
         while True:
             # Find the start of the next word
-            for cd in iter:
-                if not unicodedb.isspace(cd):
-                    break
-            else:
+            while iter.pos() >= 0 and unicodedb.isspace(iter.current()):
+                iter.move(-1)
+            if iter.pos() < 0:
                 break
 
-            start_byte = self.next_char(iter.byte_pos)
-
+            start_byte = self.next_char(iter.byte_pos())
             if maxsplit == 0:
                 res.append(Utf8Str(self.bytes[0:start_byte], self._is_ascii))
                 break
 
             # Find the end of the word
-            for cd in iter:
-                if unicodedb.isspace(cd):
-                    break
-            else:
+            while iter.pos() >= 0 and not unicodedb.isspace(iter.current()):
+                iter.move(-1)
+            if iter.pos() < 0:
                 # We hit the end of the string
                 res.append(Utf8Str(self.bytes[0:start_byte], self._is_ascii))
                 break
 
-            end_byte = self.next_char(iter.byte_pos)
+            end_byte = self.next_char(iter.byte_pos())
             res.append(Utf8Str(self.bytes[end_byte:start_byte],
                                self._is_ascii))
             maxsplit -= 1
@@ -756,117 +753,27 @@
 
 # _______________________________________________
 
-# iter.current is the current (ie the last returned) element
-# iter.pos isthe position of the current element
-# iter.byte_pos isthe byte position of the current element
-# In the before-the-start state, for foward iterators iter.pos and
-# iter.byte_pos are -1. For reverse iterators, they are len(ustr) and
-# len(ustr.bytes) respectively.
-
 class ForwardIterBase(object):
     def __init__(self, ustr):
-        self.ustr = ustr
-        self.pos = -1
-
-        self._byte_pos = 0
-        self.byte_pos = -1
-        self.current = self._default
+        self._str = ustr
+        self._byte_pos = -1
 
     def __iter__(self):
         return self
 
     def next(self):
-        if self.pos + 1 == len(self.ustr):
+        if self._byte_pos == -1:
+            if len(self._str) == 0:
+                raise StopIteration()
+            self._byte_pos = 0
+            return self._value(0)
+
+        self._byte_pos = self._str.next_char(self._byte_pos)
+        if self._byte_pos == len(self._str.bytes):
             raise StopIteration()
 
-        self.pos += 1
-        self.byte_pos = self._byte_pos
-
-        self.current = self._value(self.byte_pos)
-
-        self._byte_pos = self.ustr.next_char(self._byte_pos)
-        return self.current
-
-    def peek_next(self):
         return self._value(self._byte_pos)
 
-    def peek_prev(self):
-        return self._value(self._move_backward(self.byte_pos))
-
-    def move(self, count):
-        if count > 0:
-            self.pos += count
-
-            while count != 1:
-                self._byte_pos = self.ustr.next_char(self._byte_pos)
-                count -= 1
-            self.byte_pos = self._byte_pos
-            self._byte_pos = self.ustr.next_char(self._byte_pos)
-            self.current = self._value(self.byte_pos)
-
-        elif count < 0:
-            self.pos += count
-            while count < -1:
-                self.byte_pos = self.ustr.prev_char(self.byte_pos)
-                count += 1
-            self._byte_pos = self.byte_pos
-            self.byte_pos = self.ustr.prev_char(self.byte_pos)
-            self.current = self._value(self.byte_pos)
-
-    def copy(self):
-        iter = self.__class__(self.ustr)
-        iter.pos = self.pos
-        iter.byte_pos = self.byte_pos
-        iter._byte_pos = self._byte_pos
-        iter.current = self.current
-        return iter
-
-class ReverseIterBase(object):
-    def __init__(self, ustr):
-        self.ustr = ustr
-        self.pos = len(ustr)
-        self.byte_pos = len(ustr.bytes)
-        self.current = self._default
-
-    def __iter__(self):
-        return self
-
-    def next(self):
-        if self.pos == 0:
-            raise StopIteration()
-
-        self.pos -= 1
-        self.byte_pos = self.ustr.prev_char(self.byte_pos)
-        self.current = self._value(self.byte_pos)
-        return self.current
-
-    def peek_next(self):
-        return self._value(self.ustr.prev_char(self.byte_pos))
-
-    def peek_prev(self):
-        return self._value(self.ustr.next_char(self.byte_pos))
-
-    def move(self, count):
-        if count > 0:
-            self.pos -= count
-            while count != 0:
-                self.byte_pos = self.ustr.prev_char(self.byte_pos)
-                count -= 1
-            self.current = self._value(self.byte_pos)
-        elif count < 0:
-            self.pos -= count
-            while count != 0:
-                self.byte_pos = self.ustr.next_char(self.byte_pos)
-                count += 1
-            self.current = self._value(self.byte_pos)
-
-    def copy(self):
-        iter = self.__class__(self.ustr)
-        iter.pos = self.pos
-        iter.byte_pos = self.byte_pos
-        iter.current = self.current
-        return iter
-
 def make_iterator(name, base, calc_value, default):
     class C(object):
         import_from_mixin(base, ['__init__', '__iter__'])
@@ -876,32 +783,91 @@
     return C
 
 def codepoint_calc_value(self, byte_pos):
-    if byte_pos == -1 or byte_pos == len(self.ustr.bytes):
+    if byte_pos == -1 or byte_pos == len(self._str.bytes):
         return -1
-    return utf8ord_bytes(self.ustr.bytes, byte_pos)
+    return utf8ord_bytes(self._str.bytes, byte_pos)
 
 def character_calc_value(self, byte_pos):
-    if byte_pos == -1 or byte_pos == len(self.ustr.bytes):
+    if byte_pos == -1 or byte_pos == len(self._str.bytes):
         return None
-    length = utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
-    return Utf8Str(''.join([self.ustr.bytes[i]
-                    for i in range(self.byte_pos, self.byte_pos + length)]),
+    length = utf8_code_length[ord(self._str.bytes[self._byte_pos])]
+    return Utf8Str(''.join([self._str.bytes[i]
+                    for i in range(self._byte_pos, self._byte_pos + length)]),
                     length == 1)
 
 Utf8CodePointIter = make_iterator("Utf8CodePointIter", ForwardIterBase,
                                   codepoint_calc_value, -1)
 Utf8CharacterIter = make_iterator("Utf8CharacterIter", ForwardIterBase,
                                   character_calc_value, None)
-Utf8ReverseCodePointIter = make_iterator(
-    "Utf8ReverseCodePointIter", ReverseIterBase, codepoint_calc_value, -1)
-Utf8ReverseCharacterIter = make_iterator(
-    "Utf8ReverseCharacterIter", ReverseIterBase, character_calc_value, None)
 
 del make_iterator
 del codepoint_calc_value
 del character_calc_value
 del ForwardIterBase
-del ReverseIterBase
 
 
 
+# _______________________________________________
+
+class Utf8Iterator(object):
+    def __init__(self, str, start=0):
+        self._str = str
+
+        self._pos = start
+        self._byte_pos = str.index_of_char(start)
+
+        self._calc_current()
+
+    def _calc_current(self):
+        if self._pos >= len(self._str) or self._pos < 0:
+            raise IndexError()
+        else:
+            self._current = utf8ord_bytes(self._str.bytes, self._byte_pos)
+
+    def current(self):
+        if self._current == -1:
+            self._calc_current()
+        return self._current
+
+    def pos(self):
+        return self._pos
+
+    def byte_pos(self):
+        return self._byte_pos
+
+    def move(self, count):
+        # TODO: As an optimization, we could delay moving byte_pos until we
+        #       _calc_current
+        if count > 0:
+            self._pos += count
+
+            if self._pos < 0:
+                self._byte_pos = 0
+            else:
+                while count != 0:
+                    self._byte_pos = self._str.next_char(self._byte_pos)
+                    count -= 1
+            self._current = -1
+
+        elif count < 0:
+            self._pos += count
+
+            if self._pos < 0:
+                self._byte_pos = 0
+            else:
+                while count < 0:
+                    self._byte_pos = self._str.prev_char(self._byte_pos)
+                    count += 1
+            self._current = -1
+
+    def finished(self):
+        return self._pos >= len(self._str)
+
+    def copy(self):
+        i = Utf8Iterator(self._str)
+        i._pos = self._pos
+        i._byte_pos = self._byte_pos
+        i._current = self._current
+        return i
+
+
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -327,15 +327,15 @@
     if size == 0:
         return ''
     result = StringBuilder(size)
-    pos = 0
-    while pos < size:
-        oc = utf8ord(s, pos)
-
+    iter = s.iter()
+    while not iter.finished():
+        oc = iter.current()
         if oc < 0x100:
             result.append(chr(oc))
         else:
             raw_unicode_escape_helper(result, oc)
-        pos += 1
+
+        iter.move(1)
 
     return result.build()
 
@@ -397,28 +397,29 @@
     if size == 0:
         return ''
     result = StringBuilder(size)
-    pos = 0
-    while pos < size:
-        od = utf8ord(p, pos)
+    iter = p.iter()
+    while not iter.finished():
+        od = iter.current()
 
         if od < limit:
             result.append(chr(od))
-            pos += 1
+            iter.move(1)
         else:
-            # startpos for collecting unencodable chars
-            collstart = pos
-            collend = pos+1
-            while collend < len(p) and utf8ord(p, collend) >= limit:
-                collend += 1
+            coll = iter.copy()
+            while not coll.finished() and coll.current() >= limit:
+                coll.move(1)
+            collstart = iter.pos()
+            collend = coll.pos()
+
             ru, rs, pos = errorhandler(errors, encoding, reason, p,
                                        collstart, collend)
+            iter.move(pos - iter.pos())
             if rs is not None:
                 # py3k only
                 result.append(rs)
                 continue
 
-            for ch in ru:
-                cd = utf8ord(ch, 0)
+            for cd in ru.codepoint_iter():
                 if cd < limit:
                     result.append(chr(cd))
                 else:
@@ -452,41 +453,48 @@
                                      allow_surrogates)
 
 def unicode_encode_utf_8_impl(s, size, errors, errorhandler, allow_surrogates):
-    iter = s.codepoint_iter()
-    for oc in iter:
+    iter = s.iter()
+
+    while not iter.finished():
+        oc = iter.current()
         if oc >= 0xD800 and oc <= 0xDFFF:
             break
-        if iter.pos == size:
-            return s.bytes
-    else:
+        iter.move(1)
+    if iter.finished():
         return s.bytes
 
     result = Utf8Builder(len(s.bytes))
-    result.append_slice(s.bytes, 0, iter.byte_pos)
+    result.append_slice(s.bytes, 0, iter.byte_pos())
 
-    iter.move(-1)
-    for oc in iter:
+    while not iter.finished():
+        oc = iter.current()
+        iter.move(1)
+
         if oc >= 0xD800 and oc <= 0xDFFF:
             # Check the next character to see if this is a surrogate pair
-            if (iter.pos != len(s) and oc <= 0xDBFF and
-                0xDC00 <= iter.peek_next() <= 0xDFFF):
-                oc2 = iter.next()
+            if (not iter.finished() and oc <= 0xDBFF and
+                0xDC00 <= iter.current() <= 0xDFFF):
+
+                oc2 = iter.current()
                 result.append_codepoint(
                         ((oc - 0xD800) << 10 | (oc2 - 0xDC00)) + 0x10000)
+                iter.move(1)
+
             elif allow_surrogates:
                 result.append_codepoint(oc)
             else:
                 ru, rs, pos = errorhandler(errors, 'utf8',
                                         'surrogates not allowed', s,
-                                        iter.pos-1, iter.pos)
-                iter.move(pos - iter.pos)
+                                        iter.pos()-2, iter.pos()-1)
+                iter.move(pos - iter.pos())
                 if rs is not None:
                     # py3k only
                     result.append_utf8(rs)
+                    iter.move(1)
                     continue
-                for ch in ru:
-                    if ord(ch) < 0x80:
-                        result.append_ascii(ch)
+                for ch in ru.codepoint_iter():
+                    if ch < 0x80:
+                        result.append_ascii(chr(ch))
                     else:
                         errorhandler('strict', 'utf8',
                                     'surrogates not allowed',
@@ -809,10 +817,10 @@
         _STORECHAR(result, 0xFEFF, BYTEORDER)
         byteorder = BYTEORDER
 
-    i = 0
-    while i < size:
-        ch = utf8ord(s, i)
-        i += 1
+    iter = s.iter()
+    while not iter.finished():
+        ch = iter.current()
+        iter.move(1)
         ch, ch2 = create_surrogate_pair(ch)
 
         _STORECHAR(result, ch, byteorder)
@@ -980,16 +988,16 @@
         _STORECHAR32(result, 0xFEFF, BYTEORDER)
         byteorder = BYTEORDER
 
-    i = 0
-    while i < size:
-        ch = utf8ord(s, i)
-        i += 1
+    iter = s.iter()
+    while not iter.finished():
+        ch = iter.current()
+        iter.move(1)
         ch2 = 0
         if MAXUNICODE < 65536 and 0xD800 <= ch <= 0xDBFF and i < size:
-            ch2 = ord(s[i])
+            ch2 = iter.current()
             if 0xDC00 <= ch2 <= 0xDFFF:
                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
-                i += 1
+                iter.move(1)
         _STORECHAR32(result, ch, byteorder)
 
     return result.build()
@@ -1228,10 +1236,9 @@
     base64bits = 0
     base64buffer = 0
 
-    # TODO: Looping like this is worse than O(n)
-    pos = 0
-    while pos < size:
-        oc = utf8ord(s, pos)
+    iter = s.iter()
+    while not iter.finished():
+        oc = iter.current()
         if not inShift:
             if oc == ord('+'):
                 result.append('+-')
@@ -1260,7 +1267,7 @@
             else:
                 base64bits, base64buffer = _utf7_ENCODE_CHAR(
                     result, oc, base64bits, base64buffer)
-        pos += 1
+        iter.move(1)
 
     if base64bits:
         result.append(_utf7_TO_BASE64(base64buffer << (6 - base64bits)))
@@ -1318,15 +1325,17 @@
     if size == 0:
         return ''
     result = StringBuilder(size)
-    pos = 0
-    while pos < size:
-        ch = s[pos]
+
+    iter = s.iter()
+    while not iter.finished():
+        ch = utf8chr(iter.current())
 
         c = mapping.get(ch, '')
         if len(c) == 0:
             ru, rs, pos = errorhandler(errors, "charmap",
                                        "character maps to <undefined>",
-                                       s, pos, pos + 1)
+                                       s, iter.pos(), iter.pos() + 1)
+            iter.move(pos - iter.pos())
             if rs is not None:
                 # py3k only
                 result.append(rs)
@@ -1337,11 +1346,11 @@
                     errorhandler(
                         "strict", "charmap",
                         "character maps to <undefined>",
-                        s,  pos, pos + 1)
+                        s, iter.pos(), iter.pos() + 1)
                 result.append(c2)
             continue
         result.append(c)
-        pos += 1
+        iter.move(1)
     return result.build()
 
 # }}}
@@ -1367,9 +1376,9 @@
         errorhandler = default_unicode_error_decode
 
     if BYTEORDER == 'little':
-        iorder = [0, 1, 2, 3]
+        iorder = (0, 1, 2, 3)
     else:
-        iorder = [3, 2, 1, 0]
+        iorder = (3, 2, 1, 0)
 
     if size == 0:
         return Utf8Str(''), 0
@@ -1542,30 +1551,35 @@
     if size == 0:
         return ''
     result = StringBuilder(size)
-    pos = 0
-    while pos < size:
-        ch = utf8ord(s, pos)
+
+    iter = s.iter()
+    while not iter.finished():
+        ch = iter.current()
+
         if unicodedb.isspace(ch):
             result.append(' ')
-            pos += 1
+            iter.move(1)
             continue
+
         try:
             decimal = unicodedb.decimal(ch)
         except KeyError:
             pass
         else:
             result.append(chr(48 + decimal))
-            pos += 1
+            iter.move(1)
             continue
+
         if 0 < ch < 256:
             result.append(chr(ch))
-            pos += 1
+            iter.move(1)
             continue
+
         # All other characters are considered unencodable
-        collstart = pos
-        collend = collstart + 1
-        while collend < size:
-            ch = utf8ord(s, collend)
+        colliter = iter.copy()
+        colliter.move(1)
+        while not colliter.finished():
+            ch = colliter.current()
             try:
                 if (0 < ch < 256 or
                     unicodedb.isspace(ch) or
@@ -1574,15 +1588,19 @@
             except KeyError:
                 # not a decimal
                 pass
-            collend += 1
+            colliter.move(1)
+
+        collstart = iter.pos()
+        collend = colliter.pos()
+
         msg = "invalid decimal Unicode string"
         ru, rs, pos = errorhandler(errors, 'decimal',
                                    msg, s, collstart, collend)
+        iter.move(pos - iter.pos())
         if rs is not None:
             # py3k only
             errorhandler('strict', 'decimal', msg, s, collstart, collend)
-        for i in range(len(ru)):
-            ch = utf8.ORD(ru, i)
+        for ch in ru.codepoint_iter():
             if unicodedb.isspace(ch):
                 result.append(' ')
                 continue