[pypy-commit] pypy unicode-utf8: implement strip - fails on narrow unicode builds

Sat Mar 4 22:59:30 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r90549:bd07f3fee9c7
Date: 2017-03-04 23:57 +0100
http://bitbucket.org/pypy/pypy/changeset/bd07f3fee9c7/

Log:	implement strip - fails on narrow unicode builds

diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -125,6 +125,8 @@
         return rutf8.compute_length_utf8(self._utf8)
 
     def _val(self, space):
+        #import pdb
+        #pdb.set_trace()
         return self._utf8.decode('utf8')
 
     @staticmethod
@@ -511,7 +513,8 @@
             if keepends:
                 eol = pos
                 lgt += 1
-            strs_w.append(W_UnicodeObject(value[sol:eol], lgt))
+            # XXX find out why lgt calculation is off
+            strs_w.append(W_UnicodeObject(value[sol:eol], -1))
         return space.newlist(strs_w)
 
     @unwrap_spec(width=int)
@@ -666,24 +669,58 @@
 
         return W_UnicodeObject(value, self._len())
 
+    def _utf8_sliced(self, start, stop, lgt):
+        assert start >= 0
+        assert stop >= 0
+        #if start == 0 and stop == len(s) and space.is_w(space.type(orig_obj),
+        #                                                space.w_bytes):
+        #    return orig_obj
+        return W_UnicodeObject(self._utf8[start:stop], lgt)
+
     def _strip_none(self, space, left, right):
         "internal function called by str_xstrip methods"
         value = self._utf8
 
         lpos = 0
-        rpos = self._len()
+        rpos = len(value)
+        lgt = self._len()
 
         if left:
-            while lpos < rpos and self._isspace(value[lpos]):
-                lpos += 1
+            while lpos < rpos and rutf8.isspace(value, lpos):
+                lpos = rutf8.next_codepoint_pos(value, lpos)
+                lgt -= 1
 
         if right:
-            while rpos > lpos and self._isspace(value[rpos - 1]):
-                rpos -= 1
+            while rpos > lpos and rutf8.isspace(value,
+                                         rutf8.prev_codepoint_pos(value, rpos)):
+                rpos = rutf8.prev_codepoint_pos(value, rpos)
+                lgt -= 1
 
         assert rpos >= lpos    # annotator hint, don't remove
-        return self._sliced(space, value, lpos, rpos, self)
+        return self._utf8_sliced(lpos, rpos, lgt)
 
+    def _strip(self, space, w_chars, left, right, name='strip'):
+        "internal function called by str_xstrip methods"
+        value = self._utf8
+        chars = self.convert_arg_to_w_unicode(space, w_chars, strict=name)._utf8
+
+        lpos = 0
+        rpos = len(value)
+        lgt = self._len()
+
+        if left:
+            while lpos < rpos and rutf8.utf8_in_chars(value, lpos, chars):
+                lpos = rutf8.next_codepoint_pos(value, lpos)
+                lgt -= 1
+
+        if right:
+            while rpos > lpos and rutf8.utf8_in_chars(value,
+                    rutf8.prev_codepoint_pos(value, rpos), chars):
+                rpos = rutf8.prev_codepoint_pos(value, rpos)
+                lgt -= 1
+
+        assert rpos >= lpos    # annotator hint, don't remove
+        return self._utf8_sliced(lpos, rpos, lgt)
 
     def descr_getnewargs(self, space):
         return space.newtuple([W_UnicodeObject(self._utf8, self._length)])
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -217,6 +217,24 @@
         return True
     return False
 
+def utf8_in_chars(value, pos, chars):
+    """ equivalent of u'x' in u'xyz', just done in utf8
+    """
+    lgt = next_codepoint_pos(value, pos) - pos
+    i = 0
+    while i < len(chars):
+        j = next_codepoint_pos(chars, i)
+        if j - i != lgt:
+            i = j
+            continue
+        for k in range(lgt):
+            if value[k + pos] != chars[i + k]:
+                break
+        else:
+            return True
+        i = j
+    return False
+
 class Utf8CheckError(Exception):
     def __init__(self, msg, startpos, endpos):
         self.msg = msg
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -70,3 +70,15 @@
             assert rutf8.isspace(unichr(i).encode('utf8'), 0)
         else:
             assert not rutf8.isspace(unichr(i).encode('utf8'), 0)
+
+ at given(strategies.integers(min_value=0, max_value=sys.maxunicode),
+       strategies.characters())
+def test_utf8_in_chars(i, uni):
+    if not uni:
+        return
+    if unichr(i) in uni:
+        response = True
+    else:
+        response = False
+    r = rutf8.utf8_in_chars(unichr(i).encode('utf8'), 0, uni.encode('utf8'))
+    assert r == response