[pypy-commit] pypy unicode-utf8: implement strip - fails on narrow unicode builds
fijal
pypy.commits at gmail.com
Sat Mar 4 22:59:30 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r90549:bd07f3fee9c7
Date: 2017-03-04 23:57 +0100
http://bitbucket.org/pypy/pypy/changeset/bd07f3fee9c7/
Log: implement strip - fails on narrow unicode builds
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -125,6 +125,8 @@
return rutf8.compute_length_utf8(self._utf8)
def _val(self, space):
+ #import pdb
+ #pdb.set_trace()
return self._utf8.decode('utf8')
@staticmethod
@@ -511,7 +513,8 @@
if keepends:
eol = pos
lgt += 1
- strs_w.append(W_UnicodeObject(value[sol:eol], lgt))
+ # XXX find out why lgt calculation is off
+ strs_w.append(W_UnicodeObject(value[sol:eol], -1))
return space.newlist(strs_w)
@unwrap_spec(width=int)
@@ -666,24 +669,58 @@
return W_UnicodeObject(value, self._len())
+ def _utf8_sliced(self, start, stop, lgt):
+ assert start >= 0
+ assert stop >= 0
+ #if start == 0 and stop == len(s) and space.is_w(space.type(orig_obj),
+ # space.w_bytes):
+ # return orig_obj
+ return W_UnicodeObject(self._utf8[start:stop], lgt)
+
def _strip_none(self, space, left, right):
"internal function called by str_xstrip methods"
value = self._utf8
lpos = 0
- rpos = self._len()
+ rpos = len(value)
+ lgt = self._len()
if left:
- while lpos < rpos and self._isspace(value[lpos]):
- lpos += 1
+ while lpos < rpos and rutf8.isspace(value, lpos):
+ lpos = rutf8.next_codepoint_pos(value, lpos)
+ lgt -= 1
if right:
- while rpos > lpos and self._isspace(value[rpos - 1]):
- rpos -= 1
+ while rpos > lpos and rutf8.isspace(value,
+ rutf8.prev_codepoint_pos(value, rpos)):
+ rpos = rutf8.prev_codepoint_pos(value, rpos)
+ lgt -= 1
assert rpos >= lpos # annotator hint, don't remove
- return self._sliced(space, value, lpos, rpos, self)
+ return self._utf8_sliced(lpos, rpos, lgt)
+ def _strip(self, space, w_chars, left, right, name='strip'):
+ "internal function called by str_xstrip methods"
+ value = self._utf8
+ chars = self.convert_arg_to_w_unicode(space, w_chars, strict=name)._utf8
+
+ lpos = 0
+ rpos = len(value)
+ lgt = self._len()
+
+ if left:
+ while lpos < rpos and rutf8.utf8_in_chars(value, lpos, chars):
+ lpos = rutf8.next_codepoint_pos(value, lpos)
+ lgt -= 1
+
+ if right:
+ while rpos > lpos and rutf8.utf8_in_chars(value,
+ rutf8.prev_codepoint_pos(value, rpos), chars):
+ rpos = rutf8.prev_codepoint_pos(value, rpos)
+ lgt -= 1
+
+ assert rpos >= lpos # annotator hint, don't remove
+ return self._utf8_sliced(lpos, rpos, lgt)
def descr_getnewargs(self, space):
return space.newtuple([W_UnicodeObject(self._utf8, self._length)])
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -217,6 +217,24 @@
return True
return False
+def utf8_in_chars(value, pos, chars):
+ """ equivalent of u'x' in u'xyz', just done in utf8
+ """
+ lgt = next_codepoint_pos(value, pos) - pos
+ i = 0
+ while i < len(chars):
+ j = next_codepoint_pos(chars, i)
+ if j - i != lgt:
+ i = j
+ continue
+ for k in range(lgt):
+ if value[k + pos] != chars[i + k]:
+ break
+ else:
+ return True
+ i = j
+ return False
+
class Utf8CheckError(Exception):
def __init__(self, msg, startpos, endpos):
self.msg = msg
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -70,3 +70,15 @@
assert rutf8.isspace(unichr(i).encode('utf8'), 0)
else:
assert not rutf8.isspace(unichr(i).encode('utf8'), 0)
+
+ at given(strategies.integers(min_value=0, max_value=sys.maxunicode),
+ strategies.characters())
+def test_utf8_in_chars(i, uni):
+ if not uni:
+ return
+ if unichr(i) in uni:
+ response = True
+ else:
+ response = False
+ r = rutf8.utf8_in_chars(unichr(i).encode('utf8'), 0, uni.encode('utf8'))
+ assert r == response
More information about the pypy-commit
mailing list