[pypy-commit] pypy unicode-utf8: first attempt at find
fijal
pypy.commits at gmail.com
Sat Oct 7 12:26:55 EDT 2017
Author: fijal
Branch: unicode-utf8
Changeset: r92638:3ff5e711f1a0
Date: 2017-10-07 18:26 +0200
http://bitbucket.org/pypy/pypy/changeset/3ff5e711f1a0/
Log: first attempt at find
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -117,6 +117,7 @@
def _new_from_list(self, value):
u = u''.join(value)
return W_UnicodeObject(u.encode('utf8'), len(u))
+
def _empty(self):
return W_UnicodeObject.EMPTY
@@ -412,6 +413,23 @@
"character mapping must be in range(0x110000)")
return W_UnicodeObject(result.build(), result_length)
+ def descr_find(self, space, w_sub, w_start=None, w_end=None):
+ value, start, end, ofs = self._convert_idx_params(space, w_start, w_end)
+
+ w_sub = self.convert_arg_to_w_unicode(space, w_sub)
+ # XXX for now just create index
+ storage = self._get_index_storage()
+ start_index = rutf8.codepoint_position_at_index(self._utf8, storage,
+ start)
+ end_index = rutf8.codepoint_position_at_index(self._utf8, storage, end)
+
+ res_index = value.find(w_sub._utf8, start_index, end_index)
+ if res_index == -1:
+ return space.newint(-1)
+
+ res = rutf8.check_utf8(self._utf8, force_len=res_index) # can't raise
+ return space.newint(res)
+
def descr_encode(self, space, w_encoding=None, w_errors=None):
encoding, errors = _get_encoding_and_errors(space, w_encoding,
w_errors)
@@ -732,14 +750,17 @@
descr_rmul = descr_mul
+ def _get_index_storage(self):
+ if self._index_storage == rutf8.null_storage():
+ self._index_storage = rutf8.create_utf8_index_storage(self._utf8,
+ self._length)
+ return self._index_storage
+
def _getitem_result(self, space, index):
if index >= self._length:
raise oefmt(space.w_IndexError, "string index out of range")
- if self._index_storage == rutf8.null_storage():
- self._index_storage = rutf8.create_utf8_index_storage(self._utf8,
- self._length)
- start = rutf8.codepoint_position_at_index(self._utf8,
- self._index_storage, index)
+ storage = self._get_index_storage()
+ start = rutf8.codepoint_position_at_index(self._utf8, storage, index)
end = rutf8.next_codepoint_pos(self._utf8, start)
return W_UnicodeObject(self._utf8[start:end], 1)
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -290,16 +290,18 @@
#@jit.elidable
-def check_utf8(s, allow_surrogates=False):
+def check_utf8(s, allow_surrogates=False, force_len=-1):
"""Check that 's' is a utf-8-encoded byte string.
Returns the length (number of chars) or raise CheckError.
Note that surrogates are not handled specially here.
"""
- import pdb
- pdb.set_trace()
pos = 0
continuation_bytes = 0
- while pos < len(s):
+ if force_len == -1:
+ end = len(s)
+ else:
+ end = force_len
+ while pos < end:
ordch1 = ord(s[pos])
pos += 1
# fast path for ASCII
@@ -310,7 +312,7 @@
raise CheckError(pos - 1)
if ordch1 <= 0xDF:
- if pos >= len(s):
+ if pos >= end:
raise CheckError(pos - 1)
ordch2 = ord(s[pos])
pos += 1
@@ -322,7 +324,7 @@
continue
if ordch1 <= 0xEF:
- if (pos + 2) > len(s):
+ if (pos + 2) > end:
raise CheckError(pos - 1)
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
@@ -336,7 +338,7 @@
continue
if ordch1 <= 0xF4:
- if (pos + 3) > len(s):
+ if (pos + 3) > end:
raise CheckError(pos - 1)
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
@@ -353,7 +355,7 @@
raise CheckError(pos - 1)
- assert pos == len(s)
+ assert pos == end
return pos - continuation_bytes
@jit.elidable
More information about the pypy-commit
mailing list