[pypy-commit] pypy unicode-utf8: first attempt at find

Sat Oct 7 12:26:55 EDT 2017

Author: fijal
Branch: unicode-utf8
Changeset: r92638:3ff5e711f1a0
Date: 2017-10-07 18:26 +0200
http://bitbucket.org/pypy/pypy/changeset/3ff5e711f1a0/

Log:	first attempt at find

diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -117,6 +117,7 @@
     def _new_from_list(self, value):
         u = u''.join(value)
         return W_UnicodeObject(u.encode('utf8'), len(u))
+
     def _empty(self):
         return W_UnicodeObject.EMPTY
 
@@ -412,6 +413,23 @@
                             "character mapping must be in range(0x110000)")
         return W_UnicodeObject(result.build(), result_length)
 
+    def descr_find(self, space, w_sub, w_start=None, w_end=None):
+        value, start, end, ofs = self._convert_idx_params(space, w_start, w_end)
+
+        w_sub = self.convert_arg_to_w_unicode(space, w_sub)
+        # XXX for now just create index
+        storage = self._get_index_storage()
+        start_index = rutf8.codepoint_position_at_index(self._utf8, storage,
+                                                        start)
+        end_index = rutf8.codepoint_position_at_index(self._utf8, storage, end)
+
+        res_index = value.find(w_sub._utf8, start_index, end_index)
+        if res_index == -1:
+            return space.newint(-1)
+
+        res = rutf8.check_utf8(self._utf8, force_len=res_index) # can't raise
+        return space.newint(res)
+
     def descr_encode(self, space, w_encoding=None, w_errors=None):
         encoding, errors = _get_encoding_and_errors(space, w_encoding,
                                                     w_errors)
@@ -732,14 +750,17 @@
 
     descr_rmul = descr_mul
 
+    def _get_index_storage(self):
+        if self._index_storage == rutf8.null_storage():
+            self._index_storage = rutf8.create_utf8_index_storage(self._utf8,
+                self._length)
+        return self._index_storage
+
     def _getitem_result(self, space, index):
         if index >= self._length:
             raise oefmt(space.w_IndexError, "string index out of range")
-        if self._index_storage == rutf8.null_storage():
-            self._index_storage = rutf8.create_utf8_index_storage(self._utf8,
-                self._length)
-        start = rutf8.codepoint_position_at_index(self._utf8,
-            self._index_storage, index)
+        storage = self._get_index_storage()
+        start = rutf8.codepoint_position_at_index(self._utf8, storage, index)
         end = rutf8.next_codepoint_pos(self._utf8, start)
         return W_UnicodeObject(self._utf8[start:end], 1)
 
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -290,16 +290,18 @@
 
 
 #@jit.elidable
-def check_utf8(s, allow_surrogates=False):
+def check_utf8(s, allow_surrogates=False, force_len=-1):
     """Check that 's' is a utf-8-encoded byte string.
     Returns the length (number of chars) or raise CheckError.
     Note that surrogates are not handled specially here.
     """
-    import pdb
-    pdb.set_trace()
     pos = 0
     continuation_bytes = 0
-    while pos < len(s):
+    if force_len == -1:
+        end = len(s)
+    else:
+        end = force_len
+    while pos < end:
         ordch1 = ord(s[pos])
         pos += 1
         # fast path for ASCII
@@ -310,7 +312,7 @@
             raise CheckError(pos - 1)
 
         if ordch1 <= 0xDF:
-            if pos >= len(s):
+            if pos >= end:
                 raise CheckError(pos - 1)
             ordch2 = ord(s[pos])
             pos += 1
@@ -322,7 +324,7 @@
             continue
 
         if ordch1 <= 0xEF:
-            if (pos + 2) > len(s):
+            if (pos + 2) > end:
                 raise CheckError(pos - 1)
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
@@ -336,7 +338,7 @@
             continue
 
         if ordch1 <= 0xF4:
-            if (pos + 3) > len(s):
+            if (pos + 3) > end:
                 raise CheckError(pos - 1)
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
@@ -353,7 +355,7 @@
 
         raise CheckError(pos - 1)
 
-    assert pos == len(s)
+    assert pos == end
     return pos - continuation_bytes
 
 @jit.elidable