[pypy-commit] pypy unicode-utf8: capitalize and {starts, ends}with.

Sat Oct 7 10:29:24 EDT 2017

Author: Jeremy Thurgood <firxen at gmail.com>
Branch: unicode-utf8
Changeset: r92634:2d6fe4fc14a3
Date: 2017-10-07 16:28 +0200
http://bitbucket.org/pypy/pypy/changeset/2d6fe4fc14a3/

Log:	capitalize and {starts,ends}with.

diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -632,16 +632,13 @@
                 return space.w_True
         return space.w_False
 
+    # This is overridden in unicodeobject, but the two above are not.
     def _startswith(self, space, value, w_prefix, start, end):
         prefix = self._op_val(space, w_prefix)
         if start > len(value):
-            return self._starts_ends_overflow(prefix)
+            return False
         return startswith(value, prefix, start, end)
 
-    def _starts_ends_overflow(self, prefix):
-        return False     # bug-to-bug compat: this is for strings and
-                         # bytearrays, but overridden for unicodes
-
     def descr_endswith(self, space, w_suffix, w_start=None, w_end=None):
         value, start, end, _ = self._convert_idx_params(space, w_start, w_end)
         if space.isinstance_w(w_suffix, space.w_tuple):
@@ -655,10 +652,11 @@
                 return space.w_True
         return space.w_False
 
+    # This is overridden in unicodeobject, but the two above are not.
     def _endswith(self, space, value, w_prefix, start, end):
         prefix = self._op_val(space, w_prefix)
         if start > len(value):
-            return self._starts_ends_overflow(prefix)
+            return False
         return endswith(value, prefix, start, end)
 
     def _strip(self, space, w_chars, left, right, name='strip'):
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -6,8 +6,9 @@
 from rpython.rlib.buffer import StringBuffer
 from rpython.rlib.mutbuffer import MutableStringBuffer
 from rpython.rlib.rarithmetic import ovfcheck
-from rpython.rlib.rstring import StringBuilder, split, rsplit, UnicodeBuilder,\
-     replace_count
+from rpython.rlib.rstring import (
+    StringBuilder, split, rsplit, UnicodeBuilder, replace_count, startswith,
+    endswith)
 from rpython.rlib.runicode import make_unicode_escape_function
 from rpython.rlib import rutf8, jit
 
@@ -139,6 +140,10 @@
         return True
 
     @staticmethod
+    def _op_utf8(space, w_other, strict=None):
+        return W_UnicodeObject.convert_arg_to_w_unicode(space, w_other, strict)._utf8
+
+    @staticmethod
     def _op_val(space, w_other, strict=None):
         return W_UnicodeObject.convert_arg_to_w_unicode(space, w_other, strict)._utf8.decode('utf8')
 
@@ -520,8 +525,17 @@
             i = rutf8.next_codepoint_pos(val, i)
         return space.newbool(cased)
 
-    def _starts_ends_overflow(self, prefix):
-        return len(prefix) == 0
+    def _startswith(self, space, value, w_prefix, start, end):
+        prefix = self._op_utf8(space, w_prefix)
+        if start > len(value):
+            return len(prefix) == 0
+        return startswith(value, prefix, start, end)
+
+    def _endswith(self, space, value, w_prefix, start, end):
+        prefix = self._op_utf8(space, w_prefix)
+        if start > len(value):
+            return len(prefix) == 0
+        return endswith(value, prefix, start, end)
 
     def descr_add(self, space, w_other):
         try:
@@ -644,6 +658,21 @@
 
         return space.newlist_utf8(res)
 
+    def descr_capitalize(self, space):
+        value = self._utf8
+        if len(value) == 0:
+            return self._empty()
+
+        builder = StringBuilder(len(value))
+        uchar = rutf8.codepoint_at_pos(value, 0)
+        i = rutf8.next_codepoint_pos(value, 0)
+        rutf8.unichr_as_utf8_append(builder, unicodedb.toupper(uchar))
+        while i < len(value):
+            uchar = rutf8.codepoint_at_pos(value, i)
+            i = rutf8.next_codepoint_pos(value, i)
+            rutf8.unichr_as_utf8_append(builder, unicodedb.tolower(uchar))
+        return W_UnicodeObject(builder.build(), self._len())
+
     @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
     def descr_center(self, space, width, w_fillchar):
         value = self._utf8