[pypy-commit] pypy py3k: Handle the special case of \u03A3 in lower()

arigo pypy.commits at gmail.com
Wed Jul 27 12:17:15 EDT 2016


Author: Armin Rigo <arigo at tunes.org>
Branch: py3k
Changeset: r85878:904955c86e02
Date: 2016-07-25 11:15 +0200
http://bitbucket.org/pypy/pypy/changeset/904955c86e02/

Log:	Handle the special case of \u03A3 in lower()

diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -152,7 +152,7 @@
         builder = self._builder(len(value))
         builder.append(self._upper(value[0]))
         for i in range(1, len(value)):
-            builder.append(self._lower(value[i]))
+            builder.append(self._lower_in_str(value, i))
         return self._new(builder.build())
 
     @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
@@ -452,9 +452,13 @@
         value = self._val(space)
         builder = self._builder(len(value))
         for i in range(len(value)):
-            builder.append(self._lower(value[i]))
+            builder.append(self._lower_in_str(value, i))
         return self._new(builder.build())
 
+    def _lower_in_str(self, value, i):
+        # overridden in unicodeobject.py
+        return self._lower(value[i])
+
     def descr_partition(self, space, w_sub):
         from pypy.objspace.std.bytearrayobject import W_BytearrayObject
         value = self._val(space)
@@ -699,7 +703,7 @@
         for i in range(len(selfvalue)):
             ch = selfvalue[i]
             if self._isupper(ch):
-                builder.append(self._lower(ch))
+                builder.append(self._lower_in_str(selfvalue, i))
             elif self._islower(ch):
                 builder.append(self._upper(ch))
             else:
@@ -716,11 +720,12 @@
     def title(self, value):
         builder = self._builder(len(value))
         previous_is_cased = False
-        for ch in value:
+        for i in range(len(value)):
+            ch = value[i]
             if not previous_is_cased:
                 builder.append(self._title(ch))
             else:
-                builder.append(self._lower(ch))
+                builder.append(self._lower_in_str(value, i))
             previous_is_cased = self._iscased(ch)
         return builder.build()
 
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -978,10 +978,20 @@
         raises(TypeError, 'u"".encode("utf-8", None)')
 
     def test_casefold(self):
-        assert 'hello'.casefold() == 'hello'
-        assert 'hELlo'.casefold() == 'hello'
-        assert 'ß'.casefold() == 'ss'
-        assert 'fi'.casefold() == 'fi'
-        assert '\u03a3'.casefold() == '\u03c3'
-        assert 'A\u0345\u03a3'.casefold() == 'a\u03b9\u03c3'
-        assert '\u00b5'.casefold() == '\u03bc'
+        assert u'hello'.casefold() == u'hello'
+        assert u'hELlo'.casefold() == u'hello'
+        assert u'ß'.casefold() == u'ss'
+        assert u'fi'.casefold() == u'fi'
+        assert u'\u03a3'.casefold() == u'\u03c3'
+        assert u'A\u0345\u03a3'.casefold() == u'a\u03b9\u03c3'
+        assert u'\u00b5'.casefold() == u'\u03bc'
+
+    def test_lower_3a3(self):
+        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
+        assert u'\u03a3'.lower() == u'\u03c3'
+        assert u'\u0345\u03a3'.lower() == u'\u0345\u03c3'
+        assert u'A\u0345\u03a3'.lower() == u'a\u0345\u03c2'
+        assert u'A\u0345\u03a3a'.lower() == u'a\u0345\u03c3a'
+        assert u'A\u0345\u03a3'.lower() == u'a\u0345\u03c2'
+        assert u'A\u03a3\u0345'.lower() == u'a\u03c2\u0345'
+        assert u'\u03a3\u0345 '.lower() == u'\u03c3\u0345 '
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -173,7 +173,11 @@
         return u''.join([unichr(x) for x in
                          unicodedb.toupper_full(ord(ch))])
 
-    def _lower(self, ch):
+    def _lower_in_str(self, value, i):
+        ch = value[i]
+        if ord(ch) == 0x3A3:
+            # Obscure special case.
+            return self._handle_capital_sigma(value, i)
         return u''.join([unichr(x) for x in
                          unicodedb.tolower_full(ord(ch))])
 
@@ -181,6 +185,31 @@
         return u''.join([unichr(x) for x in
                          unicodedb.totitle_full(ord(ch))])
 
+    def _handle_capital_sigma(self, value, i):
+        # U+03A3 is in the Final_Sigma context when, it is found like this:
+        #\p{cased} \p{case-ignorable}* U+03A3 not(\p{case-ignorable}* \p{cased})
+        # where \p{xxx} is a character with property xxx.
+        j = i - 1
+        while j >= 0:
+            ch = value[j]
+            if not unicodedb.iscaseignorable(ord(ch)):
+                break
+            j -= 1
+        final_sigma = j >= 0 and unicodedb.iscased(ord(ch))
+        if final_sigma:
+            j = i + 1
+            length = len(value)
+            while j < length:
+                ch = value[j]
+                if not unicodedb.iscaseignorable(ord(ch)):
+                    break
+                j += 1
+            final_sigma = j == length or not unicodedb.iscased(ord(ch))
+        if final_sigma:
+            return unichr(0x3C2)
+        else:
+            return unichr(0x3C3)
+
     def _newlist_unwrapped(self, space, lst):
         return space.newlist_unicode(lst)
 


More information about the pypy-commit mailing list