[pypy-commit] pypy py3k: Handle the special case of \u03A3 in lower()
arigo
pypy.commits at gmail.com
Wed Jul 27 12:17:15 EDT 2016
Author: Armin Rigo <arigo at tunes.org>
Branch: py3k
Changeset: r85878:904955c86e02
Date: 2016-07-25 11:15 +0200
http://bitbucket.org/pypy/pypy/changeset/904955c86e02/
Log: Handle the special case of \u03A3 in lower()
diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -152,7 +152,7 @@
builder = self._builder(len(value))
builder.append(self._upper(value[0]))
for i in range(1, len(value)):
- builder.append(self._lower(value[i]))
+ builder.append(self._lower_in_str(value, i))
return self._new(builder.build())
@unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
@@ -452,9 +452,13 @@
value = self._val(space)
builder = self._builder(len(value))
for i in range(len(value)):
- builder.append(self._lower(value[i]))
+ builder.append(self._lower_in_str(value, i))
return self._new(builder.build())
+ def _lower_in_str(self, value, i):
+ # overridden in unicodeobject.py
+ return self._lower(value[i])
+
def descr_partition(self, space, w_sub):
from pypy.objspace.std.bytearrayobject import W_BytearrayObject
value = self._val(space)
@@ -699,7 +703,7 @@
for i in range(len(selfvalue)):
ch = selfvalue[i]
if self._isupper(ch):
- builder.append(self._lower(ch))
+ builder.append(self._lower_in_str(selfvalue, i))
elif self._islower(ch):
builder.append(self._upper(ch))
else:
@@ -716,11 +720,12 @@
def title(self, value):
builder = self._builder(len(value))
previous_is_cased = False
- for ch in value:
+ for i in range(len(value)):
+ ch = value[i]
if not previous_is_cased:
builder.append(self._title(ch))
else:
- builder.append(self._lower(ch))
+ builder.append(self._lower_in_str(value, i))
previous_is_cased = self._iscased(ch)
return builder.build()
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -978,10 +978,20 @@
raises(TypeError, 'u"".encode("utf-8", None)')
def test_casefold(self):
- assert 'hello'.casefold() == 'hello'
- assert 'hELlo'.casefold() == 'hello'
- assert 'ß'.casefold() == 'ss'
- assert 'fi'.casefold() == 'fi'
- assert '\u03a3'.casefold() == '\u03c3'
- assert 'A\u0345\u03a3'.casefold() == 'a\u03b9\u03c3'
- assert '\u00b5'.casefold() == '\u03bc'
+ assert u'hello'.casefold() == u'hello'
+ assert u'hELlo'.casefold() == u'hello'
+ assert u'ß'.casefold() == u'ss'
+ assert u'fi'.casefold() == u'fi'
+ assert u'\u03a3'.casefold() == u'\u03c3'
+ assert u'A\u0345\u03a3'.casefold() == u'a\u03b9\u03c3'
+ assert u'\u00b5'.casefold() == u'\u03bc'
+
+ def test_lower_3a3(self):
+ # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
+ assert u'\u03a3'.lower() == u'\u03c3'
+ assert u'\u0345\u03a3'.lower() == u'\u0345\u03c3'
+ assert u'A\u0345\u03a3'.lower() == u'a\u0345\u03c2'
+ assert u'A\u0345\u03a3a'.lower() == u'a\u0345\u03c3a'
+ assert u'A\u0345\u03a3'.lower() == u'a\u0345\u03c2'
+ assert u'A\u03a3\u0345'.lower() == u'a\u03c2\u0345'
+ assert u'\u03a3\u0345 '.lower() == u'\u03c3\u0345 '
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -173,7 +173,11 @@
return u''.join([unichr(x) for x in
unicodedb.toupper_full(ord(ch))])
- def _lower(self, ch):
+ def _lower_in_str(self, value, i):
+ ch = value[i]
+ if ord(ch) == 0x3A3:
+ # Obscure special case.
+ return self._handle_capital_sigma(value, i)
return u''.join([unichr(x) for x in
unicodedb.tolower_full(ord(ch))])
@@ -181,6 +185,31 @@
return u''.join([unichr(x) for x in
unicodedb.totitle_full(ord(ch))])
+ def _handle_capital_sigma(self, value, i):
+ # U+03A3 is in the Final_Sigma context when, it is found like this:
+ #\p{cased} \p{case-ignorable}* U+03A3 not(\p{case-ignorable}* \p{cased})
+ # where \p{xxx} is a character with property xxx.
+ j = i - 1
+ while j >= 0:
+ ch = value[j]
+ if not unicodedb.iscaseignorable(ord(ch)):
+ break
+ j -= 1
+ final_sigma = j >= 0 and unicodedb.iscased(ord(ch))
+ if final_sigma:
+ j = i + 1
+ length = len(value)
+ while j < length:
+ ch = value[j]
+ if not unicodedb.iscaseignorable(ord(ch)):
+ break
+ j += 1
+ final_sigma = j == length or not unicodedb.iscased(ord(ch))
+ if final_sigma:
+ return unichr(0x3C2)
+ else:
+ return unichr(0x3C3)
+
def _newlist_unwrapped(self, space, lst):
return space.newlist_unicode(lst)
More information about the pypy-commit
mailing list