[Python-checkins] [3.10] gh-78453: Move Unicode C API tests from test_unicode to test_capi.test_unicode (GH-99431). (GH-99617)

serhiy-storchaka webhook-mailer at python.org
Sun Nov 20 05:45:56 EST 2022

commit: 42b7b2179e27f6df110a1b528fc948784e497265
branch: 3.10
author: Serhiy Storchaka <storchaka at gmail.com>
committer: serhiy-storchaka <storchaka at gmail.com>
date: 2022-11-20T12:45:50+02:00

[3.10] gh-78453: Move Unicode C API tests from test_unicode to test_capi.test_unicode (GH-99431). (GH-99617)

(cherry picked from commit 06d4e02c3b3526b5d90e41a0a0befa8663e08f27)

Co-authored-by: Serhiy Storchaka <storchaka at gmail.com>

A Lib/test/test_capi/test_unicode.py
M Lib/test/test_unicode.py

diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
new file mode 100644
index 000000000000..8c2ada8e5a58
--- /dev/null
+++ b/Lib/test/test_capi/test_unicode.py
@@ -0,0 +1,502 @@
+import unittest
+import sys
+import warnings
+from test import support
+from test.support import import_helper
+from test.support import warnings_helper
+    import _testcapi
+except ImportError:
+    _testcapi = None
+class CAPITest(unittest.TestCase):
+    # Test PyUnicode_FromFormat()
+    def test_from_format(self):
+        import_helper.import_module('ctypes')
+        from ctypes import (
+            c_char_p,
+            pythonapi, py_object, sizeof,
+            c_int, c_long, c_longlong, c_ssize_t,
+            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
+        name = "PyUnicode_FromFormat"
+        _PyUnicode_FromFormat = getattr(pythonapi, name)
+        _PyUnicode_FromFormat.argtypes = (c_char_p,)
+        _PyUnicode_FromFormat.restype = py_object
+        def PyUnicode_FromFormat(format, *args):
+            cargs = tuple(
+                py_object(arg) if isinstance(arg, str) else arg
+                for arg in args)
+            return _PyUnicode_FromFormat(format, *cargs)
+        def check_format(expected, format, *args):
+            text = PyUnicode_FromFormat(format, *args)
+            self.assertEqual(expected, text)
+        # ascii format, non-ascii argument
+        check_format('ascii\x7f=unicode\xe9',
+                     b'ascii\x7f=%U', 'unicode\xe9')
+        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
+        # raises an error
+        self.assertRaisesRegex(ValueError,
+            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
+            'string, got a non-ASCII byte: 0xe9$',
+            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
+        # test "%c"
+        check_format('\uabcd',
+                     b'%c', c_int(0xabcd))
+        check_format('\U0010ffff',
+                     b'%c', c_int(0x10ffff))
+        with self.assertRaises(OverflowError):
+            PyUnicode_FromFormat(b'%c', c_int(0x110000))
+        # Issue #18183
+        check_format('\U00010000\U00100000',
+                     b'%c%c', c_int(0x10000), c_int(0x100000))
+        # test "%"
+        check_format('%',
+                     b'%')
+        check_format('%',
+                     b'%%')
+        check_format('%s',
+                     b'%%s')
+        check_format('[%]',
+                     b'[%%]')
+        check_format('%abc',
+                     b'%%%s', b'abc')
+        # truncated string
+        check_format('abc',
+                     b'%.3s', b'abcdef')
+        check_format('abc[\ufffd',
+                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
+        check_format("'\\u20acABC'",
+                     b'%A', '\u20acABC')
+        check_format("'\\u20",
+                     b'%.5A', '\u20acABCDEF')
+        check_format("'\u20acABC'",
+                     b'%R', '\u20acABC')
+        check_format("'\u20acA",
+                     b'%.3R', '\u20acABCDEF')
+        check_format('\u20acAB',
+                     b'%.3S', '\u20acABCDEF')
+        check_format('\u20acAB',
+                     b'%.3U', '\u20acABCDEF')
+        check_format('\u20acAB',
+                     b'%.3V', '\u20acABCDEF', None)
+        check_format('abc[\ufffd',
+                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
+        # following tests comes from #7330
+        # test width modifier and precision modifier with %S
+        check_format("repr=  abc",
+                     b'repr=%5S', 'abc')
+        check_format("repr=ab",
+                     b'repr=%.2S', 'abc')
+        check_format("repr=   ab",
+                     b'repr=%5.2S', 'abc')
+        # test width modifier and precision modifier with %R
+        check_format("repr=   'abc'",
+                     b'repr=%8R', 'abc')
+        check_format("repr='ab",
+                     b'repr=%.3R', 'abc')
+        check_format("repr=  'ab",
+                     b'repr=%5.3R', 'abc')
+        # test width modifier and precision modifier with %A
+        check_format("repr=   'abc'",
+                     b'repr=%8A', 'abc')
+        check_format("repr='ab",
+                     b'repr=%.3A', 'abc')
+        check_format("repr=  'ab",
+                     b'repr=%5.3A', 'abc')
+        # test width modifier and precision modifier with %s
+        check_format("repr=  abc",
+                     b'repr=%5s', b'abc')
+        check_format("repr=ab",
+                     b'repr=%.2s', b'abc')
+        check_format("repr=   ab",
+                     b'repr=%5.2s', b'abc')
+        # test width modifier and precision modifier with %U
+        check_format("repr=  abc",
+                     b'repr=%5U', 'abc')
+        check_format("repr=ab",
+                     b'repr=%.2U', 'abc')
+        check_format("repr=   ab",
+                     b'repr=%5.2U', 'abc')
+        # test width modifier and precision modifier with %V
+        check_format("repr=  abc",
+                     b'repr=%5V', 'abc', b'123')
+        check_format("repr=ab",
+                     b'repr=%.2V', 'abc', b'123')
+        check_format("repr=   ab",
+                     b'repr=%5.2V', 'abc', b'123')
+        check_format("repr=  123",
+                     b'repr=%5V', None, b'123')
+        check_format("repr=12",
+                     b'repr=%.2V', None, b'123')
+        check_format("repr=   12",
+                     b'repr=%5.2V', None, b'123')
+        # test integer formats (%i, %d, %u)
+        check_format('010',
+                     b'%03i', c_int(10))
+        check_format('0010',
+                     b'%0.4i', c_int(10))
+        check_format('-123',
+                     b'%i', c_int(-123))
+        check_format('-123',
+                     b'%li', c_long(-123))
+        check_format('-123',
+                     b'%lli', c_longlong(-123))
+        check_format('-123',
+                     b'%zi', c_ssize_t(-123))
+        check_format('-123',
+                     b'%d', c_int(-123))
+        check_format('-123',
+                     b'%ld', c_long(-123))
+        check_format('-123',
+                     b'%lld', c_longlong(-123))
+        check_format('-123',
+                     b'%zd', c_ssize_t(-123))
+        check_format('123',
+                     b'%u', c_uint(123))
+        check_format('123',
+                     b'%lu', c_ulong(123))
+        check_format('123',
+                     b'%llu', c_ulonglong(123))
+        check_format('123',
+                     b'%zu', c_size_t(123))
+        # test long output
+        min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
+        max_longlong = -min_longlong - 1
+        check_format(str(min_longlong),
+                     b'%lld', c_longlong(min_longlong))
+        check_format(str(max_longlong),
+                     b'%lld', c_longlong(max_longlong))
+        max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
+        check_format(str(max_ulonglong),
+                     b'%llu', c_ulonglong(max_ulonglong))
+        PyUnicode_FromFormat(b'%p', c_void_p(-1))
+        # test padding (width and/or precision)
+        check_format('123'.rjust(10, '0'),
+                     b'%010i', c_int(123))
+        check_format('123'.rjust(100),
+                     b'%100i', c_int(123))
+        check_format('123'.rjust(100, '0'),
+                     b'%.100i', c_int(123))
+        check_format('123'.rjust(80, '0').rjust(100),
+                     b'%100.80i', c_int(123))
+        check_format('123'.rjust(10, '0'),
+                     b'%010u', c_uint(123))
+        check_format('123'.rjust(100),
+                     b'%100u', c_uint(123))
+        check_format('123'.rjust(100, '0'),
+                     b'%.100u', c_uint(123))
+        check_format('123'.rjust(80, '0').rjust(100),
+                     b'%100.80u', c_uint(123))
+        check_format('123'.rjust(10, '0'),
+                     b'%010x', c_int(0x123))
+        check_format('123'.rjust(100),
+                     b'%100x', c_int(0x123))
+        check_format('123'.rjust(100, '0'),
+                     b'%.100x', c_int(0x123))
+        check_format('123'.rjust(80, '0').rjust(100),
+                     b'%100.80x', c_int(0x123))
+        # test %A
+        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
+                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
+        # test %V
+        check_format('repr=abc',
+                     b'repr=%V', 'abc', b'xyz')
+        # test %p
+        # We cannot test the exact result,
+        # because it returns a hex representation of a C pointer,
+        # which is going to be different each time. But, we can test the format.
+        p_format_regex = r'^0x[a-zA-Z0-9]{3,}$'
+        p_format1 = PyUnicode_FromFormat(b'%p', 'abc')
+        self.assertIsInstance(p_format1, str)
+        self.assertRegex(p_format1, p_format_regex)
+        p_format2 = PyUnicode_FromFormat(b'%p %p', '123456', b'xyz')
+        self.assertIsInstance(p_format2, str)
+        self.assertRegex(p_format2,
+                         r'0x[a-zA-Z0-9]{3,} 0x[a-zA-Z0-9]{3,}')
+        # Extra args are ignored:
+        p_format3 = PyUnicode_FromFormat(b'%p', '123456', None, b'xyz')
+        self.assertIsInstance(p_format3, str)
+        self.assertRegex(p_format3, p_format_regex)
+        # Test string decode from parameter of %s using utf-8.
+        # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
+        # '\u4eba\u6c11'
+        check_format('repr=\u4eba\u6c11',
+                     b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
+        #Test replace error handler.
+        check_format('repr=abc\ufffd',
+                     b'repr=%V', None, b'abc\xff')
+        # not supported: copy the raw format string. these tests are just here
+        # to check for crashes and should not be considered as specifications
+        check_format('%s',
+                     b'%1%s', b'abc')
+        check_format('%1abc',
+                     b'%1abc')
+        check_format('%+i',
+                     b'%+i', c_int(10))
+        check_format('%.%s',
+                     b'%.%s', b'abc')
+        # Issue #33817: empty strings
+        check_format('',
+                     b'')
+        check_format('',
+                     b'%s', b'')
+    # Test PyUnicode_AsWideChar()
+    @support.cpython_only
+    def test_aswidechar(self):
+        from _testcapi import unicode_aswidechar
+        import_helper.import_module('ctypes')
+        from ctypes import c_wchar, sizeof
+        wchar, size = unicode_aswidechar('abcdef', 2)
+        self.assertEqual(size, 2)
+        self.assertEqual(wchar, 'ab')
+        wchar, size = unicode_aswidechar('abc', 3)
+        self.assertEqual(size, 3)
+        self.assertEqual(wchar, 'abc')
+        wchar, size = unicode_aswidechar('abc', 4)
+        self.assertEqual(size, 3)
+        self.assertEqual(wchar, 'abc\0')
+        wchar, size = unicode_aswidechar('abc', 10)
+        self.assertEqual(size, 3)
+        self.assertEqual(wchar, 'abc\0')
+        wchar, size = unicode_aswidechar('abc\0def', 20)
+        self.assertEqual(size, 7)
+        self.assertEqual(wchar, 'abc\0def\0')
+        nonbmp = chr(0x10ffff)
+        if sizeof(c_wchar) == 2:
+            buflen = 3
+            nchar = 2
+        else: # sizeof(c_wchar) == 4
+            buflen = 2
+            nchar = 1
+        wchar, size = unicode_aswidechar(nonbmp, buflen)
+        self.assertEqual(size, nchar)
+        self.assertEqual(wchar, nonbmp + '\0')
+    # Test PyUnicode_AsWideCharString()
+    @support.cpython_only
+    def test_aswidecharstring(self):
+        from _testcapi import unicode_aswidecharstring
+        import_helper.import_module('ctypes')
+        from ctypes import c_wchar, sizeof
+        wchar, size = unicode_aswidecharstring('abc')
+        self.assertEqual(size, 3)
+        self.assertEqual(wchar, 'abc\0')
+        wchar, size = unicode_aswidecharstring('abc\0def')
+        self.assertEqual(size, 7)
+        self.assertEqual(wchar, 'abc\0def\0')
+        nonbmp = chr(0x10ffff)
+        if sizeof(c_wchar) == 2:
+            nchar = 2
+        else: # sizeof(c_wchar) == 4
+            nchar = 1
+        wchar, size = unicode_aswidecharstring(nonbmp)
+        self.assertEqual(size, nchar)
+        self.assertEqual(wchar, nonbmp + '\0')
+    # Test PyUnicode_AsUCS4()
+    @support.cpython_only
+    def test_asucs4(self):
+        from _testcapi import unicode_asucs4
+        for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
+                  'a\ud800b\udfffc', '\ud834\udd1e']:
+            l = len(s)
+            self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
+            self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
+            self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
+            self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
+            self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
+            self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
+            s = '\0'.join([s, s])
+            self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
+            self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
+    # Test PyUnicode_AsUTF8()
+    @support.cpython_only
+    def test_asutf8(self):
+        from _testcapi import unicode_asutf8
+        bmp = '\u0100'
+        bmp2 = '\uffff'
+        nonbmp = chr(0x10ffff)
+        self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
+        self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
+        self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
+        self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
+    # Test PyUnicode_AsUTF8AndSize()
+    @support.cpython_only
+    def test_asutf8andsize(self):
+        from _testcapi import unicode_asutf8andsize
+        bmp = '\u0100'
+        bmp2 = '\uffff'
+        nonbmp = chr(0x10ffff)
+        self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
+        self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
+        self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
+        self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
+    # Test PyUnicode_FindChar()
+    @support.cpython_only
+    def test_findchar(self):
+        from _testcapi import unicode_findchar
+        for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
+            for i, ch in enumerate(str):
+                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
+                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
+        str = "!>_<!"
+        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
+        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
+        # start < end
+        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
+        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
+        # start >= end
+        self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
+        self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
+        # negative
+        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
+        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
+    # Test PyUnicode_CopyCharacters()
+    @support.cpython_only
+    def test_copycharacters(self):
+        from _testcapi import unicode_copycharacters
+        strings = [
+            'abcde', '\xa1\xa2\xa3\xa4\xa5',
+            '\u4f60\u597d\u4e16\u754c\uff01',
+            '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
+        ]
+        for idx, from_ in enumerate(strings):
+            # wide -> narrow: exceed maxchar limitation
+            for to in strings[:idx]:
+                self.assertRaises(
+                    SystemError,
+                    unicode_copycharacters, to, 0, from_, 0, 5
+                )
+            # same kind
+            for from_start in range(5):
+                self.assertEqual(
+                    unicode_copycharacters(from_, 0, from_, from_start, 5),
+                    (from_[from_start:from_start+5].ljust(5, '\0'),
+                     5-from_start)
+                )
+            for to_start in range(5):
+                self.assertEqual(
+                    unicode_copycharacters(from_, to_start, from_, to_start, 5),
+                    (from_[to_start:to_start+5].rjust(5, '\0'),
+                     5-to_start)
+                )
+            # narrow -> wide
+            # Tests omitted since this creates invalid strings.
+        s = strings[0]
+        self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
+        self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
+        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
+        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
+        self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
+        self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
+        self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
+    @support.cpython_only
+    @support.requires_legacy_unicode_capi
+    def test_encode_decimal(self):
+        from _testcapi import unicode_encodedecimal
+        with warnings_helper.check_warnings():
+            warnings.simplefilter('ignore', DeprecationWarning)
+            self.assertEqual(unicode_encodedecimal('123'),
+                             b'123')
+            self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
+                             b'3.14')
+            self.assertEqual(unicode_encodedecimal(
+                             "\N{EM SPACE}3.14\N{EN SPACE}"), b' 3.14 ')
+            self.assertRaises(UnicodeEncodeError,
+                              unicode_encodedecimal, "123\u20ac", "strict")
+            self.assertRaisesRegex(
+                ValueError,
+                "^'decimal' codec can't encode character",
+                unicode_encodedecimal, "123\u20ac", "replace")
+    @support.cpython_only
+    @support.requires_legacy_unicode_capi
+    def test_transform_decimal(self):
+        from _testcapi import unicode_transformdecimaltoascii as transform_decimal
+        with warnings_helper.check_warnings():
+            warnings.simplefilter('ignore', DeprecationWarning)
+            self.assertEqual(transform_decimal('123'),
+                             '123')
+            self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
+                             '3.14')
+            self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
+                             "\N{EM SPACE}3.14\N{EN SPACE}")
+            self.assertEqual(transform_decimal('123\u20ac'),
+                             '123\u20ac')
+    @support.cpython_only
+    def test_pep393_utf8_caching_bug(self):
+        # Issue #25709: Problem with string concatenation and utf-8 cache
+        from _testcapi import getargs_s_hash
+        for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
+            s = ''
+            for i in range(5):
+                # Due to CPython specific optimization the 's' string can be
+                # resized in-place.
+                s += chr(k)
+                # Parsing with the "s#" format code calls indirectly
+                # PyUnicode_AsUTF8AndSize() which creates the UTF-8
+                # encoded string cached in the Unicode object.
+                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
+                # Check that the second call returns the same result
+                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
+if __name__ == "__main__":
+    unittest.main()
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 0feb61629174..f6a1651e76f7 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -15,7 +15,6 @@
 import unicodedata
 import unittest
 import warnings
-from test.support import import_helper
 from test.support import warnings_helper
 from test import support, string_tests
 from test.support.script_helper import assert_python_failure
@@ -2570,492 +2569,6 @@ def test_check_encoding_errors(self):
         self.assertEqual(proc.rc, 10, proc)
-class CAPITest(unittest.TestCase):
-    # Test PyUnicode_FromFormat()
-    def test_from_format(self):
-        import_helper.import_module('ctypes')
-        from ctypes import (
-            c_char_p,
-            pythonapi, py_object, sizeof,
-            c_int, c_long, c_longlong, c_ssize_t,
-            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
-        name = "PyUnicode_FromFormat"
-        _PyUnicode_FromFormat = getattr(pythonapi, name)
-        _PyUnicode_FromFormat.argtypes = (c_char_p,)
-        _PyUnicode_FromFormat.restype = py_object
-        def PyUnicode_FromFormat(format, *args):
-            cargs = tuple(
-                py_object(arg) if isinstance(arg, str) else arg
-                for arg in args)
-            return _PyUnicode_FromFormat(format, *cargs)
-        def check_format(expected, format, *args):
-            text = PyUnicode_FromFormat(format, *args)
-            self.assertEqual(expected, text)
-        # ascii format, non-ascii argument
-        check_format('ascii\x7f=unicode\xe9',
-                     b'ascii\x7f=%U', 'unicode\xe9')
-        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
-        # raises an error
-        self.assertRaisesRegex(ValueError,
-            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
-            'string, got a non-ASCII byte: 0xe9$',
-            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
-        # test "%c"
-        check_format('\uabcd',
-                     b'%c', c_int(0xabcd))
-        check_format('\U0010ffff',
-                     b'%c', c_int(0x10ffff))
-        with self.assertRaises(OverflowError):
-            PyUnicode_FromFormat(b'%c', c_int(0x110000))
-        # Issue #18183
-        check_format('\U00010000\U00100000',
-                     b'%c%c', c_int(0x10000), c_int(0x100000))
-        # test "%"
-        check_format('%',
-                     b'%')
-        check_format('%',
-                     b'%%')
-        check_format('%s',
-                     b'%%s')
-        check_format('[%]',
-                     b'[%%]')
-        check_format('%abc',
-                     b'%%%s', b'abc')
-        # truncated string
-        check_format('abc',
-                     b'%.3s', b'abcdef')
-        check_format('abc[\ufffd',
-                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
-        check_format("'\\u20acABC'",
-                     b'%A', '\u20acABC')
-        check_format("'\\u20",
-                     b'%.5A', '\u20acABCDEF')
-        check_format("'\u20acABC'",
-                     b'%R', '\u20acABC')
-        check_format("'\u20acA",
-                     b'%.3R', '\u20acABCDEF')
-        check_format('\u20acAB',
-                     b'%.3S', '\u20acABCDEF')
-        check_format('\u20acAB',
-                     b'%.3U', '\u20acABCDEF')
-        check_format('\u20acAB',
-                     b'%.3V', '\u20acABCDEF', None)
-        check_format('abc[\ufffd',
-                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
-        # following tests comes from #7330
-        # test width modifier and precision modifier with %S
-        check_format("repr=  abc",
-                     b'repr=%5S', 'abc')
-        check_format("repr=ab",
-                     b'repr=%.2S', 'abc')
-        check_format("repr=   ab",
-                     b'repr=%5.2S', 'abc')
-        # test width modifier and precision modifier with %R
-        check_format("repr=   'abc'",
-                     b'repr=%8R', 'abc')
-        check_format("repr='ab",
-                     b'repr=%.3R', 'abc')
-        check_format("repr=  'ab",
-                     b'repr=%5.3R', 'abc')
-        # test width modifier and precision modifier with %A
-        check_format("repr=   'abc'",
-                     b'repr=%8A', 'abc')
-        check_format("repr='ab",
-                     b'repr=%.3A', 'abc')
-        check_format("repr=  'ab",
-                     b'repr=%5.3A', 'abc')
-        # test width modifier and precision modifier with %s
-        check_format("repr=  abc",
-                     b'repr=%5s', b'abc')
-        check_format("repr=ab",
-                     b'repr=%.2s', b'abc')
-        check_format("repr=   ab",
-                     b'repr=%5.2s', b'abc')
-        # test width modifier and precision modifier with %U
-        check_format("repr=  abc",
-                     b'repr=%5U', 'abc')
-        check_format("repr=ab",
-                     b'repr=%.2U', 'abc')
-        check_format("repr=   ab",
-                     b'repr=%5.2U', 'abc')
-        # test width modifier and precision modifier with %V
-        check_format("repr=  abc",
-                     b'repr=%5V', 'abc', b'123')
-        check_format("repr=ab",
-                     b'repr=%.2V', 'abc', b'123')
-        check_format("repr=   ab",
-                     b'repr=%5.2V', 'abc', b'123')
-        check_format("repr=  123",
-                     b'repr=%5V', None, b'123')
-        check_format("repr=12",
-                     b'repr=%.2V', None, b'123')
-        check_format("repr=   12",
-                     b'repr=%5.2V', None, b'123')
-        # test integer formats (%i, %d, %u)
-        check_format('010',
-                     b'%03i', c_int(10))
-        check_format('0010',
-                     b'%0.4i', c_int(10))
-        check_format('-123',
-                     b'%i', c_int(-123))
-        check_format('-123',
-                     b'%li', c_long(-123))
-        check_format('-123',
-                     b'%lli', c_longlong(-123))
-        check_format('-123',
-                     b'%zi', c_ssize_t(-123))
-        check_format('-123',
-                     b'%d', c_int(-123))
-        check_format('-123',
-                     b'%ld', c_long(-123))
-        check_format('-123',
-                     b'%lld', c_longlong(-123))
-        check_format('-123',
-                     b'%zd', c_ssize_t(-123))
-        check_format('123',
-                     b'%u', c_uint(123))
-        check_format('123',
-                     b'%lu', c_ulong(123))
-        check_format('123',
-                     b'%llu', c_ulonglong(123))
-        check_format('123',
-                     b'%zu', c_size_t(123))
-        # test long output
-        min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
-        max_longlong = -min_longlong - 1
-        check_format(str(min_longlong),
-                     b'%lld', c_longlong(min_longlong))
-        check_format(str(max_longlong),
-                     b'%lld', c_longlong(max_longlong))
-        max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
-        check_format(str(max_ulonglong),
-                     b'%llu', c_ulonglong(max_ulonglong))
-        PyUnicode_FromFormat(b'%p', c_void_p(-1))
-        # test padding (width and/or precision)
-        check_format('123'.rjust(10, '0'),
-                     b'%010i', c_int(123))
-        check_format('123'.rjust(100),
-                     b'%100i', c_int(123))
-        check_format('123'.rjust(100, '0'),
-                     b'%.100i', c_int(123))
-        check_format('123'.rjust(80, '0').rjust(100),
-                     b'%100.80i', c_int(123))
-        check_format('123'.rjust(10, '0'),
-                     b'%010u', c_uint(123))
-        check_format('123'.rjust(100),
-                     b'%100u', c_uint(123))
-        check_format('123'.rjust(100, '0'),
-                     b'%.100u', c_uint(123))
-        check_format('123'.rjust(80, '0').rjust(100),
-                     b'%100.80u', c_uint(123))
-        check_format('123'.rjust(10, '0'),
-                     b'%010x', c_int(0x123))
-        check_format('123'.rjust(100),
-                     b'%100x', c_int(0x123))
-        check_format('123'.rjust(100, '0'),
-                     b'%.100x', c_int(0x123))
-        check_format('123'.rjust(80, '0').rjust(100),
-                     b'%100.80x', c_int(0x123))
-        # test %A
-        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
-                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
-        # test %V
-        check_format('repr=abc',
-                     b'repr=%V', 'abc', b'xyz')
-        # test %p
-        # We cannot test the exact result,
-        # because it returns a hex representation of a C pointer,
-        # which is going to be different each time. But, we can test the format.
-        p_format_regex = r'^0x[a-zA-Z0-9]{3,}$'
-        p_format1 = PyUnicode_FromFormat(b'%p', 'abc')
-        self.assertIsInstance(p_format1, str)
-        self.assertRegex(p_format1, p_format_regex)
-        p_format2 = PyUnicode_FromFormat(b'%p %p', '123456', b'xyz')
-        self.assertIsInstance(p_format2, str)
-        self.assertRegex(p_format2,
-                         r'0x[a-zA-Z0-9]{3,} 0x[a-zA-Z0-9]{3,}')
-        # Extra args are ignored:
-        p_format3 = PyUnicode_FromFormat(b'%p', '123456', None, b'xyz')
-        self.assertIsInstance(p_format3, str)
-        self.assertRegex(p_format3, p_format_regex)
-        # Test string decode from parameter of %s using utf-8.
-        # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
-        # '\u4eba\u6c11'
-        check_format('repr=\u4eba\u6c11',
-                     b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
-        #Test replace error handler.
-        check_format('repr=abc\ufffd',
-                     b'repr=%V', None, b'abc\xff')
-        # not supported: copy the raw format string. these tests are just here
-        # to check for crashes and should not be considered as specifications
-        check_format('%s',
-                     b'%1%s', b'abc')
-        check_format('%1abc',
-                     b'%1abc')
-        check_format('%+i',
-                     b'%+i', c_int(10))
-        check_format('%.%s',
-                     b'%.%s', b'abc')
-        # Issue #33817: empty strings
-        check_format('',
-                     b'')
-        check_format('',
-                     b'%s', b'')
-    # Test PyUnicode_AsWideChar()
-    @support.cpython_only
-    def test_aswidechar(self):
-        from _testcapi import unicode_aswidechar
-        import_helper.import_module('ctypes')
-        from ctypes import c_wchar, sizeof
-        wchar, size = unicode_aswidechar('abcdef', 2)
-        self.assertEqual(size, 2)
-        self.assertEqual(wchar, 'ab')
-        wchar, size = unicode_aswidechar('abc', 3)
-        self.assertEqual(size, 3)
-        self.assertEqual(wchar, 'abc')
-        wchar, size = unicode_aswidechar('abc', 4)
-        self.assertEqual(size, 3)
-        self.assertEqual(wchar, 'abc\0')
-        wchar, size = unicode_aswidechar('abc', 10)
-        self.assertEqual(size, 3)
-        self.assertEqual(wchar, 'abc\0')
-        wchar, size = unicode_aswidechar('abc\0def', 20)
-        self.assertEqual(size, 7)
-        self.assertEqual(wchar, 'abc\0def\0')
-        nonbmp = chr(0x10ffff)
-        if sizeof(c_wchar) == 2:
-            buflen = 3
-            nchar = 2
-        else: # sizeof(c_wchar) == 4
-            buflen = 2
-            nchar = 1
-        wchar, size = unicode_aswidechar(nonbmp, buflen)
-        self.assertEqual(size, nchar)
-        self.assertEqual(wchar, nonbmp + '\0')
-    # Test PyUnicode_AsWideCharString()
-    @support.cpython_only
-    def test_aswidecharstring(self):
-        from _testcapi import unicode_aswidecharstring
-        import_helper.import_module('ctypes')
-        from ctypes import c_wchar, sizeof
-        wchar, size = unicode_aswidecharstring('abc')
-        self.assertEqual(size, 3)
-        self.assertEqual(wchar, 'abc\0')
-        wchar, size = unicode_aswidecharstring('abc\0def')
-        self.assertEqual(size, 7)
-        self.assertEqual(wchar, 'abc\0def\0')
-        nonbmp = chr(0x10ffff)
-        if sizeof(c_wchar) == 2:
-            nchar = 2
-        else: # sizeof(c_wchar) == 4
-            nchar = 1
-        wchar, size = unicode_aswidecharstring(nonbmp)
-        self.assertEqual(size, nchar)
-        self.assertEqual(wchar, nonbmp + '\0')
-    # Test PyUnicode_AsUCS4()
-    @support.cpython_only
-    def test_asucs4(self):
-        from _testcapi import unicode_asucs4
-        for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
-                  'a\ud800b\udfffc', '\ud834\udd1e']:
-            l = len(s)
-            self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
-            self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
-            self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
-            self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
-            self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
-            self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
-            s = '\0'.join([s, s])
-            self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
-            self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
-    # Test PyUnicode_AsUTF8()
-    @support.cpython_only
-    def test_asutf8(self):
-        from _testcapi import unicode_asutf8
-        bmp = '\u0100'
-        bmp2 = '\uffff'
-        nonbmp = chr(0x10ffff)
-        self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
-        self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
-        self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
-        self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
-    # Test PyUnicode_AsUTF8AndSize()
-    @support.cpython_only
-    def test_asutf8andsize(self):
-        from _testcapi import unicode_asutf8andsize
-        bmp = '\u0100'
-        bmp2 = '\uffff'
-        nonbmp = chr(0x10ffff)
-        self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
-        self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
-        self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
-        self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
-    # Test PyUnicode_FindChar()
-    @support.cpython_only
-    def test_findchar(self):
-        from _testcapi import unicode_findchar
-        for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
-            for i, ch in enumerate(str):
-                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
-                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
-        str = "!>_<!"
-        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
-        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
-        # start < end
-        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
-        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
-        # start >= end
-        self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
-        self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
-        # negative
-        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
-        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
-    # Test PyUnicode_CopyCharacters()
-    @support.cpython_only
-    def test_copycharacters(self):
-        from _testcapi import unicode_copycharacters
-        strings = [
-            'abcde', '\xa1\xa2\xa3\xa4\xa5',
-            '\u4f60\u597d\u4e16\u754c\uff01',
-            '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
-        ]
-        for idx, from_ in enumerate(strings):
-            # wide -> narrow: exceed maxchar limitation
-            for to in strings[:idx]:
-                self.assertRaises(
-                    SystemError,
-                    unicode_copycharacters, to, 0, from_, 0, 5
-                )
-            # same kind
-            for from_start in range(5):
-                self.assertEqual(
-                    unicode_copycharacters(from_, 0, from_, from_start, 5),
-                    (from_[from_start:from_start+5].ljust(5, '\0'),
-                     5-from_start)
-                )
-            for to_start in range(5):
-                self.assertEqual(
-                    unicode_copycharacters(from_, to_start, from_, to_start, 5),
-                    (from_[to_start:to_start+5].rjust(5, '\0'),
-                     5-to_start)
-                )
-            # narrow -> wide
-            # Tests omitted since this creates invalid strings.
-        s = strings[0]
-        self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
-        self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
-        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
-        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
-        self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
-        self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
-        self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
-    @support.cpython_only
-    @support.requires_legacy_unicode_capi
-    def test_encode_decimal(self):
-        from _testcapi import unicode_encodedecimal
-        with warnings_helper.check_warnings():
-            warnings.simplefilter('ignore', DeprecationWarning)
-            self.assertEqual(unicode_encodedecimal('123'),
-                             b'123')
-            self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
-                             b'3.14')
-            self.assertEqual(unicode_encodedecimal(
-                             "\N{EM SPACE}3.14\N{EN SPACE}"), b' 3.14 ')
-            self.assertRaises(UnicodeEncodeError,
-                              unicode_encodedecimal, "123\u20ac", "strict")
-            self.assertRaisesRegex(
-                ValueError,
-                "^'decimal' codec can't encode character",
-                unicode_encodedecimal, "123\u20ac", "replace")
-    @support.cpython_only
-    @support.requires_legacy_unicode_capi
-    def test_transform_decimal(self):
-        from _testcapi import unicode_transformdecimaltoascii as transform_decimal
-        with warnings_helper.check_warnings():
-            warnings.simplefilter('ignore', DeprecationWarning)
-            self.assertEqual(transform_decimal('123'),
-                             '123')
-            self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
-                             '3.14')
-            self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
-                             "\N{EM SPACE}3.14\N{EN SPACE}")
-            self.assertEqual(transform_decimal('123\u20ac'),
-                             '123\u20ac')
-    @support.cpython_only
-    def test_pep393_utf8_caching_bug(self):
-        # Issue #25709: Problem with string concatenation and utf-8 cache
-        from _testcapi import getargs_s_hash
-        for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
-            s = ''
-            for i in range(5):
-                # Due to CPython specific optimization the 's' string can be
-                # resized in-place.
-                s += chr(k)
-                # Parsing with the "s#" format code calls indirectly
-                # PyUnicode_AsUTF8AndSize() which creates the UTF-8
-                # encoded string cached in the Unicode object.
-                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
-                # Check that the second call returns the same result
-                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
 class StringModuleTest(unittest.TestCase):
     def test_formatter_parser(self):
         def parse(format):

More information about the Python-checkins mailing list