[Python-checkins] [3.10] gh-78453: Move Unicode C API tests from test_unicode to test_capi.test_unicode (GH-99431). (GH-99617)
serhiy-storchaka
webhook-mailer at python.org
Sun Nov 20 05:45:56 EST 2022
https://github.com/python/cpython/commit/42b7b2179e27f6df110a1b528fc948784e497265
commit: 42b7b2179e27f6df110a1b528fc948784e497265
branch: 3.10
author: Serhiy Storchaka <storchaka at gmail.com>
committer: serhiy-storchaka <storchaka at gmail.com>
date: 2022-11-20T12:45:50+02:00
summary:
[3.10] gh-78453: Move Unicode C API tests from test_unicode to test_capi.test_unicode (GH-99431). (GH-99617)
(cherry picked from commit 06d4e02c3b3526b5d90e41a0a0befa8663e08f27)
Co-authored-by: Serhiy Storchaka <storchaka at gmail.com>
files:
A Lib/test/test_capi/test_unicode.py
M Lib/test/test_unicode.py
diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
new file mode 100644
index 000000000000..8c2ada8e5a58
--- /dev/null
+++ b/Lib/test/test_capi/test_unicode.py
@@ -0,0 +1,502 @@
+import unittest
+import sys
+import warnings
+from test import support
+from test.support import import_helper
+from test.support import warnings_helper
+
+try:
+ import _testcapi
+except ImportError:
+ _testcapi = None
+
+
+class CAPITest(unittest.TestCase):
+
+ # Test PyUnicode_FromFormat()
+ def test_from_format(self):
+ import_helper.import_module('ctypes')
+ from ctypes import (
+ c_char_p,
+ pythonapi, py_object, sizeof,
+ c_int, c_long, c_longlong, c_ssize_t,
+ c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
+ name = "PyUnicode_FromFormat"
+ _PyUnicode_FromFormat = getattr(pythonapi, name)
+ _PyUnicode_FromFormat.argtypes = (c_char_p,)
+ _PyUnicode_FromFormat.restype = py_object
+
+ def PyUnicode_FromFormat(format, *args):
+ cargs = tuple(
+ py_object(arg) if isinstance(arg, str) else arg
+ for arg in args)
+ return _PyUnicode_FromFormat(format, *cargs)
+
+ def check_format(expected, format, *args):
+ text = PyUnicode_FromFormat(format, *args)
+ self.assertEqual(expected, text)
+
+ # ascii format, non-ascii argument
+ check_format('ascii\x7f=unicode\xe9',
+ b'ascii\x7f=%U', 'unicode\xe9')
+
+ # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
+ # raises an error
+ self.assertRaisesRegex(ValueError,
+ r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
+ 'string, got a non-ASCII byte: 0xe9$',
+ PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
+
+ # test "%c"
+ check_format('\uabcd',
+ b'%c', c_int(0xabcd))
+ check_format('\U0010ffff',
+ b'%c', c_int(0x10ffff))
+ with self.assertRaises(OverflowError):
+ PyUnicode_FromFormat(b'%c', c_int(0x110000))
+ # Issue #18183
+ check_format('\U00010000\U00100000',
+ b'%c%c', c_int(0x10000), c_int(0x100000))
+
+ # test "%"
+ check_format('%',
+ b'%')
+ check_format('%',
+ b'%%')
+ check_format('%s',
+ b'%%s')
+ check_format('[%]',
+ b'[%%]')
+ check_format('%abc',
+ b'%%%s', b'abc')
+
+ # truncated string
+ check_format('abc',
+ b'%.3s', b'abcdef')
+ check_format('abc[\ufffd',
+ b'%.5s', 'abc[\u20ac]'.encode('utf8'))
+ check_format("'\\u20acABC'",
+ b'%A', '\u20acABC')
+ check_format("'\\u20",
+ b'%.5A', '\u20acABCDEF')
+ check_format("'\u20acABC'",
+ b'%R', '\u20acABC')
+ check_format("'\u20acA",
+ b'%.3R', '\u20acABCDEF')
+ check_format('\u20acAB',
+ b'%.3S', '\u20acABCDEF')
+ check_format('\u20acAB',
+ b'%.3U', '\u20acABCDEF')
+ check_format('\u20acAB',
+ b'%.3V', '\u20acABCDEF', None)
+ check_format('abc[\ufffd',
+ b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
+
+ # following tests comes from #7330
+ # test width modifier and precision modifier with %S
+ check_format("repr= abc",
+ b'repr=%5S', 'abc')
+ check_format("repr=ab",
+ b'repr=%.2S', 'abc')
+ check_format("repr= ab",
+ b'repr=%5.2S', 'abc')
+
+ # test width modifier and precision modifier with %R
+ check_format("repr= 'abc'",
+ b'repr=%8R', 'abc')
+ check_format("repr='ab",
+ b'repr=%.3R', 'abc')
+ check_format("repr= 'ab",
+ b'repr=%5.3R', 'abc')
+
+ # test width modifier and precision modifier with %A
+ check_format("repr= 'abc'",
+ b'repr=%8A', 'abc')
+ check_format("repr='ab",
+ b'repr=%.3A', 'abc')
+ check_format("repr= 'ab",
+ b'repr=%5.3A', 'abc')
+
+ # test width modifier and precision modifier with %s
+ check_format("repr= abc",
+ b'repr=%5s', b'abc')
+ check_format("repr=ab",
+ b'repr=%.2s', b'abc')
+ check_format("repr= ab",
+ b'repr=%5.2s', b'abc')
+
+ # test width modifier and precision modifier with %U
+ check_format("repr= abc",
+ b'repr=%5U', 'abc')
+ check_format("repr=ab",
+ b'repr=%.2U', 'abc')
+ check_format("repr= ab",
+ b'repr=%5.2U', 'abc')
+
+ # test width modifier and precision modifier with %V
+ check_format("repr= abc",
+ b'repr=%5V', 'abc', b'123')
+ check_format("repr=ab",
+ b'repr=%.2V', 'abc', b'123')
+ check_format("repr= ab",
+ b'repr=%5.2V', 'abc', b'123')
+ check_format("repr= 123",
+ b'repr=%5V', None, b'123')
+ check_format("repr=12",
+ b'repr=%.2V', None, b'123')
+ check_format("repr= 12",
+ b'repr=%5.2V', None, b'123')
+
+ # test integer formats (%i, %d, %u)
+ check_format('010',
+ b'%03i', c_int(10))
+ check_format('0010',
+ b'%0.4i', c_int(10))
+ check_format('-123',
+ b'%i', c_int(-123))
+ check_format('-123',
+ b'%li', c_long(-123))
+ check_format('-123',
+ b'%lli', c_longlong(-123))
+ check_format('-123',
+ b'%zi', c_ssize_t(-123))
+
+ check_format('-123',
+ b'%d', c_int(-123))
+ check_format('-123',
+ b'%ld', c_long(-123))
+ check_format('-123',
+ b'%lld', c_longlong(-123))
+ check_format('-123',
+ b'%zd', c_ssize_t(-123))
+
+ check_format('123',
+ b'%u', c_uint(123))
+ check_format('123',
+ b'%lu', c_ulong(123))
+ check_format('123',
+ b'%llu', c_ulonglong(123))
+ check_format('123',
+ b'%zu', c_size_t(123))
+
+ # test long output
+ min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
+ max_longlong = -min_longlong - 1
+ check_format(str(min_longlong),
+ b'%lld', c_longlong(min_longlong))
+ check_format(str(max_longlong),
+ b'%lld', c_longlong(max_longlong))
+ max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
+ check_format(str(max_ulonglong),
+ b'%llu', c_ulonglong(max_ulonglong))
+ PyUnicode_FromFormat(b'%p', c_void_p(-1))
+
+ # test padding (width and/or precision)
+ check_format('123'.rjust(10, '0'),
+ b'%010i', c_int(123))
+ check_format('123'.rjust(100),
+ b'%100i', c_int(123))
+ check_format('123'.rjust(100, '0'),
+ b'%.100i', c_int(123))
+ check_format('123'.rjust(80, '0').rjust(100),
+ b'%100.80i', c_int(123))
+
+ check_format('123'.rjust(10, '0'),
+ b'%010u', c_uint(123))
+ check_format('123'.rjust(100),
+ b'%100u', c_uint(123))
+ check_format('123'.rjust(100, '0'),
+ b'%.100u', c_uint(123))
+ check_format('123'.rjust(80, '0').rjust(100),
+ b'%100.80u', c_uint(123))
+
+ check_format('123'.rjust(10, '0'),
+ b'%010x', c_int(0x123))
+ check_format('123'.rjust(100),
+ b'%100x', c_int(0x123))
+ check_format('123'.rjust(100, '0'),
+ b'%.100x', c_int(0x123))
+ check_format('123'.rjust(80, '0').rjust(100),
+ b'%100.80x', c_int(0x123))
+
+ # test %A
+ check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
+ b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
+
+ # test %V
+ check_format('repr=abc',
+ b'repr=%V', 'abc', b'xyz')
+
+ # test %p
+ # We cannot test the exact result,
+ # because it returns a hex representation of a C pointer,
+ # which is going to be different each time. But, we can test the format.
+ p_format_regex = r'^0x[a-zA-Z0-9]{3,}$'
+ p_format1 = PyUnicode_FromFormat(b'%p', 'abc')
+ self.assertIsInstance(p_format1, str)
+ self.assertRegex(p_format1, p_format_regex)
+
+ p_format2 = PyUnicode_FromFormat(b'%p %p', '123456', b'xyz')
+ self.assertIsInstance(p_format2, str)
+ self.assertRegex(p_format2,
+ r'0x[a-zA-Z0-9]{3,} 0x[a-zA-Z0-9]{3,}')
+
+ # Extra args are ignored:
+ p_format3 = PyUnicode_FromFormat(b'%p', '123456', None, b'xyz')
+ self.assertIsInstance(p_format3, str)
+ self.assertRegex(p_format3, p_format_regex)
+
+ # Test string decode from parameter of %s using utf-8.
+ # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
+ # '\u4eba\u6c11'
+ check_format('repr=\u4eba\u6c11',
+ b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
+
+ #Test replace error handler.
+ check_format('repr=abc\ufffd',
+ b'repr=%V', None, b'abc\xff')
+
+ # not supported: copy the raw format string. these tests are just here
+ # to check for crashes and should not be considered as specifications
+ check_format('%s',
+ b'%1%s', b'abc')
+ check_format('%1abc',
+ b'%1abc')
+ check_format('%+i',
+ b'%+i', c_int(10))
+ check_format('%.%s',
+ b'%.%s', b'abc')
+
+ # Issue #33817: empty strings
+ check_format('',
+ b'')
+ check_format('',
+ b'%s', b'')
+
+ # Test PyUnicode_AsWideChar()
+ @support.cpython_only
+ def test_aswidechar(self):
+ from _testcapi import unicode_aswidechar
+ import_helper.import_module('ctypes')
+ from ctypes import c_wchar, sizeof
+
+ wchar, size = unicode_aswidechar('abcdef', 2)
+ self.assertEqual(size, 2)
+ self.assertEqual(wchar, 'ab')
+
+ wchar, size = unicode_aswidechar('abc', 3)
+ self.assertEqual(size, 3)
+ self.assertEqual(wchar, 'abc')
+
+ wchar, size = unicode_aswidechar('abc', 4)
+ self.assertEqual(size, 3)
+ self.assertEqual(wchar, 'abc\0')
+
+ wchar, size = unicode_aswidechar('abc', 10)
+ self.assertEqual(size, 3)
+ self.assertEqual(wchar, 'abc\0')
+
+ wchar, size = unicode_aswidechar('abc\0def', 20)
+ self.assertEqual(size, 7)
+ self.assertEqual(wchar, 'abc\0def\0')
+
+ nonbmp = chr(0x10ffff)
+ if sizeof(c_wchar) == 2:
+ buflen = 3
+ nchar = 2
+ else: # sizeof(c_wchar) == 4
+ buflen = 2
+ nchar = 1
+ wchar, size = unicode_aswidechar(nonbmp, buflen)
+ self.assertEqual(size, nchar)
+ self.assertEqual(wchar, nonbmp + '\0')
+
+ # Test PyUnicode_AsWideCharString()
+ @support.cpython_only
+ def test_aswidecharstring(self):
+ from _testcapi import unicode_aswidecharstring
+ import_helper.import_module('ctypes')
+ from ctypes import c_wchar, sizeof
+
+ wchar, size = unicode_aswidecharstring('abc')
+ self.assertEqual(size, 3)
+ self.assertEqual(wchar, 'abc\0')
+
+ wchar, size = unicode_aswidecharstring('abc\0def')
+ self.assertEqual(size, 7)
+ self.assertEqual(wchar, 'abc\0def\0')
+
+ nonbmp = chr(0x10ffff)
+ if sizeof(c_wchar) == 2:
+ nchar = 2
+ else: # sizeof(c_wchar) == 4
+ nchar = 1
+ wchar, size = unicode_aswidecharstring(nonbmp)
+ self.assertEqual(size, nchar)
+ self.assertEqual(wchar, nonbmp + '\0')
+
+ # Test PyUnicode_AsUCS4()
+ @support.cpython_only
+ def test_asucs4(self):
+ from _testcapi import unicode_asucs4
+ for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
+ 'a\ud800b\udfffc', '\ud834\udd1e']:
+ l = len(s)
+ self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
+ self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
+ self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
+ self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
+ self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
+ self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
+ s = '\0'.join([s, s])
+ self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
+ self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
+
+ # Test PyUnicode_AsUTF8()
+ @support.cpython_only
+ def test_asutf8(self):
+ from _testcapi import unicode_asutf8
+
+ bmp = '\u0100'
+ bmp2 = '\uffff'
+ nonbmp = chr(0x10ffff)
+
+ self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
+ self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
+ self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
+ self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
+
+ # Test PyUnicode_AsUTF8AndSize()
+ @support.cpython_only
+ def test_asutf8andsize(self):
+ from _testcapi import unicode_asutf8andsize
+
+ bmp = '\u0100'
+ bmp2 = '\uffff'
+ nonbmp = chr(0x10ffff)
+
+ self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
+ self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
+ self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
+ self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
+
+ # Test PyUnicode_FindChar()
+ @support.cpython_only
+ def test_findchar(self):
+ from _testcapi import unicode_findchar
+
+ for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
+ for i, ch in enumerate(str):
+ self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
+ self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
+
+ str = "!>_<!"
+ self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
+ self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
+ # start < end
+ self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
+ self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
+ # start >= end
+ self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
+ self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
+ # negative
+ self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
+ self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
+
+ # Test PyUnicode_CopyCharacters()
+ @support.cpython_only
+ def test_copycharacters(self):
+ from _testcapi import unicode_copycharacters
+
+ strings = [
+ 'abcde', '\xa1\xa2\xa3\xa4\xa5',
+ '\u4f60\u597d\u4e16\u754c\uff01',
+ '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
+ ]
+
+ for idx, from_ in enumerate(strings):
+ # wide -> narrow: exceed maxchar limitation
+ for to in strings[:idx]:
+ self.assertRaises(
+ SystemError,
+ unicode_copycharacters, to, 0, from_, 0, 5
+ )
+ # same kind
+ for from_start in range(5):
+ self.assertEqual(
+ unicode_copycharacters(from_, 0, from_, from_start, 5),
+ (from_[from_start:from_start+5].ljust(5, '\0'),
+ 5-from_start)
+ )
+ for to_start in range(5):
+ self.assertEqual(
+ unicode_copycharacters(from_, to_start, from_, to_start, 5),
+ (from_[to_start:to_start+5].rjust(5, '\0'),
+ 5-to_start)
+ )
+ # narrow -> wide
+ # Tests omitted since this creates invalid strings.
+
+ s = strings[0]
+ self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
+ self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
+ self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
+ self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
+ self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
+ self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
+ self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
+
+ @support.cpython_only
+ @support.requires_legacy_unicode_capi
+ def test_encode_decimal(self):
+ from _testcapi import unicode_encodedecimal
+ with warnings_helper.check_warnings():
+ warnings.simplefilter('ignore', DeprecationWarning)
+ self.assertEqual(unicode_encodedecimal('123'),
+ b'123')
+ self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
+ b'3.14')
+ self.assertEqual(unicode_encodedecimal(
+ "\N{EM SPACE}3.14\N{EN SPACE}"), b' 3.14 ')
+ self.assertRaises(UnicodeEncodeError,
+ unicode_encodedecimal, "123\u20ac", "strict")
+ self.assertRaisesRegex(
+ ValueError,
+ "^'decimal' codec can't encode character",
+ unicode_encodedecimal, "123\u20ac", "replace")
+
+ @support.cpython_only
+ @support.requires_legacy_unicode_capi
+ def test_transform_decimal(self):
+ from _testcapi import unicode_transformdecimaltoascii as transform_decimal
+ with warnings_helper.check_warnings():
+ warnings.simplefilter('ignore', DeprecationWarning)
+ self.assertEqual(transform_decimal('123'),
+ '123')
+ self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
+ '3.14')
+ self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
+ "\N{EM SPACE}3.14\N{EN SPACE}")
+ self.assertEqual(transform_decimal('123\u20ac'),
+ '123\u20ac')
+
+ @support.cpython_only
+ def test_pep393_utf8_caching_bug(self):
+ # Issue #25709: Problem with string concatenation and utf-8 cache
+ from _testcapi import getargs_s_hash
+ for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
+ s = ''
+ for i in range(5):
+ # Due to CPython specific optimization the 's' string can be
+ # resized in-place.
+ s += chr(k)
+ # Parsing with the "s#" format code calls indirectly
+ # PyUnicode_AsUTF8AndSize() which creates the UTF-8
+ # encoded string cached in the Unicode object.
+ self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
+ # Check that the second call returns the same result
+ self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 0feb61629174..f6a1651e76f7 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -15,7 +15,6 @@
import unicodedata
import unittest
import warnings
-from test.support import import_helper
from test.support import warnings_helper
from test import support, string_tests
from test.support.script_helper import assert_python_failure
@@ -2570,492 +2569,6 @@ def test_check_encoding_errors(self):
self.assertEqual(proc.rc, 10, proc)
-class CAPITest(unittest.TestCase):
-
- # Test PyUnicode_FromFormat()
- def test_from_format(self):
- import_helper.import_module('ctypes')
- from ctypes import (
- c_char_p,
- pythonapi, py_object, sizeof,
- c_int, c_long, c_longlong, c_ssize_t,
- c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
- name = "PyUnicode_FromFormat"
- _PyUnicode_FromFormat = getattr(pythonapi, name)
- _PyUnicode_FromFormat.argtypes = (c_char_p,)
- _PyUnicode_FromFormat.restype = py_object
-
- def PyUnicode_FromFormat(format, *args):
- cargs = tuple(
- py_object(arg) if isinstance(arg, str) else arg
- for arg in args)
- return _PyUnicode_FromFormat(format, *cargs)
-
- def check_format(expected, format, *args):
- text = PyUnicode_FromFormat(format, *args)
- self.assertEqual(expected, text)
-
- # ascii format, non-ascii argument
- check_format('ascii\x7f=unicode\xe9',
- b'ascii\x7f=%U', 'unicode\xe9')
-
- # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
- # raises an error
- self.assertRaisesRegex(ValueError,
- r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
- 'string, got a non-ASCII byte: 0xe9$',
- PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
-
- # test "%c"
- check_format('\uabcd',
- b'%c', c_int(0xabcd))
- check_format('\U0010ffff',
- b'%c', c_int(0x10ffff))
- with self.assertRaises(OverflowError):
- PyUnicode_FromFormat(b'%c', c_int(0x110000))
- # Issue #18183
- check_format('\U00010000\U00100000',
- b'%c%c', c_int(0x10000), c_int(0x100000))
-
- # test "%"
- check_format('%',
- b'%')
- check_format('%',
- b'%%')
- check_format('%s',
- b'%%s')
- check_format('[%]',
- b'[%%]')
- check_format('%abc',
- b'%%%s', b'abc')
-
- # truncated string
- check_format('abc',
- b'%.3s', b'abcdef')
- check_format('abc[\ufffd',
- b'%.5s', 'abc[\u20ac]'.encode('utf8'))
- check_format("'\\u20acABC'",
- b'%A', '\u20acABC')
- check_format("'\\u20",
- b'%.5A', '\u20acABCDEF')
- check_format("'\u20acABC'",
- b'%R', '\u20acABC')
- check_format("'\u20acA",
- b'%.3R', '\u20acABCDEF')
- check_format('\u20acAB',
- b'%.3S', '\u20acABCDEF')
- check_format('\u20acAB',
- b'%.3U', '\u20acABCDEF')
- check_format('\u20acAB',
- b'%.3V', '\u20acABCDEF', None)
- check_format('abc[\ufffd',
- b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
-
- # following tests comes from #7330
- # test width modifier and precision modifier with %S
- check_format("repr= abc",
- b'repr=%5S', 'abc')
- check_format("repr=ab",
- b'repr=%.2S', 'abc')
- check_format("repr= ab",
- b'repr=%5.2S', 'abc')
-
- # test width modifier and precision modifier with %R
- check_format("repr= 'abc'",
- b'repr=%8R', 'abc')
- check_format("repr='ab",
- b'repr=%.3R', 'abc')
- check_format("repr= 'ab",
- b'repr=%5.3R', 'abc')
-
- # test width modifier and precision modifier with %A
- check_format("repr= 'abc'",
- b'repr=%8A', 'abc')
- check_format("repr='ab",
- b'repr=%.3A', 'abc')
- check_format("repr= 'ab",
- b'repr=%5.3A', 'abc')
-
- # test width modifier and precision modifier with %s
- check_format("repr= abc",
- b'repr=%5s', b'abc')
- check_format("repr=ab",
- b'repr=%.2s', b'abc')
- check_format("repr= ab",
- b'repr=%5.2s', b'abc')
-
- # test width modifier and precision modifier with %U
- check_format("repr= abc",
- b'repr=%5U', 'abc')
- check_format("repr=ab",
- b'repr=%.2U', 'abc')
- check_format("repr= ab",
- b'repr=%5.2U', 'abc')
-
- # test width modifier and precision modifier with %V
- check_format("repr= abc",
- b'repr=%5V', 'abc', b'123')
- check_format("repr=ab",
- b'repr=%.2V', 'abc', b'123')
- check_format("repr= ab",
- b'repr=%5.2V', 'abc', b'123')
- check_format("repr= 123",
- b'repr=%5V', None, b'123')
- check_format("repr=12",
- b'repr=%.2V', None, b'123')
- check_format("repr= 12",
- b'repr=%5.2V', None, b'123')
-
- # test integer formats (%i, %d, %u)
- check_format('010',
- b'%03i', c_int(10))
- check_format('0010',
- b'%0.4i', c_int(10))
- check_format('-123',
- b'%i', c_int(-123))
- check_format('-123',
- b'%li', c_long(-123))
- check_format('-123',
- b'%lli', c_longlong(-123))
- check_format('-123',
- b'%zi', c_ssize_t(-123))
-
- check_format('-123',
- b'%d', c_int(-123))
- check_format('-123',
- b'%ld', c_long(-123))
- check_format('-123',
- b'%lld', c_longlong(-123))
- check_format('-123',
- b'%zd', c_ssize_t(-123))
-
- check_format('123',
- b'%u', c_uint(123))
- check_format('123',
- b'%lu', c_ulong(123))
- check_format('123',
- b'%llu', c_ulonglong(123))
- check_format('123',
- b'%zu', c_size_t(123))
-
- # test long output
- min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
- max_longlong = -min_longlong - 1
- check_format(str(min_longlong),
- b'%lld', c_longlong(min_longlong))
- check_format(str(max_longlong),
- b'%lld', c_longlong(max_longlong))
- max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
- check_format(str(max_ulonglong),
- b'%llu', c_ulonglong(max_ulonglong))
- PyUnicode_FromFormat(b'%p', c_void_p(-1))
-
- # test padding (width and/or precision)
- check_format('123'.rjust(10, '0'),
- b'%010i', c_int(123))
- check_format('123'.rjust(100),
- b'%100i', c_int(123))
- check_format('123'.rjust(100, '0'),
- b'%.100i', c_int(123))
- check_format('123'.rjust(80, '0').rjust(100),
- b'%100.80i', c_int(123))
-
- check_format('123'.rjust(10, '0'),
- b'%010u', c_uint(123))
- check_format('123'.rjust(100),
- b'%100u', c_uint(123))
- check_format('123'.rjust(100, '0'),
- b'%.100u', c_uint(123))
- check_format('123'.rjust(80, '0').rjust(100),
- b'%100.80u', c_uint(123))
-
- check_format('123'.rjust(10, '0'),
- b'%010x', c_int(0x123))
- check_format('123'.rjust(100),
- b'%100x', c_int(0x123))
- check_format('123'.rjust(100, '0'),
- b'%.100x', c_int(0x123))
- check_format('123'.rjust(80, '0').rjust(100),
- b'%100.80x', c_int(0x123))
-
- # test %A
- check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
- b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
-
- # test %V
- check_format('repr=abc',
- b'repr=%V', 'abc', b'xyz')
-
- # test %p
- # We cannot test the exact result,
- # because it returns a hex representation of a C pointer,
- # which is going to be different each time. But, we can test the format.
- p_format_regex = r'^0x[a-zA-Z0-9]{3,}$'
- p_format1 = PyUnicode_FromFormat(b'%p', 'abc')
- self.assertIsInstance(p_format1, str)
- self.assertRegex(p_format1, p_format_regex)
-
- p_format2 = PyUnicode_FromFormat(b'%p %p', '123456', b'xyz')
- self.assertIsInstance(p_format2, str)
- self.assertRegex(p_format2,
- r'0x[a-zA-Z0-9]{3,} 0x[a-zA-Z0-9]{3,}')
-
- # Extra args are ignored:
- p_format3 = PyUnicode_FromFormat(b'%p', '123456', None, b'xyz')
- self.assertIsInstance(p_format3, str)
- self.assertRegex(p_format3, p_format_regex)
-
- # Test string decode from parameter of %s using utf-8.
- # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
- # '\u4eba\u6c11'
- check_format('repr=\u4eba\u6c11',
- b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
-
- #Test replace error handler.
- check_format('repr=abc\ufffd',
- b'repr=%V', None, b'abc\xff')
-
- # not supported: copy the raw format string. these tests are just here
- # to check for crashes and should not be considered as specifications
- check_format('%s',
- b'%1%s', b'abc')
- check_format('%1abc',
- b'%1abc')
- check_format('%+i',
- b'%+i', c_int(10))
- check_format('%.%s',
- b'%.%s', b'abc')
-
- # Issue #33817: empty strings
- check_format('',
- b'')
- check_format('',
- b'%s', b'')
-
- # Test PyUnicode_AsWideChar()
- @support.cpython_only
- def test_aswidechar(self):
- from _testcapi import unicode_aswidechar
- import_helper.import_module('ctypes')
- from ctypes import c_wchar, sizeof
-
- wchar, size = unicode_aswidechar('abcdef', 2)
- self.assertEqual(size, 2)
- self.assertEqual(wchar, 'ab')
-
- wchar, size = unicode_aswidechar('abc', 3)
- self.assertEqual(size, 3)
- self.assertEqual(wchar, 'abc')
-
- wchar, size = unicode_aswidechar('abc', 4)
- self.assertEqual(size, 3)
- self.assertEqual(wchar, 'abc\0')
-
- wchar, size = unicode_aswidechar('abc', 10)
- self.assertEqual(size, 3)
- self.assertEqual(wchar, 'abc\0')
-
- wchar, size = unicode_aswidechar('abc\0def', 20)
- self.assertEqual(size, 7)
- self.assertEqual(wchar, 'abc\0def\0')
-
- nonbmp = chr(0x10ffff)
- if sizeof(c_wchar) == 2:
- buflen = 3
- nchar = 2
- else: # sizeof(c_wchar) == 4
- buflen = 2
- nchar = 1
- wchar, size = unicode_aswidechar(nonbmp, buflen)
- self.assertEqual(size, nchar)
- self.assertEqual(wchar, nonbmp + '\0')
-
- # Test PyUnicode_AsWideCharString()
- @support.cpython_only
- def test_aswidecharstring(self):
- from _testcapi import unicode_aswidecharstring
- import_helper.import_module('ctypes')
- from ctypes import c_wchar, sizeof
-
- wchar, size = unicode_aswidecharstring('abc')
- self.assertEqual(size, 3)
- self.assertEqual(wchar, 'abc\0')
-
- wchar, size = unicode_aswidecharstring('abc\0def')
- self.assertEqual(size, 7)
- self.assertEqual(wchar, 'abc\0def\0')
-
- nonbmp = chr(0x10ffff)
- if sizeof(c_wchar) == 2:
- nchar = 2
- else: # sizeof(c_wchar) == 4
- nchar = 1
- wchar, size = unicode_aswidecharstring(nonbmp)
- self.assertEqual(size, nchar)
- self.assertEqual(wchar, nonbmp + '\0')
-
- # Test PyUnicode_AsUCS4()
- @support.cpython_only
- def test_asucs4(self):
- from _testcapi import unicode_asucs4
- for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
- 'a\ud800b\udfffc', '\ud834\udd1e']:
- l = len(s)
- self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
- self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
- self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
- self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
- self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
- self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
- s = '\0'.join([s, s])
- self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
- self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
-
- # Test PyUnicode_AsUTF8()
- @support.cpython_only
- def test_asutf8(self):
- from _testcapi import unicode_asutf8
-
- bmp = '\u0100'
- bmp2 = '\uffff'
- nonbmp = chr(0x10ffff)
-
- self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
- self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
- self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
- self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
-
- # Test PyUnicode_AsUTF8AndSize()
- @support.cpython_only
- def test_asutf8andsize(self):
- from _testcapi import unicode_asutf8andsize
-
- bmp = '\u0100'
- bmp2 = '\uffff'
- nonbmp = chr(0x10ffff)
-
- self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
- self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
- self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
- self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
-
- # Test PyUnicode_FindChar()
- @support.cpython_only
- def test_findchar(self):
- from _testcapi import unicode_findchar
-
- for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
- for i, ch in enumerate(str):
- self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
- self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
-
- str = "!>_<!"
- self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
- self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
- # start < end
- self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
- self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
- # start >= end
- self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
- self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
- # negative
- self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
- self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
-
- # Test PyUnicode_CopyCharacters()
- @support.cpython_only
- def test_copycharacters(self):
- from _testcapi import unicode_copycharacters
-
- strings = [
- 'abcde', '\xa1\xa2\xa3\xa4\xa5',
- '\u4f60\u597d\u4e16\u754c\uff01',
- '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
- ]
-
- for idx, from_ in enumerate(strings):
- # wide -> narrow: exceed maxchar limitation
- for to in strings[:idx]:
- self.assertRaises(
- SystemError,
- unicode_copycharacters, to, 0, from_, 0, 5
- )
- # same kind
- for from_start in range(5):
- self.assertEqual(
- unicode_copycharacters(from_, 0, from_, from_start, 5),
- (from_[from_start:from_start+5].ljust(5, '\0'),
- 5-from_start)
- )
- for to_start in range(5):
- self.assertEqual(
- unicode_copycharacters(from_, to_start, from_, to_start, 5),
- (from_[to_start:to_start+5].rjust(5, '\0'),
- 5-to_start)
- )
- # narrow -> wide
- # Tests omitted since this creates invalid strings.
-
- s = strings[0]
- self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
- self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
- self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
- self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
- self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
- self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
- self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
-
- @support.cpython_only
- @support.requires_legacy_unicode_capi
- def test_encode_decimal(self):
- from _testcapi import unicode_encodedecimal
- with warnings_helper.check_warnings():
- warnings.simplefilter('ignore', DeprecationWarning)
- self.assertEqual(unicode_encodedecimal('123'),
- b'123')
- self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
- b'3.14')
- self.assertEqual(unicode_encodedecimal(
- "\N{EM SPACE}3.14\N{EN SPACE}"), b' 3.14 ')
- self.assertRaises(UnicodeEncodeError,
- unicode_encodedecimal, "123\u20ac", "strict")
- self.assertRaisesRegex(
- ValueError,
- "^'decimal' codec can't encode character",
- unicode_encodedecimal, "123\u20ac", "replace")
-
- @support.cpython_only
- @support.requires_legacy_unicode_capi
- def test_transform_decimal(self):
- from _testcapi import unicode_transformdecimaltoascii as transform_decimal
- with warnings_helper.check_warnings():
- warnings.simplefilter('ignore', DeprecationWarning)
- self.assertEqual(transform_decimal('123'),
- '123')
- self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
- '3.14')
- self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
- "\N{EM SPACE}3.14\N{EN SPACE}")
- self.assertEqual(transform_decimal('123\u20ac'),
- '123\u20ac')
-
- @support.cpython_only
- def test_pep393_utf8_caching_bug(self):
- # Issue #25709: Problem with string concatenation and utf-8 cache
- from _testcapi import getargs_s_hash
- for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
- s = ''
- for i in range(5):
- # Due to CPython specific optimization the 's' string can be
- # resized in-place.
- s += chr(k)
- # Parsing with the "s#" format code calls indirectly
- # PyUnicode_AsUTF8AndSize() which creates the UTF-8
- # encoded string cached in the Unicode object.
- self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
- # Check that the second call returns the same result
- self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
-
class StringModuleTest(unittest.TestCase):
def test_formatter_parser(self):
def parse(format):
More information about the Python-checkins
mailing list