[pypy-commit] pypy code_page-utf8: test, fix encoding code_pages

Tue Nov 12 22:35:54 EST 2019

Author: Matti Picus <matti.picus at gmail.com>
Branch: code_page-utf8
Changeset: r98040:5db4b2b481b5
Date: 2019-11-12 22:01 -0500
http://bitbucket.org/pypy/pypy/changeset/5db4b2b481b5/

Log:	test, fix encoding code_pages

diff --git a/pypy/interpreter/unicodehelper_win32.py b/pypy/interpreter/unicodehelper_win32.py
--- a/pypy/interpreter/unicodehelper_win32.py
+++ b/pypy/interpreter/unicodehelper_win32.py
@@ -163,12 +163,18 @@
                 charsize = 1
             else:
                 chars[0] = Py_UNICODE_HIGH_SURROGATE(uni)
-                chars[0] = Py_UNICODE_LOW_SURROGATE(uni)
+                chars[1] = Py_UNICODE_LOW_SURROGATE(uni)
                 charsize = 2
                 # first get the size of the result
             outsize = WideCharToMultiByte(cp, flags, chars, charsize, None, 0,
                                            None, used_default_p)
+            
             if outsize == 0:
+                if rwin32.GetLastError_saved() == rwin32.ERROR_NO_UNICODE_TRANSLATION:
+                    r, pos, retype = errorhandler(errors, name,
+                                       "invalid character", s, pos, pos+1)
+                    res.append(r)
+                    continue
                 raise rwin32.lastSavedWindowsError()
             # If we used a default char, then we failed!
             if (used_default_p and rffi.cast(lltype.Bool, used_default_p[0])):
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -1,4 +1,5 @@
 import sys
+import pytest
 
 class AppTestCodecs:
     spaceconfig = {
@@ -525,6 +526,57 @@
             ):
             check_decode(1252, test)
 
+    def test_encode_65001(self):
+        tests = [
+            ('abc', 'strict', b'abc'),
+            ('\xe9\u20ac', 'strict',  b'\xc3\xa9\xe2\x82\xac'),
+            ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
+            ('\udc80', 'strict', None),
+            ('\udc80', 'ignore', b''),
+            ('\udc80', 'replace', b'?'),
+            ('\udc80', 'backslashreplace', b'\\udc80'),
+            ('\udc80', 'namereplace', b'\\udc80'),
+            ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
+        ]
+        for text, errors, expected in tests:
+            if expected is not None:
+                try:
+                    encoded = text.encode('cp65001', errors)
+                except UnicodeEncodeError as err:
+                    assert False, ('Unable to encode %a to cp65001 with '
+                              'errors=%r: %s' % (text, errors, err))
+                assert encoded ==expected, ('%a.encode("cp65001", %r)=%a != %a'
+                    % (text, errors, encoded, expected))
+            else:
+                raises(UnicodeEncodeError, text.encode, "cp65001", errors)
+
+    def test_decode_65001(self):
+        tests = [
+            (b'abc', 'strict', 'abc'),
+            (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
+            (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
+            (b'\xef\xbf\xbd', 'strict', '\ufffd'),
+            (b'[\xc3\xa9]', 'strict', '[\xe9]'),
+            # invalid bytes
+            (b'[\xff]', 'strict', None),
+            (b'[\xff]', 'ignore', '[]'),
+            (b'[\xff]', 'replace', '[\ufffd]'),
+            (b'[\xff]', 'surrogateescape', '[\udcff]'),
+            (b'[\xed\xb2\x80]', 'strict', None),
+            (b'[\xed\xb2\x80]', 'ignore', '[]'),
+            (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
+        ]
+        for raw, errors, expected in tests:
+            if expected is not None:
+                try:
+                    decoded = raw.decode('cp65001', errors)
+                except UnicodeDecodeError as err:
+                    assert False, ('Unable to decode %a from cp65001 with '
+                              'errors=%r: %s' % (raw, errors, err))
+                assert decoded == expected, ('%a.decode("cp65001", %r)=%a != %a'
+                    % (raw, errors, decoded, expected))
+            else:
+                raises(UnicodeDecodeError, raw.decode, 'cp65001', errors)
 
 
 class AppTestPartialEvaluation: