[Python-checkins] bpo-34523: Py_DecodeLocale() use UTF-8 on Windows (GH-8998)

Victor Stinner webhook-mailer at python.org
Wed Aug 29 13:32:54 EDT 2018


https://github.com/python/cpython/commit/c5989cd87659acbfd4d19dc00dbe99c3a0fc9bd2
commit: c5989cd87659acbfd4d19dc00dbe99c3a0fc9bd2
branch: master
author: Victor Stinner <vstinner at redhat.com>
committer: GitHub <noreply at github.com>
date: 2018-08-29T19:32:47+02:00
summary:

bpo-34523: Py_DecodeLocale() use UTF-8 on Windows (GH-8998)

Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding on
Windows if Py_LegacyWindowsFSEncodingFlag is zero.

pymain_read_conf() now sets Py_LegacyWindowsFSEncodingFlag in its
loop, but restore its value at exit.

files:
A Misc/NEWS.d/next/C API/2018-08-29-18-48-47.bpo-34523.lLQ8rh.rst
M Doc/c-api/sys.rst
M Lib/test/test_embed.py
M Modules/main.c
M Python/fileutils.c

diff --git a/Doc/c-api/sys.rst b/Doc/c-api/sys.rst
index 994509aa50f2..0eee35a1285c 100644
--- a/Doc/c-api/sys.rst
+++ b/Doc/c-api/sys.rst
@@ -109,6 +109,7 @@ Operating System Utilities
    Encoding, highest priority to lowest priority:
 
    * ``UTF-8`` on macOS and Android;
+   * ``UTF-8`` on Windows if :c:data:`Py_LegacyWindowsFSEncodingFlag` is zero;
    * ``UTF-8`` if the Python UTF-8 mode is enabled;
    * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
      ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
@@ -140,6 +141,10 @@ Operating System Utilities
    .. versionchanged:: 3.7
       The function now uses the UTF-8 encoding in the UTF-8 mode.
 
+   .. versionchanged:: 3.8
+      The function now uses the UTF-8 encoding on Windows if
+      :c:data:`Py_LegacyWindowsFSEncodingFlag` is zero;
+
 
 .. c:function:: char* Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
 
@@ -150,6 +155,7 @@ Operating System Utilities
    Encoding, highest priority to lowest priority:
 
    * ``UTF-8`` on macOS and Android;
+   * ``UTF-8`` on Windows if :c:data:`Py_LegacyWindowsFSEncodingFlag` is zero;
    * ``UTF-8`` if the Python UTF-8 mode is enabled;
    * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
      ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
@@ -169,9 +175,6 @@ Operating System Utilities
    Use the :c:func:`Py_DecodeLocale` function to decode the bytes string back
    to a wide character string.
 
-   .. versionchanged:: 3.7
-      The function now uses the UTF-8 encoding in the UTF-8 mode.
-
    .. seealso::
 
       The :c:func:`PyUnicode_EncodeFSDefault` and
@@ -180,7 +183,11 @@ Operating System Utilities
    .. versionadded:: 3.5
 
    .. versionchanged:: 3.7
-      The function now supports the UTF-8 mode.
+      The function now uses the UTF-8 encoding in the UTF-8 mode.
+
+   .. versionchanged:: 3.8
+      The function now uses the UTF-8 encoding on Windows if
+      :c:data:`Py_LegacyWindowsFSEncodingFlag` is zero;
 
 
 .. _systemfunctions:
diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py
index b6311e4b334b..9155c40f405e 100644
--- a/Lib/test/test_embed.py
+++ b/Lib/test/test_embed.py
@@ -268,10 +268,10 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
         'dump_refs': 0,
         'malloc_stats': 0,
 
-        # None means that the default encoding is read at runtime:
-        # see get_locale_encoding().
+        # None means that the value is get by get_locale_encoding()
         'filesystem_encoding': None,
-        'filesystem_errors': sys.getfilesystemencodeerrors(),
+        'filesystem_errors': None,
+
         'utf8_mode': 0,
         'coerce_c_locale': 0,
         'coerce_c_locale_warn': 0,
@@ -294,7 +294,8 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
         'quiet': 0,
         'user_site_directory': 1,
         'buffered_stdio': 1,
-        # None means that check_config() gets the expected encoding at runtime
+
+        # None means that the value is get by get_stdio_encoding()
         'stdio_encoding': None,
         'stdio_errors': None,
 
@@ -303,7 +304,6 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
         '_frozen': 0,
     }
 
-
     def get_stdio_encoding(self, env):
         code = 'import sys; print(sys.stdout.encoding, sys.stdout.errors)'
         args = (sys.executable, '-c', code)
@@ -315,18 +315,12 @@ def get_stdio_encoding(self, env):
         out = proc.stdout.rstrip()
         return out.split()
 
-    def get_locale_encoding(self, isolated):
-        if sys.platform in ('win32', 'darwin') or support.is_android:
-            # Windows, macOS and Android use UTF-8
-            return "utf-8"
-
-        code = ('import codecs, locale, sys',
-                'locale.setlocale(locale.LC_CTYPE, "")',
-                'enc = locale.nl_langinfo(locale.CODESET)',
-                'enc = codecs.lookup(enc).name',
-                'print(enc)')
-        args = (sys.executable, '-c', '; '.join(code))
-        env = dict(os.environ)
+    def get_filesystem_encoding(self, isolated, env):
+        code = ('import codecs, locale, sys; '
+                'print(sys.getfilesystemencoding(), '
+                'sys.getfilesystemencodeerrors())')
+        args = (sys.executable, '-c', code)
+        env = dict(env)
         if not isolated:
             env['PYTHONCOERCECLOCALE'] = '0'
             env['PYTHONUTF8'] = '0'
@@ -336,7 +330,8 @@ def get_locale_encoding(self, isolated):
         if proc.returncode:
             raise Exception(f"failed to get the locale encoding: "
                             f"stdout={proc.stdout!r} stderr={proc.stderr!r}")
-        return proc.stdout.rstrip()
+        out = proc.stdout.rstrip()
+        return out.split()
 
     def check_config(self, testname, expected):
         expected = dict(self.DEFAULT_CONFIG, **expected)
@@ -356,8 +351,12 @@ def check_config(self, testname, expected):
                 expected['stdio_encoding'] = res[0]
             if expected['stdio_errors'] is None:
                 expected['stdio_errors'] = res[1]
-        if expected['filesystem_encoding'] is None:
-            expected['filesystem_encoding'] = self.get_locale_encoding(expected['isolated'])
+        if expected['filesystem_encoding'] is None or expected['filesystem_errors'] is None:
+            res = self.get_filesystem_encoding(expected['isolated'], env)
+            if expected['filesystem_encoding'] is None:
+                expected['filesystem_encoding'] = res[0]
+            if expected['filesystem_errors'] is None:
+                expected['filesystem_errors'] = res[1]
         for key, value in expected.items():
             expected[key] = str(value)
 
diff --git a/Misc/NEWS.d/next/C API/2018-08-29-18-48-47.bpo-34523.lLQ8rh.rst b/Misc/NEWS.d/next/C API/2018-08-29-18-48-47.bpo-34523.lLQ8rh.rst
new file mode 100644
index 000000000000..95368f1c6847
--- /dev/null
+++ b/Misc/NEWS.d/next/C API/2018-08-29-18-48-47.bpo-34523.lLQ8rh.rst	
@@ -0,0 +1,2 @@
+Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding on
+Windows if Py_LegacyWindowsFSEncodingFlag is zero.
diff --git a/Modules/main.c b/Modules/main.c
index 2e9e23b652f3..bf7290a54a45 100644
--- a/Modules/main.c
+++ b/Modules/main.c
@@ -1287,6 +1287,9 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config,
                  _PyCmdline *cmdline)
 {
     int init_utf8_mode = Py_UTF8Mode;
+#ifdef MS_WINDOWS
+    int init_legacy_encoding = Py_LegacyWindowsFSEncodingFlag;
+#endif
     _PyCoreConfig save_config = _PyCoreConfig_INIT;
     int res = -1;
 
@@ -1313,9 +1316,12 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config,
             goto done;
         }
 
-        /* bpo-34207: Py_DecodeLocale(), Py_EncodeLocale() and similar
-           functions depend on Py_UTF8Mode. */
+        /* bpo-34207: Py_DecodeLocale() and Py_EncodeLocale() depend
+           on Py_UTF8Mode and Py_LegacyWindowsFSEncodingFlag. */
         Py_UTF8Mode = config->utf8_mode;
+#ifdef MS_WINDOWS
+        Py_LegacyWindowsFSEncodingFlag = config->legacy_windows_fs_encoding;
+#endif
 
         if (pymain_init_cmdline_argv(pymain, config, cmdline) < 0) {
             goto done;
@@ -1380,6 +1386,9 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config,
 done:
     _PyCoreConfig_Clear(&save_config);
     Py_UTF8Mode = init_utf8_mode ;
+#ifdef MS_WINDOWS
+    Py_LegacyWindowsFSEncodingFlag = init_legacy_encoding;
+#endif
     return res;
 }
 
diff --git a/Python/fileutils.c b/Python/fileutils.c
index e756c260cdcc..9a3c334d43bf 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -499,9 +499,13 @@ _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
     return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
                             surrogateescape);
 #else
-    if (Py_UTF8Mode == 1) {
-        return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
-                                surrogateescape);
+    int use_utf8 = (Py_UTF8Mode == 1);
+#ifdef MS_WINDOWS
+    use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
+#endif
+    if (use_utf8) {
+        return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen,
+                                reason, surrogateescape);
     }
 
 #ifdef USE_FORCE_ASCII
@@ -661,7 +665,11 @@ encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
     return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
                             raw_malloc, surrogateescape);
 #else   /* __APPLE__ */
-    if (Py_UTF8Mode == 1) {
+    int use_utf8 = (Py_UTF8Mode == 1);
+#ifdef MS_WINDOWS
+    use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
+#endif
+    if (use_utf8) {
         return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
                                 raw_malloc, surrogateescape);
     }



More information about the Python-checkins mailing list