[Python-checkins] bpo-34485: Add _PyCoreConfig.stdio_encoding (GH-8881)

Victor Stinner webhook-mailer at python.org
Wed Aug 29 05:47:33 EDT 2018


https://github.com/python/cpython/commit/dfe0dc74536dfb6f331131d9b2b49557675bb6b7
commit: dfe0dc74536dfb6f331131d9b2b49557675bb6b7
branch: master
author: Victor Stinner <vstinner at redhat.com>
committer: GitHub <noreply at github.com>
date: 2018-08-29T11:47:29+02:00
summary:

bpo-34485: Add _PyCoreConfig.stdio_encoding (GH-8881)

* Add stdio_encoding and stdio_errors fields to _PyCoreConfig.
* Add unit tests on stdio_encoding and stdio_errors.

files:
M Include/coreconfig.h
M Include/pylifecycle.h
M Lib/test/test_embed.py
M Programs/_testembed.c
M Python/coreconfig.c
M Python/pylifecycle.c

diff --git a/Include/coreconfig.h b/Include/coreconfig.h
index b2799075f930..ffba306a9f8a 100644
--- a/Include/coreconfig.h
+++ b/Include/coreconfig.h
@@ -203,6 +203,18 @@ typedef struct {
        If set to -1 (default), it is set to !Py_UnbufferedStdioFlag. */
     int buffered_stdio;
 
+    /* Encoding of sys.stdin, sys.stdout and sys.stderr.
+       Value set from PYTHONIOENCODING environment variable and
+       Py_SetStandardStreamEncoding() function.
+       See also 'stdio_errors' attribute. */
+    char *stdio_encoding;
+
+    /* Error handler of sys.stdin and sys.stdout.
+       Value set from PYTHONIOENCODING environment variable and
+       Py_SetStandardStreamEncoding() function.
+       See also 'stdio_encoding' attribute. */
+    char *stdio_errors;
+
 #ifdef MS_WINDOWS
     /* If greater than 1, use the "mbcs" encoding instead of the UTF-8
        encoding for the filesystem encoding.
diff --git a/Include/pylifecycle.h b/Include/pylifecycle.h
index 20298277023f..b96db1e38b9f 100644
--- a/Include/pylifecycle.h
+++ b/Include/pylifecycle.h
@@ -179,6 +179,9 @@ PyAPI_FUNC(void) _Py_CoerceLegacyLocale(const _PyCoreConfig *config);
 PyAPI_FUNC(int) _Py_LegacyLocaleDetected(void);
 PyAPI_FUNC(char *) _Py_SetLocaleFromEnv(int category);
 #endif
+#ifdef Py_BUILD_CORE
+PyAPI_FUNC(int) _Py_IsLocaleCoercionTarget(const char *ctype_loc);
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py
index 3922447c645e..2ec9cf3686e4 100644
--- a/Lib/test/test_embed.py
+++ b/Lib/test/test_embed.py
@@ -288,13 +288,29 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
         'quiet': 0,
         'user_site_directory': 1,
         'buffered_stdio': 1,
+        # None means that check_config() gets the expected encoding at runtime
+        'stdio_encoding': None,
+        'stdio_errors': None,
 
         '_install_importlib': 1,
         '_check_hash_pycs_mode': 'default',
         '_frozen': 0,
     }
 
+    def get_stdio_encoding(self, env):
+        code = 'import sys; print(sys.stdout.encoding, sys.stdout.errors)'
+        args = (sys.executable, '-c', code)
+        proc = subprocess.run(args, env=env, text=True,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.STDOUT)
+        if proc.returncode:
+            raise Exception(f"failed to get the stdio encoding: stdout={proc.stdout!r}")
+        out = proc.stdout.rstrip()
+        return out.split()
+
     def check_config(self, testname, expected):
+        expected = dict(self.DEFAULT_CONFIG, **expected)
+
         env = dict(os.environ)
         for key in list(env):
             if key.startswith('PYTHON'):
@@ -303,13 +319,19 @@ def check_config(self, testname, expected):
         # on the current locale
         env['PYTHONCOERCECLOCALE'] = '0'
         env['PYTHONUTF8'] = '0'
-        out, err = self.run_embedded_interpreter(testname, env=env)
-        # Ignore err
 
-        expected = dict(self.DEFAULT_CONFIG, **expected)
+        if expected['stdio_encoding'] is None or expected['stdio_errors'] is None:
+            res = self.get_stdio_encoding(env)
+            if expected['stdio_encoding'] is None:
+                expected['stdio_encoding'] = res[0]
+            if expected['stdio_errors'] is None:
+                expected['stdio_errors'] = res[1]
         for key, value in expected.items():
             expected[key] = str(value)
 
+        out, err = self.run_embedded_interpreter(testname, env=env)
+        # Ignore err
+
         config = {}
         for line in out.splitlines():
             key, value = line.split(' = ', 1)
@@ -331,7 +353,11 @@ def test_init_global_config(self):
             'verbose': 1,
             'quiet': 1,
             'buffered_stdio': 0,
+
             'utf8_mode': 1,
+            'stdio_encoding': 'utf-8',
+            'stdio_errors': 'surrogateescape',
+
             'user_site_directory': 0,
             '_frozen': 1,
         }
@@ -350,6 +376,8 @@ def test_init_from_config(self):
             'malloc_stats': 1,
 
             'utf8_mode': 1,
+            'stdio_encoding': 'iso8859-1',
+            'stdio_errors': 'replace',
 
             'pycache_prefix': 'conf_pycache_prefix',
             'program_name': './conf_program_name',
@@ -387,6 +415,8 @@ def test_init_env(self):
             'write_bytecode': 0,
             'verbose': 1,
             'buffered_stdio': 0,
+            'stdio_encoding': 'iso8859-1',
+            'stdio_errors': 'replace',
             'user_site_directory': 0,
             'faulthandler': 1,
             'dev_mode': 1,
diff --git a/Programs/_testembed.c b/Programs/_testembed.c
index d0c00cfc6cd4..d5694178b11b 100644
--- a/Programs/_testembed.c
+++ b/Programs/_testembed.c
@@ -374,6 +374,8 @@ dump_config(void)
     printf("user_site_directory = %i\n", config->user_site_directory);
     printf("buffered_stdio = %i\n", config->buffered_stdio);
     ASSERT_EQUAL(config->buffered_stdio, !Py_UnbufferedStdioFlag);
+    printf("stdio_encoding = %s\n", config->stdio_encoding);
+    printf("stdio_errors = %s\n", config->stdio_errors);
 
     /* FIXME: test legacy_windows_fs_encoding */
     /* FIXME: test legacy_windows_stdio */
@@ -532,6 +534,11 @@ static int test_init_from_config(void)
     Py_UnbufferedStdioFlag = 0;
     config.buffered_stdio = 0;
 
+    putenv("PYTHONIOENCODING=cp424");
+    Py_SetStandardStreamEncoding("ascii", "ignore");
+    config.stdio_encoding = "iso8859-1";
+    config.stdio_errors = "replace";
+
     putenv("PYTHONNOUSERSITE=");
     Py_NoUserSiteDirectory = 0;
     config.user_site_directory = 0;
@@ -569,6 +576,7 @@ static void test_init_env_putenvs(void)
     putenv("PYTHONNOUSERSITE=1");
     putenv("PYTHONFAULTHANDLER=1");
     putenv("PYTHONDEVMODE=1");
+    putenv("PYTHONIOENCODING=iso8859-1:replace");
     /* FIXME: test PYTHONWARNINGS */
     /* FIXME: test PYTHONEXECUTABLE */
     /* FIXME: test PYTHONHOME */
diff --git a/Python/coreconfig.c b/Python/coreconfig.c
index 99d703cab92d..00037d973d5d 100644
--- a/Python/coreconfig.c
+++ b/Python/coreconfig.c
@@ -1,6 +1,9 @@
 #include "Python.h"
 #include "internal/pystate.h"
 #include <locale.h>
+#ifdef HAVE_LANGINFO_H
+#  include <langinfo.h>
+#endif
 
 
 #define DECODE_LOCALE_ERR(NAME, LEN) \
@@ -89,8 +92,8 @@ _Py_wstrlist_copy(int len, wchar_t **list)
  * mechanism that attempts to figure out an appropriate IO encoding
  */
 
-char *_Py_StandardStreamEncoding = NULL;
-char *_Py_StandardStreamErrors = NULL;
+static char *_Py_StandardStreamEncoding = NULL;
+static char *_Py_StandardStreamErrors = NULL;
 
 int
 Py_SetStandardStreamEncoding(const char *encoding, const char *errors)
@@ -205,6 +208,9 @@ _PyCoreConfig_Clear(_PyCoreConfig *config)
     CLEAR(config->dll_path);
 #endif
     CLEAR(config->base_exec_prefix);
+
+    CLEAR(config->stdio_encoding);
+    CLEAR(config->stdio_errors);
 #undef CLEAR
 #undef CLEAR_WSTRLIST
 }
@@ -216,6 +222,15 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2)
     _PyCoreConfig_Clear(config);
 
 #define COPY_ATTR(ATTR) config->ATTR = config2->ATTR
+#define COPY_STR_ATTR(ATTR) \
+    do { \
+        if (config2->ATTR != NULL) { \
+            config->ATTR = _PyMem_RawStrdup(config2->ATTR); \
+            if (config->ATTR == NULL) { \
+                return -1; \
+            } \
+        } \
+    } while (0)
 #define COPY_WSTR_ATTR(ATTR) \
     do { \
         if (config2->ATTR != NULL) { \
@@ -287,6 +302,8 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2)
     COPY_ATTR(quiet);
     COPY_ATTR(user_site_directory);
     COPY_ATTR(buffered_stdio);
+    COPY_STR_ATTR(stdio_encoding);
+    COPY_STR_ATTR(stdio_errors);
 #ifdef MS_WINDOWS
     COPY_ATTR(legacy_windows_fs_encoding);
     COPY_ATTR(legacy_windows_stdio);
@@ -932,6 +949,161 @@ config_init_locale(_PyCoreConfig *config)
 }
 
 
+static const char *
+get_stdio_errors(const _PyCoreConfig *config)
+{
+#ifndef MS_WINDOWS
+    const char *loc = setlocale(LC_CTYPE, NULL);
+    if (loc != NULL) {
+        /* surrogateescape is the default in the legacy C and POSIX locales */
+        if (strcmp(loc, "C") == 0 || strcmp(loc, "POSIX") == 0) {
+            return "surrogateescape";
+        }
+
+#ifdef PY_COERCE_C_LOCALE
+        /* surrogateescape is the default in locale coercion target locales */
+        if (_Py_IsLocaleCoercionTarget(loc)) {
+            return "surrogateescape";
+        }
+#endif
+    }
+
+    return "strict";
+#else
+    /* On Windows, always use surrogateescape by default */
+    return "surrogateescape";
+#endif
+}
+
+
+_PyInitError
+_Py_get_locale_encoding(char **locale_encoding)
+{
+#ifdef MS_WINDOWS
+    char encoding[20];
+    PyOS_snprintf(encoding, sizeof(encoding), "cp%d", GetACP());
+#elif defined(__ANDROID__)
+    const char *encoding = "UTF-8";
+#else
+    const char *encoding = nl_langinfo(CODESET);
+    if (!encoding || encoding[0] == '\0') {
+        return _Py_INIT_USER_ERR("failed to get the locale encoding: "
+                                 "nl_langinfo(CODESET) failed");
+    }
+#endif
+    *locale_encoding = _PyMem_RawStrdup(encoding);
+    if (*locale_encoding == NULL) {
+        return _Py_INIT_NO_MEMORY();
+    }
+    return _Py_INIT_OK();
+}
+
+
+static _PyInitError
+config_init_stdio_encoding(_PyCoreConfig *config)
+{
+    /* If Py_SetStandardStreamEncoding() have been called, use these
+        parameters. */
+    if (config->stdio_encoding == NULL && _Py_StandardStreamEncoding != NULL) {
+        config->stdio_encoding = _PyMem_RawStrdup(_Py_StandardStreamEncoding);
+        if (config->stdio_encoding == NULL) {
+            return _Py_INIT_NO_MEMORY();
+        }
+    }
+
+    if (config->stdio_errors == NULL && _Py_StandardStreamErrors != NULL) {
+        config->stdio_errors = _PyMem_RawStrdup(_Py_StandardStreamErrors);
+        if (config->stdio_errors == NULL) {
+            return _Py_INIT_NO_MEMORY();
+        }
+    }
+
+    if (config->stdio_encoding != NULL && config->stdio_errors != NULL) {
+        return _Py_INIT_OK();
+    }
+
+    /* PYTHONIOENCODING environment variable */
+    const char *opt = _PyCoreConfig_GetEnv(config, "PYTHONIOENCODING");
+    if (opt) {
+        char *pythonioencoding = _PyMem_RawStrdup(opt);
+        if (pythonioencoding == NULL) {
+            return _Py_INIT_NO_MEMORY();
+        }
+
+        char *err = strchr(pythonioencoding, ':');
+        if (err) {
+            *err = '\0';
+            err++;
+            if (!err[0]) {
+                err = NULL;
+            }
+        }
+
+        /* Does PYTHONIOENCODING contain an encoding? */
+        if (pythonioencoding[0]) {
+            if (config->stdio_encoding == NULL) {
+                config->stdio_encoding = _PyMem_RawStrdup(pythonioencoding);
+                if (config->stdio_encoding == NULL) {
+                    PyMem_RawFree(pythonioencoding);
+                    return _Py_INIT_NO_MEMORY();
+                }
+            }
+
+            /* If the encoding is set but not the error handler,
+               use "strict" error handler by default.
+               PYTHONIOENCODING=latin1 behaves as
+               PYTHONIOENCODING=latin1:strict. */
+            if (!err) {
+                err = "strict";
+            }
+        }
+
+        if (config->stdio_errors == NULL && err != NULL) {
+            config->stdio_errors = _PyMem_RawStrdup(err);
+            if (config->stdio_errors == NULL) {
+                PyMem_RawFree(pythonioencoding);
+                return _Py_INIT_NO_MEMORY();
+            }
+        }
+
+        PyMem_RawFree(pythonioencoding);
+    }
+
+    /* UTF-8 Mode uses UTF-8/surrogateescape */
+    if (config->utf8_mode) {
+        if (config->stdio_encoding == NULL) {
+            config->stdio_encoding = _PyMem_RawStrdup("utf-8");
+            if (config->stdio_encoding == NULL) {
+                return _Py_INIT_NO_MEMORY();
+            }
+        }
+        if (config->stdio_errors == NULL) {
+            config->stdio_errors = _PyMem_RawStrdup("surrogateescape");
+            if (config->stdio_errors == NULL) {
+                return _Py_INIT_NO_MEMORY();
+            }
+        }
+    }
+
+    /* Choose the default error handler based on the current locale. */
+    if (config->stdio_encoding == NULL) {
+        _PyInitError err = _Py_get_locale_encoding(&config->stdio_encoding);
+        if (_Py_INIT_FAILED(err)) {
+            return err;
+        }
+    }
+    if (config->stdio_errors == NULL) {
+        const char *errors = get_stdio_errors(config);
+        config->stdio_errors = _PyMem_RawStrdup(errors);
+        if (config->stdio_errors == NULL) {
+            return _Py_INIT_NO_MEMORY();
+        }
+    }
+
+    return _Py_INIT_OK();
+}
+
+
 /* Read configuration settings from standard locations
  *
  * This function doesn't make any changes to the interpreter state - it
@@ -1044,6 +1216,11 @@ _PyCoreConfig_Read(_PyCoreConfig *config)
         config->argc = 0;
     }
 
+    err = config_init_stdio_encoding(config);
+    if (_Py_INIT_FAILED(err)) {
+        return err;
+    }
+
     assert(config->coerce_c_locale >= 0);
     assert(config->use_environment >= 0);
 
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 88403f4cbe5d..9f6757fe808d 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -184,27 +184,6 @@ get_codec_name(const char *encoding)
     return NULL;
 }
 
-static _PyInitError
-get_locale_encoding(char **locale_encoding)
-{
-#ifdef MS_WINDOWS
-    char encoding[20];
-    PyOS_snprintf(encoding, sizeof(encoding), "cp%d", GetACP());
-#elif defined(__ANDROID__)
-    const char *encoding = "UTF-8";
-#else
-    const char *encoding = nl_langinfo(CODESET);
-    if (!encoding || encoding[0] == '\0') {
-        return _Py_INIT_USER_ERR("failed to get the locale encoding: "
-                                 "nl_langinfo(CODESET) failed");
-    }
-#endif
-    *locale_encoding = _PyMem_RawStrdup(encoding);
-    if (*locale_encoding == NULL) {
-        return _Py_INIT_NO_MEMORY();
-    }
-    return _Py_INIT_OK();
-}
 
 static _PyInitError
 initimport(PyInterpreterState *interp, PyObject *sysmod)
@@ -340,35 +319,20 @@ static _LocaleCoercionTarget _TARGET_LOCALES[] = {
     {NULL}
 };
 
-static const char *
-get_stdio_errors(void)
-{
-#ifndef MS_WINDOWS
-    const char *ctype_loc = setlocale(LC_CTYPE, NULL);
-    if (ctype_loc != NULL) {
-        /* surrogateescape is the default in the legacy C and POSIX locales */
-        if (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "POSIX") == 0) {
-            return "surrogateescape";
-        }
 
-#ifdef PY_COERCE_C_LOCALE
-        /* surrogateescape is the default in locale coercion target locales */
-        const _LocaleCoercionTarget *target = NULL;
-        for (target = _TARGET_LOCALES; target->locale_name; target++) {
-            if (strcmp(ctype_loc, target->locale_name) == 0) {
-                return "surrogateescape";
-            }
+int
+_Py_IsLocaleCoercionTarget(const char *ctype_loc)
+{
+    const _LocaleCoercionTarget *target = NULL;
+    for (target = _TARGET_LOCALES; target->locale_name; target++) {
+        if (strcmp(ctype_loc, target->locale_name) == 0) {
+            return 1;
         }
-#endif
     }
-
-    return "strict";
-#else
-    /* On Windows, always use surrogateescape by default */
-    return "surrogateescape";
-#endif
+    return 0;
 }
 
+
 #ifdef PY_COERCE_C_LOCALE
 static const char C_LOCALE_COERCION_WARNING[] =
     "Python detected LC_CTYPE=C: LC_CTYPE coerced to %.20s (set another locale "
@@ -1533,8 +1497,10 @@ initfsencoding(PyInterpreterState *interp)
             Py_HasFileSystemDefaultEncoding = 1;
         }
         else {
+            extern _PyInitError _Py_get_locale_encoding(char **locale_encoding);
+
             char *locale_encoding;
-            _PyInitError err = get_locale_encoding(&locale_encoding);
+            _PyInitError err = _Py_get_locale_encoding(&locale_encoding);
             if (_Py_INIT_FAILED(err)) {
                 return err;
             }
@@ -1740,13 +1706,16 @@ init_sys_streams(PyInterpreterState *interp)
     PyObject *std = NULL;
     int fd;
     PyObject * encoding_attr;
-    char *pythonioencoding = NULL;
-    const char *encoding, *errors;
-    char *locale_encoding = NULL;
-    char *codec_name = NULL;
     _PyInitError res = _Py_INIT_OK();
-    extern char *_Py_StandardStreamEncoding;
-    extern char *_Py_StandardStreamErrors;
+    _PyCoreConfig *config = &interp->core_config;
+
+    char *codec_name = get_codec_name(config->stdio_encoding);
+    if (codec_name == NULL) {
+        return _Py_INIT_ERR("failed to get the Python codec name "
+                            "of the stdio encoding");
+    }
+    PyMem_RawFree(config->stdio_encoding);
+    config->stdio_encoding = codec_name;
 
     /* Hack to avoid a nasty recursion issue when Python is invoked
        in verbose mode: pre-import the Latin-1 and UTF-8 codecs */
@@ -1778,85 +1747,15 @@ init_sys_streams(PyInterpreterState *interp)
     }
     Py_DECREF(wrapper);
 
-    encoding = _Py_StandardStreamEncoding;
-    errors = _Py_StandardStreamErrors;
-    if (!encoding || !errors) {
-        char *opt = Py_GETENV("PYTHONIOENCODING");
-        if (opt && opt[0] != '\0') {
-            char *err;
-            pythonioencoding = _PyMem_Strdup(opt);
-            if (pythonioencoding == NULL) {
-                PyErr_NoMemory();
-                goto error;
-            }
-            err = strchr(pythonioencoding, ':');
-            if (err) {
-                *err = '\0';
-                err++;
-                if (!err[0]) {
-                    err = NULL;
-                }
-            }
-
-            /* Does PYTHONIOENCODING contain an encoding? */
-            if (pythonioencoding[0]) {
-                if (!encoding) {
-                    encoding = pythonioencoding;
-                }
-
-                /* If the encoding is set but not the error handler,
-                   use "strict" error handler by default.
-                   PYTHONIOENCODING=latin1 behaves as
-                   PYTHONIOENCODING=latin1:strict. */
-                if (!err) {
-                    err = "strict";
-                }
-            }
-
-            if (!errors && err != NULL) {
-                errors = err;
-            }
-        }
-
-        if (interp->core_config.utf8_mode) {
-            if (!encoding) {
-                encoding = "utf-8";
-            }
-            if (!errors) {
-                errors = "surrogateescape";
-            }
-        }
-
-        if (!errors) {
-            /* Choose the default error handler based on the current locale */
-            errors = get_stdio_errors();
-        }
-    }
-
-    if (encoding == NULL) {
-        _PyInitError err = get_locale_encoding(&locale_encoding);
-        if (_Py_INIT_FAILED(err)) {
-            return err;
-        }
-        encoding = locale_encoding;
-    }
-
-    codec_name = get_codec_name(encoding);
-    if (codec_name == NULL) {
-        PyErr_SetString(PyExc_RuntimeError,
-                        "failed to get the Python codec name "
-                        "of stdio encoding");
-        goto error;
-    }
-    encoding = codec_name;
-
     /* Set sys.stdin */
     fd = fileno(stdin);
     /* Under some conditions stdin, stdout and stderr may not be connected
      * and fileno() may point to an invalid file descriptor. For example
      * GUI apps don't have valid standard streams by default.
      */
-    std = create_stdio(iomod, fd, 0, "<stdin>", encoding, errors);
+    std = create_stdio(iomod, fd, 0, "<stdin>",
+                       config->stdio_encoding,
+                       config->stdio_errors);
     if (std == NULL)
         goto error;
     PySys_SetObject("__stdin__", std);
@@ -1865,7 +1764,9 @@ init_sys_streams(PyInterpreterState *interp)
 
     /* Set sys.stdout */
     fd = fileno(stdout);
-    std = create_stdio(iomod, fd, 1, "<stdout>", encoding, errors);
+    std = create_stdio(iomod, fd, 1, "<stdout>",
+                       config->stdio_encoding,
+                       config->stdio_errors);
     if (std == NULL)
         goto error;
     PySys_SetObject("__stdout__", std);
@@ -1875,7 +1776,9 @@ init_sys_streams(PyInterpreterState *interp)
 #if 1 /* Disable this if you have trouble debugging bootstrap stuff */
     /* Set sys.stderr, replaces the preliminary stderr */
     fd = fileno(stderr);
-    std = create_stdio(iomod, fd, 1, "<stderr>", encoding, "backslashreplace");
+    std = create_stdio(iomod, fd, 1, "<stderr>",
+                       config->stdio_encoding,
+                       "backslashreplace");
     if (std == NULL)
         goto error;
 
@@ -1911,9 +1814,6 @@ init_sys_streams(PyInterpreterState *interp)
 done:
     _Py_ClearStandardStreamEncoding();
 
-    PyMem_RawFree(locale_encoding);
-    PyMem_RawFree(codec_name);
-    PyMem_Free(pythonioencoding);
     Py_XDECREF(bimod);
     Py_XDECREF(iomod);
     return res;



More information about the Python-checkins mailing list