[Python-checkins] bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)

Victor Stinner webhook-mailer at python.org
Wed Dec 13 06:29:12 EST 2017


https://github.com/python/cpython/commit/91106cd9ff2f321c0f60fbaa09fd46c80aa5c266
commit: 91106cd9ff2f321c0f60fbaa09fd46c80aa5c266
branch: master
author: Victor Stinner <victor.stinner at gmail.com>
committer: GitHub <noreply at github.com>
date: 2017-12-13T12:29:09+01:00
summary:

bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)

* Add -X utf8 command line option, PYTHONUTF8 environment variable
  and a new sys.flags.utf8_mode flag.
* If the LC_CTYPE locale is "C" at startup: enable automatically the
  UTF-8 mode.
* Add _winapi.GetACP(). encodings._alias_mbcs() now calls
  _winapi.GetACP() to get the ANSI code page
* locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8
  mode. As a side effect, open() now uses the UTF-8 encoding by
  default in this mode.
* Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding
  in the UTF-8 Mode.
* Update subprocess._args_from_interpreter_flags() to handle -X utf8
* Skip some tests relying on the current locale if the UTF-8 mode is
  enabled.
* Add test_utf8mode.py.
* _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to
  return also the length (number of wide characters).
* pymain_get_global_config() and pymain_set_global_config() now
  always copy flag values, rather than only copying if the new value
  is greater than the old value.

files:
A Lib/test/test_utf8_mode.py
A Misc/NEWS.d/next/Core and Builtins/2017-12-05-23-10-58.bpo-29240.qpJP5l.rst
M Doc/c-api/sys.rst
M Doc/library/locale.rst
M Doc/library/sys.rst
M Doc/using/cmdline.rst
M Doc/whatsnew/3.7.rst
M Include/fileobject.h
M Include/pystate.h
M Lib/_bootlocale.py
M Lib/encodings/__init__.py
M Lib/locale.py
M Lib/subprocess.py
M Lib/test/test_builtin.py
M Lib/test/test_c_locale_coercion.py
M Lib/test/test_codecs.py
M Lib/test/test_io.py
M Lib/test/test_sys.py
M Modules/_winapi.c
M Modules/clinic/_winapi.c.h
M Modules/main.c
M Objects/unicodeobject.c
M Programs/python.c
M Python/bltinmodule.c
M Python/fileutils.c
M Python/pylifecycle.c
M Python/sysmodule.c

diff --git a/Doc/c-api/sys.rst b/Doc/c-api/sys.rst
index 95d9d657ce9..20bc7bd3df3 100644
--- a/Doc/c-api/sys.rst
+++ b/Doc/c-api/sys.rst
@@ -127,6 +127,9 @@ Operating System Utilities
 
    .. versionadded:: 3.5
 
+   .. versionchanged:: 3.7
+      The function now uses the UTF-8 encoding in the UTF-8 mode.
+
 
 .. c:function:: char* Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
 
@@ -138,12 +141,15 @@ Operating System Utilities
    to free the memory. Return ``NULL`` on encoding error or memory allocation
    error
 
-   If error_pos is not ``NULL``, ``*error_pos`` is set to the index of the
-   invalid character on encoding error, or set to ``(size_t)-1`` otherwise.
+   If error_pos is not ``NULL``, ``*error_pos`` is set to ``(size_t)-1`` on
+   success,  or set to the index of the invalid character on encoding error.
 
    Use the :c:func:`Py_DecodeLocale` function to decode the bytes string back
    to a wide character string.
 
+   .. versionchanged:: 3.7
+      The function now uses the UTF-8 encoding in the UTF-8 mode.
+
    .. seealso::
 
       The :c:func:`PyUnicode_EncodeFSDefault` and
@@ -151,6 +157,9 @@ Operating System Utilities
 
    .. versionadded:: 3.5
 
+   .. versionchanged:: 3.7
+      The function now supports the UTF-8 mode.
+
 
 .. _systemfunctions:
 
diff --git a/Doc/library/locale.rst b/Doc/library/locale.rst
index e8567a7b658..7da94a23964 100644
--- a/Doc/library/locale.rst
+++ b/Doc/library/locale.rst
@@ -316,6 +316,13 @@ The :mod:`locale` module defines the following exception and functions:
    preferences, so this function is not thread-safe. If invoking setlocale is not
    necessary or desired, *do_setlocale* should be set to ``False``.
 
+   On Android or in the UTF-8 mode (:option:`-X` ``utf8`` option), always
+   return ``'UTF-8'``, the locale and the *do_setlocale* argument are ignored.
+
+   .. versionchanged:: 3.7
+      The function now always returns ``UTF-8`` on Android or if the UTF-8 mode
+      is enabled.
+
 
 .. function:: normalize(localename)
 
diff --git a/Doc/library/sys.rst b/Doc/library/sys.rst
index 9e47681804b..957d02b2a30 100644
--- a/Doc/library/sys.rst
+++ b/Doc/library/sys.rst
@@ -313,6 +313,9 @@ always available.
       has caught :exc:`SystemExit` (such as an error flushing buffered data
       in the standard streams), the exit status is changed to 120.
 
+   .. versionchanged:: 3.7
+      Added ``utf8_mode`` attribute for the new :option:`-X` ``utf8`` flag.
+
 
 .. data:: flags
 
@@ -335,6 +338,7 @@ always available.
    :const:`quiet`                :option:`-q`
    :const:`hash_randomization`   :option:`-R`
    :const:`dev_mode`             :option:`-X` ``dev``
+   :const:`utf8_mode`            :option:`-X` ``utf8``
    ============================= =============================
 
    .. versionchanged:: 3.2
@@ -347,7 +351,8 @@ always available.
       Removed obsolete ``division_warning`` attribute.
 
    .. versionchanged:: 3.7
-      Added ``dev_mode`` attribute for the new :option:`-X` ``dev`` flag.
+      Added ``dev_mode`` attribute for the new :option:`-X` ``dev`` flag
+      and ``utf8_mode`` attribute for the new  :option:`-X` ``utf8`` flag.
 
 
 .. data:: float_info
@@ -492,6 +497,8 @@ always available.
    :func:`os.fsencode` and :func:`os.fsdecode` should be used to ensure that
    the correct encoding and errors mode are used.
 
+   * In the UTF-8 mode, the encoding is ``utf-8`` on any platform.
+
    * On Mac OS X, the encoding is ``'utf-8'``.
 
    * On Unix, the encoding is the locale encoding.
@@ -506,6 +513,10 @@ always available.
       Windows is no longer guaranteed to return ``'mbcs'``. See :pep:`529`
       and :func:`_enablelegacywindowsfsencoding` for more information.
 
+   .. versionchanged:: 3.7
+      Return 'utf-8' in the UTF-8 mode.
+
+
 .. function:: getfilesystemencodeerrors()
 
    Return the name of the error mode used to convert between Unicode filenames
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst
index e32f77e7ffc..5cb90717705 100644
--- a/Doc/using/cmdline.rst
+++ b/Doc/using/cmdline.rst
@@ -439,6 +439,9 @@ Miscellaneous options
      * Set the :attr:`~sys.flags.dev_mode` attribute of :attr:`sys.flags` to
        ``True``
 
+   * ``-X utf8`` enables the UTF-8 mode, whereas ``-X utf8=0`` disables the
+     UTF-8 mode.
+
    It also allows passing arbitrary values and retrieving them through the
    :data:`sys._xoptions` dictionary.
 
@@ -455,7 +458,7 @@ Miscellaneous options
       The ``-X showalloccount`` option.
 
    .. versionadded:: 3.7
-      The ``-X importtime`` and ``-X dev`` options.
+      The ``-X importtime``, ``-X dev`` and ``-X utf8`` options.
 
 
 Options you shouldn't use
@@ -816,6 +819,14 @@ conflict.
 
    .. versionadded:: 3.7
 
+.. envvar:: PYTHONUTF8
+
+   If set to ``1``, enable the UTF-8 mode. If set to ``0``, disable the UTF-8
+   mode. Any other non-empty string cause an error.
+
+   .. versionadded:: 3.7
+
+
 Debug-mode variables
 ~~~~~~~~~~~~~~~~~~~~
 
diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst
index 58bfaef282c..81a88a0c82e 100644
--- a/Doc/whatsnew/3.7.rst
+++ b/Doc/whatsnew/3.7.rst
@@ -185,6 +185,23 @@ resolution on Linux and Windows.
        PEP written and implemented by Victor Stinner
 
 
+PEP 540: Add a new UTF-8 mode
+-----------------------------
+
+Add a new UTF-8 mode to ignore the locale, use the UTF-8 encoding, and change
+:data:`sys.stdin` and :data:`sys.stdout` error handlers to ``surrogateescape``.
+This mode is enabled by default in the POSIX locale, but otherwise disabled by
+default.
+
+The new :option:`-X` ``utf8`` command line option and :envvar:`PYTHONUTF8`
+environment variable are added to control the UTF-8 mode.
+
+.. seealso::
+
+    :pep:`540` -- Add a new UTF-8 mode
+       PEP written and implemented by Victor Stinner
+
+
 New Development Mode: -X dev
 ----------------------------
 
@@ -353,6 +370,10 @@ Added another argument *monetary* in :meth:`format_string` of :mod:`locale`.
 If *monetary* is true, the conversion uses monetary thousands separator and
 grouping strings. (Contributed by Garvit in :issue:`10379`.)
 
+The :func:`locale.getpreferredencoding` function now always returns ``'UTF-8'``
+on Android or in the UTF-8 mode  (:option:`-X` ``utf8`` option), the locale and
+the *do_setlocale* argument are ignored.
+
 math
 ----
 
diff --git a/Include/fileobject.h b/Include/fileobject.h
index 0b1678ee8da..89e8dd6a285 100644
--- a/Include/fileobject.h
+++ b/Include/fileobject.h
@@ -28,6 +28,10 @@ PyAPI_DATA(const char *) Py_FileSystemDefaultEncodeErrors;
 #endif
 PyAPI_DATA(int) Py_HasFileSystemDefaultEncoding;
 
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
+PyAPI_DATA(int) Py_UTF8Mode;
+#endif
+
 /* Internal API
 
    The std printer acts as a preliminary sys.stderr until the new io
diff --git a/Include/pystate.h b/Include/pystate.h
index d149aeb2aff..c7ea179cf7d 100644
--- a/Include/pystate.h
+++ b/Include/pystate.h
@@ -38,6 +38,7 @@ typedef struct {
     int show_alloc_count;   /* -X showalloccount */
     int dump_refs;          /* PYTHONDUMPREFS */
     int malloc_stats;       /* PYTHONMALLOCSTATS */
+    int utf8_mode;          /* -X utf8 or PYTHONUTF8 environment variable */
 } _PyCoreConfig;
 
 #define _PyCoreConfig_INIT (_PyCoreConfig){.use_hash_seed = -1}
diff --git a/Lib/_bootlocale.py b/Lib/_bootlocale.py
index 0c61b0d3a0f..3273a3b4225 100644
--- a/Lib/_bootlocale.py
+++ b/Lib/_bootlocale.py
@@ -9,6 +9,8 @@
 
 if sys.platform.startswith("win"):
     def getpreferredencoding(do_setlocale=True):
+        if sys.flags.utf8_mode:
+            return 'UTF-8'
         return _locale._getdefaultlocale()[1]
 else:
     try:
@@ -21,6 +23,8 @@ def getpreferredencoding(do_setlocale=True):
                 return 'UTF-8'
         else:
             def getpreferredencoding(do_setlocale=True):
+                if sys.flags.utf8_mode:
+                    return 'UTF-8'
                 # This path for legacy systems needs the more complex
                 # getdefaultlocale() function, import the full locale module.
                 import locale
@@ -28,6 +32,8 @@ def getpreferredencoding(do_setlocale=True):
     else:
         def getpreferredencoding(do_setlocale=True):
             assert not do_setlocale
+            if sys.flags.utf8_mode:
+                return 'UTF-8'
             result = _locale.nl_langinfo(_locale.CODESET)
             if not result and sys.platform == 'darwin':
                 # nl_langinfo can return an empty string
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py
index aa2fb7c2b93..025b7a8da3d 100644
--- a/Lib/encodings/__init__.py
+++ b/Lib/encodings/__init__.py
@@ -158,8 +158,9 @@ def search_function(encoding):
 if sys.platform == 'win32':
     def _alias_mbcs(encoding):
         try:
-            import _bootlocale
-            if encoding == _bootlocale.getpreferredencoding(False):
+            import _winapi
+            ansi_code_page = "cp%s" % _winapi.GetACP()
+            if encoding == ansi_code_page:
                 import encodings.mbcs
                 return encodings.mbcs.getregentry()
         except ImportError:
diff --git a/Lib/locale.py b/Lib/locale.py
index f1d157d6f9c..18079e73ad6 100644
--- a/Lib/locale.py
+++ b/Lib/locale.py
@@ -617,6 +617,8 @@ def resetlocale(category=LC_ALL):
     # On Win32, this will return the ANSI code page
     def getpreferredencoding(do_setlocale = True):
         """Return the charset that the user is likely using."""
+        if sys.flags.utf8_mode:
+            return 'UTF-8'
         import _bootlocale
         return _bootlocale.getpreferredencoding(False)
 else:
@@ -634,6 +636,8 @@ def getpreferredencoding(do_setlocale = True):
             def getpreferredencoding(do_setlocale = True):
                 """Return the charset that the user is likely using,
                 by looking at environment variables."""
+                if sys.flags.utf8_mode:
+                    return 'UTF-8'
                 res = getdefaultlocale()[1]
                 if res is None:
                     # LANG not set, default conservatively to ASCII
@@ -643,6 +647,8 @@ def getpreferredencoding(do_setlocale = True):
         def getpreferredencoding(do_setlocale = True):
             """Return the charset that the user is likely using,
             according to the system configuration."""
+            if sys.flags.utf8_mode:
+                return 'UTF-8'
             import _bootlocale
             if do_setlocale:
                 oldloc = setlocale(LC_CTYPE)
diff --git a/Lib/subprocess.py b/Lib/subprocess.py
index 301433cdf5d..65b4086dc61 100644
--- a/Lib/subprocess.py
+++ b/Lib/subprocess.py
@@ -280,7 +280,7 @@ def _args_from_interpreter_flags():
     if dev_mode:
         args.extend(('-X', 'dev'))
     for opt in ('faulthandler', 'tracemalloc', 'importtime',
-                'showalloccount', 'showrefcount'):
+                'showalloccount', 'showrefcount', 'utf8'):
         if opt in xoptions:
             value = xoptions[opt]
             if value is True:
diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py
index 0a61c054444..9329318706b 100644
--- a/Lib/test/test_builtin.py
+++ b/Lib/test/test_builtin.py
@@ -1022,6 +1022,7 @@ def test_open(self):
         self.assertRaises(ValueError, open, 'a\x00b')
         self.assertRaises(ValueError, open, b'a\x00b')
 
+    @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
     def test_open_default_encoding(self):
         old_environ = dict(os.environ)
         try:
diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py
index 2a22739fb0f..c0845d75a29 100644
--- a/Lib/test/test_c_locale_coercion.py
+++ b/Lib/test/test_c_locale_coercion.py
@@ -130,7 +130,7 @@ def get_child_details(cls, env_vars):
         that.
         """
         result, py_cmd = run_python_until_end(
-            "-c", cls.CHILD_PROCESS_SCRIPT,
+            "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
             __isolated=True,
             **env_vars
         )
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index eb21a3915b9..a59a5e21358 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -5,6 +5,7 @@
 import sys
 import unittest
 import encodings
+from unittest import mock
 
 from test import support
 
@@ -3180,16 +3181,9 @@ def test_incremental(self):
     def test_mbcs_alias(self):
         # Check that looking up our 'default' codepage will return
         # mbcs when we don't have a more specific one available
-        import _bootlocale
-        def _get_fake_codepage(*a):
-            return 'cp123'
-        old_getpreferredencoding = _bootlocale.getpreferredencoding
-        _bootlocale.getpreferredencoding = _get_fake_codepage
-        try:
+        with mock.patch('_winapi.GetACP', return_value=123):
             codec = codecs.lookup('cp123')
             self.assertEqual(codec.name, 'mbcs')
-        finally:
-            _bootlocale.getpreferredencoding = old_getpreferredencoding
 
 
 class ASCIITest(unittest.TestCase):
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
index 6bb4127b095..66748317b5f 100644
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -2580,6 +2580,7 @@ def test_reconfigure_line_buffering(self):
         t.reconfigure(line_buffering=None)
         self.assertEqual(t.line_buffering, True)
 
+    @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
     def test_default_encoding(self):
         old_environ = dict(os.environ)
         try:
@@ -2599,6 +2600,7 @@ def test_default_encoding(self):
             os.environ.update(old_environ)
 
     @support.cpython_only
+    @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
     def test_device_encoding(self):
         # Issue 15989
         import _testcapi
diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
index 6346094ad08..6933b41353b 100644
--- a/Lib/test/test_sys.py
+++ b/Lib/test/test_sys.py
@@ -527,7 +527,7 @@ def test_sys_flags(self):
                  "inspect", "interactive", "optimize", "dont_write_bytecode",
                  "no_user_site", "no_site", "ignore_environment", "verbose",
                  "bytes_warning", "quiet", "hash_randomization", "isolated",
-                 "dev_mode")
+                 "dev_mode", "utf8_mode")
         for attr in attrs:
             self.assertTrue(hasattr(sys.flags, attr), attr)
             attr_type = bool if attr == "dev_mode" else int
@@ -535,6 +535,8 @@ def test_sys_flags(self):
         self.assertTrue(repr(sys.flags))
         self.assertEqual(len(sys.flags), len(attrs))
 
+        self.assertIn(sys.flags.utf8_mode, {0, 1, 2})
+
     def assert_raise_on_new_sys_type(self, sys_attr):
         # Users are intentionally prevented from creating new instances of
         # sys.flags, sys.version_info, and sys.getwindowsversion.
@@ -710,8 +712,8 @@ def test_c_locale_surrogateescape(self):
         # have no any effect
         out = self.c_locale_get_error_handler(encoding=':')
         self.assertEqual(out,
-                         'stdin: surrogateescape\n'
-                         'stdout: surrogateescape\n'
+                         'stdin: strict\n'
+                         'stdout: strict\n'
                          'stderr: backslashreplace\n')
         out = self.c_locale_get_error_handler(encoding='')
         self.assertEqual(out,
diff --git a/Lib/test/test_utf8_mode.py b/Lib/test/test_utf8_mode.py
new file mode 100644
index 00000000000..275a6ea8ed6
--- /dev/null
+++ b/Lib/test/test_utf8_mode.py
@@ -0,0 +1,206 @@
+"""
+Test the implementation of the PEP 540: the UTF-8 Mode.
+"""
+
+import locale
+import os
+import sys
+import textwrap
+import unittest
+from test.support.script_helper import assert_python_ok, assert_python_failure
+
+
+MS_WINDOWS = (sys.platform == 'win32')
+
+
+class UTF8ModeTests(unittest.TestCase):
+    # Override PYTHONUTF8 and PYTHONLEGACYWINDOWSFSENCODING environment
+    # variables by default
+    DEFAULT_ENV = {'PYTHONUTF8': '', 'PYTHONLEGACYWINDOWSFSENCODING': ''}
+
+    def posix_locale(self):
+        loc = locale.setlocale(locale.LC_CTYPE, None)
+        return (loc == 'C')
+
+    def get_output(self, *args, failure=False, **kw):
+        kw = dict(self.DEFAULT_ENV, **kw)
+        if failure:
+            out = assert_python_failure(*args, **kw)
+            out = out[2]
+        else:
+            out = assert_python_ok(*args, **kw)
+            out = out[1]
+        return out.decode().rstrip("\n\r")
+
+    @unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale')
+    def test_posix_locale(self):
+        code = 'import sys; print(sys.flags.utf8_mode)'
+
+        out = self.get_output('-c', code, LC_ALL='C')
+        self.assertEqual(out, '1')
+
+    def test_xoption(self):
+        code = 'import sys; print(sys.flags.utf8_mode)'
+
+        out = self.get_output('-X', 'utf8', '-c', code)
+        self.assertEqual(out, '1')
+
+        # undocumented but accepted syntax: -X utf8=1
+        out = self.get_output('-X', 'utf8=1', '-c', code)
+        self.assertEqual(out, '1')
+
+        out = self.get_output('-X', 'utf8=0', '-c', code)
+        self.assertEqual(out, '0')
+
+        if MS_WINDOWS:
+            # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8
+            # and has the priority over -X utf8
+            out = self.get_output('-X', 'utf8', '-c', code,
+                                  PYTHONLEGACYWINDOWSFSENCODING='1')
+            self.assertEqual(out, '0')
+
+    def test_env_var(self):
+        code = 'import sys; print(sys.flags.utf8_mode)'
+
+        out = self.get_output('-c', code, PYTHONUTF8='1')
+        self.assertEqual(out, '1')
+
+        out = self.get_output('-c', code, PYTHONUTF8='0')
+        self.assertEqual(out, '0')
+
+        # -X utf8 has the priority over PYTHONUTF8
+        out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1')
+        self.assertEqual(out, '0')
+
+        if MS_WINDOWS:
+            # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
+            # and has the priority over PYTHONUTF8
+            out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1',
+                                  PYTHONLEGACYWINDOWSFSENCODING='1')
+            self.assertEqual(out, '0')
+
+        # Cannot test with the POSIX locale, since the POSIX locale enables
+        # the UTF-8 mode
+        if not self.posix_locale():
+            # PYTHONUTF8 should be ignored if -E is used
+            out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
+            self.assertEqual(out, '0')
+
+        # invalid mode
+        out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
+        self.assertIn('invalid PYTHONUTF8 environment variable value',
+                      out.rstrip())
+
+    def test_filesystemencoding(self):
+        code = textwrap.dedent('''
+            import sys
+            print("{}/{}".format(sys.getfilesystemencoding(),
+                                 sys.getfilesystemencodeerrors()))
+        ''')
+
+        if MS_WINDOWS:
+            expected = 'utf-8/surrogatepass'
+        else:
+            expected = 'utf-8/surrogateescape'
+
+        out = self.get_output('-X', 'utf8', '-c', code)
+        self.assertEqual(out, expected)
+
+        if MS_WINDOWS:
+            # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
+            # and has the priority over -X utf8 and PYTHONUTF8
+            out = self.get_output('-X', 'utf8', '-c', code,
+                                  PYTHONUTF8='strict',
+                                  PYTHONLEGACYWINDOWSFSENCODING='1')
+            self.assertEqual(out, 'mbcs/replace')
+
+    def test_stdio(self):
+        code = textwrap.dedent('''
+            import sys
+            print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}")
+            print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}")
+            print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}")
+        ''')
+
+        out = self.get_output('-X', 'utf8', '-c', code,
+                              PYTHONIOENCODING='')
+        self.assertEqual(out.splitlines(),
+                         ['stdin: utf-8/surrogateescape',
+                          'stdout: utf-8/surrogateescape',
+                          'stderr: utf-8/backslashreplace'])
+
+        # PYTHONIOENCODING has the priority over PYTHONUTF8
+        out = self.get_output('-X', 'utf8', '-c', code,
+                              PYTHONIOENCODING="latin1")
+        self.assertEqual(out.splitlines(),
+                         ['stdin: latin1/strict',
+                          'stdout: latin1/strict',
+                          'stderr: latin1/backslashreplace'])
+
+        out = self.get_output('-X', 'utf8', '-c', code,
+                              PYTHONIOENCODING=":namereplace")
+        self.assertEqual(out.splitlines(),
+                         ['stdin: UTF-8/namereplace',
+                          'stdout: UTF-8/namereplace',
+                          'stderr: UTF-8/backslashreplace'])
+
+    def test_io(self):
+        code = textwrap.dedent('''
+            import sys
+            filename = sys.argv[1]
+            with open(filename) as fp:
+                print(f"{fp.encoding}/{fp.errors}")
+        ''')
+        filename = __file__
+
+        out = self.get_output('-c', code, filename, PYTHONUTF8='1')
+        self.assertEqual(out, 'UTF-8/strict')
+
+    def _check_io_encoding(self, module, encoding=None, errors=None):
+        filename = __file__
+
+        # Encoding explicitly set
+        args = []
+        if encoding:
+            args.append(f'encoding={encoding!r}')
+        if errors:
+            args.append(f'errors={errors!r}')
+        code = textwrap.dedent('''
+            import sys
+            from %s import open
+            filename = sys.argv[1]
+            with open(filename, %s) as fp:
+                print(f"{fp.encoding}/{fp.errors}")
+        ''') % (module, ', '.join(args))
+        out = self.get_output('-c', code, filename,
+                              PYTHONUTF8='1')
+
+        if not encoding:
+            encoding = 'UTF-8'
+        if not errors:
+            errors = 'strict'
+        self.assertEqual(out, f'{encoding}/{errors}')
+
+    def check_io_encoding(self, module):
+        self._check_io_encoding(module, encoding="latin1")
+        self._check_io_encoding(module, errors="namereplace")
+        self._check_io_encoding(module,
+                                encoding="latin1", errors="namereplace")
+
+    def test_io_encoding(self):
+        self.check_io_encoding('io')
+
+    def test_io_encoding(self):
+        self.check_io_encoding('_pyio')
+
+    def test_locale_getpreferredencoding(self):
+        code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))'
+        out = self.get_output('-X', 'utf8', '-c', code)
+        self.assertEqual(out, 'UTF-8 UTF-8')
+
+        out = self.get_output('-X', 'utf8', '-c', code, LC_ALL='C')
+        self.assertEqual(out, 'UTF-8 UTF-8')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/Misc/NEWS.d/next/Core and Builtins/2017-12-05-23-10-58.bpo-29240.qpJP5l.rst b/Misc/NEWS.d/next/Core and Builtins/2017-12-05-23-10-58.bpo-29240.qpJP5l.rst
new file mode 100644
index 00000000000..dbd9d619411
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2017-12-05-23-10-58.bpo-29240.qpJP5l.rst	
@@ -0,0 +1 @@
+Add a new UTF-8 mode: implementation of the :pep:`540`.
diff --git a/Modules/_winapi.c b/Modules/_winapi.c
index 0a1d139cd0e..604c05d4ae1 100644
--- a/Modules/_winapi.c
+++ b/Modules/_winapi.c
@@ -1490,6 +1490,20 @@ _winapi_WriteFile_impl(PyObject *module, HANDLE handle, PyObject *buffer,
 }
 
 
+/*[clinic input]
+_winapi.GetACP
+
+Get the current Windows ANSI code page identifier.
+[clinic start generated code]*/
+
+static PyObject *
+_winapi_GetACP_impl(PyObject *module)
+/*[clinic end generated code: output=f7ee24bf705dbb88 input=1433c96d03a05229]*/
+{
+    return PyLong_FromUnsignedLong(GetACP());
+}
+
+
 static PyMethodDef winapi_functions[] = {
     _WINAPI_CLOSEHANDLE_METHODDEF
     _WINAPI_CONNECTNAMEDPIPE_METHODDEF
@@ -1515,6 +1529,7 @@ static PyMethodDef winapi_functions[] = {
     _WINAPI_WAITFORMULTIPLEOBJECTS_METHODDEF
     _WINAPI_WAITFORSINGLEOBJECT_METHODDEF
     _WINAPI_WRITEFILE_METHODDEF
+    _WINAPI_GETACP_METHODDEF
     {NULL, NULL}
 };
 
@@ -1595,14 +1610,14 @@ PyInit__winapi(void)
     WINAPI_CONSTANT(F_DWORD, WAIT_OBJECT_0);
     WINAPI_CONSTANT(F_DWORD, WAIT_ABANDONED_0);
     WINAPI_CONSTANT(F_DWORD, WAIT_TIMEOUT);
-    
+
     WINAPI_CONSTANT(F_DWORD, ABOVE_NORMAL_PRIORITY_CLASS);
     WINAPI_CONSTANT(F_DWORD, BELOW_NORMAL_PRIORITY_CLASS);
     WINAPI_CONSTANT(F_DWORD, HIGH_PRIORITY_CLASS);
     WINAPI_CONSTANT(F_DWORD, IDLE_PRIORITY_CLASS);
     WINAPI_CONSTANT(F_DWORD, NORMAL_PRIORITY_CLASS);
     WINAPI_CONSTANT(F_DWORD, REALTIME_PRIORITY_CLASS);
-    
+
     WINAPI_CONSTANT(F_DWORD, CREATE_NO_WINDOW);
     WINAPI_CONSTANT(F_DWORD, DETACHED_PROCESS);
     WINAPI_CONSTANT(F_DWORD, CREATE_DEFAULT_ERROR_MODE);
diff --git a/Modules/clinic/_winapi.c.h b/Modules/clinic/_winapi.c.h
index 01bba360714..e5781efb938 100644
--- a/Modules/clinic/_winapi.c.h
+++ b/Modules/clinic/_winapi.c.h
@@ -889,4 +889,22 @@ _winapi_WriteFile(PyObject *module, PyObject **args, Py_ssize_t nargs, PyObject
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=fba2ad7bf1a87e4a input=a9049054013a1b77]*/
+
+PyDoc_STRVAR(_winapi_GetACP__doc__,
+"GetACP($module, /)\n"
+"--\n"
+"\n"
+"Get the current Windows ANSI code page identifier.");
+
+#define _WINAPI_GETACP_METHODDEF    \
+    {"GetACP", (PyCFunction)_winapi_GetACP, METH_NOARGS, _winapi_GetACP__doc__},
+
+static PyObject *
+_winapi_GetACP_impl(PyObject *module);
+
+static PyObject *
+_winapi_GetACP(PyObject *module, PyObject *Py_UNUSED(ignored))
+{
+    return _winapi_GetACP_impl(module);
+}
+/*[clinic end generated code: output=fd91c1ec286f0bf3 input=a9049054013a1b77]*/
diff --git a/Modules/main.c b/Modules/main.c
index ac8a38c8f84..9ce111cea99 100644
--- a/Modules/main.c
+++ b/Modules/main.c
@@ -1114,50 +1114,32 @@ pymain_set_argv(_PyMain *pymain)
 }
 
 
-static void
-pymain_get_flag(int flag, int *value)
-{
-    if (flag) {
-        *value = flag;
-    }
-}
-
-static void
-pymain_set_flag(int *flag, int value)
-{
-    /* Helper to set flag variables from command line options
-    *   - uses the higher of the two values if they're both set
-    *   - otherwise leaves the flag unset
-    */
-    if (*flag < value) {
-        *flag = value;
-    }
-}
-
-
 /* Get Py_xxx global configuration variables */
 static void
 pymain_get_global_config(_PyMain *pymain)
 {
     _Py_CommandLineDetails *cmdline = &pymain->cmdline;
-    pymain_get_flag(Py_BytesWarningFlag, &cmdline->bytes_warning);
-    pymain_get_flag(Py_DebugFlag, &cmdline->debug);
-    pymain_get_flag(Py_InspectFlag, &cmdline->inspect);
-    pymain_get_flag(Py_InteractiveFlag, &cmdline->interactive);
-    pymain_get_flag(Py_IsolatedFlag, &cmdline->isolated);
-    pymain_get_flag(Py_OptimizeFlag, &cmdline->optimization_level);
-    pymain_get_flag(Py_DontWriteBytecodeFlag, &cmdline->dont_write_bytecode);
-    pymain_get_flag(Py_NoUserSiteDirectory, &cmdline->no_user_site_directory);
-    pymain_get_flag(Py_NoSiteFlag, &cmdline->no_site_import);
-    pymain_get_flag(Py_UnbufferedStdioFlag, &cmdline->use_unbuffered_io);
-    pymain_get_flag(Py_VerboseFlag, &cmdline->verbosity);
-    pymain_get_flag(Py_QuietFlag, &cmdline->quiet_flag);
+
+    cmdline->bytes_warning = Py_BytesWarningFlag;
+    cmdline->debug = Py_DebugFlag;
+    cmdline->inspect = Py_InspectFlag;
+    cmdline->interactive = Py_InteractiveFlag;
+    cmdline->isolated = Py_IsolatedFlag;
+    cmdline->optimization_level = Py_OptimizeFlag;
+    cmdline->dont_write_bytecode = Py_DontWriteBytecodeFlag;
+    cmdline->no_user_site_directory = Py_NoUserSiteDirectory;
+    cmdline->no_site_import = Py_NoSiteFlag;
+    cmdline->use_unbuffered_io = Py_UnbufferedStdioFlag;
+    cmdline->verbosity = Py_VerboseFlag;
+    cmdline->quiet_flag = Py_QuietFlag;
 #ifdef MS_WINDOWS
-    pymain_get_flag(Py_LegacyWindowsFSEncodingFlag, &cmdline->legacy_windows_fs_encoding);
-    pymain_get_flag(Py_LegacyWindowsStdioFlag, &cmdline->legacy_windows_stdio);
+    cmdline->legacy_windows_fs_encoding = Py_LegacyWindowsFSEncodingFlag;
+    cmdline->legacy_windows_stdio = Py_LegacyWindowsStdioFlag;
 #endif
+    cmdline->check_hash_pycs_mode = _Py_CheckHashBasedPycsMode ;
 
-    pymain_get_flag(Py_IgnoreEnvironmentFlag, &pymain->core_config.ignore_environment);
+    pymain->core_config.ignore_environment = Py_IgnoreEnvironmentFlag;
+    pymain->core_config.utf8_mode = Py_UTF8Mode;
 }
 
 
@@ -1166,26 +1148,27 @@ static void
 pymain_set_global_config(_PyMain *pymain)
 {
     _Py_CommandLineDetails *cmdline = &pymain->cmdline;
-    pymain_set_flag(&Py_BytesWarningFlag, cmdline->bytes_warning);
-    pymain_set_flag(&Py_DebugFlag, cmdline->debug);
-    pymain_set_flag(&Py_InspectFlag, cmdline->inspect);
-    pymain_set_flag(&Py_InteractiveFlag, cmdline->interactive);
-    pymain_set_flag(&Py_IsolatedFlag, cmdline->isolated);
-    pymain_set_flag(&Py_OptimizeFlag, cmdline->optimization_level);
-    pymain_set_flag(&Py_DontWriteBytecodeFlag, cmdline->dont_write_bytecode);
-    pymain_set_flag(&Py_NoUserSiteDirectory, cmdline->no_user_site_directory);
-    pymain_set_flag(&Py_NoSiteFlag, cmdline->no_site_import);
-    pymain_set_flag(&Py_UnbufferedStdioFlag, cmdline->use_unbuffered_io);
-    pymain_set_flag(&Py_VerboseFlag, cmdline->verbosity);
-    pymain_set_flag(&Py_QuietFlag, cmdline->quiet_flag);
-    if (cmdline->check_hash_pycs_mode)
-        _Py_CheckHashBasedPycsMode = cmdline->check_hash_pycs_mode;
+
+    Py_BytesWarningFlag = cmdline->bytes_warning;
+    Py_DebugFlag = cmdline->debug;
+    Py_InspectFlag = cmdline->inspect;
+    Py_InteractiveFlag = cmdline->interactive;
+    Py_IsolatedFlag = cmdline->isolated;
+    Py_OptimizeFlag = cmdline->optimization_level;
+    Py_DontWriteBytecodeFlag = cmdline->dont_write_bytecode;
+    Py_NoUserSiteDirectory = cmdline->no_user_site_directory;
+    Py_NoSiteFlag = cmdline->no_site_import;
+    Py_UnbufferedStdioFlag = cmdline->use_unbuffered_io;
+    Py_VerboseFlag = cmdline->verbosity;
+    Py_QuietFlag = cmdline->quiet_flag;
+    _Py_CheckHashBasedPycsMode = cmdline->check_hash_pycs_mode;
 #ifdef MS_WINDOWS
-    pymain_set_flag(&Py_LegacyWindowsFSEncodingFlag, cmdline->legacy_windows_fs_encoding);
-    pymain_set_flag(&Py_LegacyWindowsStdioFlag, cmdline->legacy_windows_stdio);
+    Py_LegacyWindowsFSEncodingFlag = cmdline->legacy_windows_fs_encoding;
+    Py_LegacyWindowsStdioFlag = cmdline->legacy_windows_stdio;
 #endif
 
-    pymain_set_flag(&Py_IgnoreEnvironmentFlag, pymain->core_config.ignore_environment);
+    Py_IgnoreEnvironmentFlag = pymain->core_config.ignore_environment;
+    Py_UTF8Mode = pymain->core_config.utf8_mode;
 }
 
 
@@ -1609,6 +1592,57 @@ _PyMainInterpreterConfig_ReadEnv(_PyMainInterpreterConfig *config)
 }
 
 
+static int
+pymain_init_utf8_mode(_PyMain *pymain)
+{
+    _PyCoreConfig *core_config = &pymain->core_config;
+
+#ifdef MS_WINDOWS
+    if (pymain->cmdline.legacy_windows_fs_encoding) {
+        core_config->utf8_mode = 0;
+        return 0;
+    }
+#endif
+
+    wchar_t *xopt = pymain_get_xoption(pymain, L"utf8");
+    if (xopt) {
+        wchar_t *sep = wcschr(xopt, L'=');
+        if (sep) {
+            xopt = sep + 1;
+            if (wcscmp(xopt, L"1") == 0) {
+                core_config->utf8_mode = 1;
+            }
+            else if (wcscmp(xopt, L"0") == 0) {
+                core_config->utf8_mode = 0;
+            }
+            else {
+                pymain->err = _Py_INIT_USER_ERR("invalid -X utf8 option value");
+                return -1;
+            }
+        }
+        else {
+            core_config->utf8_mode = 1;
+        }
+        return 0;
+    }
+
+    char *opt = pymain_get_env_var("PYTHONUTF8");
+    if (opt) {
+        if (strcmp(opt, "1") == 0) {
+            core_config->utf8_mode = 1;
+        }
+        else if (strcmp(opt, "0") == 0) {
+            core_config->utf8_mode = 0;
+        }
+        else {
+            pymain->err = _Py_INIT_USER_ERR("invalid PYTHONUTF8 environment "
+                                             "variable value");
+            return -1;
+        }
+        return 0;
+    }
+    return 0;
+}
 
 
 static int
@@ -1674,6 +1708,9 @@ pymain_parse_envvars(_PyMain *pymain)
         pymain->core_config.malloc_stats = 1;
     }
 
+    if (pymain_init_utf8_mode(pymain) < 0) {
+        return -1;
+    }
 
     return 0;
 }
@@ -1702,6 +1739,7 @@ pymain_parse_cmdline_envvars_impl(_PyMain *pymain)
     if (pymain_parse_envvars(pymain) < 0) {
         return -1;
     }
+    /* FIXME: if utf8_mode value changed, parse again cmdline */
 
     _PyInitError err = _PyMainInterpreterConfig_Read(&pymain->config);
     if (_Py_INIT_FAILED(err)) {
@@ -1730,6 +1768,7 @@ pymain_parse_cmdline_envvars(_PyMain *pymain)
 static int
 pymain_init_python(_PyMain *pymain)
 {
+
     pymain_set_global_config(pymain);
 
     pymain_init_stdio(pymain);
@@ -1788,6 +1827,7 @@ pymain_init(_PyMain *pymain)
         return -1;
     }
 
+    pymain->core_config.utf8_mode = Py_UTF8Mode;
     pymain->core_config._disable_importlib = 0;
     pymain->config.install_signal_handlers = 1;
 
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 8d4fea8ede1..c7480a0d87c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -5079,16 +5079,17 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
     return NULL;
 }
 
-#if defined(__APPLE__) || defined(__ANDROID__)
 
-/* Simplified UTF-8 decoder using surrogateescape error handler,
-   used to decode the command line arguments on Mac OS X and Android.
+/* UTF-8 decoder using the surrogateescape error handler .
 
-   Return a pointer to a newly allocated wide character string (use
-   PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
+   On success, return a pointer to a newly allocated wide character string (use
+   PyMem_RawFree() to free the memory) and write the output length (in number
+   of wchar_t units) into *p_wlen (if p_wlen is set).
 
+   On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen
+   (if p_wlen is set). */
 wchar_t*
-_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
+_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
 {
     const char *e;
     wchar_t *unicode;
@@ -5096,11 +5097,20 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
 
     /* Note: size will always be longer than the resulting Unicode
        character count */
-    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
+    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
+        if (p_wlen) {
+            *p_wlen = (size_t)-1;
+        }
         return NULL;
+    }
+
     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
-    if (!unicode)
+    if (!unicode) {
+        if (p_wlen) {
+            *p_wlen = (size_t)-1;
+        }
         return NULL;
+    }
 
     /* Unpack UTF-8 encoded data */
     e = s + size;
@@ -5130,10 +5140,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
         }
     }
     unicode[outpos] = L'\0';
+    if (p_wlen) {
+        *p_wlen = outpos;
+    }
     return unicode;
 }
 
-#endif /* __APPLE__ or __ANDROID__ */
 
 /* Primary internal function which creates utf8 encoded bytes objects.
 
diff --git a/Programs/python.c b/Programs/python.c
index 22d55bbc4ce..aef7122517a 100644
--- a/Programs/python.c
+++ b/Programs/python.c
@@ -17,6 +17,15 @@ wmain(int argc, wchar_t **argv)
 #else
 
 
+static void _Py_NO_RETURN
+fatal_error(const char *msg)
+{
+    fprintf(stderr, "Fatal Python error: %s\n", msg);
+    fflush(stderr);
+    exit(1);
+}
+
+
 int
 main(int argc, char **argv)
 {
@@ -28,9 +37,7 @@ main(int argc, char **argv)
 
     _PyInitError err = _PyRuntime_Initialize();
     if (_Py_INIT_FAILED(err)) {
-        fprintf(stderr, "Fatal Python error: %s\n", err.msg);
-        fflush(stderr);
-        exit(1);
+        fatal_error(err.msg);
     }
 
     /* Force default allocator, to be able to release memory above
@@ -40,7 +47,7 @@ main(int argc, char **argv)
     argv_copy = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
     argv_copy2 = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
     if (!argv_copy || !argv_copy2) {
-        fprintf(stderr, "out of memory\n");
+        fatal_error("out of memory");
         return 1;
     }
 
@@ -55,7 +62,7 @@ main(int argc, char **argv)
 
     oldloc = _PyMem_RawStrdup(setlocale(LC_ALL, NULL));
     if (!oldloc) {
-        fprintf(stderr, "out of memory\n");
+        fatal_error("out of memory");
         return 1;
     }
 
@@ -73,6 +80,7 @@ main(int argc, char **argv)
      * details.
      */
     if (_Py_LegacyLocaleDetected()) {
+        Py_UTF8Mode = 1;
         _Py_CoerceLegacyLocale();
     }
 
@@ -81,10 +89,7 @@ main(int argc, char **argv)
         argv_copy[i] = Py_DecodeLocale(argv[i], NULL);
         if (!argv_copy[i]) {
             PyMem_RawFree(oldloc);
-            fprintf(stderr, "Fatal Python error: "
-                            "unable to decode the command line argument #%i\n",
-                            i + 1);
-            return 1;
+            fatal_error("unable to decode the command line arguments");
         }
         argv_copy2[i] = argv_copy[i];
     }
diff --git a/Python/bltinmodule.c b/Python/bltinmodule.c
index 81774dc5f8e..23d7aa45683 100644
--- a/Python/bltinmodule.c
+++ b/Python/bltinmodule.c
@@ -29,6 +29,9 @@ const char *Py_FileSystemDefaultEncoding = NULL; /* set by initfsencoding() */
 int Py_HasFileSystemDefaultEncoding = 0;
 #endif
 const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape";
+/* UTF-8 mode (PEP 540): if non-zero, use the UTF-8 encoding, and change stdin
+   and stdout error handler to "surrogateescape". */
+int Py_UTF8Mode = 0;
 
 _Py_IDENTIFIER(__builtins__);
 _Py_IDENTIFIER(__dict__);
diff --git a/Python/fileutils.c b/Python/fileutils.c
index eab58c50561..03cc37958a0 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -20,9 +20,8 @@ extern int winerror_to_errno(int);
 #include <fcntl.h>
 #endif /* HAVE_FCNTL_H */
 
-#if defined(__APPLE__) || defined(__ANDROID__)
-extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
-#endif
+extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
+                                               size_t *p_wlen);
 
 #ifdef O_CLOEXEC
 /* Does open() support the O_CLOEXEC flag? Possible values:
@@ -250,40 +249,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size)
 }
 #endif
 
-
-/* Decode a byte string from the locale encoding with the
-   surrogateescape error handler: undecodable bytes are decoded as characters
-   in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
-   character, escape the bytes using the surrogateescape error handler instead
-   of decoding them.
-
-   Return a pointer to a newly allocated wide character string, use
-   PyMem_RawFree() to free the memory. If size is not NULL, write the number of
-   wide characters excluding the null character into *size
-
-   Return NULL on decoding error or memory allocation error. If *size* is not
-   NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
-   decoding error.
-
-   Decoding errors should never happen, unless there is a bug in the C
-   library.
-
-   Use the Py_EncodeLocale() function to encode the character string back to a
-   byte string. */
-wchar_t*
-Py_DecodeLocale(const char* arg, size_t *size)
+static wchar_t*
+decode_locale(const char* arg, size_t *size)
 {
-#if defined(__APPLE__) || defined(__ANDROID__)
-    wchar_t *wstr;
-    wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
-    if (size != NULL) {
-        if (wstr != NULL)
-            *size = wcslen(wstr);
-        else
-            *size = (size_t)-1;
-    }
-    return wstr;
-#else
     wchar_t *res;
     size_t argsize;
     size_t count;
@@ -293,19 +261,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
     mbstate_t mbs;
 #endif
 
-#ifndef MS_WINDOWS
-    if (force_ascii == -1)
-        force_ascii = check_force_ascii();
-
-    if (force_ascii) {
-        /* force ASCII encoding to workaround mbstowcs() issue */
-        res = decode_ascii_surrogateescape(arg, size);
-        if (res == NULL)
-            goto oom;
-        return res;
-    }
-#endif
-
 #ifdef HAVE_BROKEN_MBSTOWCS
     /* Some platforms have a broken implementation of
      * mbstowcs which does not count the characters that
@@ -402,43 +357,84 @@ Py_DecodeLocale(const char* arg, size_t *size)
         goto oom;
 #endif   /* HAVE_MBRTOWC */
     return res;
+
 oom:
-    if (size != NULL)
+    if (size != NULL) {
         *size = (size_t)-1;
+    }
     return NULL;
-#endif   /* __APPLE__ or __ANDROID__ */
 }
 
-/* Encode a wide character string to the locale encoding with the
-   surrogateescape error handler: surrogate characters in the range
-   U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
 
-   Return a pointer to a newly allocated byte string, use PyMem_Free() to free
-   the memory. Return NULL on encoding or memory allocation error.
+/* Decode a byte string from the locale encoding with the
+   surrogateescape error handler: undecodable bytes are decoded as characters
+   in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
+   character, escape the bytes using the surrogateescape error handler instead
+   of decoding them.
 
-   If error_pos is not NULL, *error_pos is set to the index of the invalid
-   character on encoding error, or set to (size_t)-1 otherwise.
+   Return a pointer to a newly allocated wide character string, use
+   PyMem_RawFree() to free the memory. If size is not NULL, write the number of
+   wide characters excluding the null character into *size
 
-   Use the Py_DecodeLocale() function to decode the bytes string back to a wide
-   character string. */
-char*
-Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
+   Return NULL on decoding error or memory allocation error. If *size* is not
+   NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
+   decoding error.
+
+   Decoding errors should never happen, unless there is a bug in the C
+   library.
+
+   Use the Py_EncodeLocale() function to encode the character string back to a
+   byte string. */
+wchar_t*
+Py_DecodeLocale(const char* arg, size_t *size)
 {
 #if defined(__APPLE__) || defined(__ANDROID__)
+    return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
+#else
+    if (Py_UTF8Mode) {
+        return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
+    }
+
+#ifndef MS_WINDOWS
+    if (force_ascii == -1)
+        force_ascii = check_force_ascii();
+
+    if (force_ascii) {
+        /* force ASCII encoding to workaround mbstowcs() issue */
+        wchar_t *wstr = decode_ascii_surrogateescape(arg, size);
+        if (wstr == NULL) {
+            if (size != NULL) {
+                *size = (size_t)-1;
+            }
+            return NULL;
+        }
+        return wstr;
+    }
+#endif
+
+    return decode_locale(arg, size);
+#endif   /* __APPLE__ or __ANDROID__ */
+}
+
+static char*
+_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
+{
     Py_ssize_t len;
     PyObject *unicode, *bytes = NULL;
     char *cpath;
 
     unicode = PyUnicode_FromWideChar(text, wcslen(text));
-    if (unicode == NULL)
+    if (unicode == NULL) {
         return NULL;
+    }
 
     bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
     Py_DECREF(unicode);
     if (bytes == NULL) {
         PyErr_Clear();
-        if (error_pos != NULL)
+        if (error_pos != NULL) {
             *error_pos = (size_t)-1;
+        }
         return NULL;
     }
 
@@ -447,27 +443,24 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
     if (cpath == NULL) {
         PyErr_Clear();
         Py_DECREF(bytes);
-        if (error_pos != NULL)
+        if (error_pos != NULL) {
             *error_pos = (size_t)-1;
+        }
         return NULL;
     }
     memcpy(cpath, PyBytes_AsString(bytes), len + 1);
     Py_DECREF(bytes);
     return cpath;
-#else   /* __APPLE__ */
+}
+
+static char*
+encode_locale(const wchar_t *text, size_t *error_pos)
+{
     const size_t len = wcslen(text);
     char *result = NULL, *bytes = NULL;
     size_t i, size, converted;
     wchar_t c, buf[2];
 
-#ifndef MS_WINDOWS
-    if (force_ascii == -1)
-        force_ascii = check_force_ascii();
-
-    if (force_ascii)
-        return encode_ascii_surrogateescape(text, error_pos);
-#endif
-
     /* The function works in two steps:
        1. compute the length of the output buffer in bytes (size)
        2. outputs the bytes */
@@ -522,6 +515,39 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
         bytes = result;
     }
     return result;
+}
+
+/* Encode a wide character string to the locale encoding with the
+   surrogateescape error handler: surrogate characters in the range
+   U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
+
+   Return a pointer to a newly allocated byte string, use PyMem_Free() to free
+   the memory. Return NULL on encoding or memory allocation error.
+
+   If error_pos is not NULL, *error_pos is set to (size_t)-1 on success, or set
+   to the index of the invalid character on encoding error.
+
+   Use the Py_DecodeLocale() function to decode the bytes string back to a wide
+   character string. */
+char*
+Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
+{
+#if defined(__APPLE__) || defined(__ANDROID__)
+    return _Py_EncodeLocaleUTF8(text, error_pos);
+#else   /* __APPLE__ */
+    if (Py_UTF8Mode) {
+        return _Py_EncodeLocaleUTF8(text, error_pos);
+    }
+
+#ifndef MS_WINDOWS
+    if (force_ascii == -1)
+        force_ascii = check_force_ascii();
+
+    if (force_ascii)
+        return encode_ascii_surrogateescape(text, error_pos);
+#endif
+
+    return encode_locale(text, error_pos);
 #endif   /* __APPLE__ or __ANDROID__ */
 }
 
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index f284855f342..2bac23d1cb0 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -54,7 +54,7 @@ extern grammar _PyParser_Grammar; /* From graminit.c */
 static _PyInitError add_main_module(PyInterpreterState *interp);
 static _PyInitError initfsencoding(PyInterpreterState *interp);
 static _PyInitError initsite(void);
-static _PyInitError init_sys_streams(void);
+static _PyInitError init_sys_streams(PyInterpreterState *interp);
 static _PyInitError initsigs(void);
 static void call_py_exitfuncs(void);
 static void wait_for_thread_shutdown(void);
@@ -925,7 +925,7 @@ _Py_InitializeMainInterpreter(const _PyMainInterpreterConfig *config)
         return err;
     }
 
-    err = init_sys_streams();
+    err = init_sys_streams(interp);
     if (_Py_INIT_FAILED(err)) {
         return err;
     }
@@ -1410,7 +1410,7 @@ new_interpreter(PyThreadState **tstate_p)
             return err;
         }
 
-        err = init_sys_streams();
+        err = init_sys_streams(interp);
         if (_Py_INIT_FAILED(err)) {
             return err;
         }
@@ -1558,7 +1558,13 @@ initfsencoding(PyInterpreterState *interp)
         Py_FileSystemDefaultEncodeErrors = "surrogatepass";
     }
 #else
-    if (Py_FileSystemDefaultEncoding == NULL) {
+    if (Py_FileSystemDefaultEncoding == NULL &&
+        interp->core_config.utf8_mode)
+    {
+        Py_FileSystemDefaultEncoding = "utf-8";
+        Py_HasFileSystemDefaultEncoding = 1;
+    }
+    else if (Py_FileSystemDefaultEncoding == NULL) {
         Py_FileSystemDefaultEncoding = get_locale_encoding();
         if (Py_FileSystemDefaultEncoding == NULL) {
             return _Py_INIT_ERR("Unable to get the locale encoding");
@@ -1749,7 +1755,7 @@ create_stdio(PyObject* io,
 
 /* Initialize sys.stdin, stdout, stderr and builtins.open */
 static _PyInitError
-init_sys_streams(void)
+init_sys_streams(PyInterpreterState *interp)
 {
     PyObject *iomod = NULL, *wrapper;
     PyObject *bimod = NULL;
@@ -1794,10 +1800,10 @@ init_sys_streams(void)
     encoding = _Py_StandardStreamEncoding;
     errors = _Py_StandardStreamErrors;
     if (!encoding || !errors) {
-        pythonioencoding = Py_GETENV("PYTHONIOENCODING");
-        if (pythonioencoding) {
+        char *opt = Py_GETENV("PYTHONIOENCODING");
+        if (opt && opt[0] != '\0') {
             char *err;
-            pythonioencoding = _PyMem_Strdup(pythonioencoding);
+            pythonioencoding = _PyMem_Strdup(opt);
             if (pythonioencoding == NULL) {
                 PyErr_NoMemory();
                 goto error;
@@ -1814,7 +1820,12 @@ init_sys_streams(void)
                 encoding = pythonioencoding;
             }
         }
-        if (!errors && !(pythonioencoding && *pythonioencoding)) {
+        else if (interp->core_config.utf8_mode) {
+            encoding = "utf-8";
+            errors = "surrogateescape";
+        }
+
+        if (!errors && !pythonioencoding) {
             /* Choose the default error handler based on the current locale */
             errors = get_default_standard_stream_error_handler();
         }
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index f10099b5232..141e189d0b1 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -1814,6 +1814,7 @@ static PyStructSequence_Field flags_fields[] = {
     {"hash_randomization",      "-R"},
     {"isolated",                "-I"},
     {"dev_mode",                "-X dev"},
+    {"utf8_mode",               "-X utf8"},
     {0}
 };
 
@@ -1821,7 +1822,7 @@ static PyStructSequence_Desc flags_desc = {
     "sys.flags",        /* name */
     flags__doc__,       /* doc */
     flags_fields,       /* fields */
-    14
+    15
 };
 
 static PyObject*
@@ -1853,8 +1854,9 @@ make_flags(void)
     SetFlag(Py_QuietFlag);
     SetFlag(Py_HashRandomizationFlag);
     SetFlag(Py_IsolatedFlag);
-#undef SetFlag
     PyStructSequence_SET_ITEM(seq, pos++, PyBool_FromLong(core_config->dev_mode));
+    SetFlag(Py_UTF8Mode);
+#undef SetFlag
 
     if (PyErr_Occurred()) {
         Py_DECREF(seq);



More information about the Python-checkins mailing list