[issue16455] sys.getfilesystemencoding() is not the locale encoding on FreeBSD and OpenSolaris when the locale is not set

Mon Nov 12 15:40:44 CET 2012

STINNER Victor added the comment:

Hijacking locale.getpreferredencoding() is maybe dangerous. I attached a
new patch, force_ascii.patch, which uses a different approach: be more
strict than mbstowcs(), force the ASCII encoding when:
 - the LC_CTYPE locale is C
 - nl_langinfo(CODESET) is ASCII or an alias of ASCII
 - mbstowcs() is able to decode non-ASCII characters

2012/11/12 STINNER Victor <report at bugs.python.org>

>
> STINNER Victor added the comment:
>
> Some tests are failing with the patch:
>
> ======================================================================
> FAIL: test_undecodable_env (test.test_subprocess.POSIXProcessTestCase)
> ----------------------------------------------------------------------
> Traceback (most recent call last):
>   File "/usr/home/haypo/prog/python/default/Lib/test/test_subprocess.py",
> line 1606, in test_undecodable_env
>     self.assertEqual(stdout.decode('ascii'), ascii(value))
> AssertionError: "'abc\\xff'" != "'abc\\udcff'"
> - 'abc\xff'
> ?      ^
> + 'abc\udcff'
> ?      ^^^
>
> ======================================================================
> FAIL: test_strcoll_with_diacritic (test.test_locale.TestEnUSCollation)
> ----------------------------------------------------------------------
> Traceback (most recent call last):
>   File "/usr/home/haypo/prog/python/default/Lib/test/test_locale.py", line
> 364, in test_strcoll_with_diacritic
>     self.assertLess(locale.strcoll('\xe0', 'b'), 0)
> AssertionError: 126 not less than 0
>
> ======================================================================
> FAIL: test_strxfrm_with_diacritic (test.test_locale.TestEnUSCollation)
> ----------------------------------------------------------------------
> Traceback (most recent call last):
>   File "/usr/home/haypo/prog/python/default/Lib/test/test_locale.py", line
> 367, in test_strxfrm_with_diacritic
>     self.assertLess(locale.strxfrm('\xe0'), locale.strxfrm('b'))
> AssertionError: '\xe0' not less than 'b'
>
> ----------
>
> _______________________________________
> Python tracker <report at bugs.python.org>
> <http://bugs.python.org/issue16455>
> _______________________________________
>

----------
Added file: http://bugs.python.org/file27970/force_ascii.patch

_______________________________________
Python tracker <report at bugs.python.org>
<http://bugs.python.org/issue16455>
_______________________________________
-------------- next part --------------
diff -r 6a6ad09faad2 Python/fileutils.c

--- a/Python/fileutils.c	Mon Nov 12 01:23:51 2012 +0100
+++ b/Python/fileutils.c	Mon Nov 12 15:33:24 2012 +0100
@@ -4,6 +4,7 @@
 #endif
 
 #ifdef HAVE_LANGINFO_H
+#include <locale.h>
 #include <langinfo.h>
 #endif
 
@@ -39,6 +40,104 @@ PyObject *
 
 #ifdef HAVE_STAT
 
+/* Workaround FreeBSD and OpenIndiana locale encoding issue. On these
+   operating systems, nl_langinfo(CODESET) announces an alias of the ASCII
+   encoding, whereas mbstowcs() and wcstombs() functions use the ISO-8859-1
+   encoding. The problem is that os.fsencode() and os.fsdecode() use the
+   Python codec "ASCII". For example, if command line arguments are decoded
+   by mbstowcs() and encoded by os.fsencode(), we get a UnicodeEncodeError
+   instead of retrieving the original byte string.
+
+   The workaround is enabled if setlocale(LC_CTYPE, NULL) returns "C" and
+   nl_langinfo(CODESET) returns "ascii". The workaround is not used if
+   setlocale(LC_CTYPE, NULL) failed, or if nl_langinfo() or CODESET is not
+   available.
+
+   Values of locale_is_ascii:
+
+       1: the workaround is used, the ASCII codec is used instead of mbstowcs()
+          and wcstombs() functions
+       0: the workaround is not used
+      -1: unknown, need to call check_locale_force_ascii() to known the value
+*/
+static int locale_force_ascii = -1;
+
+extern char* _Py_GetLocaleEncoding(void);
+
+static int
+check_locale_force_ascii(void)
+{
+#ifdef MS_WINDOWS
+    return 0;
+#else
+    char *encoding, *loc;
+    int i;
+    unsigned char ch;
+    wchar_t wch;
+    size_t res;
+
+    return 1;
+
+    loc = setlocale(LC_CTYPE, NULL);
+    if (loc == NULL || strcmp(loc, "C") != 0) {
+        /* Failed to get the LC_CTYPE locale or it is different than C:
+         * don't use the workaround. */
+        return 0;
+    }
+
+    encoding = _Py_GetLocaleEncoding();
+    if (encoding == NULL) {
+        /* unknown encoding: consider that the encoding is not ASCII */
+        PyErr_Clear();
+        return 0;
+    }
+
+    if (strcmp(encoding, "ascii") != 0) {
+        free(encoding);
+        return 0;
+    }
+    free(encoding);
+
+    /* the locale is not set and nl_langinfo(CODESET) returns "ASCII"
+       (or an alias of the ASCII encoding). Check if the locale encoding
+       is really ASCII. */
+    for (i=0x80; i<0xff; i++) {
+        ch = (unsigned char)i;
+        res = mbstowcs(&wch, (char*)&ch, 1);
+        if (res == (size_t)-1) {
+            /* decoding a non-ASCII character from the locale encoding failed:
+               the encoding is really ASCII */
+            return 0;
+        }
+    }
+    return 1;
+#endif
+}
+
+static wchar_t*
+locale_decode_ascii(const char *arg, size_t *size)
+{
+    wchar_t *res;
+    unsigned char *in;
+    wchar_t *out;
+
+    res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
+    if (!res)
+        return NULL;
+
+    in = (unsigned char*)arg;
+    out = res;
+    while(*in)
+        if(*in < 128)
+            *out++ = *in++;
+        else
+            *out++ = 0xdc00 + *in++;
+    *out = 0;
+    if (size != NULL)
+        *size = out - res;
+    return res;
+}
+
 /* Decode a byte string from the locale encoding with the
    surrogateescape error handler (undecodable bytes are decoded as characters
    in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
@@ -60,20 +159,33 @@ wchar_t*
 _Py_char2wchar(const char* arg, size_t *size)
 {
     wchar_t *res;
+    size_t argsize;
+    size_t count;
+    unsigned char *in;
+    wchar_t *out;
+#ifdef HAVE_MBRTOWC
+    mbstate_t mbs;
+#endif
+
+    if (locale_force_ascii == -1)
+        locale_force_ascii = check_locale_force_ascii();
+
+    if (locale_force_ascii) {
+        /* force ASCII encoding to workaround mbstowcs() issue */
+        res = locale_decode_ascii(arg, size);
+        if (res == NULL)
+            goto oom;
+        return res;
+    }
+
 #ifdef HAVE_BROKEN_MBSTOWCS
     /* Some platforms have a broken implementation of
      * mbstowcs which does not count the characters that
      * would result from conversion.  Use an upper bound.
      */
-    size_t argsize = strlen(arg);
+    argsize = strlen(arg);
 #else
-    size_t argsize = mbstowcs(NULL, arg, 0);
-#endif
-    size_t count;
-    unsigned char *in;
-    wchar_t *out;
-#ifdef HAVE_MBRTOWC
-    mbstate_t mbs;
+    argsize = mbstowcs(NULL, arg, 0);
 #endif
     if (argsize != (size_t)-1) {
         res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
@@ -144,24 +256,16 @@ wchar_t*
         argsize -= converted;
         out++;
     }
+    if (size != NULL)
+        *size = out - res;
 #else
     /* Cannot use C locale for escaping; manually escape as if charset
        is ASCII (i.e. escape all bytes > 128. This will still roundtrip
        correctly in the locale's charset, which must be an ASCII superset. */
-    res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
-    if (!res)
+    res = locale_decode_ascii(arg, size);
+    if (res == NULL)
         goto oom;
-    in = (unsigned char*)arg;
-    out = res;
-    while(*in)
-        if(*in < 128)
-            *out++ = *in++;
-        else
-            *out++ = 0xdc00 + *in++;
-    *out = 0;
 #endif
-    if (size != NULL)
-        *size = out - res;
     return res;
 oom:
     if (size != NULL)
@@ -169,6 +273,45 @@ oom:
     return NULL;
 }
 
+static char*
+locale_encode_ascii(const wchar_t *text, size_t *error_pos)
+{
+    char *result = NULL, *out;
+    size_t len, i;
+    wchar_t ch;
+
+    if (error_pos != NULL)
+        *error_pos = (size_t)-1;
+
+    len = wcslen(text);
+
+    result = PyMem_Malloc(len + 1);  /* +1 for NUL byte */
+    if (result == NULL)
+        return NULL;
+
+    out = result;
+    for (i=0; i<len; i++) {
+        ch = text[i];
+
+        if (ch <= 0x7f) {
+            /* ASCII character */
+            *out++ = (char)ch;
+        }
+        else if (0xdc80 <= ch && ch <= 0xdcff) {
+            /* UTF-8b surrogate */
+            *out++ = (char)(ch - 0xdc00);
+        }
+        else {
+            if (error_pos != NULL)
+                *error_pos = i;
+            PyMem_Free(result);
+            return NULL;
+        }
+    }
+    *out = '\0';
+    return result;
+}
+
 /* Encode a (wide) character string to the locale encoding with the
    surrogateescape error handler (characters in range U+DC80..U+DCFF are
    converted to bytes 0x80..0xFF).
@@ -191,6 +334,12 @@ char*
     if (error_pos != NULL)
         *error_pos = (size_t)-1;
 
+    if (locale_force_ascii == -1)
+        locale_force_ascii = check_locale_force_ascii();
+
+    if (locale_force_ascii)
+        return locale_encode_ascii(text, error_pos);
+
     /* The function works in two steps:
        1. compute the length of the output buffer in bytes (size)
        2. outputs the bytes */
@@ -231,7 +380,7 @@ char*
             }
         }
         if (result != NULL) {
-            *bytes = 0;
+            *bytes = '\0';
             break;
         }
 
diff -r 6a6ad09faad2 Python/pythonrun.c
--- a/Python/pythonrun.c	Mon Nov 12 01:23:51 2012 +0100
+++ b/Python/pythonrun.c	Mon Nov 12 15:33:24 2012 +0100
@@ -170,8 +170,8 @@ error:
     return NULL;
 }
 
-static char*
-get_locale_encoding(void)
+char*
+_Py_GetLocaleEncoding(void)
 {
 #ifdef MS_WINDOWS
     char codepage[100];
@@ -868,7 +868,7 @@ initfsencoding(PyInterpreterState *inter
 
     if (Py_FileSystemDefaultEncoding == NULL)
     {
-        Py_FileSystemDefaultEncoding = get_locale_encoding();
+        Py_FileSystemDefaultEncoding = _Py_GetLocaleEncoding();
         if (Py_FileSystemDefaultEncoding == NULL)
             Py_FatalError("Py_Initialize: Unable to get the locale encoding");