[Python-checkins] r87537 - in python/branches/py3k: Doc/library/struct.rst Doc/whatsnew/3.2.rst Lib/test/test_struct.py Lib/wave.py Misc/NEWS Modules/_struct.c

victor.stinner python-checkins at python.org
Tue Dec 28 14:26:42 CET 2010


Author: victor.stinner
Date: Tue Dec 28 14:26:42 2010
New Revision: 87537

Log:
Issue #10783: struct.pack() doesn't encode implicitly unicode to UTF-8

 * Replace "bytes" by "bytes object" in struct error messages
 * Document the API change in What's new in Python 3.2
 * Fix test_wave
 * Remove also ugly implicit conversions in test_struct

Modified:
   python/branches/py3k/Doc/library/struct.rst
   python/branches/py3k/Doc/whatsnew/3.2.rst
   python/branches/py3k/Lib/test/test_struct.py
   python/branches/py3k/Lib/wave.py
   python/branches/py3k/Misc/NEWS
   python/branches/py3k/Modules/_struct.c

Modified: python/branches/py3k/Doc/library/struct.rst
==============================================================================
--- python/branches/py3k/Doc/library/struct.rst	(original)
+++ python/branches/py3k/Doc/library/struct.rst	Tue Dec 28 14:26:42 2010
@@ -164,58 +164,53 @@
 +--------+--------------------------+--------------------+----------------+------------+
 | ``c``  | :c:type:`char`           | bytes of length 1  | 1              |            |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``b``  | :c:type:`signed char`    | integer            | 1              | \(1),\(4)  |
+| ``b``  | :c:type:`signed char`    | integer            | 1              | \(1),\(3)  |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``B``  | :c:type:`unsigned char`  | integer            | 1              | \(4)       |
+| ``B``  | :c:type:`unsigned char`  | integer            | 1              | \(3)       |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``?``  | :c:type:`_Bool`          | bool               | 1              | \(2)       |
+| ``?``  | :c:type:`_Bool`          | bool               | 1              | \(1)       |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``h``  | :c:type:`short`          | integer            | 2              | \(4)       |
+| ``h``  | :c:type:`short`          | integer            | 2              | \(3)       |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``H``  | :c:type:`unsigned short` | integer            | 2              | \(4)       |
+| ``H``  | :c:type:`unsigned short` | integer            | 2              | \(3)       |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``i``  | :c:type:`int`            | integer            | 4              | \(4)       |
+| ``i``  | :c:type:`int`            | integer            | 4              | \(3)       |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``I``  | :c:type:`unsigned int`   | integer            | 4              | \(4)       |
+| ``I``  | :c:type:`unsigned int`   | integer            | 4              | \(3)       |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``l``  | :c:type:`long`           | integer            | 4              | \(4)       |
+| ``l``  | :c:type:`long`           | integer            | 4              | \(3)       |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``L``  | :c:type:`unsigned long`  | integer            | 4              | \(4)       |
+| ``L``  | :c:type:`unsigned long`  | integer            | 4              | \(3)       |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``q``  | :c:type:`long long`      | integer            | 8              | \(3), \(4) |
+| ``q``  | :c:type:`long long`      | integer            | 8              | \(2), \(3) |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``Q``  | :c:type:`unsigned long   | integer            | 8              | \(3), \(4) |
+| ``Q``  | :c:type:`unsigned long   | integer            | 8              | \(2), \(3) |
 |        | long`                    |                    |                |            |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``f``  | :c:type:`float`          | float              | 4              | \(5)       |
+| ``f``  | :c:type:`float`          | float              | 4              | \(4)       |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``d``  | :c:type:`double`         | float              | 8              | \(5)       |
+| ``d``  | :c:type:`double`         | float              | 8              | \(4)       |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``s``  | :c:type:`char[]`         | bytes              |                | \(1)       |
+| ``s``  | :c:type:`char[]`         | bytes              |                |            |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``p``  | :c:type:`char[]`         | bytes              |                | \(1)       |
+| ``p``  | :c:type:`char[]`         | bytes              |                |            |
 +--------+--------------------------+--------------------+----------------+------------+
-| ``P``  | :c:type:`void \*`        | integer            |                | \(6)       |
+| ``P``  | :c:type:`void \*`        | integer            |                | \(5)       |
 +--------+--------------------------+--------------------+----------------+------------+
 
 Notes:
 
 (1)
-   The ``c``, ``s`` and ``p`` conversion codes operate on :class:`bytes`
-   objects, but packing with such codes also supports :class:`str` objects,
-   which are encoded using UTF-8.
-
-(2)
    The ``'?'`` conversion code corresponds to the :c:type:`_Bool` type defined by
    C99. If this type is not available, it is simulated using a :c:type:`char`. In
    standard mode, it is always represented by one byte.
 
-(3)
+(2)
    The ``'q'`` and ``'Q'`` conversion codes are available in native mode only if
    the platform C compiler supports C :c:type:`long long`, or, on Windows,
    :c:type:`__int64`.  They are always available in standard modes.
 
-(4)
+(3)
    When attempting to pack a non-integer using any of the integer conversion
    codes, if the non-integer has a :meth:`__index__` method then that method is
    called to convert the argument to an integer before packing.
@@ -223,12 +218,12 @@
    .. versionchanged:: 3.2
       Use of the :meth:`__index__` method for non-integers is new in 3.2.
 
-(5)
+(4)
    For the ``'f'`` and ``'d'`` conversion codes, the packed representation uses
    the IEEE 754 binary32 (for ``'f'``) or binary64 (for ``'d'``) format,
    regardless of the floating-point format used by the platform.
 
-(6)
+(5)
    The ``'P'`` format character is only available for the native byte ordering
    (selected as the default or with the ``'@'`` byte order character). The byte
    order character ``'='`` chooses to use little- or big-endian ordering based
@@ -310,9 +305,9 @@
 The ordering of format characters may have an impact on size since the padding
 needed to satisfy alignment requirements is different::
 
-    >>> pack('ci', '*', 0x12131415)
+    >>> pack('ci', b'*', 0x12131415)
     b'*\x00\x00\x00\x12\x13\x14\x15'
-    >>> pack('ic', 0x12131415, '*')
+    >>> pack('ic', 0x12131415, b'*')
     b'\x12\x13\x14\x15*'
     >>> calcsize('ci')
     8

Modified: python/branches/py3k/Doc/whatsnew/3.2.rst
==============================================================================
--- python/branches/py3k/Doc/whatsnew/3.2.rst	(original)
+++ python/branches/py3k/Doc/whatsnew/3.2.rst	Tue Dec 28 14:26:42 2010
@@ -1705,3 +1705,7 @@
 
   (Contributed by Georg Brandl and Mattias Brändström;
   `appspot issue 53094 <http://codereview.appspot.com/53094>`_.)
+
+* :func:`struct.pack` doesn't encode implicitly unicode to UTF-8 anymore: use
+  explicit conversion instead and replace unicode literals by bytes literals.
+

Modified: python/branches/py3k/Lib/test/test_struct.py
==============================================================================
--- python/branches/py3k/Lib/test/test_struct.py	(original)
+++ python/branches/py3k/Lib/test/test_struct.py	Tue Dec 28 14:26:42 2010
@@ -82,58 +82,52 @@
         # Test some of the new features in detail
         # (format, argument, big-endian result, little-endian result, asymmetric)
         tests = [
-            ('c', 'a', 'a', 'a', 0),
-            ('xc', 'a', '\0a', '\0a', 0),
-            ('cx', 'a', 'a\0', 'a\0', 0),
-            ('s', 'a', 'a', 'a', 0),
-            ('0s', 'helloworld', '', '', 1),
-            ('1s', 'helloworld', 'h', 'h', 1),
-            ('9s', 'helloworld', 'helloworl', 'helloworl', 1),
-            ('10s', 'helloworld', 'helloworld', 'helloworld', 0),
-            ('11s', 'helloworld', 'helloworld\0', 'helloworld\0', 1),
-            ('20s', 'helloworld', 'helloworld'+10*'\0', 'helloworld'+10*'\0', 1),
-            ('b', 7, '\7', '\7', 0),
-            ('b', -7, '\371', '\371', 0),
-            ('B', 7, '\7', '\7', 0),
-            ('B', 249, '\371', '\371', 0),
-            ('h', 700, '\002\274', '\274\002', 0),
-            ('h', -700, '\375D', 'D\375', 0),
-            ('H', 700, '\002\274', '\274\002', 0),
-            ('H', 0x10000-700, '\375D', 'D\375', 0),
-            ('i', 70000000, '\004,\035\200', '\200\035,\004', 0),
-            ('i', -70000000, '\373\323\342\200', '\200\342\323\373', 0),
-            ('I', 70000000, '\004,\035\200', '\200\035,\004', 0),
-            ('I', 0x100000000-70000000, '\373\323\342\200', '\200\342\323\373', 0),
-            ('l', 70000000, '\004,\035\200', '\200\035,\004', 0),
-            ('l', -70000000, '\373\323\342\200', '\200\342\323\373', 0),
-            ('L', 70000000, '\004,\035\200', '\200\035,\004', 0),
-            ('L', 0x100000000-70000000, '\373\323\342\200', '\200\342\323\373', 0),
-            ('f', 2.0, '@\000\000\000', '\000\000\000@', 0),
-            ('d', 2.0, '@\000\000\000\000\000\000\000',
-                       '\000\000\000\000\000\000\000@', 0),
-            ('f', -2.0, '\300\000\000\000', '\000\000\000\300', 0),
-            ('d', -2.0, '\300\000\000\000\000\000\000\000',
-                       '\000\000\000\000\000\000\000\300', 0),
-                ('?', 0, '\0', '\0', 0),
-                ('?', 3, '\1', '\1', 1),
-                ('?', True, '\1', '\1', 0),
-                ('?', [], '\0', '\0', 1),
-                ('?', (1,), '\1', '\1', 1),
+            ('c', b'a', b'a', b'a', 0),
+            ('xc', b'a', b'\0a', b'\0a', 0),
+            ('cx', b'a', b'a\0', b'a\0', 0),
+            ('s', b'a', b'a', b'a', 0),
+            ('0s', b'helloworld', b'', b'', 1),
+            ('1s', b'helloworld', b'h', b'h', 1),
+            ('9s', b'helloworld', b'helloworl', b'helloworl', 1),
+            ('10s', b'helloworld', b'helloworld', b'helloworld', 0),
+            ('11s', b'helloworld', b'helloworld\0', b'helloworld\0', 1),
+            ('20s', b'helloworld', b'helloworld'+10*b'\0', b'helloworld'+10*b'\0', 1),
+            ('b', 7, b'\7', b'\7', 0),
+            ('b', -7, b'\371', b'\371', 0),
+            ('B', 7, b'\7', b'\7', 0),
+            ('B', 249, b'\371', b'\371', 0),
+            ('h', 700, b'\002\274', b'\274\002', 0),
+            ('h', -700, b'\375D', b'D\375', 0),
+            ('H', 700, b'\002\274', b'\274\002', 0),
+            ('H', 0x10000-700, b'\375D', b'D\375', 0),
+            ('i', 70000000, b'\004,\035\200', b'\200\035,\004', 0),
+            ('i', -70000000, b'\373\323\342\200', b'\200\342\323\373', 0),
+            ('I', 70000000, b'\004,\035\200', b'\200\035,\004', 0),
+            ('I', 0x100000000-70000000, b'\373\323\342\200', b'\200\342\323\373', 0),
+            ('l', 70000000, b'\004,\035\200', b'\200\035,\004', 0),
+            ('l', -70000000, b'\373\323\342\200', b'\200\342\323\373', 0),
+            ('L', 70000000, b'\004,\035\200', b'\200\035,\004', 0),
+            ('L', 0x100000000-70000000, b'\373\323\342\200', b'\200\342\323\373', 0),
+            ('f', 2.0, b'@\000\000\000', b'\000\000\000@', 0),
+            ('d', 2.0, b'@\000\000\000\000\000\000\000',
+                       b'\000\000\000\000\000\000\000@', 0),
+            ('f', -2.0, b'\300\000\000\000', b'\000\000\000\300', 0),
+            ('d', -2.0, b'\300\000\000\000\000\000\000\000',
+                        b'\000\000\000\000\000\000\000\300', 0),
+            ('?', 0, b'\0', b'\0', 0),
+            ('?', 3, b'\1', b'\1', 1),
+            ('?', True, b'\1', b'\1', 0),
+            ('?', [], b'\0', b'\0', 1),
+            ('?', (1,), b'\1', b'\1', 1),
         ]
 
         for fmt, arg, big, lil, asy in tests:
-            big = bytes(big, "latin-1")
-            lil = bytes(lil, "latin-1")
             for (xfmt, exp) in [('>'+fmt, big), ('!'+fmt, big), ('<'+fmt, lil),
                                 ('='+fmt, ISBIGENDIAN and big or lil)]:
                 res = struct.pack(xfmt, arg)
                 self.assertEqual(res, exp)
                 self.assertEqual(struct.calcsize(xfmt), len(res))
                 rev = struct.unpack(xfmt, res)[0]
-                if isinstance(arg, str):
-                    # Strings are returned as bytes since you can't know the
-                    # encoding of the string when packed.
-                    arg = bytes(arg, 'latin1')
                 if rev != arg:
                     self.assertTrue(asy)
 
@@ -334,15 +328,14 @@
     def test_p_code(self):
         # Test p ("Pascal string") code.
         for code, input, expected, expectedback in [
-                ('p','abc', '\x00', b''),
-                ('1p', 'abc', '\x00', b''),
-                ('2p', 'abc', '\x01a', b'a'),
-                ('3p', 'abc', '\x02ab', b'ab'),
-                ('4p', 'abc', '\x03abc', b'abc'),
-                ('5p', 'abc', '\x03abc\x00', b'abc'),
-                ('6p', 'abc', '\x03abc\x00\x00', b'abc'),
-                ('1000p', 'x'*1000, '\xff' + 'x'*999, b'x'*255)]:
-            expected = bytes(expected, "latin-1")
+                ('p',  b'abc', b'\x00',            b''),
+                ('1p', b'abc', b'\x00',            b''),
+                ('2p', b'abc', b'\x01a',           b'a'),
+                ('3p', b'abc', b'\x02ab',          b'ab'),
+                ('4p', b'abc', b'\x03abc',         b'abc'),
+                ('5p', b'abc', b'\x03abc\x00',     b'abc'),
+                ('6p', b'abc', b'\x03abc\x00\x00', b'abc'),
+                ('1000p', b'x'*1000, b'\xff' + b'x'*999, b'x'*255)]:
             got = struct.pack(code, input)
             self.assertEqual(got, expected)
             (got,) = struct.unpack(code, got)
@@ -401,15 +394,11 @@
         s = struct.Struct(fmt)
         for cls in (bytes, bytearray):
             data = cls(test_string)
-            if not isinstance(data, (bytes, bytearray)):
-                bytes_data = bytes(data, 'latin1')
-            else:
-                bytes_data = data
             self.assertEqual(s.unpack_from(data), (b'abcd',))
             self.assertEqual(s.unpack_from(data, 2), (b'cd01',))
             self.assertEqual(s.unpack_from(data, 4), (b'0123',))
             for i in range(6):
-                self.assertEqual(s.unpack_from(data, i), (bytes_data[i:i+4],))
+                self.assertEqual(s.unpack_from(data, i), (data[i:i+4],))
             for i in range(6, len(test_string) + 1):
                 self.assertRaises(struct.error, s.unpack_from, data, i)
         for cls in (bytes, bytearray):

Modified: python/branches/py3k/Lib/wave.py
==============================================================================
--- python/branches/py3k/Lib/wave.py	(original)
+++ python/branches/py3k/Lib/wave.py	Tue Dec 28 14:26:42 2010
@@ -467,11 +467,11 @@
         self._datalength = self._nframes * self._nchannels * self._sampwidth
         self._form_length_pos = self._file.tell()
         self._file.write(struct.pack('<l4s4slhhllhh4s',
-            36 + self._datalength, 'WAVE', 'fmt ', 16,
+            36 + self._datalength, b'WAVE', b'fmt ', 16,
             WAVE_FORMAT_PCM, self._nchannels, self._framerate,
             self._nchannels * self._framerate * self._sampwidth,
             self._nchannels * self._sampwidth,
-            self._sampwidth * 8, 'data'))
+            self._sampwidth * 8, b'data'))
         self._data_length_pos = self._file.tell()
         self._file.write(struct.pack('<l', self._datalength))
         self._headerwritten = True

Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS	(original)
+++ python/branches/py3k/Misc/NEWS	Tue Dec 28 14:26:42 2010
@@ -18,6 +18,9 @@
 Library
 -------
 
+- Issue #10783: struct.pack() doesn't encode implicitly unicode to UTF-8
+  anymore.
+
 - Issue #10730: Add SVG mime types to mimetypes module.
 
 - Issue #10768: Make the Tkinter ScrolledText widget work again.

Modified: python/branches/py3k/Modules/_struct.c
==============================================================================
--- python/branches/py3k/Modules/_struct.c	(original)
+++ python/branches/py3k/Modules/_struct.c	Tue Dec 28 14:26:42 2010
@@ -462,14 +462,9 @@
 static int
 np_char(char *p, PyObject *v, const formatdef *f)
 {
-    if (PyUnicode_Check(v)) {
-        v = _PyUnicode_AsDefaultEncodedString(v, NULL);
-        if (v == NULL)
-            return -1;
-    }
     if (!PyBytes_Check(v) || PyBytes_Size(v) != 1) {
         PyErr_SetString(StructError,
-                        "char format requires bytes or string of length 1");
+                        "char format requires a bytes object of length 1");
         return -1;
     }
     *p = *PyBytes_AsString(v);
@@ -1345,7 +1340,7 @@
     if (!PyBytes_Check(o_format)) {
         Py_DECREF(o_format);
         PyErr_Format(PyExc_TypeError,
-                     "Struct() argument 1 must be bytes, not %.200s",
+                     "Struct() argument 1 must be a bytes object, not %.200s",
                      Py_TYPE(o_format)->tp_name);
         return -1;
     }
@@ -1423,7 +1418,7 @@
         return NULL;
     if (vbuf.len != soself->s_size) {
         PyErr_Format(StructError,
-                     "unpack requires a bytes argument of length %zd",
+                     "unpack requires a bytes object of length %zd",
                      soself->s_size);
         PyBuffer_Release(&vbuf);
         return NULL;
@@ -1503,15 +1498,10 @@
         if (e->format == 's') {
             int isstring;
             void *p;
-            if (PyUnicode_Check(v)) {
-                v = _PyUnicode_AsDefaultEncodedString(v, NULL);
-                if (v == NULL)
-                    return -1;
-            }
             isstring = PyBytes_Check(v);
             if (!isstring && !PyByteArray_Check(v)) {
                 PyErr_SetString(StructError,
-                                "argument for 's' must be a bytes or string");
+                                "argument for 's' must be a bytes object");
                 return -1;
             }
             if (isstring) {
@@ -1529,15 +1519,10 @@
         } else if (e->format == 'p') {
             int isstring;
             void *p;
-            if (PyUnicode_Check(v)) {
-                v = _PyUnicode_AsDefaultEncodedString(v, NULL);
-                if (v == NULL)
-                    return -1;
-            }
             isstring = PyBytes_Check(v);
             if (!isstring && !PyByteArray_Check(v)) {
                 PyErr_SetString(StructError,
-                                "argument for 'p' must be a bytes or string");
+                                "argument for 'p' must be a bytes object");
                 return -1;
             }
             if (isstring) {
@@ -1691,7 +1676,7 @@
     {NULL,       NULL}          /* sentinel */
 };
 
-PyDoc_STRVAR(s__doc__, 
+PyDoc_STRVAR(s__doc__,
 "Struct(fmt) --> compiled struct object\n"
 "\n"
 "Return a new Struct object which writes and reads binary data according to\n"


More information about the Python-checkins mailing list