[Python-checkins] r57146 - in python/trunk: Doc/c-api/concrete.rst Doc/library/codecs.rst Include/unicodeobject.h Lib/encodings/aliases.py Lib/encodings/utf_32.py Lib/encodings/utf_32_be.py Lib/encodings/utf_32_le.py Lib/test/test_codeccallbacks.py Lib/test/test_codecs.py Misc/NEWS Modules/_codecsmodule.c Objects/unicodeobject.c

walter.doerwald python-checkins at python.org
Fri Aug 17 18:41:58 CEST 2007


Author: walter.doerwald
Date: Fri Aug 17 18:41:28 2007
New Revision: 57146

Added:
   python/trunk/Lib/encodings/utf_32.py
   python/trunk/Lib/encodings/utf_32_be.py
   python/trunk/Lib/encodings/utf_32_le.py
Modified:
   python/trunk/Doc/c-api/concrete.rst
   python/trunk/Doc/library/codecs.rst
   python/trunk/Include/unicodeobject.h
   python/trunk/Lib/encodings/aliases.py
   python/trunk/Lib/test/test_codeccallbacks.py
   python/trunk/Lib/test/test_codecs.py
   python/trunk/Misc/NEWS
   python/trunk/Modules/_codecsmodule.c
   python/trunk/Objects/unicodeobject.c
Log:
Backport r57105 and r57145 from the py3k branch: UTF-32 codecs.


Modified: python/trunk/Doc/c-api/concrete.rst
==============================================================================
--- python/trunk/Doc/c-api/concrete.rst	(original)
+++ python/trunk/Doc/c-api/concrete.rst	Fri Aug 17 18:41:28 2007
@@ -1301,6 +1301,79 @@
    object.  Error handling is "strict".  Return *NULL* if an exception was raised
    by the codec.
 
+These are the UTF-32 codec APIs:
+
+.. % --- UTF-32 Codecs ------------------------------------------------------ */
+
+
+.. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder)
+
+   Decode *length* bytes from a UTF-32 encoded buffer string and return the
+   corresponding Unicode object.  *errors* (if non-*NULL*) defines the error
+   handling. It defaults to "strict".
+
+   If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte
+   order::
+
+      *byteorder == -1: little endian
+      *byteorder == 0:  native order
+      *byteorder == 1:  big endian
+
+   and then switches if the first four bytes of the input data are a byte order mark
+   (BOM) and the specified byte order is native order.  This BOM is not copied into
+   the resulting Unicode string.  After completion, *\*byteorder* is set to the
+   current byte order at the end of input data.
+
+   In a narrow build codepoints outside the BMP will be decoded as surrogate pairs.
+
+   If *byteorder* is *NULL*, the codec starts in native order mode.
+
+   Return *NULL* if an exception was raised by the codec.
+
+   .. versionadded:: 2.6
+
+
+.. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed)
+
+   If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If
+   *consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat
+   trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible
+   by four) as an error. Those bytes will not be decoded and the number of bytes
+   that have been decoded will be stored in *consumed*.
+
+   .. versionadded:: 2.6
+
+
+.. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder)
+
+   Return a Python bytes object holding the UTF-32 encoded value of the Unicode
+   data in *s*.  If *byteorder* is not ``0``, output is written according to the
+   following byte order::
+
+      byteorder == -1: little endian
+      byteorder == 0:  native byte order (writes a BOM mark)
+      byteorder == 1:  big endian
+
+   If byteorder is ``0``, the output string will always start with the Unicode BOM
+   mark (U+FEFF). In the other two modes, no BOM mark is prepended.
+
+   If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output
+   as a single codepoint.
+
+   Return *NULL* if an exception was raised by the codec.
+
+   .. versionadded:: 2.6
+
+
+.. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode)
+
+   Return a Python string using the UTF-32 encoding in native byte order. The
+   string always starts with a BOM mark.  Error handling is "strict".  Return
+   *NULL* if an exception was raised by the codec.
+
+   .. versionadded:: 2.6
+
+
 These are the UTF-16 codec APIs:
 
 .. % --- UTF-16 Codecs ------------------------------------------------------ */

Modified: python/trunk/Doc/library/codecs.rst
==============================================================================
--- python/trunk/Doc/library/codecs.rst	(original)
+++ python/trunk/Doc/library/codecs.rst	Fri Aug 17 18:41:28 2007
@@ -1045,6 +1045,12 @@
 | shift_jisx0213  | shiftjisx0213, sjisx0213,      | Japanese                       |
 |                 | s_jisx0213                     |                                |
 +-----------------+--------------------------------+--------------------------------+
+| utf_32          | U32, utf32                     | all languages                  |
++-----------------+--------------------------------+--------------------------------+
+| utf_32_be       | UTF-32BE                       | all languages                  |
++-----------------+--------------------------------+--------------------------------+
+| utf_32_le       | UTF-32LE                       | all languages                  |
++-----------------+--------------------------------+--------------------------------+
 | utf_16          | U16, utf16                     | all languages                  |
 +-----------------+--------------------------------+--------------------------------+
 | utf_16_be       | UTF-16BE                       | all languages (BMP only)       |

Modified: python/trunk/Include/unicodeobject.h
==============================================================================
--- python/trunk/Include/unicodeobject.h	(original)
+++ python/trunk/Include/unicodeobject.h	Fri Aug 17 18:41:28 2007
@@ -145,6 +145,7 @@
 # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
 # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
+# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
 # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
 # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
 # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
@@ -159,6 +160,8 @@
 # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
 # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
+# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
+# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
 # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
 # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
@@ -170,6 +173,7 @@
 # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
 # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
+# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
@@ -223,6 +227,7 @@
 # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
 # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
+# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
 # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
 # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
 # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
@@ -237,6 +242,8 @@
 # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
 # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
+# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
+# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
 # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
 # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
@@ -248,6 +255,7 @@
 # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
 # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
+# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
@@ -701,6 +709,80 @@
     const char *errors		/* error handling */
     );
 
+/* --- UTF-32 Codecs ------------------------------------------------------ */
+
+/* Decodes length bytes from a UTF-32 encoded buffer string and returns
+   the corresponding Unicode object.
+
+   errors (if non-NULL) defines the error handling. It defaults
+   to "strict". 
+
+   If byteorder is non-NULL, the decoder starts decoding using the
+   given byte order:
+
+	*byteorder == -1: little endian
+	*byteorder == 0:  native order
+	*byteorder == 1:  big endian
+
+   In native mode, the first four bytes of the stream are checked for a
+   BOM mark. If found, the BOM mark is analysed, the byte order
+   adjusted and the BOM skipped.  In the other modes, no BOM mark
+   interpretation is done. After completion, *byteorder is set to the
+   current byte order at the end of input data.
+
+   If byteorder is NULL, the codec starts in native order mode.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
+    const char *string, 	/* UTF-32 encoded string */
+    Py_ssize_t length,	 	/* size of string */
+    const char *errors,		/* error handling */
+    int *byteorder		/* pointer to byteorder to use
+				   0=native;-1=LE,1=BE; updated on
+				   exit */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
+    const char *string, 	/* UTF-32 encoded string */
+    Py_ssize_t length,	 	/* size of string */
+    const char *errors,		/* error handling */
+    int *byteorder,		/* pointer to byteorder to use
+				   0=native;-1=LE,1=BE; updated on
+				   exit */
+    Py_ssize_t *consumed	/* bytes consumed */
+    );
+
+/* Returns a Python string using the UTF-32 encoding in native byte
+   order. The string always starts with a BOM mark.  */
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
+    PyObject *unicode	 	/* Unicode object */
+    );
+
+/* Returns a Python string object holding the UTF-32 encoded value of
+   the Unicode data.
+
+   If byteorder is not 0, output is written according to the following
+   byte order:
+
+   byteorder == -1: little endian
+   byteorder == 0:  native byte order (writes a BOM mark)
+   byteorder == 1:  big endian
+
+   If byteorder is 0, the output string will always start with the
+   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
+   prepended.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
+    const Py_UNICODE *data, 	/* Unicode char buffer */
+    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
+    const char *errors,		/* error handling */
+    int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */
+    );
+
 /* --- UTF-16 Codecs ------------------------------------------------------ */
 
 /* Decodes length bytes from a UTF-16 encoded buffer string and returns

Modified: python/trunk/Lib/encodings/aliases.py
==============================================================================
--- python/trunk/Lib/encodings/aliases.py	(original)
+++ python/trunk/Lib/encodings/aliases.py	Fri Aug 17 18:41:28 2007
@@ -490,6 +490,16 @@
     'unicodelittleunmarked' : 'utf_16_le',
     'utf_16le'           : 'utf_16_le',
 
+    # utf_32 codec
+    'u32'                : 'utf_32',
+    'utf32'              : 'utf_32',
+
+    # utf_32_be codec
+    'utf_32be'           : 'utf_32_be',
+
+    # utf_32_le codec
+    'utf_32le'           : 'utf_32_le',
+
     # utf_7 codec
     'u7'                 : 'utf_7',
     'utf7'               : 'utf_7',

Added: python/trunk/Lib/encodings/utf_32.py
==============================================================================
--- (empty file)
+++ python/trunk/Lib/encodings/utf_32.py	Fri Aug 17 18:41:28 2007
@@ -0,0 +1,144 @@
+"""
+Python 'utf-32' Codec
+"""
+import codecs, sys
+
+### Codec APIs
+
+encode = codecs.utf_32_encode
+
+def decode(input, errors='strict'):
+    return codecs.utf_32_decode(input, errors, True)
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+    def __init__(self, errors='strict'):
+        codecs.IncrementalEncoder.__init__(self, errors)
+        self.encoder = None
+
+    def encode(self, input, final=False):
+        if self.encoder is None:
+            result = codecs.utf_32_encode(input, self.errors)[0]
+            if sys.byteorder == 'little':
+                self.encoder = codecs.utf_32_le_encode
+            else:
+                self.encoder = codecs.utf_32_be_encode
+            return result
+        return self.encoder(input, self.errors)[0]
+
+    def reset(self):
+        codecs.IncrementalEncoder.reset(self)
+        self.encoder = None
+
+    def getstate(self):
+        # state info we return to the caller:
+        # 0: stream is in natural order for this platform
+        # 2: endianness hasn't been determined yet
+        # (we're never writing in unnatural order)
+        return (2 if self.encoder is None else 0)
+
+    def setstate(self, state):
+        if state:
+            self.encoder = None
+        else:
+            if sys.byteorder == 'little':
+                self.encoder = codecs.utf_32_le_encode
+            else:
+                self.encoder = codecs.utf_32_be_encode
+
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    def __init__(self, errors='strict'):
+        codecs.BufferedIncrementalDecoder.__init__(self, errors)
+        self.decoder = None
+
+    def _buffer_decode(self, input, errors, final):
+        if self.decoder is None:
+            (output, consumed, byteorder) = \
+                codecs.utf_32_ex_decode(input, errors, 0, final)
+            if byteorder == -1:
+                self.decoder = codecs.utf_32_le_decode
+            elif byteorder == 1:
+                self.decoder = codecs.utf_32_be_decode
+            elif consumed >= 4:
+                raise UnicodeError("UTF-32 stream does not start with BOM")
+            return (output, consumed)
+        return self.decoder(input, self.errors, final)
+
+    def reset(self):
+        codecs.BufferedIncrementalDecoder.reset(self)
+        self.decoder = None
+
+    def getstate(self):
+        # additonal state info from the base class must be None here,
+        # as it isn't passed along to the caller
+        state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
+        # additional state info we pass to the caller:
+        # 0: stream is in natural order for this platform
+        # 1: stream is in unnatural order
+        # 2: endianness hasn't been determined yet
+        if self.decoder is None:
+            return (state, 2)
+        addstate = int((sys.byteorder == "big") !=
+                       (self.decoder is codecs.utf_32_be_decode))
+        return (state, addstate)
+
+    def setstate(self, state):
+        # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
+        codecs.BufferedIncrementalDecoder.setstate(self, state)
+        state = state[1]
+        if state == 0:
+            self.decoder = (codecs.utf_32_be_decode
+                            if sys.byteorder == "big"
+                            else codecs.utf_32_le_decode)
+        elif state == 1:
+            self.decoder = (codecs.utf_32_le_decode
+                            if sys.byteorder == "big"
+                            else codecs.utf_32_be_decode)
+        else:
+            self.decoder = None
+
+class StreamWriter(codecs.StreamWriter):
+    def __init__(self, stream, errors='strict'):
+        self.bom_written = False
+        codecs.StreamWriter.__init__(self, stream, errors)
+
+    def encode(self, input, errors='strict'):
+        self.bom_written = True
+        result = codecs.utf_32_encode(input, errors)
+        if sys.byteorder == 'little':
+            self.encode = codecs.utf_32_le_encode
+        else:
+            self.encode = codecs.utf_32_be_encode
+        return result
+
+class StreamReader(codecs.StreamReader):
+
+    def reset(self):
+        codecs.StreamReader.reset(self)
+        try:
+            del self.decode
+        except AttributeError:
+            pass
+
+    def decode(self, input, errors='strict'):
+        (object, consumed, byteorder) = \
+            codecs.utf_32_ex_decode(input, errors, 0, False)
+        if byteorder == -1:
+            self.decode = codecs.utf_32_le_decode
+        elif byteorder == 1:
+            self.decode = codecs.utf_32_be_decode
+        elif consumed>=4:
+            raise UnicodeError,"UTF-32 stream does not start with BOM"
+        return (object, consumed)
+
+### encodings module API
+
+def getregentry():
+    return codecs.CodecInfo(
+        name='utf-32',
+        encode=encode,
+        decode=decode,
+        incrementalencoder=IncrementalEncoder,
+        incrementaldecoder=IncrementalDecoder,
+        streamreader=StreamReader,
+        streamwriter=StreamWriter,
+    )

Added: python/trunk/Lib/encodings/utf_32_be.py
==============================================================================
--- (empty file)
+++ python/trunk/Lib/encodings/utf_32_be.py	Fri Aug 17 18:41:28 2007
@@ -0,0 +1,37 @@
+"""
+Python 'utf-32-be' Codec
+"""
+import codecs
+
+### Codec APIs
+
+encode = codecs.utf_32_be_encode
+
+def decode(input, errors='strict'):
+    return codecs.utf_32_be_decode(input, errors, True)
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+    def encode(self, input, final=False):
+        return codecs.utf_32_be_encode(input, self.errors)[0]
+
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    _buffer_decode = codecs.utf_32_be_decode
+
+class StreamWriter(codecs.StreamWriter):
+    encode = codecs.utf_32_be_encode
+
+class StreamReader(codecs.StreamReader):
+    decode = codecs.utf_32_be_decode
+
+### encodings module API
+
+def getregentry():
+    return codecs.CodecInfo(
+        name='utf-32-be',
+        encode=encode,
+        decode=decode,
+        incrementalencoder=IncrementalEncoder,
+        incrementaldecoder=IncrementalDecoder,
+        streamreader=StreamReader,
+        streamwriter=StreamWriter,
+    )

Added: python/trunk/Lib/encodings/utf_32_le.py
==============================================================================
--- (empty file)
+++ python/trunk/Lib/encodings/utf_32_le.py	Fri Aug 17 18:41:28 2007
@@ -0,0 +1,37 @@
+"""
+Python 'utf-32-le' Codec
+"""
+import codecs
+
+### Codec APIs
+
+encode = codecs.utf_32_le_encode
+
+def decode(input, errors='strict'):
+    return codecs.utf_32_le_decode(input, errors, True)
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+    def encode(self, input, final=False):
+        return codecs.utf_32_le_encode(input, self.errors)[0]
+
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    _buffer_decode = codecs.utf_32_le_decode
+
+class StreamWriter(codecs.StreamWriter):
+    encode = codecs.utf_32_le_encode
+
+class StreamReader(codecs.StreamReader):
+    decode = codecs.utf_32_le_decode
+
+### encodings module API
+
+def getregentry():
+    return codecs.CodecInfo(
+        name='utf-32-le',
+        encode=encode,
+        decode=decode,
+        incrementalencoder=IncrementalEncoder,
+        incrementaldecoder=IncrementalDecoder,
+        streamreader=StreamReader,
+        streamwriter=StreamWriter,
+    )

Modified: python/trunk/Lib/test/test_codeccallbacks.py
==============================================================================
--- python/trunk/Lib/test/test_codeccallbacks.py	(original)
+++ python/trunk/Lib/test/test_codeccallbacks.py	Fri Aug 17 18:41:28 2007
@@ -285,7 +285,8 @@
 
     def test_longstrings(self):
         # test long strings to check for memory overflow problems
-        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
+        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
+                   "backslashreplace"]
         # register the handlers under different names,
         # to prevent the codec from recognizing the name
         for err in errors:
@@ -293,7 +294,8 @@
         l = 1000
         errors += [ "test." + err for err in errors ]
         for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
-            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
+            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
+                        "utf-8", "utf-7", "utf-16", "utf-32"):
                 for err in errors:
                     try:
                         uni.encode(enc, err)

Modified: python/trunk/Lib/test/test_codecs.py
==============================================================================
--- python/trunk/Lib/test/test_codecs.py	(original)
+++ python/trunk/Lib/test/test_codecs.py	Fri Aug 17 18:41:28 2007
@@ -244,6 +244,137 @@
         self.assertEqual(reader.readline(), s5)
         self.assertEqual(reader.readline(), u"")
 
+class UTF32Test(ReadTest):
+    encoding = "utf-32"
+
+    spamle = ('\xff\xfe\x00\x00'
+              's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
+              's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
+    spambe = ('\x00\x00\xfe\xff'
+              '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
+              '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
+
+    def test_only_one_bom(self):
+        _,_,reader,writer = codecs.lookup(self.encoding)
+        # encode some stream
+        s = StringIO.StringIO()
+        f = writer(s)
+        f.write(u"spam")
+        f.write(u"spam")
+        d = s.getvalue()
+        # check whether there is exactly one BOM in it
+        self.assert_(d == self.spamle or d == self.spambe)
+        # try to read it back
+        s = StringIO.StringIO(d)
+        f = reader(s)
+        self.assertEquals(f.read(), u"spamspam")
+
+    def test_badbom(self):
+        s = StringIO.StringIO(4*"\xff")
+        f = codecs.getreader(self.encoding)(s)
+        self.assertRaises(UnicodeError, f.read)
+
+        s = StringIO.StringIO(8*"\xff")
+        f = codecs.getreader(self.encoding)(s)
+        self.assertRaises(UnicodeError, f.read)
+
+    def test_partial(self):
+        self.check_partial(
+            u"\x00\xff\u0100\uffff",
+            [
+                u"", # first byte of BOM read
+                u"", # second byte of BOM read
+                u"", # third byte of BOM read
+                u"", # fourth byte of BOM read => byteorder known
+                u"",
+                u"",
+                u"",
+                u"\x00",
+                u"\x00",
+                u"\x00",
+                u"\x00",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100\uffff",
+            ]
+        )
+
+    def test_errors(self):
+        self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
+                          "\xff", "strict", True)
+
+class UTF32LETest(ReadTest):
+    encoding = "utf-32-le"
+
+    def test_partial(self):
+        self.check_partial(
+            u"\x00\xff\u0100\uffff",
+            [
+                u"",
+                u"",
+                u"",
+                u"\x00",
+                u"\x00",
+                u"\x00",
+                u"\x00",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100\uffff",
+            ]
+        )
+
+    def test_simple(self):
+        self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
+
+    def test_errors(self):
+        self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
+                          "\xff", "strict", True)
+
+class UTF32BETest(ReadTest):
+    encoding = "utf-32-be"
+
+    def test_partial(self):
+        self.check_partial(
+            u"\x00\xff\u0100\uffff",
+            [
+                u"",
+                u"",
+                u"",
+                u"\x00",
+                u"\x00",
+                u"\x00",
+                u"\x00",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100\uffff",
+            ]
+        )
+
+    def test_simple(self):
+        self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
+
+    def test_errors(self):
+        self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
+                          "\xff", "strict", True)
+
 class UTF16Test(ReadTest):
     encoding = "utf-16"
 
@@ -1278,6 +1409,9 @@
 
 def test_main():
     test_support.run_unittest(
+        UTF32Test,
+        UTF32LETest,
+        UTF32BETest,
         UTF16Test,
         UTF16LETest,
         UTF16BETest,

Modified: python/trunk/Misc/NEWS
==============================================================================
--- python/trunk/Misc/NEWS	(original)
+++ python/trunk/Misc/NEWS	Fri Aug 17 18:41:28 2007
@@ -243,6 +243,8 @@
 - GB18030 codec now can encode additional two-byte characters that
   are missing in GBK.
 
+- Add new codecs for UTF-32, UTF-32-LE and UTF-32-BE.
+
 - Bug #1704793: Return UTF-16 pair if unicodedata.lookup cannot
   represent the result in a single character.
 

Modified: python/trunk/Modules/_codecsmodule.c
==============================================================================
--- python/trunk/Modules/_codecsmodule.c	(original)
+++ python/trunk/Modules/_codecsmodule.c	Fri Aug 17 18:41:28 2007
@@ -392,6 +392,126 @@
 }
 
 static PyObject *
+utf_32_decode(PyObject *self,
+	    PyObject *args)
+{
+    const char *data;
+    Py_ssize_t size;
+    const char *errors = NULL;
+    int byteorder = 0;
+    int final = 0;
+    Py_ssize_t consumed;
+    PyObject *decoded;
+
+    if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode",
+			  &data, &size, &errors, &final))
+	return NULL;
+    if (size < 0) {
+	    PyErr_SetString(PyExc_ValueError, "negative argument");
+	    return 0;
+    }
+    consumed = size; /* This is overwritten unless final is true. */
+    decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
+					    final ? NULL : &consumed);
+    if (decoded == NULL)
+	return NULL;
+    return codec_tuple(decoded, consumed);
+}
+
+static PyObject *
+utf_32_le_decode(PyObject *self,
+		 PyObject *args)
+{
+    const char *data;
+    Py_ssize_t size;
+    const char *errors = NULL;
+    int byteorder = -1;
+    int final = 0;
+    Py_ssize_t consumed;
+    PyObject *decoded = NULL;
+
+    if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode",
+			  &data, &size, &errors, &final))
+	return NULL;
+
+    if (size < 0) {
+          PyErr_SetString(PyExc_ValueError, "negative argument");
+          return 0;
+    }
+    consumed = size; /* This is overwritten unless final is true. */
+    decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
+	&byteorder, final ? NULL : &consumed);
+    if (decoded == NULL)
+	return NULL;
+    return codec_tuple(decoded, consumed);
+
+}
+
+static PyObject *
+utf_32_be_decode(PyObject *self,
+		 PyObject *args)
+{
+    const char *data;
+    Py_ssize_t size;
+    const char *errors = NULL;
+    int byteorder = 1;
+    int final = 0;
+    Py_ssize_t consumed;
+    PyObject *decoded = NULL;
+
+    if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode",
+			  &data, &size, &errors, &final))
+	return NULL;
+    if (size < 0) {
+          PyErr_SetString(PyExc_ValueError, "negative argument");
+          return 0;
+    }
+    consumed = size; /* This is overwritten unless final is true. */
+    decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
+	&byteorder, final ? NULL : &consumed);
+    if (decoded == NULL)
+	return NULL;
+    return codec_tuple(decoded, consumed);
+}
+
+/* This non-standard version also provides access to the byteorder
+   parameter of the builtin UTF-32 codec.
+
+   It returns a tuple (unicode, bytesread, byteorder) with byteorder
+   being the value in effect at the end of data.
+
+*/
+
+static PyObject *
+utf_32_ex_decode(PyObject *self,
+		 PyObject *args)
+{
+    const char *data;
+    Py_ssize_t size;
+    const char *errors = NULL;
+    int byteorder = 0;
+    PyObject *unicode, *tuple;
+    int final = 0;
+    Py_ssize_t consumed;
+
+    if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode",
+			  &data, &size, &errors, &byteorder, &final))
+	return NULL;
+    if (size < 0) {
+	    PyErr_SetString(PyExc_ValueError, "negative argument");
+	    return 0;
+    }
+    consumed = size; /* This is overwritten unless final is true. */
+    unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
+					    final ? NULL : &consumed);
+    if (unicode == NULL)
+	return NULL;
+    tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
+    Py_DECREF(unicode);
+    return tuple;
+}
+
+static PyObject *
 unicode_escape_decode(PyObject *self,
 		     PyObject *args)
 {
@@ -683,6 +803,83 @@
     return v;
 }
 
+/* This version provides access to the byteorder parameter of the
+   builtin UTF-32 codecs as optional third argument. It defaults to 0
+   which means: use the native byte order and prepend the data with a
+   BOM mark.
+
+*/
+
+static PyObject *
+utf_32_encode(PyObject *self,
+	    PyObject *args)
+{
+    PyObject *str, *v;
+    const char *errors = NULL;
+    int byteorder = 0;
+
+    if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
+			  &str, &errors, &byteorder))
+	return NULL;
+
+    str = PyUnicode_FromObject(str);
+    if (str == NULL)
+	return NULL;
+    v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
+					  PyUnicode_GET_SIZE(str),
+					  errors,
+					  byteorder),
+		    PyUnicode_GET_SIZE(str));
+    Py_DECREF(str);
+    return v;
+}
+
+static PyObject *
+utf_32_le_encode(PyObject *self,
+		 PyObject *args)
+{
+    PyObject *str, *v;
+    const char *errors = NULL;
+
+    if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
+			  &str, &errors))
+	return NULL;
+
+    str = PyUnicode_FromObject(str);
+    if (str == NULL)
+	return NULL;
+    v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
+					     PyUnicode_GET_SIZE(str),
+					     errors,
+					     -1),
+		       PyUnicode_GET_SIZE(str));
+    Py_DECREF(str);
+    return v;
+}
+
+static PyObject *
+utf_32_be_encode(PyObject *self,
+		 PyObject *args)
+{
+    PyObject *str, *v;
+    const char *errors = NULL;
+
+    if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
+			  &str, &errors))
+	return NULL;
+
+    str = PyUnicode_FromObject(str);
+    if (str == NULL)
+	return NULL;
+    v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
+					  PyUnicode_GET_SIZE(str),
+					  errors,
+					  +1),
+		    PyUnicode_GET_SIZE(str));
+    Py_DECREF(str);
+    return v;
+}
+
 static PyObject *
 unicode_escape_encode(PyObject *self,
 		     PyObject *args)
@@ -901,6 +1098,13 @@
     {"utf_16_le_decode",	utf_16_le_decode,		METH_VARARGS},
     {"utf_16_be_decode",	utf_16_be_decode,		METH_VARARGS},
     {"utf_16_ex_decode",	utf_16_ex_decode,		METH_VARARGS},
+    {"utf_32_encode",		utf_32_encode,			METH_VARARGS},
+    {"utf_32_le_encode",	utf_32_le_encode,		METH_VARARGS},
+    {"utf_32_be_encode",	utf_32_be_encode,		METH_VARARGS},
+    {"utf_32_decode",		utf_32_decode,			METH_VARARGS},
+    {"utf_32_le_decode",	utf_32_le_decode,		METH_VARARGS},
+    {"utf_32_be_decode",	utf_32_be_decode,		METH_VARARGS},
+    {"utf_32_ex_decode",	utf_32_ex_decode,		METH_VARARGS},
     {"unicode_escape_encode",	unicode_escape_encode,		METH_VARARGS},
     {"unicode_escape_decode",	unicode_escape_decode,		METH_VARARGS},
     {"unicode_internal_encode",	unicode_internal_encode,	METH_VARARGS},

Modified: python/trunk/Objects/unicodeobject.c
==============================================================================
--- python/trunk/Objects/unicodeobject.c	(original)
+++ python/trunk/Objects/unicodeobject.c	Fri Aug 17 18:41:28 2007
@@ -1504,6 +1504,272 @@
 				NULL);
 }
 
+/* --- UTF-32 Codec ------------------------------------------------------- */
+
+PyObject *
+PyUnicode_DecodeUTF32(const char *s,
+		      Py_ssize_t size,
+		      const char *errors,
+		      int *byteorder)
+{
+    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
+}
+
+PyObject *
+PyUnicode_DecodeUTF32Stateful(const char *s,
+			      Py_ssize_t size,
+			      const char *errors,
+			      int *byteorder,
+			      Py_ssize_t *consumed)
+{
+    const char *starts = s;
+    Py_ssize_t startinpos;
+    Py_ssize_t endinpos;
+    Py_ssize_t outpos;
+    PyUnicodeObject *unicode;
+    Py_UNICODE *p;
+#ifndef Py_UNICODE_WIDE
+    int i, pairs;
+#else
+    const int pairs = 0;
+#endif
+    const unsigned char *q, *e;
+    int bo = 0;       /* assume native ordering by default */
+    const char *errmsg = "";
+    /* On narrow builds we split characters outside the BMP into two
+       codepoints => count how much extra space we need. */
+#ifndef Py_UNICODE_WIDE
+    for (i = pairs = 0; i < size/4; i++)
+	if (((Py_UCS4 *)s)[i] >= 0x10000)
+	    pairs++;
+#endif
+    /* Offsets from q for retrieving bytes in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int iorder[] = {0, 1, 2, 3};
+#else
+    int iorder[] = {3, 2, 1, 0};
+#endif
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+
+    /* This might be one to much, because of a BOM */
+    unicode = _PyUnicode_New((size+3)/4+pairs);
+    if (!unicode)
+        return NULL;
+    if (size == 0)
+        return (PyObject *)unicode;
+
+    /* Unpack UTF-32 encoded data */
+    p = unicode->str;
+    q = (unsigned char *)s;
+    e = q + size;
+
+    if (byteorder)
+        bo = *byteorder;
+
+    /* Check for BOM marks (U+FEFF) in the input and adjust current
+       byte order setting accordingly. In native mode, the leading BOM
+       mark is skipped, in all other modes, it is copied to the output
+       stream as-is (giving a ZWNBSP character). */
+    if (bo == 0) {
+        if (size >= 4) {
+            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
+                                (q[iorder[1]] << 8) | q[iorder[0]];
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+	    if (bom == 0x0000FEFF) {
+		q += 4;
+		bo = -1;
+	    }
+	    else if (bom == 0xFFFE0000) {
+		q += 4;
+		bo = 1;
+	    }
+#else
+	    if (bom == 0x0000FEFF) {
+		q += 4;
+		bo = 1;
+	    }
+	    else if (bom == 0xFFFE0000) {
+		q += 4;
+		bo = -1;
+	    }
+#endif
+	}
+    }
+
+    if (bo == -1) {
+        /* force LE */
+        iorder[0] = 0;
+        iorder[1] = 1;
+        iorder[2] = 2;
+        iorder[3] = 3;
+    }
+    else if (bo == 1) {
+        /* force BE */
+        iorder[0] = 3;
+        iorder[1] = 2;
+        iorder[2] = 1;
+        iorder[3] = 0;
+    }
+
+    while (q < e) {
+	Py_UCS4 ch;
+	/* remaining bytes at the end? (size should be divisible by 4) */
+	if (e-q<4) {
+	    if (consumed)
+		break;
+	    errmsg = "truncated data";
+	    startinpos = ((const char *)q)-starts;
+	    endinpos = ((const char *)e)-starts;
+	    goto utf32Error;
+	    /* The remaining input chars are ignored if the callback
+	       chooses to skip the input */
+	}
+	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
+	     (q[iorder[1]] << 8) | q[iorder[0]];
+
+	if (ch >= 0x110000)
+	{
+	    errmsg = "codepoint not in range(0x110000)";
+	    startinpos = ((const char *)q)-starts;
+	    endinpos = startinpos+4;
+	    goto utf32Error;
+	}
+#ifndef Py_UNICODE_WIDE
+	if (ch >= 0x10000)
+	{
+	    *p++ = 0xD800 | ((ch-0x10000) >> 10);
+	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
+	}
+	else
+#endif
+	    *p++ = ch;
+	q += 4;
+	continue;
+    utf32Error:
+	outpos = p-PyUnicode_AS_UNICODE(unicode);
+    if (unicode_decode_call_errorhandler(
+         errors, &errorHandler,
+         "utf32", errmsg,
+         starts, size, &startinpos, &endinpos, &exc, &s,
+         (PyObject **)&unicode, &outpos, &p))
+	    goto onError;
+    }
+
+    if (byteorder)
+        *byteorder = bo;
+
+    if (consumed)
+	*consumed = (const char *)q-starts;
+
+    /* Adjust length */
+    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
+        goto onError;
+
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return (PyObject *)unicode;
+
+onError:
+    Py_DECREF(unicode);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return NULL;
+}
+
+PyObject *
+PyUnicode_EncodeUTF32(const Py_UNICODE *s,
+		      Py_ssize_t size,
+		      const char *errors,
+		      int byteorder)
+{
+    PyObject *v;
+    unsigned char *p;
+#ifndef Py_UNICODE_WIDE
+    int i, pairs;
+#else
+    const int pairs = 0;
+#endif
+    /* Offsets from p for storing byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int iorder[] = {0, 1, 2, 3};
+#else
+    int iorder[] = {3, 2, 1, 0};
+#endif
+
+#define STORECHAR(CH)                       \
+    do {                                    \
+        p[iorder[3]] = ((CH) >> 24) & 0xff; \
+        p[iorder[2]] = ((CH) >> 16) & 0xff; \
+        p[iorder[1]] = ((CH) >> 8) & 0xff;  \
+        p[iorder[0]] = (CH) & 0xff;         \
+        p += 4;                             \
+    } while(0)
+
+    /* In narrow builds we can output surrogate pairs as one codepoint,
+       so we need less space. */
+#ifndef Py_UNICODE_WIDE
+    for (i = pairs = 0; i < size-1; i++)
+	if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
+	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
+	    pairs++;
+#endif
+    v = PyString_FromStringAndSize(NULL,
+		  4 * (size - pairs + (byteorder == 0)));
+    if (v == NULL)
+        return NULL;
+
+    p = (unsigned char *)PyString_AS_STRING(v);
+    if (byteorder == 0)
+	STORECHAR(0xFEFF);
+    if (size == 0)
+        return v;
+
+    if (byteorder == -1) {
+        /* force LE */
+        iorder[0] = 0;
+        iorder[1] = 1;
+        iorder[2] = 2;
+        iorder[3] = 3;
+    }
+    else if (byteorder == 1) {
+        /* force BE */
+        iorder[0] = 3;
+        iorder[1] = 2;
+        iorder[2] = 1;
+        iorder[3] = 0;
+    }
+
+    while (size-- > 0) {
+	Py_UCS4 ch = *s++;
+#ifndef Py_UNICODE_WIDE
+	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
+	    Py_UCS4 ch2 = *s;
+	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+		s++;
+		size--;
+	    }
+	}
+#endif
+        STORECHAR(ch);
+    }
+    return v;
+#undef STORECHAR
+}
+
+PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
+{
+    if (!PyUnicode_Check(unicode)) {
+        PyErr_BadArgument();
+        return NULL;
+    }
+    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
+				 PyUnicode_GET_SIZE(unicode),
+				 NULL,
+				 0);
+}
+
 /* --- UTF-16 Codec ------------------------------------------------------- */
 
 PyObject *


More information about the Python-checkins mailing list