[Jython-checkins] jython: Codec unicode_internal now uses utf-32be.

jeff.allen jython-checkins at python.org
Mon Jan 14 10:08:21 CET 2013


http://hg.python.org/jython/rev/32b51334df9a
changeset:   6949:32b51334df9a
user:        Jeff Allen <ja...py at farowl.co.uk>
date:        Mon Jan 14 08:15:09 2013 +0000
summary:
  Codec unicode_internal now uses utf-32be.
CPython 3.3 deprecates this codec, which was tied anyway to the CPython
implementation of unicode strings. Some accommodations made in tests to
the Jython approach within test_codecs: now at 16 skips no errors.

files:
  Lib/test/test_codecs.py                |  11 +-
  src/org/python/core/Py.java            |   4 +-
  src/org/python/core/PySystemState.java |   1 +
  src/org/python/core/codecs.java        |   4 +-
  src/org/python/modules/_codecs.java    |  57 +++++++++----
  5 files changed, 54 insertions(+), 23 deletions(-)


diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -720,7 +720,9 @@
     def test_recoding(self):
         f = StringIO.StringIO()
         f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
-        f2.write(u"a")
+        # f2.write(u"a")
+        # Must be bytes in Jython (and probably should have been in CPython)
+        f2.write(b"\x00\x00\x00\x61")
         f2.close()
         # Python used to crash on this at exit because of a refcount
         # bug in _codecsmodule.c
@@ -847,7 +849,6 @@
         for uni, puny in punycode_testcases:
             self.assertEqual(uni, puny.decode("punycode"))
 
- at unittest.skipIf(test_support.is_jython, "FIXME: equates to UTF-32BE in Jython")
 class UnicodeInternalTest(unittest.TestCase):
     def test_bug1251300(self):
         # Decoding with unicode_internal used to not correctly handle "code
@@ -880,7 +881,11 @@
             try:
                 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
             except UnicodeDecodeError, ex:
-                self.assertEqual("unicode_internal", ex.encoding)
+                if test_support.is_jython:
+                    # Jython delegates internally to utf-32be and it shows here
+                    self.assertEqual("utf-32", ex.encoding)
+                else:
+                    self.assertEqual("unicode_internal", ex.encoding)
                 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
                 self.assertEqual(4, ex.start)
                 self.assertEqual(8, ex.end)
diff --git a/src/org/python/core/Py.java b/src/org/python/core/Py.java
--- a/src/org/python/core/Py.java
+++ b/src/org/python/core/Py.java
@@ -76,8 +76,10 @@
     public static PyBoolean False;
     /** The Python boolean True **/
     public static PyBoolean True;
-    /** A zero-length Python string **/
+    /** A zero-length Python byte string **/
     public static PyString EmptyString;
+    /** A zero-length Python Unicode string **/
+    public static PyUnicode EmptyUnicode;
     /** A Python string containing '\n' **/
     public static PyString Newline;
     /** A Python unicode string containing '\n' **/
diff --git a/src/org/python/core/PySystemState.java b/src/org/python/core/PySystemState.java
--- a/src/org/python/core/PySystemState.java
+++ b/src/org/python/core/PySystemState.java
@@ -945,6 +945,7 @@
         Py.True = new PyBoolean(true);
 
         Py.EmptyString = new PyString("");
+        Py.EmptyUnicode = new PyUnicode("");
         Py.Newline = new PyString("\n");
         Py.UnicodeNewline = new PyUnicode("\n");
         Py.Space = new PyString(" ");
diff --git a/src/org/python/core/codecs.java b/src/org/python/core/codecs.java
--- a/src/org/python/core/codecs.java
+++ b/src/org/python/core/codecs.java
@@ -21,7 +21,7 @@
  * <p>
  * The class also contains the inner methods of the standard Unicode codecs, available for
  * transcoding of text at the Java level. These also are exposed through the <code>_codecs</code>
- * module. In CPython, the implementation are found in <code>Objects/unicodeobject.c</code>.
+ * module. In CPython, the implementations are found in <code>Objects/unicodeobject.c</code>.
  *
  * @since Jython 2.0
  */
@@ -249,7 +249,7 @@
             throw wrong_exception_type(exc);
         }
         PyObject end = exc.__getattr__("end");
-        return new PyTuple(Py.java2py(""), end);
+        return new PyTuple(Py.EmptyUnicode, end);
     }
 
     private static boolean isUnicodeError(PyObject exc) {
diff --git a/src/org/python/modules/_codecs.java b/src/org/python/modules/_codecs.java
--- a/src/org/python/modules/_codecs.java
+++ b/src/org/python/modules/_codecs.java
@@ -832,7 +832,7 @@
      * @param order byte order to use BE, LE or UNDEFINED (a BOM will be written)
      * @return tuple (encoded_bytes, unicode_consumed)
      */
-    public static PyTuple PyUnicode_EncodeUTF32(String unicode, String errors, ByteOrder order) {
+    private static PyTuple PyUnicode_EncodeUTF32(String unicode, String errors, ByteOrder order) {
 
         // We use a StringBuilder but we are really storing encoded bytes
         StringBuilder v = new StringBuilder(4 * (unicode.length() + 1));
@@ -1347,7 +1347,7 @@
          * Main codec loop consumes 4 bytes and emits one code point with each pass, until there are
          * fewer than 4 bytes left.
          */
-        for (; q < limit; q += 4) {
+        while (q < limit) {
             // Read 4 bytes in two 16-bit chunks according to byte order
             int hi, lo;
             hi = (bytes.charAt(q) << 8) | bytes.charAt(q + 1);
@@ -1356,12 +1356,14 @@
             if (hi == 0) {
                 // It's a BMP character so we can't go wrong
                 unicode.append((char)lo);
+                q += 4;
             } else {
                 // Code may be invalid: let the appendCodePoint method detect that
                 try {
                     unicode.appendCodePoint((hi << 16) + lo);
+                    q += 4;
                 } catch (IllegalArgumentException e) {
-                    q = codecs.insertReplacementAndGetResume(unicode, errors, "utf-32be", //
+                    q = codecs.insertReplacementAndGetResume(unicode, errors, "utf-32", //
                             bytes, q, q + 4, "codepoint not in range(0x110000)");
                 }
             }
@@ -1387,7 +1389,7 @@
          * Main codec loop consumes 4 bytes and emits one code point with each pass, until there are
          * fewer than 4 bytes left.
          */
-        for (; q < limit; q += 4) {
+        while (q < limit) {
             // Read 4 bytes in two 16-bit chunks according to byte order
             int hi, lo;
             hi = (bytes.charAt(q + 3) << 8) | bytes.charAt(q + 2);
@@ -1396,10 +1398,12 @@
             if (hi == 0) {
                 // It's a BMP character so we can't go wrong
                 unicode.append((char)lo);
+                q += 4;
             } else {
                 // Code may be invalid: let the appendCodePoint method detect that
                 try {
                     unicode.appendCodePoint((hi << 16) + lo);
+                    q += 4;
                 } catch (IllegalArgumentException e) {
                     q = codecs.insertReplacementAndGetResume(unicode, errors, "utf-32", //
                             bytes, q, q + 4, "codepoint not in range(0x110000)");
@@ -1447,31 +1451,50 @@
     }
 
     /* --- UnicodeInternal Codec ------------------------------------------ */
-    // XXX Should deprecate unicode-internal codec and delegate to UTF-32BE (when we have one)
+
     /*
      * This codec is supposed to deal with an encoded form equal to the internal representation of
      * the unicode object considered as bytes in memory. This was confusing in CPython as it varied
-     * with machine architecture (width and endian-ness). In Jython, the most compatible choice
-     * would be UTF-32BE since unicode objects report their length as if UCS-4 and
-     * sys.byteorder=='big'. The codec is deprecated in v3.3 as irrelevant, or impossible, in view
-     * of the flexible string representation (which Jython emulates in its own way).
+     * with machine architecture (width and endian-ness). In Jython, where both are fixed, the most
+     * compatible choice is UTF-32BE. The codec is deprecated in v3.3 as irrelevant, or impossible,
+     * in view of the flexible string representation (which Jython emulates in its own way).
      *
      * See http://mail.python.org/pipermail/python-dev/2011-November/114415.html
      */
-    public static PyTuple unicode_internal_encode(String str) {
-        return unicode_internal_encode(str, null);
+    /**
+     * Legacy method to encode given unicode in CPython wide-build internal format (equivalent
+     * UTF-32BE).
+     */
+    @Deprecated
+    public static PyTuple unicode_internal_encode(String unicode) {
+        return utf_32_be_encode(unicode, null);
     }
 
-    public static PyTuple unicode_internal_encode(String str, String errors) {
-        return encode_tuple(str, str.length());
+    /**
+     * Legacy method to encode given unicode in CPython wide-build internal format (equivalent
+     * UTF-32BE). There must be a multiple of 4 bytes.
+     */
+    @Deprecated
+    public static PyTuple unicode_internal_encode(String unicode, String errors) {
+        return utf_32_be_encode(unicode, errors);
     }
 
-    public static PyTuple unicode_internal_decode(String str) {
-        return unicode_internal_decode(str, null);
+    /**
+     * Legacy method to decode given bytes as if CPython wide-build internal format (equivalent
+     * UTF-32BE). There must be a multiple of 4 bytes.
+     */
+    @Deprecated
+    public static PyTuple unicode_internal_decode(String bytes) {
+        return utf_32_be_decode(bytes, null, true);
     }
 
-    public static PyTuple unicode_internal_decode(String str, String errors) {
-        return decode_tuple(str, str.length());
+    /**
+     * Legacy method to decode given bytes as if CPython wide-build internal format (equivalent
+     * UTF-32BE). There must be a multiple of 4 bytes.
+     */
+    @Deprecated
+    public static PyTuple unicode_internal_decode(String bytes, String errors) {
+        return utf_32_be_decode(bytes, errors, true);
     }
 
     /**

-- 
Repository URL: http://hg.python.org/jython


More information about the Jython-checkins mailing list