[Jython-checkins] jython: Codec unicode_internal now uses utf-32be.
jeff.allen
jython-checkins at python.org
Mon Jan 14 10:08:21 CET 2013
http://hg.python.org/jython/rev/32b51334df9a
changeset: 6949:32b51334df9a
user: Jeff Allen <ja...py at farowl.co.uk>
date: Mon Jan 14 08:15:09 2013 +0000
summary:
Codec unicode_internal now uses utf-32be.
CPython 3.3 deprecates this codec, which was tied anyway to the CPython
implementation of unicode strings. Some accommodations made in tests to
the Jython approach within test_codecs: now at 16 skips no errors.
files:
Lib/test/test_codecs.py | 11 +-
src/org/python/core/Py.java | 4 +-
src/org/python/core/PySystemState.java | 1 +
src/org/python/core/codecs.java | 4 +-
src/org/python/modules/_codecs.java | 57 +++++++++----
5 files changed, 54 insertions(+), 23 deletions(-)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -720,7 +720,9 @@
def test_recoding(self):
f = StringIO.StringIO()
f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
- f2.write(u"a")
+ # f2.write(u"a")
+ # Must be bytes in Jython (and probably should have been in CPython)
+ f2.write(b"\x00\x00\x00\x61")
f2.close()
# Python used to crash on this at exit because of a refcount
# bug in _codecsmodule.c
@@ -847,7 +849,6 @@
for uni, puny in punycode_testcases:
self.assertEqual(uni, puny.decode("punycode"))
- at unittest.skipIf(test_support.is_jython, "FIXME: equates to UTF-32BE in Jython")
class UnicodeInternalTest(unittest.TestCase):
def test_bug1251300(self):
# Decoding with unicode_internal used to not correctly handle "code
@@ -880,7 +881,11 @@
try:
"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
except UnicodeDecodeError, ex:
- self.assertEqual("unicode_internal", ex.encoding)
+ if test_support.is_jython:
+ # Jython delegates internally to utf-32be and it shows here
+ self.assertEqual("utf-32", ex.encoding)
+ else:
+ self.assertEqual("unicode_internal", ex.encoding)
self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
self.assertEqual(4, ex.start)
self.assertEqual(8, ex.end)
diff --git a/src/org/python/core/Py.java b/src/org/python/core/Py.java
--- a/src/org/python/core/Py.java
+++ b/src/org/python/core/Py.java
@@ -76,8 +76,10 @@
public static PyBoolean False;
/** The Python boolean True **/
public static PyBoolean True;
- /** A zero-length Python string **/
+ /** A zero-length Python byte string **/
public static PyString EmptyString;
+ /** A zero-length Python Unicode string **/
+ public static PyUnicode EmptyUnicode;
/** A Python string containing '\n' **/
public static PyString Newline;
/** A Python unicode string containing '\n' **/
diff --git a/src/org/python/core/PySystemState.java b/src/org/python/core/PySystemState.java
--- a/src/org/python/core/PySystemState.java
+++ b/src/org/python/core/PySystemState.java
@@ -945,6 +945,7 @@
Py.True = new PyBoolean(true);
Py.EmptyString = new PyString("");
+ Py.EmptyUnicode = new PyUnicode("");
Py.Newline = new PyString("\n");
Py.UnicodeNewline = new PyUnicode("\n");
Py.Space = new PyString(" ");
diff --git a/src/org/python/core/codecs.java b/src/org/python/core/codecs.java
--- a/src/org/python/core/codecs.java
+++ b/src/org/python/core/codecs.java
@@ -21,7 +21,7 @@
* <p>
* The class also contains the inner methods of the standard Unicode codecs, available for
* transcoding of text at the Java level. These also are exposed through the <code>_codecs</code>
- * module. In CPython, the implementation are found in <code>Objects/unicodeobject.c</code>.
+ * module. In CPython, the implementations are found in <code>Objects/unicodeobject.c</code>.
*
* @since Jython 2.0
*/
@@ -249,7 +249,7 @@
throw wrong_exception_type(exc);
}
PyObject end = exc.__getattr__("end");
- return new PyTuple(Py.java2py(""), end);
+ return new PyTuple(Py.EmptyUnicode, end);
}
private static boolean isUnicodeError(PyObject exc) {
diff --git a/src/org/python/modules/_codecs.java b/src/org/python/modules/_codecs.java
--- a/src/org/python/modules/_codecs.java
+++ b/src/org/python/modules/_codecs.java
@@ -832,7 +832,7 @@
* @param order byte order to use BE, LE or UNDEFINED (a BOM will be written)
* @return tuple (encoded_bytes, unicode_consumed)
*/
- public static PyTuple PyUnicode_EncodeUTF32(String unicode, String errors, ByteOrder order) {
+ private static PyTuple PyUnicode_EncodeUTF32(String unicode, String errors, ByteOrder order) {
// We use a StringBuilder but we are really storing encoded bytes
StringBuilder v = new StringBuilder(4 * (unicode.length() + 1));
@@ -1347,7 +1347,7 @@
* Main codec loop consumes 4 bytes and emits one code point with each pass, until there are
* fewer than 4 bytes left.
*/
- for (; q < limit; q += 4) {
+ while (q < limit) {
// Read 4 bytes in two 16-bit chunks according to byte order
int hi, lo;
hi = (bytes.charAt(q) << 8) | bytes.charAt(q + 1);
@@ -1356,12 +1356,14 @@
if (hi == 0) {
// It's a BMP character so we can't go wrong
unicode.append((char)lo);
+ q += 4;
} else {
// Code may be invalid: let the appendCodePoint method detect that
try {
unicode.appendCodePoint((hi << 16) + lo);
+ q += 4;
} catch (IllegalArgumentException e) {
- q = codecs.insertReplacementAndGetResume(unicode, errors, "utf-32be", //
+ q = codecs.insertReplacementAndGetResume(unicode, errors, "utf-32", //
bytes, q, q + 4, "codepoint not in range(0x110000)");
}
}
@@ -1387,7 +1389,7 @@
* Main codec loop consumes 4 bytes and emits one code point with each pass, until there are
* fewer than 4 bytes left.
*/
- for (; q < limit; q += 4) {
+ while (q < limit) {
// Read 4 bytes in two 16-bit chunks according to byte order
int hi, lo;
hi = (bytes.charAt(q + 3) << 8) | bytes.charAt(q + 2);
@@ -1396,10 +1398,12 @@
if (hi == 0) {
// It's a BMP character so we can't go wrong
unicode.append((char)lo);
+ q += 4;
} else {
// Code may be invalid: let the appendCodePoint method detect that
try {
unicode.appendCodePoint((hi << 16) + lo);
+ q += 4;
} catch (IllegalArgumentException e) {
q = codecs.insertReplacementAndGetResume(unicode, errors, "utf-32", //
bytes, q, q + 4, "codepoint not in range(0x110000)");
@@ -1447,31 +1451,50 @@
}
/* --- UnicodeInternal Codec ------------------------------------------ */
- // XXX Should deprecate unicode-internal codec and delegate to UTF-32BE (when we have one)
+
/*
* This codec is supposed to deal with an encoded form equal to the internal representation of
* the unicode object considered as bytes in memory. This was confusing in CPython as it varied
- * with machine architecture (width and endian-ness). In Jython, the most compatible choice
- * would be UTF-32BE since unicode objects report their length as if UCS-4 and
- * sys.byteorder=='big'. The codec is deprecated in v3.3 as irrelevant, or impossible, in view
- * of the flexible string representation (which Jython emulates in its own way).
+ * with machine architecture (width and endian-ness). In Jython, where both are fixed, the most
+ * compatible choice is UTF-32BE. The codec is deprecated in v3.3 as irrelevant, or impossible,
+ * in view of the flexible string representation (which Jython emulates in its own way).
*
* See http://mail.python.org/pipermail/python-dev/2011-November/114415.html
*/
- public static PyTuple unicode_internal_encode(String str) {
- return unicode_internal_encode(str, null);
+ /**
+ * Legacy method to encode given unicode in CPython wide-build internal format (equivalent
+ * UTF-32BE).
+ */
+ @Deprecated
+ public static PyTuple unicode_internal_encode(String unicode) {
+ return utf_32_be_encode(unicode, null);
}
- public static PyTuple unicode_internal_encode(String str, String errors) {
- return encode_tuple(str, str.length());
+ /**
+ * Legacy method to encode given unicode in CPython wide-build internal format (equivalent
+ * UTF-32BE). There must be a multiple of 4 bytes.
+ */
+ @Deprecated
+ public static PyTuple unicode_internal_encode(String unicode, String errors) {
+ return utf_32_be_encode(unicode, errors);
}
- public static PyTuple unicode_internal_decode(String str) {
- return unicode_internal_decode(str, null);
+ /**
+ * Legacy method to decode given bytes as if CPython wide-build internal format (equivalent
+ * UTF-32BE). There must be a multiple of 4 bytes.
+ */
+ @Deprecated
+ public static PyTuple unicode_internal_decode(String bytes) {
+ return utf_32_be_decode(bytes, null, true);
}
- public static PyTuple unicode_internal_decode(String str, String errors) {
- return decode_tuple(str, str.length());
+ /**
+ * Legacy method to decode given bytes as if CPython wide-build internal format (equivalent
+ * UTF-32BE). There must be a multiple of 4 bytes.
+ */
+ @Deprecated
+ public static PyTuple unicode_internal_decode(String bytes, String errors) {
+ return utf_32_be_decode(bytes, errors, true);
}
/**
--
Repository URL: http://hg.python.org/jython
More information about the Jython-checkins
mailing list