[Jython-checkins] jython: Fixed test failures in charmap codec and UTF-16 use of PyUnicode.
jeff.allen
jython-checkins at python.org
Sat Jan 5 18:56:56 CET 2013
http://hg.python.org/jython/rev/bdf4b0b5f5f7
changeset: 6936:bdf4b0b5f5f7
user: Jeff Allen <ja...py at farowl.co.uk>
date: Sat Dec 29 01:06:41 2012 +0000
summary:
Fixed test failures in charmap codec and UTF-16 use of PyUnicode.
Added a lot of commentary and made arguments optional in charmap.
Changed return type of UTF-16 decode to PyUnicode (because it is).
Now scoring fail/error/skip = 3/2/45 in test_codecs.
files:
src/org/python/modules/_codecs.java | 233 +++++++++++++--
1 files changed, 191 insertions(+), 42 deletions(-)
diff --git a/src/org/python/modules/_codecs.java b/src/org/python/modules/_codecs.java
--- a/src/org/python/modules/_codecs.java
+++ b/src/org/python/modules/_codecs.java
@@ -49,6 +49,7 @@
}
private static PyTuple decode_tuple_str(String s, int len) {
+ // XXX should this be PyUnicode(s) ?
return new PyTuple(new PyString(s), Py.newInteger(len));
}
@@ -117,51 +118,118 @@
}
/* --- Character Mapping Codec --------------------------------------- */
- public static PyTuple charmap_decode(String str, String errors, PyObject mapping) {
- return charmap_decode(str, errors, mapping, false);
+
+ /**
+ * Equivalent to <code>charmap_decode(bytes, errors, null)</code>. This method is here so the
+ * error and mapping arguments can be optional at the Python level.
+ *
+ * @param bytes sequence of bytes to decode
+ * @return decoded string and number of bytes consumed
+ */
+ public static PyTuple charmap_decode(String bytes) {
+ return charmap_decode(bytes, null, null);
}
- public static PyTuple charmap_decode(String str, String errors, PyObject mapping,
+ /**
+ * Equivalent to <code>charmap_decode(bytes, errors, null)</code>. This method is here so the
+ * error argument can be optional at the Python level.
+ *
+ * @param bytes sequence of bytes to decode
+ * @param errors error policy
+ * @return decoded string and number of bytes consumed
+ */
+ public static PyTuple charmap_decode(String bytes, String errors) {
+ return charmap_decode(bytes, errors, null);
+ }
+
+ /**
+ * Decode a sequence of bytes into Unicode characters via a mapping supplied as a container to
+ * be indexed by the byte values (as unsigned integers). If the mapping is null or None, decode
+ * with latin-1 (essentially treating bytes as character codes directly).
+ *
+ * @param bytes sequence of bytes to decode
+ * @param errors error policy
+ * @param mapping to convert bytes to characters
+ * @return decoded string and number of bytes consumed
+ */
+ public static PyTuple charmap_decode(String str, String errors, PyObject mapping) {
+ if (mapping == null || mapping == Py.None) {
+ // Default to Latin-1
+ return latin_1_decode(str, errors);
+ } else {
+ return charmap_decode(str, errors, mapping, false);
+ }
+ }
+
+ /**
+ * Decode a sequence of bytes into Unicode characters via a mapping supplied as a container to
+ * be indexed by the byte values (as unsigned integers).
+ *
+ * @param bytes sequence of bytes to decode
+ * @param errors error policy
+ * @param mapping to convert bytes to characters
+ * @param ignoreUnmapped if true, pass unmapped byte values as character codes [0..256)
+ * @return decoded string and number of bytes consumed
+ */
+ public static PyTuple charmap_decode(String bytes, String errors, PyObject mapping,
boolean ignoreUnmapped) {
+ // XXX bytes: would prefer to accept any object with buffer API
+ int size = bytes.length();
+ StringBuilder v = new StringBuilder(size);
- int size = str.length();
- StringBuilder v = new StringBuilder(size);
for (int i = 0; i < size; i++) {
- char ch = str.charAt(i);
- if (ch > 0xFF) {
- i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, //
+
+ // Process the i.th input byte
+ int b = bytes.charAt(i);
+ if (b > 0xff) {
+ i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
i, i + 1, "ordinal not in range(255)") - 1;
continue;
}
- PyObject w = Py.newInteger(ch);
+
+ // Map the byte to an output character code (or possibly string)
+ PyObject w = Py.newInteger(b);
PyObject x = mapping.__finditem__(w);
+
+ // Apply to the output
if (x == null) {
+ // Error case: mapping not found
if (ignoreUnmapped) {
- v.append(ch);
+ v.appendCodePoint(b);
} else {
- i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, //
+ i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
i, i + 1, "no mapping found") - 1;
}
- continue;
- }
- /* Apply mapping */
- if (x instanceof PyInteger) {
+
+ } else if (x instanceof PyInteger) {
+ // Mapping was to an int: treat as character code
int value = ((PyInteger)x).getValue();
if (value < 0 || value > PySystemState.maxunicode) {
throw Py.TypeError("character mapping must return "
+ "integer greater than 0 and less than sys.maxunicode");
}
- v.append((char)value);
+ v.appendCodePoint(value);
+
} else if (x == Py.None) {
- i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, //
+ i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
i, i + 1, "character maps to <undefined>") - 1;
+
} else if (x instanceof PyString) {
- v.append(x.toString());
+ String s = x.toString();
+ if (s.charAt(0) == 0xfffe) {
+ // Invalid indicates "undefined" see C-API PyUnicode_DecodeCharmap()
+ i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
+ i, i + 1, "character maps to <undefined>") - 1;
+ } else {
+ v.append(s);
+ }
+
} else {
/* wrong return value */
throw Py.TypeError("character mapping must return " + "integer, None or str");
}
}
+
return decode_tuple(v.toString(), size);
}
@@ -198,79 +266,163 @@
return new PyUnicode(buf.toString());
}
- public static PyTuple charmap_encode(String str, String errors, PyObject mapping) {
- // Default to Latin-1
- if (mapping == null) {
- return latin_1_encode(str, errors);
- }
- return charmap_encode_internal(str, errors, mapping, new StringBuilder(str.length()), true);
+ /**
+ * Equivalent to <code>charmap_encode(str, null, null)</code>. This method is here so the error
+ * and mapping arguments can be optional at the Python level.
+ *
+ * @param str to be encoded
+ * @return (encoded data, size(str)) as a pair
+ */
+ public static PyTuple charmap_encode(String str) {
+ return charmap_encode(str, null, null);
}
+ /**
+ * Equivalent to <code>charmap_encode(str, errors, null)</code>. This method is here so the
+ * mapping can be optional at the Python level.
+ *
+ * @param str to be encoded
+ * @param errors error policy name (e.g. "ignore")
+ * @return (encoded data, size(str)) as a pair
+ */
+ public static PyTuple charmap_encode(String str, String errors) {
+ return charmap_encode(str, errors, null);
+ }
+
+ /**
+ * Encoder based on an optional character mapping. This mapping is either an
+ * <code>EncodingMap</code> of 256 entries, or an arbitrary container indexable with integers
+ * using <code>__finditem__</code> and yielding byte strings. If the mapping is null, latin-1
+ * (effectively a mapping of character code to the numerically-equal byte) is used
+ *
+ * @param str to be encoded
+ * @param errors error policy name (e.g. "ignore")
+ * @param mapping from character code to output byte (or string)
+ * @return (encoded data, size(str)) as a pair
+ */
+ public static PyTuple charmap_encode(String str, String errors, PyObject mapping) {
+ if (mapping == null || mapping == Py.None) {
+ // Default to Latin-1
+ return latin_1_encode(str, errors);
+ } else {
+ return charmap_encode_internal(str, errors, mapping, new StringBuilder(str.length()),
+ true);
+ }
+ }
+
+ /**
+ * Helper to implement the several variants of <code>charmap_encode</code>, given an optional
+ * mapping. This mapping is either an <code>EncodingMap</code> of 256 entries, or an arbitrary
+ * container indexable with integers using <code>__finditem__</code> and yielding byte strings.
+ *
+ * @param str to be encoded
+ * @param errors error policy name (e.g. "ignore")
+ * @param mapping from character code to output byte (or string)
+ * @param v to contain the encoded bytes
+ * @param letLookupHandleError
+ * @return (encoded data, size(str)) as a pair
+ */
private static PyTuple charmap_encode_internal(String str, String errors, PyObject mapping,
StringBuilder v, boolean letLookupHandleError) {
+
EncodingMap encodingMap = mapping instanceof EncodingMap ? (EncodingMap)mapping : null;
int size = str.length();
+
for (int i = 0; i < size; i++) {
+
+ // Map the i.th character of str to some value
char ch = str.charAt(i);
PyObject x;
if (encodingMap != null) {
+ // The mapping given was an EncodingMap [0,256) => on-negative int
int result = encodingMap.lookup(ch);
- if (result == -1) {
- x = null;
- } else {
- x = Py.newInteger(result);
- }
+ x = (result == -1) ? null : Py.newInteger(result);
} else {
+ // The mapping was a map or similar: non-negative int -> object
x = mapping.__finditem__(Py.newInteger(ch));
}
+
+ // And map this object to an output character
if (x == null) {
+ // Error during lookup
if (letLookupHandleError) {
+ // Some kind of substitute can be placed in the output
i = handleBadMapping(str, errors, mapping, v, size, i);
} else {
+ // Hard error
throw Py.UnicodeEncodeError("charmap", str, i, i + 1,
"character maps to <undefined>");
}
+
} else if (x instanceof PyInteger) {
+ // Look-up had integer result: output as byte value
int value = ((PyInteger)x).getValue();
if (value < 0 || value > 255) {
throw Py.TypeError("character mapping must be in range(256)");
}
v.append((char)value);
+
} else if (x instanceof PyString && !(x instanceof PyUnicode)) {
+ // Look-up had str or unicode result: output as Java String
+ // XXX: (Py3k) Look-up had bytes or str result: output as ... this is a problem
v.append(x.toString());
+
} else if (x instanceof PyNone) {
i = handleBadMapping(str, errors, mapping, v, size, i);
+
} else {
/* wrong return value */
throw Py.TypeError("character mapping must return " + "integer, None or str");
}
}
+
return encode_tuple(v.toString(), size);
}
+ /**
+ * Helper for {@link #charmap_encode_internal(String, String, PyObject, StringBuilder, boolean)}
+ * called when we need some kind of substitute in the output for an invalid input.
+ *
+ * @param str to be encoded
+ * @param errors error policy name (e.g. "ignore")
+ * @param mapping from character code to output byte (or string)
+ * @param v to contain the encoded bytes
+ * @param size of str
+ * @param i index in str of current (and problematic) character
+ * @return index of last character of problematic section
+ */
private static int handleBadMapping(String str, String errors, PyObject mapping,
StringBuilder v, int size, int i) {
+
+ // If error policy specified, execute it
if (errors != null) {
+
if (errors.equals(codecs.IGNORE)) {
return i;
+
} else if (errors.equals(codecs.REPLACE)) {
- charmap_encode_internal("?", errors, mapping, v, false);
+ String replStr = "?";
+ charmap_encode_internal(replStr, errors, mapping, v, false);
return i;
+
} else if (errors.equals(codecs.XMLCHARREFREPLACE)) {
- charmap_encode_internal(codecs.xmlcharrefreplace(i, i + 1, str).toString(), errors,
- mapping, v, false);
+ String replStr = codecs.xmlcharrefreplace(i, i + 1, str).toString();
+ charmap_encode_internal(replStr, errors, mapping, v, false);
return i;
+
} else if (errors.equals(codecs.BACKSLASHREPLACE)) {
- charmap_encode_internal(codecs.backslashreplace(i, i + 1, str).toString(), errors,
- mapping, v, false);
+ String replStr = codecs.backslashreplace(i, i + 1, str).toString();
+ charmap_encode_internal(replStr, errors, mapping, v, false);
return i;
}
}
- PyObject replacement =
- codecs.encoding_error(errors, "charmap", str, i, i + 1,
- "character maps to <undefined>");
+
+ // Default behaviour (error==null or does not match known case)
+ String msg = "character maps to <undefined>";
+ PyObject replacement = codecs.encoding_error(errors, "charmap", str, i, i + 1, msg);
String replStr = replacement.__getitem__(0).toString();
charmap_encode_internal(replStr, errors, mapping, v, false);
+
return codecs.calcNewPosition(size, replacement) - 1;
}
@@ -423,8 +575,8 @@
int[] bo = new int[] {0};
int[] consumed = final_ ? null : new int[1];
String decoded = decode_UTF16(str, errors, bo, consumed);
- return new PyTuple(Py.newString(decoded),
- Py.newInteger(final_ ? str.length() : consumed[0]), Py.newInteger(bo[0]));
+ return new PyTuple(new PyUnicode(decoded), Py.newInteger(final_ ? str.length()
+ : consumed[0]), Py.newInteger(bo[0]));
}
private static String decode_UTF16(String str, String errors, int[] byteorder) {
@@ -562,11 +714,8 @@
public static class EncodingMap extends PyObject {
char[] level1;
-
char[] level23;
-
int count2;
-
int count3;
private EncodingMap(char[] level1, char[] level23, int count2, int count3) {
--
Repository URL: http://hg.python.org/jython
More information about the Jython-checkins
mailing list