[Jython-checkins] jython: Fixed test failures in charmap codec and UTF-16 use of PyUnicode.

jeff.allen jython-checkins at python.org
Sat Jan 5 18:56:56 CET 2013


http://hg.python.org/jython/rev/bdf4b0b5f5f7
changeset:   6936:bdf4b0b5f5f7
user:        Jeff Allen <ja...py at farowl.co.uk>
date:        Sat Dec 29 01:06:41 2012 +0000
summary:
  Fixed test failures in charmap codec and UTF-16 use of PyUnicode.
Added a lot of commentary and made arguments optional in charmap.
Changed return type of UTF-16 decode to PyUnicode (because it is).
Now scoring fail/error/skip = 3/2/45 in test_codecs.

files:
  src/org/python/modules/_codecs.java |  233 +++++++++++++--
  1 files changed, 191 insertions(+), 42 deletions(-)


diff --git a/src/org/python/modules/_codecs.java b/src/org/python/modules/_codecs.java
--- a/src/org/python/modules/_codecs.java
+++ b/src/org/python/modules/_codecs.java
@@ -49,6 +49,7 @@
     }
 
     private static PyTuple decode_tuple_str(String s, int len) {
+        // XXX should this be PyUnicode(s) ?
         return new PyTuple(new PyString(s), Py.newInteger(len));
     }
 
@@ -117,51 +118,118 @@
     }
 
     /* --- Character Mapping Codec --------------------------------------- */
-    public static PyTuple charmap_decode(String str, String errors, PyObject mapping) {
-        return charmap_decode(str, errors, mapping, false);
+
+    /**
+     * Equivalent to <code>charmap_decode(bytes, errors, null)</code>. This method is here so the
+     * error and mapping arguments can be optional at the Python level.
+     *
+     * @param bytes sequence of bytes to decode
+     * @return decoded string and number of bytes consumed
+     */
+    public static PyTuple charmap_decode(String bytes) {
+        return charmap_decode(bytes, null, null);
     }
 
-    public static PyTuple charmap_decode(String str, String errors, PyObject mapping,
+    /**
+     * Equivalent to <code>charmap_decode(bytes, errors, null)</code>. This method is here so the
+     * error argument can be optional at the Python level.
+     *
+     * @param bytes sequence of bytes to decode
+     * @param errors error policy
+     * @return decoded string and number of bytes consumed
+     */
+    public static PyTuple charmap_decode(String bytes, String errors) {
+        return charmap_decode(bytes, errors, null);
+    }
+
+    /**
+     * Decode a sequence of bytes into Unicode characters via a mapping supplied as a container to
+     * be indexed by the byte values (as unsigned integers). If the mapping is null or None, decode
+     * with latin-1 (essentially treating bytes as character codes directly).
+     *
+     * @param bytes sequence of bytes to decode
+     * @param errors error policy
+     * @param mapping to convert bytes to characters
+     * @return decoded string and number of bytes consumed
+     */
+    public static PyTuple charmap_decode(String str, String errors, PyObject mapping) {
+        if (mapping == null || mapping == Py.None) {
+            // Default to Latin-1
+            return latin_1_decode(str, errors);
+        } else {
+            return charmap_decode(str, errors, mapping, false);
+        }
+    }
+
+    /**
+     * Decode a sequence of bytes into Unicode characters via a mapping supplied as a container to
+     * be indexed by the byte values (as unsigned integers).
+     *
+     * @param bytes sequence of bytes to decode
+     * @param errors error policy
+     * @param mapping to convert bytes to characters
+     * @param ignoreUnmapped if true, pass unmapped byte values as character codes [0..256)
+     * @return decoded string and number of bytes consumed
+     */
+    public static PyTuple charmap_decode(String bytes, String errors, PyObject mapping,
             boolean ignoreUnmapped) {
+        // XXX bytes: would prefer to accept any object with buffer API
+        int size = bytes.length();
+        StringBuilder v = new StringBuilder(size);
 
-        int size = str.length();
-        StringBuilder v = new StringBuilder(size);
         for (int i = 0; i < size; i++) {
-            char ch = str.charAt(i);
-            if (ch > 0xFF) {
-                i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, //
+
+            // Process the i.th input byte
+            int b = bytes.charAt(i);
+            if (b > 0xff) {
+                i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
                         i, i + 1, "ordinal not in range(255)") - 1;
                 continue;
             }
-            PyObject w = Py.newInteger(ch);
+
+            // Map the byte to an output character code (or possibly string)
+            PyObject w = Py.newInteger(b);
             PyObject x = mapping.__finditem__(w);
+
+            // Apply to the output
             if (x == null) {
+                // Error case: mapping not found
                 if (ignoreUnmapped) {
-                    v.append(ch);
+                    v.appendCodePoint(b);
                 } else {
-                    i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, //
+                    i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
                             i, i + 1, "no mapping found") - 1;
                 }
-                continue;
-            }
-            /* Apply mapping */
-            if (x instanceof PyInteger) {
+
+            } else if (x instanceof PyInteger) {
+                // Mapping was to an int: treat as character code
                 int value = ((PyInteger)x).getValue();
                 if (value < 0 || value > PySystemState.maxunicode) {
                     throw Py.TypeError("character mapping must return "
                             + "integer greater than 0 and less than sys.maxunicode");
                 }
-                v.append((char)value);
+                v.appendCodePoint(value);
+
             } else if (x == Py.None) {
-                i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, //
+                i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
                         i, i + 1, "character maps to <undefined>") - 1;
+
             } else if (x instanceof PyString) {
-                v.append(x.toString());
+                String s = x.toString();
+                if (s.charAt(0) == 0xfffe) {
+                    // Invalid indicates "undefined" see C-API PyUnicode_DecodeCharmap()
+                    i = codecs.insertReplacementAndGetResume(v, errors, "charmap", bytes, //
+                            i, i + 1, "character maps to <undefined>") - 1;
+                } else {
+                    v.append(s);
+                }
+
             } else {
                 /* wrong return value */
                 throw Py.TypeError("character mapping must return " + "integer, None or str");
             }
         }
+
         return decode_tuple(v.toString(), size);
     }
 
@@ -198,79 +266,163 @@
         return new PyUnicode(buf.toString());
     }
 
-    public static PyTuple charmap_encode(String str, String errors, PyObject mapping) {
-        // Default to Latin-1
-        if (mapping == null) {
-            return latin_1_encode(str, errors);
-        }
-        return charmap_encode_internal(str, errors, mapping, new StringBuilder(str.length()), true);
+    /**
+     * Equivalent to <code>charmap_encode(str, null, null)</code>. This method is here so the error
+     * and mapping arguments can be optional at the Python level.
+     *
+     * @param str to be encoded
+     * @return (encoded data, size(str)) as a pair
+     */
+    public static PyTuple charmap_encode(String str) {
+        return charmap_encode(str, null, null);
     }
 
+    /**
+     * Equivalent to <code>charmap_encode(str, errors, null)</code>. This method is here so the
+     * mapping can be optional at the Python level.
+     *
+     * @param str to be encoded
+     * @param errors error policy name (e.g. "ignore")
+     * @return (encoded data, size(str)) as a pair
+     */
+    public static PyTuple charmap_encode(String str, String errors) {
+        return charmap_encode(str, errors, null);
+    }
+
+    /**
+     * Encoder based on an optional character mapping. This mapping is either an
+     * <code>EncodingMap</code> of 256 entries, or an arbitrary container indexable with integers
+     * using <code>__finditem__</code> and yielding byte strings. If the mapping is null, latin-1
+     * (effectively a mapping of character code to the numerically-equal byte) is used
+     *
+     * @param str to be encoded
+     * @param errors error policy name (e.g. "ignore")
+     * @param mapping from character code to output byte (or string)
+     * @return (encoded data, size(str)) as a pair
+     */
+    public static PyTuple charmap_encode(String str, String errors, PyObject mapping) {
+        if (mapping == null || mapping == Py.None) {
+            // Default to Latin-1
+            return latin_1_encode(str, errors);
+        } else {
+            return charmap_encode_internal(str, errors, mapping, new StringBuilder(str.length()),
+                    true);
+        }
+    }
+
+    /**
+     * Helper to implement the several variants of <code>charmap_encode</code>, given an optional
+     * mapping. This mapping is either an <code>EncodingMap</code> of 256 entries, or an arbitrary
+     * container indexable with integers using <code>__finditem__</code> and yielding byte strings.
+     *
+     * @param str to be encoded
+     * @param errors error policy name (e.g. "ignore")
+     * @param mapping from character code to output byte (or string)
+     * @param v to contain the encoded bytes
+     * @param letLookupHandleError
+     * @return (encoded data, size(str)) as a pair
+     */
     private static PyTuple charmap_encode_internal(String str, String errors, PyObject mapping,
             StringBuilder v, boolean letLookupHandleError) {
+
         EncodingMap encodingMap = mapping instanceof EncodingMap ? (EncodingMap)mapping : null;
         int size = str.length();
+
         for (int i = 0; i < size; i++) {
+
+            // Map the i.th character of str to some value
             char ch = str.charAt(i);
             PyObject x;
             if (encodingMap != null) {
+                // The mapping given was an EncodingMap [0,256) => on-negative int
                 int result = encodingMap.lookup(ch);
-                if (result == -1) {
-                    x = null;
-                } else {
-                    x = Py.newInteger(result);
-                }
+                x = (result == -1) ? null : Py.newInteger(result);
             } else {
+                // The mapping was a map or similar: non-negative int -> object
                 x = mapping.__finditem__(Py.newInteger(ch));
             }
+
+            // And map this object to an output character
             if (x == null) {
+                // Error during lookup
                 if (letLookupHandleError) {
+                    // Some kind of substitute can be placed in the output
                     i = handleBadMapping(str, errors, mapping, v, size, i);
                 } else {
+                    // Hard error
                     throw Py.UnicodeEncodeError("charmap", str, i, i + 1,
                             "character maps to <undefined>");
                 }
+
             } else if (x instanceof PyInteger) {
+                // Look-up had integer result: output as byte value
                 int value = ((PyInteger)x).getValue();
                 if (value < 0 || value > 255) {
                     throw Py.TypeError("character mapping must be in range(256)");
                 }
                 v.append((char)value);
+
             } else if (x instanceof PyString && !(x instanceof PyUnicode)) {
+                // Look-up had str or unicode result: output as Java String
+                // XXX: (Py3k) Look-up had bytes or str result: output as ... this is a problem
                 v.append(x.toString());
+
             } else if (x instanceof PyNone) {
                 i = handleBadMapping(str, errors, mapping, v, size, i);
+
             } else {
                 /* wrong return value */
                 throw Py.TypeError("character mapping must return " + "integer, None or str");
             }
         }
+
         return encode_tuple(v.toString(), size);
     }
 
+    /**
+     * Helper for {@link #charmap_encode_internal(String, String, PyObject, StringBuilder, boolean)}
+     * called when we need some kind of substitute in the output for an invalid input.
+     *
+     * @param str to be encoded
+     * @param errors error policy name (e.g. "ignore")
+     * @param mapping from character code to output byte (or string)
+     * @param v to contain the encoded bytes
+     * @param size of str
+     * @param i index in str of current (and problematic) character
+     * @return index of last character of problematic section
+     */
     private static int handleBadMapping(String str, String errors, PyObject mapping,
             StringBuilder v, int size, int i) {
+
+        // If error policy specified, execute it
         if (errors != null) {
+
             if (errors.equals(codecs.IGNORE)) {
                 return i;
+
             } else if (errors.equals(codecs.REPLACE)) {
-                charmap_encode_internal("?", errors, mapping, v, false);
+                String replStr = "?";
+                charmap_encode_internal(replStr, errors, mapping, v, false);
                 return i;
+
             } else if (errors.equals(codecs.XMLCHARREFREPLACE)) {
-                charmap_encode_internal(codecs.xmlcharrefreplace(i, i + 1, str).toString(), errors,
-                        mapping, v, false);
+                String replStr = codecs.xmlcharrefreplace(i, i + 1, str).toString();
+                charmap_encode_internal(replStr, errors, mapping, v, false);
                 return i;
+
             } else if (errors.equals(codecs.BACKSLASHREPLACE)) {
-                charmap_encode_internal(codecs.backslashreplace(i, i + 1, str).toString(), errors,
-                        mapping, v, false);
+                String replStr = codecs.backslashreplace(i, i + 1, str).toString();
+                charmap_encode_internal(replStr, errors, mapping, v, false);
                 return i;
             }
         }
-        PyObject replacement =
-                codecs.encoding_error(errors, "charmap", str, i, i + 1,
-                        "character maps to <undefined>");
+
+        // Default behaviour (error==null or does not match known case)
+        String msg = "character maps to <undefined>";
+        PyObject replacement = codecs.encoding_error(errors, "charmap", str, i, i + 1, msg);
         String replStr = replacement.__getitem__(0).toString();
         charmap_encode_internal(replStr, errors, mapping, v, false);
+
         return codecs.calcNewPosition(size, replacement) - 1;
     }
 
@@ -423,8 +575,8 @@
         int[] bo = new int[] {0};
         int[] consumed = final_ ? null : new int[1];
         String decoded = decode_UTF16(str, errors, bo, consumed);
-        return new PyTuple(Py.newString(decoded),
-                Py.newInteger(final_ ? str.length() : consumed[0]), Py.newInteger(bo[0]));
+        return new PyTuple(new PyUnicode(decoded), Py.newInteger(final_ ? str.length()
+                : consumed[0]), Py.newInteger(bo[0]));
     }
 
     private static String decode_UTF16(String str, String errors, int[] byteorder) {
@@ -562,11 +714,8 @@
     public static class EncodingMap extends PyObject {
 
         char[] level1;
-
         char[] level23;
-
         int count2;
-
         int count3;
 
         private EncodingMap(char[] level1, char[] level23, int count2, int count3) {

-- 
Repository URL: http://hg.python.org/jython


More information about the Jython-checkins mailing list