[Jython-checkins] jython: Implement our own limited base64 encode/decode. (Fixes #2663)

Sun Jan 6 04:31:34 EST 2019

https://hg.python.org/jython/rev/79cd4168b63d
changeset:   8214:79cd4168b63d
user:        Jeff Allen <ja.py at farowl.co.uk>
date:        Sun Jan 06 09:02:01 2019 +0000
summary:
  Implement our own limited base64 encode/decode. (Fixes #2663)

The DatatypeConverter class we used to solve a problem with large
modules was removed at Java 9 SE but the replacement Base64 class does
not exist in Java 7 which we'd like to support. Also tidies some
formatting and comments.

files:
  Lib/test/test_large_method_bytecode_jy.py |    2 -
  NEWS                                      |    1 +
  src/org/python/compiler/Module.java       |  290 ++++++---
  src/org/python/core/BytecodeLoader.java   |   83 ++-
  4 files changed, 270 insertions(+), 106 deletions(-)

diff --git a/Lib/test/test_large_method_bytecode_jy.py b/Lib/test/test_large_method_bytecode_jy.py
--- a/Lib/test/test_large_method_bytecode_jy.py
+++ b/Lib/test/test_large_method_bytecode_jy.py
@@ -16,7 +16,6 @@
 import unittest
 from test import test_support
 
- at unittest.skipIf(test_support.get_java_version() >= (9,), "Fails on Java 9+, see #2663")
 class large_method_tests(unittest.TestCase):
     '''Tests some oversized functions and methods.
     '''
@@ -53,7 +52,6 @@
         '''
         self.assertEqual(large_methods.small_function(), 'small 10')
 
- at unittest.skipIf(test_support.get_java_version() >= (9,), "Fails on Java 9+, issue #2663")
 class large_module_tests(unittest.TestCase):
     '''Tests a module with oversized main-code.
     So the whole module is represented as a single PyBytecode object.
diff --git a/NEWS b/NEWS
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,7 @@
 
 Development tip
   Bugs fixed
+    - [ 2663 ] Remove dependency on javax.xml.bind.DatatypeConverter
     - [ 2726 ] os.uname() throws IllegalArgumentException on Windows (Chinese localisation)
     - [ 2719 ] Divergence of __str__ and __repr__ from CPython
     - [ 2714 ] Locale and java version failures in test_os_jy
diff --git a/src/org/python/compiler/Module.java b/src/org/python/compiler/Module.java
--- a/src/org/python/compiler/Module.java
+++ b/src/org/python/compiler/Module.java
@@ -858,151 +858,239 @@
          bytes not directly suitable as String-values. cStringIO does not use Base64 or
          something, but rather supports only string-compatible data.
         */
-        // so we use Java-reflection...
+        // so we use Java-serialization...
 
         // serialize the object
         ByteArrayOutputStream bo = new ByteArrayOutputStream();
         ObjectOutputStream so = new ObjectOutputStream(bo);
         so.writeObject(btcode);
         so.flush();
-        String code_str = DatatypeConverter.printBase64Binary(bo.toByteArray());
+        // From Java 8 use: String code_str = Base64.getEncoder().encodeToString(bo.toByteArray());
+        String code_str = base64encodeToString(bo.toByteArray());
         so.close();
         bo.close();
         return code_str;
     }
 
+    /**
+     * Implement a simplified base64 encoding compatible with the decoding in BytecodeLoader. This
+     * encoder adds no '=' padding or line-breaks. equivalent to
+     * {@code binascii.b2a_base64(bytes).rstrip('=\n')}.
+     * 
+     * @param data to encode
+     * @return the string encoding the data
+     */
+    private static String base64encodeToString(byte[] data) {
+
+        final int N = data.length;
+        int tail = N % 3;
+
+        StringBuilder chars = new StringBuilder(((N / 3) + 1) * 4);
+
+        // Process bytes in blocks of three
+        int b = 0, quantum;
+        while (b <= N - 3) {
+            // Process [b:b+3]
+            quantum = ((data[b++] & 0xff) << 16) + ((data[b++] & 0xff) << 8) + (data[b++] & 0xff);
+            chars.append(base64enc[quantum >> 18]);
+            chars.append(base64enc[(quantum >> 12) & 0x3f]);
+            chars.append(base64enc[(quantum >> 6) & 0x3f]);
+            chars.append(base64enc[quantum & 0x3f]);
+        }
+
+        // Process the tail bytes
+        if (tail >= 1) {
+            quantum = ((data[b++] & 0xff) << 8);
+            if (tail == 2) {
+                quantum += data[b++] & 0xff;
+            }
+            chars.append(base64enc[quantum >> 10]);
+            chars.append(base64enc[(quantum >> 4) & 0x3f]);
+            if (tail == 2) {
+                chars.append(base64enc[(quantum << 2) & 0x3f]);
+            }
+        }
+
+        return chars.toString();
+    }
+
+    /** Look-up table for {@link #base64encodeToString(byte[])}. */
+    private static final char[] base64enc =
+            "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".toCharArray();
+
     private static final int maxLiteral = 65535;
 
     /**
-     * This method stores Python-Bytecode in String literals.
-     * While Java supports rather long strings, constrained only by
-     * int-addressing of arrays, it supports only up to 65535 characters
-     * in literals (not sure how escape-sequences are counted).
-     * To circumvent this limitation, the code is automatically splitted
-     * into several literals with the following naming-scheme.
-     *
-     * - The marker-interface 'ContainsPyBytecode' indicates that a class
-     *   contains (static final) literals of the following scheme:
-     * - a prefix of '___' indicates a bytecode-containing string literal
-     * - a number indicating the number of parts follows
-     * - '0_' indicates that no splitting occurred
-     * - otherwise another number follows, naming the index of the literal
-     * - indexing starts at 0
-     *
+     * This method stores Base64 encoded Python byte code in one or more String literals.
+     * <p>
+     * While Java String objects are limited only by the address range of arrays, the class file
+     * standard only supports literals representable in at most 65535 bytes of modified UTF-8. This
+     * method us used only with base64 Strings (therefore ASCII without nulls) and so each character
+     * occupies exactly 1 byte in the class file after encoding to UTF-8.
+     * <p>
+     * To work within the 65535 byte limitation, the {@code code_str} is split into several literals
+     * with the following naming-scheme:
+     * <ul>
+     * <li>The marker-interface 'ContainsPyBytecode' indicates that a class contains (static final)
+     * literals of the following scheme:
+     * <li>a prefix of '___' indicates a bytecode-containing string literal
+     * <li>a number indicating the number of parts follows
+     * <li>'0_' indicates that no splitting occurred
+     * <li>otherwise another number follows, naming the index of the literal
+     * <li>indexing starts at 0
+     * </ul>
      * Examples:
-     * ___0_method1   contains bytecode for method1
-     * ___2_0_method2 contains first part of method2's bytecode
-     * ___2_1_method2 contains second part of method2's bytecode
-     *
-     * Note that this approach is provisional. In future, Jython might contain
-     * the bytecode directly as bytecode-objects. The current approach was
-     * feasible with far less complicated JVM bytecode-manipulation, but needs
-     * special treatment after class-loading.
+     * <ul>
+     * <li>{@code ___0_method1} contains bytecode for method1
+     * <li>{@code ___2_0_method2} contains first part of method2's bytecode
+     * <li>{@code ___2_1_method2} contains second part of method2's bytecode
+     * </ul>
+     * Note that this approach is provisional. In future, Jython might contain the bytecode directly
+     * as bytecode-objects. The current approach was feasible with far less complicated JVM
+     * bytecode-manipulation, but needs special treatment after class-loading.
+     * 
+     * @param name of the method or function being generated
+     * @param code_str Base64 encoded CPython byte code
+     * @param module currently being defined as a class file
+     * @throws java.io.IOException
      */
     private static void insert_code_str_to_classfile(String name, String code_str, Module module)
             throws java.io.IOException {
-        // We might need to split the code into several literals.
-        if (code_str.length() > maxLiteral) {
-            int splits = code_str.length()/maxLiteral;
-            if (code_str.length()%maxLiteral > 0) {
+        if (code_str.length() <= maxLiteral) {
+            // This can go as a single literal
+            module.classfile.addFinalStringLiteral("___0_" + name, code_str);
+        } else {
+            // We need to split the code into several literals.
+            int splits = code_str.length() / maxLiteral;
+            if (code_str.length() % maxLiteral > 0) {
                 ++splits;
             }
             int pos = 0, i = 0;
-            for (; pos+maxLiteral <= code_str.length(); ++i) {
-                module.classfile.addFinalStringLiteral(
-                        "___"+splits+"_"+i+"_"+name,
-                        code_str.substring(pos, pos+maxLiteral));
+            for (; pos + maxLiteral <= code_str.length(); ++i) {
+                module.classfile.addFinalStringLiteral("___" + splits + "_" + i + "_" + name,
+                        code_str.substring(pos, pos + maxLiteral));
                 pos += maxLiteral;
             }
             if (i < splits) {
-                module.classfile.addFinalStringLiteral(
-                        "___"+splits+"_"+i+"_"+name,
+                module.classfile.addFinalStringLiteral("___" + splits + "_" + i + "_" + name,
                         code_str.substring(pos));
             }
-        } else {
-            module.classfile.addFinalStringLiteral("___0_"+name, code_str);
         }
     }
 
+    /**
+     * Create and write a Python module as a Java class file.
+     * 
+     * @param node AST of the module to write
+     * @param ostream stream onto which to write it
+     * @param name
+     * @param filename
+     * @param linenumbers
+     * @param printResults
+     * @param cflags
+     * @param mtime
+     * @throws Exception
+     */
     public static void compile(mod node, OutputStream ostream, String name, String filename,
             boolean linenumbers, boolean printResults, CompilerFlags cflags, long mtime)
             throws Exception {
+
         try {
             Module module = new Module(name, filename, linenumbers, mtime);
             _module_init(node, module, printResults, cflags);
             module.write(ostream);
+
         } catch (MethodTooLargeException re) {
-                PyBytecode btcode = loadPyBytecode(filename, true);
-                int thresh = 22000;
-                // No idea, how to determine at this point if a method is oversized, so we just try
-                // a threshold regarding Python code-length, while JVM restriction is actually about
-                // Java bytecode length. Anyway; given that code-lengths are strongly related, this
-                // should work well enough.
+            PyBytecode btcode = loadPyBytecode(filename, true);
+            int thresh = 22000;
+            /*
+             * No idea, how to determine at this point if a method is oversized, so we just try a
+             * threshold regarding Python code-length, while JVM restriction is actually about Java
+             * bytecode length. Anyway; given that code-lengths are strongly related, this should
+             * work well enough.
+             */
+            while (true) { // Always enjoy to write a line like this :)
+                try {
+                    List<PyBytecode> largest_m_codes = new ArrayList<>();
+                    Stack<PyBytecode> buffer = new Stack<>();
+                    // HashSet<PyBytecode> allCodes = new HashSet<>();
+                    buffer.push(btcode);
+                    // allCodes.add(btcode);
 
-                while (true) { // Always enjoy to write a line like this :)
-                    try {
-                        List<PyBytecode> largest_m_codes = new ArrayList<>();
-                        Stack<PyBytecode> buffer = new Stack<>();
-                        //HashSet<PyBytecode> allCodes = new HashSet<>();
-                        buffer.push(btcode);
-                        //allCodes.add(btcode);
-                        while (!buffer.isEmpty()) {
-                            // Probably this cannot yield cycles, so cycle-proof stuff
-                            // is out-commented for now. (everything regarding 'allCodes')
-                            PyBytecode bcode = buffer.pop();
-                            if (bcode.co_code.length > thresh) {
-                                largest_m_codes.add(bcode);
-                            } else {
-                                // If a function needs to be represented as CPython bytecode, we create
-                                // all inner PyCode-items (classes, functions, methods) also as CPython
-                                // bytecode implicitly, so no need to look at them individually.
-                                // Maybe we can later optimize this such that inner methods can be
-                                // JVM-bytecode as well (if not oversized themselves).
-                                for (PyObject item: bcode.co_consts) {
-                                    if (item instanceof PyBytecode /*&& !allCodes.contains(item)*/) {
-                                        PyBytecode mpbc = (PyBytecode) item;
-                                        buffer.push(mpbc);
-                                        //allCodes.add(mpbc);
-                                    }
+                    while (!buffer.isEmpty()) {
+                        /*
+                         * Probably this cannot yield cycles, so cycle-proof stuff is out-commented
+                         * for now. (everything regarding 'allCodes')
+                         */
+                        PyBytecode bcode = buffer.pop();
+                        if (bcode.co_code.length > thresh) {
+                            largest_m_codes.add(bcode);
+                        } else {
+                            /*
+                             * If a function needs to be represented as CPython bytecode, we create
+                             * all inner PyCode-items (classes, functions, methods) also as CPython
+                             * bytecode implicitly, so no need to look at them individually. Maybe
+                             * we can later optimize this such that inner methods can be
+                             * JVM-bytecode as well (if not oversized themselves).
+                             */
+                            for (PyObject item : bcode.co_consts) {
+                                if (item instanceof PyBytecode /* && !allCodes.contains(item) */) {
+                                    PyBytecode mpbc = (PyBytecode) item;
+                                    buffer.push(mpbc);
+                                    // allCodes.add(mpbc);
                                 }
                             }
                         }
-                        Module module = new Module(name, filename, linenumbers, mtime);
-                        module.oversized_methods = new Hashtable<>(largest_m_codes.size());
-                        int ov_id = 0;
-                        String name_id;
-                        for (PyBytecode largest_m_code: largest_m_codes) {
-                            if (!PyCodeConstant.isJavaIdentifier(largest_m_code.co_name)) {
-                                name_id = "f$_"+ov_id++;
-                            } else {
-                                name_id = largest_m_code.co_name+"$_"+ov_id++;
-                            }
-                            if (largest_m_code.co_name.equals("<module>")) {
-                                // In Jython's opinion module begins at line 0
-                                // (while CPython reports line 1)
-                                module.oversized_methods.put(
-                                        largest_m_code.co_name+0, name_id);
-                            } else {
-                                module.oversized_methods.put(
-                                        largest_m_code.co_name+largest_m_code.co_firstlineno, name_id);
-                            }
-                            String code_str = serializePyBytecode(largest_m_code);
-                            insert_code_str_to_classfile(name_id, code_str, module);
+                    }
+
+                    Module module = new Module(name, filename, linenumbers, mtime);
+
+                    module.oversized_methods = new Hashtable<>(largest_m_codes.size());
+                    int ov_id = 0;
+                    String name_id;
+
+                    for (PyBytecode largest_m_code : largest_m_codes) {
+                        if (!PyCodeConstant.isJavaIdentifier(largest_m_code.co_name)) {
+                            name_id = "f$_" + ov_id++;
+                        } else {
+                            name_id = largest_m_code.co_name + "$_" + ov_id++;
+                        }
+                        if (largest_m_code.co_name.equals("<module>")) {
+                            /*
+                             * In Jython's opinion module begins at line 0 (while CPython reports
+                             * line 1)
+                             */
+                            module.oversized_methods.put(largest_m_code.co_name + 0, name_id);
+                        } else {
+                            module.oversized_methods.put(
+                                    largest_m_code.co_name + largest_m_code.co_firstlineno,
+                                    name_id);
                         }
-                        module.classfile.addInterface(p(org.python.core.ContainsPyBytecode.class));
-                        _module_init(node, module, printResults, cflags);
-                        module.write(ostream);
-                        break;
-                    } catch (MethodTooLargeException e) {
-                        thresh -= 100;
+
+                        String code_str = serializePyBytecode(largest_m_code);
+                        insert_code_str_to_classfile(name_id, code_str, module);
                     }
-                    if (thresh == 10000) { /* This value should be well feasible by JVM-bytecode,
-                                              so something else must be wrong. */
-                        throw new RuntimeException(
-                                "For unknown reason, too large method code couldn't be resolved"+
-                                "\nby PyBytecode-approach:\n"+filename);
-                    }
+
+                    module.classfile.addInterface(p(org.python.core.ContainsPyBytecode.class));
+
+                    _module_init(node, module, printResults, cflags);
+                    module.write(ostream);
+
+                    break;
+
+                } catch (MethodTooLargeException e) {
+                    thresh -= 1000;
                 }
+                if (thresh < 10000) {
+                    /*
+                     * This value should be well feasible by JVM-bytecode, so something else must be
+                     * wrong.
+                     */
+                    throw new RuntimeException(
+                            "For unknown reason, too large method code couldn't be resolved"
+                                    + "\nby PyBytecode-approach:\n" + filename);
+                }
+            }
         }
     }
 
diff --git a/src/org/python/core/BytecodeLoader.java b/src/org/python/core/BytecodeLoader.java
--- a/src/org/python/core/BytecodeLoader.java
+++ b/src/org/python/core/BytecodeLoader.java
@@ -77,9 +77,9 @@
     }
 
     private static PyCode parseSerializedCode(String code_str)
-            throws IOException, ClassNotFoundException
-    {
-        byte[] b = DatatypeConverter.parseBase64Binary(code_str);
+            throws IOException, ClassNotFoundException {
+        // From Java 8 use: byte[] b = Base64.getDecoder().decode(code_str);
+        byte[] b = base64decode(code_str);
         ByteArrayInputStream bi = new ByteArrayInputStream(b);
         ObjectInputStream si = new ObjectInputStream(bi);
         PyBytecode meth_code = (PyBytecode) si.readObject();
@@ -89,6 +89,83 @@
     }
 
     /**
+     * Implement a restricted form of base64 decoding compatible with the encoding in Module. This
+     * decoder treats characters outside the set of 64 necessary to encode data as errors, including
+     * the pad "=". As a result, the length of the argument exactly determines the size of array
+     * returned.
+     * 
+     * @param src to decode
+     * @return a new byte array
+     * @throws IllegalArgumentException if src has an invalid character or impossible length.
+     */
+    private static byte[] base64decode(String src) throws IllegalArgumentException {
+
+        // Length L is a multiple of 4 plus 0, 2 or 3 tail characters (bearing 0, 8, or 16 bits)
+        final int L = src.length();
+        final int tail = L % 4; // 0 to 3 where 1 (an extra 6 bits) is invalid.
+        if (tail == 1) {
+            throw new IllegalArgumentException("Input length invalid (4n+1)");
+        }
+
+        // src encodes exactly this many bytes:
+        final int N = (L / 4) * 3 + (tail > 0 ? tail - 1 : 0);
+        byte[] data = new byte[N];
+
+        // Work through src in blocks of 4
+        int s = 0, b = 0, quantum;
+        while (s <= L - 4) {
+            // Process src[s:s+4]
+            quantum = (base64CharToBits(src.charAt(s++)) << 18)
+                    + (base64CharToBits(src.charAt(s++)) << 12)
+                    + (base64CharToBits(src.charAt(s++)) << 6) + base64CharToBits(src.charAt(s++));
+            data[b++] = (byte) (quantum >> 16);
+            data[b++] = (byte) (quantum >> 8);
+            data[b++] = (byte) quantum;
+        }
+
+        // Now deal with 2 or 3 tail characters, generating one or two bytes.
+        if (tail >= 2) {
+            // Repeat the loop body, but everything is 8 bits to the right.
+            quantum = (base64CharToBits(src.charAt(s++)) << 10)
+                    + (base64CharToBits(src.charAt(s++)) << 4);
+            data[b++] = (byte) (quantum >> 8);
+            if (tail == 3) {
+                quantum += (base64CharToBits(src.charAt(s++)) >> 2);
+                data[b++] = (byte) quantum;
+            }
+        }
+
+        return data;
+    }
+
+    /**
+     * Helper for {@link #base64decode(String)}, converting one character.
+     * @param c to convert
+     * @return value 0..63
+     * @throws IllegalArgumentException if not a base64 character
+     */
+    private static int base64CharToBits(char c) throws IllegalArgumentException {
+        if (c >= 'a') {
+            if (c <= 'z') {
+                return c - ('a' - 26);
+            }
+        } else if (c >= 'A') {
+            if (c <= 'Z') {
+                return c - 'A';
+            }
+        } else if (c >= '0') {
+            if (c <= '9') {
+                return c + (52 - '0');
+            }
+        } else if (c == '+') {
+            return 62;
+        } else if (c == '/') {
+            return 63;
+        }
+        throw new IllegalArgumentException("Invalid character " + c);
+    }
+
+    /**
      * This method looks for Python-Bytecode stored in String literals.
      * While Java supports rather long strings, constrained only by
      * int-addressing of arrays, it supports only up to 65535 characters

-- 
Repository URL: https://hg.python.org/jython