[Jython-checkins] jython (merge default -> default): Merge str bytes check to trunk

jeff.allen jython-checkins at python.org
Mon Dec 1 00:25:22 CET 2014


https://hg.python.org/jython/rev/849ec9c291db
changeset:   7425:849ec9c291db
parent:      7423:6aa434d5dc01
parent:      7424:f0c63b42e552
user:        Jeff Allen <ja.py at farowl.co.uk>
date:        Sun Nov 30 23:25:03 2014 +0000
summary:
  Merge str bytes check to trunk

files:
  Lib/test/test_str_jy.py                       |  11 +-
  src/org/python/core/Py.java                   |   9 +-
  src/org/python/core/PyJavaType.java           |   6 +-
  src/org/python/core/PyString.java             |  70 ++++++++-
  src/org/python/core/PyUnicode.java            |   3 +-
  tests/java/org/python/core/BaseBytesTest.java |  24 +--
  6 files changed, 90 insertions(+), 33 deletions(-)


diff --git a/Lib/test/test_str_jy.py b/Lib/test/test_str_jy.py
--- a/Lib/test/test_str_jy.py
+++ b/Lib/test/test_str_jy.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 from test import test_support
+import java.lang
 import unittest
 
 class WrappedStrCmpTest(unittest.TestCase):
@@ -23,12 +24,18 @@
         ABC = Wrapper('ABC')
         self.assertEquals(1, d[ABC])
 
-class IntToStrTest(unittest.TestCase):
+class StrConstructorTest(unittest.TestCase):
 
     def test_int_to_string_format(self):
         # 0.001 comes out as 0.0010
         self.assertEquals(str(0.001), "0.001")
 
+    def test_unicode_resistance(self):
+        # Issue 2037: prevent byte/str elements > 255
+        self.assertRaises(UnicodeEncodeError, str, java.lang.String(u"caf\xe9 noir"))
+        self.assertRaises(UnicodeEncodeError, str, java.lang.String(u"abc\u0111efgh"))
+
+
 class StringSlicingTest(unittest.TestCase):
 
     def test_out_of_bounds(self):
@@ -165,7 +172,7 @@
 def test_main():
     test_support.run_unittest(
         WrappedStrCmpTest,
-        IntToStrTest,
+        StrConstructorTest,
         StringSlicingTest,
         FormatTest,
         DisplayTest,
diff --git a/src/org/python/core/Py.java b/src/org/python/core/Py.java
--- a/src/org/python/core/Py.java
+++ b/src/org/python/core/Py.java
@@ -1652,7 +1652,7 @@
 
     static {
         for (char j = 0; j < 256; j++) {
-            letters[j] = new PyString(new Character(j).toString());
+            letters[j] = new PyString(j);
         }
     }
 
@@ -1667,11 +1667,8 @@
     static final PyString makeCharacter(int codepoint, boolean toUnicode) {
         if (toUnicode) {
             return new PyUnicode(codepoint);
-        } else if (codepoint > 65536) {
-            throw new IllegalArgumentException(String.format("Codepoint > 65536 (%d) requires "
-                                                             + "toUnicode argument", codepoint));
-        } else if (codepoint > 256) {
-            return new PyString((char)codepoint);
+        } else if (codepoint > 255) {
+            throw new IllegalArgumentException("Cannot create PyString with non-byte value");
         }
         return letters[codepoint];
     }
diff --git a/src/org/python/core/PyJavaType.java b/src/org/python/core/PyJavaType.java
--- a/src/org/python/core/PyJavaType.java
+++ b/src/org/python/core/PyJavaType.java
@@ -611,8 +611,12 @@
             addMethod(new PyBuiltinMethodNarrow("__repr__") {
                 @Override
                 public PyObject __call__() {
+                    /*
+                     * java.lang.Object.toString returns Unicode: preserve as a PyUnicode, then let
+                     * the repr() built-in decide how to handle it. (Also applies to __str__.)
+                     */
                     String toString = self.getJavaProxy().toString();
-                    return toString == null ? Py.EmptyString : Py.newString(toString);
+                    return toString == null ? Py.EmptyUnicode : Py.newUnicode(toString);
                 }
             });
             addMethod(new PyBuiltinMethodNarrow("__unicode__") {
diff --git a/src/org/python/core/PyString.java b/src/org/python/core/PyString.java
--- a/src/org/python/core/PyString.java
+++ b/src/org/python/core/PyString.java
@@ -42,10 +42,19 @@
         this(TYPE, "");
     }
 
+    /**
+     * Fundamental constructor for <code>PyString</code> objects when the client provides a Java
+     * <code>String</code>, necessitating that we range check the characters.
+     *
+     * @param subType the actual type being constructed
+     * @param string a Java String to be wrapped
+     */
     public PyString(PyType subType, String string) {
         super(subType);
         if (string == null) {
             throw new IllegalArgumentException("Cannot create PyString from null!");
+        } else if (!isBytes(string)) {
+            throw new IllegalArgumentException("Cannot create PyString with non-byte value");
         }
         this.string = string;
     }
@@ -63,6 +72,40 @@
     }
 
     /**
+     * Determine whether a string consists entirely of characters in the range 0 to 255. Only such
+     * characters are allowed in the <code>PyString</code> (<code>str</code>) type, when it is not a
+     * {@link PyUnicode}.
+     *
+     * @return true if and only if every character has a code less than 256
+     */
+    private static boolean isBytes(String s) {
+        int k = s.length();
+        if (k == 0) {
+            return true;
+        } else {
+            // Bitwise-or the character codes together in order to test once.
+            char c = 0;
+            // Blocks of 8 to reduce loop tests
+            while (k > 8) {
+                c |= s.charAt(--k);
+                c |= s.charAt(--k);
+                c |= s.charAt(--k);
+                c |= s.charAt(--k);
+                c |= s.charAt(--k);
+                c |= s.charAt(--k);
+                c |= s.charAt(--k);
+                c |= s.charAt(--k);
+            }
+            // Now the rest
+            while (k > 0) {
+                c |= s.charAt(--k);
+            }
+            // We require there to be no bits set from 0x100 upwards
+            return c < 0x100;
+        }
+    }
+
+    /**
      * Creates a PyString from an already interned String. Just means it won't be reinterned if used
      * in a place that requires interned Strings.
      */
@@ -88,16 +131,25 @@
             String[] keywords) {
         ArgParser ap = new ArgParser("str", args, keywords, new String[] {"object"}, 0);
         PyObject S = ap.getPyObject(0, null);
+        // Get the textual representation of the object into str/bytes form
+        String str;
+        if (S == null) {
+            str = "";
+        } else {
+            // Let the object tell us its representation: this may be str or unicode.
+            S = S.__str__();
+            if (S instanceof PyUnicode) {
+                // Encoding will raise UnicodeEncodeError if not 7-bit clean.
+                str = codecs.encode((PyUnicode)S, null, null);
+            } else {
+                // Must be str/bytes, and should be 8-bit clean already.
+                str = S.toString();
+            }
+        }
         if (new_.for_type == subtype) {
-            if (S == null) {
-                return new PyString("");
-            }
-            return new PyString(S.__str__().toString());
+            return new PyString(str);
         } else {
-            if (S == null) {
-                return new PyStringDerived(subtype, "");
-            }
-            return new PyStringDerived(subtype, S.__str__().toString());
+            return new PyStringDerived(subtype, str);
         }
     }
 
@@ -4606,7 +4658,7 @@
 
                 default:
                     throw Py.ValueError("unsupported format character '"
-                            + codecs.encode(Py.newString(spec.type), null, "replace") + "' (0x"
+                            + codecs.encode(Py.newUnicode(spec.type), null, "replace") + "' (0x"
                             + Integer.toHexString(spec.type) + ") at index " + (index - 1));
             }
 
diff --git a/src/org/python/core/PyUnicode.java b/src/org/python/core/PyUnicode.java
--- a/src/org/python/core/PyUnicode.java
+++ b/src/org/python/core/PyUnicode.java
@@ -114,7 +114,8 @@
      * @param isBasic true if it is known that only BMP characters are present.
      */
     private PyUnicode(PyType subtype, String string, boolean isBasic) {
-        super(subtype, string);
+        super(subtype, "");
+        this.string = string;
         translator = isBasic ? BASIC : this.chooseIndexTranslator();
     }
 
diff --git a/tests/java/org/python/core/BaseBytesTest.java b/tests/java/org/python/core/BaseBytesTest.java
--- a/tests/java/org/python/core/BaseBytesTest.java
+++ b/tests/java/org/python/core/BaseBytesTest.java
@@ -304,22 +304,18 @@
         // Need interpreter for exceptions to be formed properly
         interp = new PythonInterpreter();
         // A scary set of objects
-        final PyObject[] brantub = {Py.None,
-                                    new PyInteger(-1),
-                                    new PyLong(0x80000000L),
-                                    new PyString("\u00A0\u0100\u00A2\u00A3\u00A4"),
-                                    new PyString("\u00A0\u00A0\u1000\u00A3\u00A4"),
-                                    new PyXRange(3, -2, -1),
-                                    new PyXRange(250, 257)};
+        final PyObject[] brantub = {Py.None, new PyInteger(-1), //
+                new PyLong(0x80000000L), //
+                new PyXRange(3, -2, -1), //
+                new PyXRange(250, 257) //
+                };
         // The PyException types we should obtain
         final PyObject[] boobyPrize = {Py.TypeError, // None
-                                       Py.ValueError, // -1
-                                       Py.OverflowError, // 0x80000000L
-                                       Py.ValueError, // \u0100 byte
-                                       Py.ValueError, // \u1000 byte
-                                       Py.ValueError, // -1 in iterable
-                                       Py.ValueError // 256 in iterable
-        };
+                Py.ValueError, // -1
+                Py.OverflowError, // 0x80000000L
+                Py.ValueError, // -1 in iterable
+                Py.ValueError // 256 in iterable
+                };
         // Work down the lists
         for (int dip = 0; dip < brantub.length; dip++) {
             PyObject aRef = boobyPrize[dip];

-- 
Repository URL: https://hg.python.org/jython


More information about the Jython-checkins mailing list