[Jython-checkins] jython (merge default -> default): Merge str bytes check to trunk
jeff.allen
jython-checkins at python.org
Mon Dec 1 00:25:22 CET 2014
https://hg.python.org/jython/rev/849ec9c291db
changeset: 7425:849ec9c291db
parent: 7423:6aa434d5dc01
parent: 7424:f0c63b42e552
user: Jeff Allen <ja.py at farowl.co.uk>
date: Sun Nov 30 23:25:03 2014 +0000
summary:
Merge str bytes check to trunk
files:
Lib/test/test_str_jy.py | 11 +-
src/org/python/core/Py.java | 9 +-
src/org/python/core/PyJavaType.java | 6 +-
src/org/python/core/PyString.java | 70 ++++++++-
src/org/python/core/PyUnicode.java | 3 +-
tests/java/org/python/core/BaseBytesTest.java | 24 +--
6 files changed, 90 insertions(+), 33 deletions(-)
diff --git a/Lib/test/test_str_jy.py b/Lib/test/test_str_jy.py
--- a/Lib/test/test_str_jy.py
+++ b/Lib/test/test_str_jy.py
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from test import test_support
+import java.lang
import unittest
class WrappedStrCmpTest(unittest.TestCase):
@@ -23,12 +24,18 @@
ABC = Wrapper('ABC')
self.assertEquals(1, d[ABC])
-class IntToStrTest(unittest.TestCase):
+class StrConstructorTest(unittest.TestCase):
def test_int_to_string_format(self):
# 0.001 comes out as 0.0010
self.assertEquals(str(0.001), "0.001")
+ def test_unicode_resistance(self):
+ # Issue 2037: prevent byte/str elements > 255
+ self.assertRaises(UnicodeEncodeError, str, java.lang.String(u"caf\xe9 noir"))
+ self.assertRaises(UnicodeEncodeError, str, java.lang.String(u"abc\u0111efgh"))
+
+
class StringSlicingTest(unittest.TestCase):
def test_out_of_bounds(self):
@@ -165,7 +172,7 @@
def test_main():
test_support.run_unittest(
WrappedStrCmpTest,
- IntToStrTest,
+ StrConstructorTest,
StringSlicingTest,
FormatTest,
DisplayTest,
diff --git a/src/org/python/core/Py.java b/src/org/python/core/Py.java
--- a/src/org/python/core/Py.java
+++ b/src/org/python/core/Py.java
@@ -1652,7 +1652,7 @@
static {
for (char j = 0; j < 256; j++) {
- letters[j] = new PyString(new Character(j).toString());
+ letters[j] = new PyString(j);
}
}
@@ -1667,11 +1667,8 @@
static final PyString makeCharacter(int codepoint, boolean toUnicode) {
if (toUnicode) {
return new PyUnicode(codepoint);
- } else if (codepoint > 65536) {
- throw new IllegalArgumentException(String.format("Codepoint > 65536 (%d) requires "
- + "toUnicode argument", codepoint));
- } else if (codepoint > 256) {
- return new PyString((char)codepoint);
+ } else if (codepoint > 255) {
+ throw new IllegalArgumentException("Cannot create PyString with non-byte value");
}
return letters[codepoint];
}
diff --git a/src/org/python/core/PyJavaType.java b/src/org/python/core/PyJavaType.java
--- a/src/org/python/core/PyJavaType.java
+++ b/src/org/python/core/PyJavaType.java
@@ -611,8 +611,12 @@
addMethod(new PyBuiltinMethodNarrow("__repr__") {
@Override
public PyObject __call__() {
+ /*
+ * java.lang.Object.toString returns Unicode: preserve as a PyUnicode, then let
+ * the repr() built-in decide how to handle it. (Also applies to __str__.)
+ */
String toString = self.getJavaProxy().toString();
- return toString == null ? Py.EmptyString : Py.newString(toString);
+ return toString == null ? Py.EmptyUnicode : Py.newUnicode(toString);
}
});
addMethod(new PyBuiltinMethodNarrow("__unicode__") {
diff --git a/src/org/python/core/PyString.java b/src/org/python/core/PyString.java
--- a/src/org/python/core/PyString.java
+++ b/src/org/python/core/PyString.java
@@ -42,10 +42,19 @@
this(TYPE, "");
}
+ /**
+ * Fundamental constructor for <code>PyString</code> objects when the client provides a Java
+ * <code>String</code>, necessitating that we range check the characters.
+ *
+ * @param subType the actual type being constructed
+ * @param string a Java String to be wrapped
+ */
public PyString(PyType subType, String string) {
super(subType);
if (string == null) {
throw new IllegalArgumentException("Cannot create PyString from null!");
+ } else if (!isBytes(string)) {
+ throw new IllegalArgumentException("Cannot create PyString with non-byte value");
}
this.string = string;
}
@@ -63,6 +72,40 @@
}
/**
+ * Determine whether a string consists entirely of characters in the range 0 to 255. Only such
+ * characters are allowed in the <code>PyString</code> (<code>str</code>) type, when it is not a
+ * {@link PyUnicode}.
+ *
+ * @return true if and only if every character has a code less than 256
+ */
+ private static boolean isBytes(String s) {
+ int k = s.length();
+ if (k == 0) {
+ return true;
+ } else {
+ // Bitwise-or the character codes together in order to test once.
+ char c = 0;
+ // Blocks of 8 to reduce loop tests
+ while (k > 8) {
+ c |= s.charAt(--k);
+ c |= s.charAt(--k);
+ c |= s.charAt(--k);
+ c |= s.charAt(--k);
+ c |= s.charAt(--k);
+ c |= s.charAt(--k);
+ c |= s.charAt(--k);
+ c |= s.charAt(--k);
+ }
+ // Now the rest
+ while (k > 0) {
+ c |= s.charAt(--k);
+ }
+ // We require there to be no bits set from 0x100 upwards
+ return c < 0x100;
+ }
+ }
+
+ /**
* Creates a PyString from an already interned String. Just means it won't be reinterned if used
* in a place that requires interned Strings.
*/
@@ -88,16 +131,25 @@
String[] keywords) {
ArgParser ap = new ArgParser("str", args, keywords, new String[] {"object"}, 0);
PyObject S = ap.getPyObject(0, null);
+ // Get the textual representation of the object into str/bytes form
+ String str;
+ if (S == null) {
+ str = "";
+ } else {
+ // Let the object tell us its representation: this may be str or unicode.
+ S = S.__str__();
+ if (S instanceof PyUnicode) {
+ // Encoding will raise UnicodeEncodeError if not 7-bit clean.
+ str = codecs.encode((PyUnicode)S, null, null);
+ } else {
+ // Must be str/bytes, and should be 8-bit clean already.
+ str = S.toString();
+ }
+ }
if (new_.for_type == subtype) {
- if (S == null) {
- return new PyString("");
- }
- return new PyString(S.__str__().toString());
+ return new PyString(str);
} else {
- if (S == null) {
- return new PyStringDerived(subtype, "");
- }
- return new PyStringDerived(subtype, S.__str__().toString());
+ return new PyStringDerived(subtype, str);
}
}
@@ -4606,7 +4658,7 @@
default:
throw Py.ValueError("unsupported format character '"
- + codecs.encode(Py.newString(spec.type), null, "replace") + "' (0x"
+ + codecs.encode(Py.newUnicode(spec.type), null, "replace") + "' (0x"
+ Integer.toHexString(spec.type) + ") at index " + (index - 1));
}
diff --git a/src/org/python/core/PyUnicode.java b/src/org/python/core/PyUnicode.java
--- a/src/org/python/core/PyUnicode.java
+++ b/src/org/python/core/PyUnicode.java
@@ -114,7 +114,8 @@
* @param isBasic true if it is known that only BMP characters are present.
*/
private PyUnicode(PyType subtype, String string, boolean isBasic) {
- super(subtype, string);
+ super(subtype, "");
+ this.string = string;
translator = isBasic ? BASIC : this.chooseIndexTranslator();
}
diff --git a/tests/java/org/python/core/BaseBytesTest.java b/tests/java/org/python/core/BaseBytesTest.java
--- a/tests/java/org/python/core/BaseBytesTest.java
+++ b/tests/java/org/python/core/BaseBytesTest.java
@@ -304,22 +304,18 @@
// Need interpreter for exceptions to be formed properly
interp = new PythonInterpreter();
// A scary set of objects
- final PyObject[] brantub = {Py.None,
- new PyInteger(-1),
- new PyLong(0x80000000L),
- new PyString("\u00A0\u0100\u00A2\u00A3\u00A4"),
- new PyString("\u00A0\u00A0\u1000\u00A3\u00A4"),
- new PyXRange(3, -2, -1),
- new PyXRange(250, 257)};
+ final PyObject[] brantub = {Py.None, new PyInteger(-1), //
+ new PyLong(0x80000000L), //
+ new PyXRange(3, -2, -1), //
+ new PyXRange(250, 257) //
+ };
// The PyException types we should obtain
final PyObject[] boobyPrize = {Py.TypeError, // None
- Py.ValueError, // -1
- Py.OverflowError, // 0x80000000L
- Py.ValueError, // \u0100 byte
- Py.ValueError, // \u1000 byte
- Py.ValueError, // -1 in iterable
- Py.ValueError // 256 in iterable
- };
+ Py.ValueError, // -1
+ Py.OverflowError, // 0x80000000L
+ Py.ValueError, // -1 in iterable
+ Py.ValueError // 256 in iterable
+ };
// Work down the lists
for (int dip = 0; dip < brantub.length; dip++) {
PyObject aRef = boobyPrize[dip];
--
Repository URL: https://hg.python.org/jython
More information about the Jython-checkins
mailing list