[Jython-checkins] jython: Add index translation delegate to PyUnicode, fixes #2100 at O(N) cost.
jeff.allen
jython-checkins at python.org
Wed Sep 17 00:55:24 CEST 2014
http://hg.python.org/jython/rev/191a9854396d
changeset: 7380:191a9854396d
user: Jeff Allen <ja.py at farowl.co.uk>
date: Sat Sep 06 13:21:12 2014 +0100
summary:
Add index translation delegate to PyUnicode, fixes #2100 at O(N) cost.
Access operations [i], slice and find operations take a time proportional to the
length of the string, when it contains supplementary characters.
files:
Lib/test/test_unicode_jy.py | 11 +-
src/org/python/core/PyString.java | 246 ++++++++++------
src/org/python/core/PyUnicode.java | 221 ++++++++++++--
3 files changed, 335 insertions(+), 143 deletions(-)
diff --git a/Lib/test/test_unicode_jy.py b/Lib/test/test_unicode_jy.py
--- a/Lib/test/test_unicode_jy.py
+++ b/Lib/test/test_unicode_jy.py
@@ -290,7 +290,6 @@
for m in self.material:
check_slice(m)
- @unittest.skip("Fails on Jython 2.7b3 issue #2100")
def test_find(self):
# Test map from internal find result to code point index
# Fails in Jython 2.7b3
@@ -304,6 +303,7 @@
i = u.find(c, start)
if i < 0: break
self.assertEqual(u[i], c)
+ self.assertGreaterEqual(i, start)
start = i + 1
def check_find_str(m, t):
@@ -325,7 +325,6 @@
m.insert(t)
check_find_str(m, t)
- @unittest.skip("Fails on Jython 2.7b3 issue #2100")
def test_rfind(self):
# Test map from internal rfind result to code point index
# Fails in Jython 2.7b3
@@ -336,9 +335,11 @@
end = m.size
u = m.text
while True:
- end = u.rfind(c, 0, end)
- if end < 0: break
- self.assertEqual(u[end], c)
+ i = u.rfind(c, 0, end)
+ if i < 0: break
+ self.assertLess(i, end)
+ self.assertEqual(u[i], c)
+ end = i
def check_rfind_str(m, t):
# Check rfind returns correct index for string target
diff --git a/src/org/python/core/PyString.java b/src/org/python/core/PyString.java
--- a/src/org/python/core/PyString.java
+++ b/src/org/python/core/PyString.java
@@ -72,6 +72,17 @@
return str;
}
+ /**
+ * Determine whether the string consists entirely of basic-plane characters. For a
+ * {@link PyString}, of course, it is always <code>true</code>, but this is useful in cases
+ * where either a <code>PyString</code> or a {@link PyUnicode} is acceptable.
+ *
+ * @return true
+ */
+ public boolean isBasicPlane() {
+ return true;
+ }
+
@ExposedNew
static PyObject str_new(PyNewWrapper new_, boolean init, PyType subtype, PyObject[] args,
String[] keywords) {
@@ -153,7 +164,7 @@
return str___str__();
}
- public @ExposedMethod(doc = BuiltinDocs.str___str___doc)
+ @ExposedMethod(doc = BuiltinDocs.str___str___doc)
final PyString str___str__() {
if (getClass() == PyString.class) {
return this;
@@ -673,6 +684,14 @@
return new PyString(str);
}
+ /**
+ * Create an instance of the same type as this object, from the Java String given as argument.
+ * This is to be overridden in a subclass to return its own type.
+ *
+ * @param string UTF-16 string encoding the characters (as Java).
+ * @param isBasic is ignored in <code>PyString</code> (effectively true).
+ * @return
+ */
protected PyString createInstance(String str, boolean isBasic) {
// ignore isBasic, doesn't apply to PyString, just PyUnicode
return new PyString(str);
@@ -2353,59 +2372,28 @@
* @param endObj end of slice.
* @return count of occurrences
*/
- protected final int _count_old(String sub, PyObject startObj, PyObject endObj) {
-// xxx
+ protected final int _count(String sub, PyObject startObj, PyObject endObj) {
+
// Interpret the slice indices as concrete values
int[] indices = translateIndices(startObj, endObj);
int subLen = sub.length();
if (subLen == 0) {
- // Special case counting the occurrences of an empty string
- if (indices[2] > getString().length()) {
+ // Special case counting the occurrences of an empty string.
+ int start = indices[2], end = indices[3], n = __len__();
+ if (end < 0 || end < start || start > n) {
+ // Slice is reversed or does not overlap the string.
return 0;
} else {
- return indices[1] - indices[0] + 1;
+ // Count of '' is one more than number of characters in overlap.
+ return Math.min(end, n) - Math.max(start, 0) + 1;
}
} else {
+
// Skip down this string finding occurrences of sub
- int start = indices[0], end = indices[1], count = 0;
- while (true) {
- int index = getString().indexOf(sub, start);
- if (index < 0) {
- break; // not found
- } else {
- // Found at index. Next search begins at end of this instance, at:
- start = index + subLen;
- if (start <= end) {
- count += 1; // ... and the instance found fits within this string.
- } else {
- break; // ... but the instance found overlaps the end, so is not valid.
- }
- }
- }
- return count;
- }
- }
-
- protected final int _count(String sub, PyObject startObj, PyObject endObj) {
-
- // Interpret the slice indices as concrete values
- int[] indices = translateIndices(startObj, endObj);
- int subLen = sub.length();
-
- if (subLen == 0) {
- // Special case counting the occurrences of an empty string
- if (indices[2] > getString().length()) {
- return 0;
- } else {
- return indices[1] - indices[0] + 1;
- }
-
- } else {
-
- // Skip down this string finding occurrences of sub
- int start = indices[0], limit = indices[1] - subLen, count = 0;
+ int start = indices[0], end = indices[1];
+ int limit = end - subLen, count = 0;
while (start <= limit) {
int index = getString().indexOf(sub, start);
@@ -2496,17 +2484,36 @@
* {@link PyUnicode#unicode_find(PyObject, PyObject, PyObject)}.
*
* @param sub substring to find.
- * @param start start of slice.
- * @param end end of slice.
+ * @param startObj start of slice.
+ * @param endObj end of slice.
* @return index of <code>sub</code> in this object or -1 if not found.
*/
- protected final int _find(String sub, PyObject start, PyObject end) {
- int[] indices = translateIndices(start, end);
- int index = getString().indexOf(sub, indices[0]);
- if (index < indices[2] || index > indices[1]) {
- return -1;
+ protected final int _find(String sub, PyObject startObj, PyObject endObj) {
+ // Interpret the slice indices as concrete values
+ int[] indices = translateIndices(startObj, endObj);
+ int subLen = sub.length();
+
+ if (subLen == 0) {
+ // Special case: an empty string may be found anywhere, ...
+ int start = indices[2], end = indices[3];
+ if (end < 0 || end < start || start > __len__()) {
+ // ... except ln a reverse slice or beyond the end of the string,
+ return -1;
+ } else {
+ // ... and will be reported at the start of the overlap.
+ return indices[0];
+ }
+
+ } else {
+ // General case: search for first match then check against slice.
+ int start = indices[0], end = indices[1];
+ int found = getString().indexOf(sub, start);
+ if (found >= 0 && found + subLen <= end) {
+ return found;
+ } else {
+ return -1;
+ }
}
- return index;
}
/**
@@ -2582,17 +2589,36 @@
* {@link PyUnicode#unicode_rfind(PyObject, PyObject, PyObject)}.
*
* @param sub substring to find.
- * @param start start of slice.
- * @param end end of slice.
+ * @param startObj start of slice.
+ * @param endObj end of slice.
* @return index of <code>sub</code> in this object or -1 if not found.
*/
- protected final int _rfind(String sub, PyObject start, PyObject end) {
- int[] indices = translateIndices(start, end);
- int index = getString().lastIndexOf(sub, indices[1] - sub.length());
- if (index < indices[2]) {
- return -1;
+ protected final int _rfind(String sub, PyObject startObj, PyObject endObj) {
+ // Interpret the slice indices as concrete values
+ int[] indices = translateIndices(startObj, endObj);
+ int subLen = sub.length();
+
+ if (subLen == 0) {
+ // Special case: an empty string may be found anywhere, ...
+ int start = indices[2], end = indices[3];
+ if (end < 0 || end < start || start > __len__()) {
+ // ... except ln a reverse slice or beyond the end of the string,
+ return -1;
+ } else {
+ // ... and will be reported at the end of the overlap.
+ return indices[1];
+ }
+
+ } else {
+ // General case: search for first match then check against slice.
+ int start = indices[0], end = indices[1];
+ int found = getString().lastIndexOf(sub, end - subLen);
+ if (found >= start) {
+ return found;
+ } else {
+ return -1;
+ }
}
- return index;
}
public double atof() {
@@ -3284,51 +3310,80 @@
}
/**
- * Turns the possibly negative Python slice start and end into valid indices into this string.
+ * Many of the string methods deal with slices specified using Python slice semantics:
+ * endpoints, which are <code>PyObject</code>s, may be <code>null</code> or <code>None</code>
+ * (meaning default to one end or the other) or may be negative (meaning "from the end").
+ * Meanwhile, the implementation methods need integer indices, both within the array, and
+ * <code>0<=start<=end<=N</code> the length of the array.
+ * <p>
+ * This method first translates the Python slice <code>startObj</code> and <code>endObj</code>
+ * according to the slice semantics for null and negative values, and stores these in elements 2
+ * and 3 of the result. Then, since the end points of the range may lie outside this sequence's
+ * bounds (in either direction) it reduces them to the nearest points satisfying
+ * <code>0<=start<=end<=N</code>, and stores these in elements [0] and [1] of the
+ * result.
*
- * @return a 3 element array of indices into this string describing a substring from [0] to [1].
- * [0] <= [1], [0] >= 0 and [1] <= string.length(). The third element contains the
- * unadjusted start value (or nearest int).
+ * @param startObj Python start of slice
+ * @param endObj Python end of slice
+ * @return a 4 element array of two range-safe indices, and two original indices.
*/
- protected int[] translateIndices(PyObject start, PyObject end) {
- int iStart, iStartUnadjusted, iEnd;
- int n = getString().length();
-
- // Make sure the slice end decodes to something in range
- if (end == null || end == Py.None) {
- iEnd = n;
+ protected int[] translateIndices(PyObject startObj, PyObject endObj) {
+ int start, end;
+ int n = __len__();
+ int[] result = new int[4];
+
+ // Decode the start using slice semantics
+ if (startObj == null || startObj == Py.None) {
+ start = 0;
+ // result[2] = 0 already
} else {
- // Convert to int but limit to Integer.MIN_VALUE <= iEnd <= Integer.MAX_VALUE
- iEnd = end.asIndex(null);
- if (iEnd > n) {
- iEnd = n;
- } else if (iEnd < 0) {
- iEnd = n + iEnd;
- if (iEnd < 0) {
- iEnd = 0;
+ // Convert to int but limit to Integer.MIN_VALUE <= start <= Integer.MAX_VALUE
+ start = startObj.asIndex(null);
+ if (start < 0) {
+ // Negative value means "from the end"
+ start = n + start;
+ }
+ result[2] = start;
+ }
+
+ // Decode the end using slice semantics
+ if (endObj == null || endObj == Py.None) {
+ result[1] = result[3] = end = n;
+ } else {
+ // Convert to int but limit to Integer.MIN_VALUE <= end <= Integer.MAX_VALUE
+ end = endObj.asIndex(null);
+ if (end < 0) {
+ // Negative value means "from the end"
+ result[3] = end = end + n;
+ // Ensure end is safe for String.substring(start,end).
+ if (end < 0) {
+ end = 0;
+ // result[1] = 0 already
+ } else {
+ result[1] = end;
+ }
+ } else {
+ result[3] = end;
+ // Ensure end is safe for String.substring(start,end).
+ if (end > n) {
+ result[1] = end = n;
+ } else {
+ result[1] = end;
}
}
}
- // Make sure the slice start decodes to something in range
- if (start == null || start == Py.None) {
- iStartUnadjusted = iStart = 0;
+ // Ensure start is safe for String.substring(start,end).
+ if (start < 0) {
+ start = 0;
+ // result[0] = 0 already
+ } else if (start > end) {
+ result[0] = start = end;
} else {
- // Convert to int but limit to Integer.MIN_VALUE <= iStart <= Integer.MAX_VALUE
- iStartUnadjusted = iStart = start.asIndex(null);
- if (iStart > iEnd) {
- iStart = iEnd;
- } else if (iStart < 0) {
- iStart = n + iStart;
- if (iStart > iEnd) {
- iStart = iEnd;
- } else if (iStart < 0) {
- iStart = 0;
- }
- }
+ result[0] = start;
}
- return new int[] {iStart, iEnd, iStartUnadjusted};
+ return result;
}
/**
@@ -3847,7 +3902,8 @@
* @param keywords naming the keyword arguments.
* @return the object designated or <code>null</code>.
*/
- private PyObject getFieldObject(String fieldName, boolean bytes, PyObject[] args, String[] keywords) {
+ private PyObject getFieldObject(String fieldName, boolean bytes, PyObject[] args,
+ String[] keywords) {
FieldNameIterator iterator = new FieldNameIterator(fieldName, bytes);
PyObject head = iterator.pyHead();
PyObject obj = null;
diff --git a/src/org/python/core/PyUnicode.java b/src/org/python/core/PyUnicode.java
--- a/src/org/python/core/PyUnicode.java
+++ b/src/org/python/core/PyUnicode.java
@@ -22,31 +22,44 @@
@ExposedType(name = "unicode", base = PyBaseString.class, doc = BuiltinDocs.unicode_doc)
public class PyUnicode extends PyString implements Iterable {
- private enum Plane {
+ /**
+ * Nearly every significant method comes in two versions: one applicable when the string
+ * contains only basic plane characters, and one that is correct when supplementary characters
+ * are also present. Set this constant <code>true</code> to treat all strings as containing
+ * supplementary characters, so that these versions will be exercised in tests.
+ */
+ private static final boolean DEBUG_NON_BMP_METHODS = false;
- UNKNOWN, BASIC, ASTRAL
- }
-
- private volatile Plane plane = Plane.UNKNOWN;
- private volatile int codePointCount = -1;
public static final PyType TYPE = PyType.fromClass(PyUnicode.class);
// for PyJavaClass.init()
public PyUnicode() {
- this(TYPE, "");
+ this(TYPE, "", true);
}
+ /**
+ * Construct a PyUnicode interpreting the Java String argument as UTF-16.
+ *
+ * @param string UTF-16 string encoding the characters (as Java).
+ */
public PyUnicode(String string) {
- this(TYPE, string);
+ this(TYPE, string, false);
}
+ /**
+ * Construct a PyUnicode interpreting the Java String argument as UTF-16. If it is known that
+ * the string contains no supplementary characters, argument isBasic may be set true by the
+ * caller. If it is false, the PyUnicode will scan the string to find out.
+ *
+ * @param string UTF-16 string encoding the characters (as Java).
+ * @param isBasic true if it is known that only BMP characters are present.
+ */
public PyUnicode(String string, boolean isBasic) {
- this(TYPE, string);
- plane = isBasic ? Plane.BASIC : Plane.UNKNOWN;
+ this(TYPE, string, isBasic);
}
public PyUnicode(PyType subtype, String string) {
- super(subtype, string);
+ this(subtype, string, false);
}
public PyUnicode(PyString pystring) {
@@ -54,12 +67,13 @@
}
public PyUnicode(PyType subtype, PyString pystring) {
- this(subtype, pystring instanceof PyUnicode ? pystring.string : pystring.decode()
- .toString());
+ this(subtype, //
+ pystring instanceof PyUnicode ? pystring.string : pystring.decode().toString(), //
+ pystring.isBasicPlane());
}
public PyUnicode(char c) {
- this(TYPE, String.valueOf(c));
+ this(TYPE, String.valueOf(c), true);
}
public PyUnicode(int codepoint) {
@@ -90,6 +104,20 @@
this(ucs4.iterator());
}
+ /**
+ * Fundamental all-features constructor on which the others depend. If it is known that the
+ * string contains no supplementary characters, argument isBasic may be set true by the caller.
+ * If it is false, the PyUnicode will scan the string to find out.
+ *
+ * @param subtype actual type to create.
+ * @param string UTF-16 string encoding the characters (as Java).
+ * @param isBasic true if it is known that only BMP characters are present.
+ */
+ private PyUnicode(PyType subtype, String string, boolean isBasic) {
+ super(subtype, string);
+ translator = isBasic ? BASIC : this.chooseIndexTranslator();
+ }
+
@Override
public int[] toCodePoints() {
int n = getCodePointCount();
@@ -101,6 +129,115 @@
return codePoints;
}
+ // ------------------------------------------------------------------------------------------
+ // Index translation for Unicode beyond the BMP
+ // ------------------------------------------------------------------------------------------
+
+ /**
+ * Index translation between code point index (as seen by Python) and UTF-16 index (as used in
+ * the Java String.
+ */
+ private interface IndexTranslator {
+
+ /** Number of supplementary characters (hence point code length may be found). */
+ public int suppCount();
+
+ /** Translate a UTF-16 code unit index to its equivalent code point index. */
+ public int codePointIndex(int utf16Index);
+
+ /** Translate a code point index to its equivalent UTF-16 code unit index. */
+ public int utf16Index(int codePointIndex);
+ }
+
+ /**
+ * The instance of index translation in use in this string. It will be set to either
+ * {@link #BASIC} or and instance of {@link #Supplementary}.
+ */
+ private final IndexTranslator translator;
+
+ /**
+ * A singleton provides the translation service (which is a pass-through) for all BMP strings.
+ */
+ static final IndexTranslator BASIC = new IndexTranslator() {
+
+ @Override
+ public int suppCount() {
+ return 0;
+ }
+
+ @Override
+ public int codePointIndex(int u) {
+ return u;
+ }
+
+ @Override
+ public int utf16Index(int i) {
+ return i;
+ }
+ };
+
+ /**
+ * A class of index translation that uses the features provided by the Java String.
+ */
+ private class Supplementary implements IndexTranslator {
+
+ /** The number of supplementary character in this string. */
+ private final int supp;
+
+ Supplementary(int supp) {
+ this.supp = supp;
+ }
+
+ @Override
+ public int codePointIndex(int u) {
+ return string.codePointCount(0, u);
+ }
+
+ @Override
+ public int utf16Index(int i) {
+ return string.offsetByCodePoints(0, i);
+ }
+
+ @Override
+ public int suppCount() {
+ return supp;
+ }
+ }
+
+ /**
+ * Choose an {@link IndexTranslator} implementation for efficient working, according to the
+ * contents of the {@link PyString#string}.
+ *
+ * @return chosen <code>IndexTranslator</code>
+ */
+ private IndexTranslator chooseIndexTranslator() {
+ int n = string.length();
+ int s = n - string.codePointCount(0, n);
+ if (DEBUG_NON_BMP_METHODS) {
+ return new Supplementary(s);
+ } else {
+ return s == 0 ? BASIC : new Supplementary(s);
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ * <p>
+ * In the <code>PyUnicode</code> version, the arguments are code point indices, such as are
+ * received from the Python caller, while the first two elements of the returned array have been
+ * translated to UTF-16 indices in the implementation string.
+ */
+ @Override
+ protected int[] translateIndices(PyObject start, PyObject end) {
+ int[] indices = super.translateIndices(start, end);
+ indices[0] = translator.utf16Index(indices[0]);
+ indices[1] = translator.utf16Index(indices[1]);
+ // indices[2] and [3] remain Unicode indices (and may be out of bounds) relative to len()
+ return indices;
+ }
+
+ // ------------------------------------------------------------------------------------------
+
// modified to know something about codepoints; we just need to return the
// corresponding substring; darn UTF16!
// TODO: we could avoid doing this unnecessary copy
@@ -122,29 +259,22 @@
return uni;
}
+ /**
+ * {@inheritDoc}
+ *
+ * @return true if the string consists only of BMP characters
+ */
+ @Override
public boolean isBasicPlane() {
- if (plane == Plane.BASIC) {
- return true;
- } else if (plane == Plane.UNKNOWN) {
- plane = (getString().length() == getCodePointCount()) ? Plane.BASIC : Plane.ASTRAL;
+ if (DEBUG_NON_BMP_METHODS) {
+ return false;
+ } else {
+ return translator.suppCount() == 0;
}
- return plane == Plane.BASIC;
}
-// RETAIN THE BELOW CODE, it facilitates testing astral support more completely
-
-// public boolean isBasicPlane() {
-// return false;
-// }
-
-// END RETAIN
-
public int getCodePointCount() {
- if (codePointCount >= 0) {
- return codePointCount;
- }
- codePointCount = getString().codePointCount(0, getString().length());
- return codePointCount;
+ return string.length() - translator.suppCount();
}
@ExposedNew
@@ -193,12 +323,13 @@
return new PyUnicode(str);
}
- // Unicode ops consisting of basic strings can only produce basic strings;
- // this may not be the case for astral ones - they also might be basic, in
- // case of deletes. So optimize by providing a tainting mechanism.
+ /**
+ * @param string UTF-16 string encoding the characters (as Java).
+ * @param isBasic true if it is known that only BMP characters are present.
+ */
@Override
- protected PyString createInstance(String str, boolean isBasic) {
- return new PyUnicode(str, isBasic);
+ protected PyString createInstance(String string, boolean isBasic) {
+ return new PyUnicode(string, isBasic);
}
@Override
@@ -443,10 +574,12 @@
private PyUnicode coerceToUnicode(PyObject o) {
if (o instanceof PyUnicode) {
return (PyUnicode)o;
+ } else if (o instanceof PyString) {
+ return new PyUnicode(((PyString)o).getString(), true);
} else if (o instanceof BufferProtocol) {
- // PyString or PyByteArray, PyMemoryView, Py2kBuffer ...
+ // PyByteArray, PyMemoryView, Py2kBuffer ...
try (PyBuffer buf = ((BufferProtocol)o).getBuffer(PyBUF.FULL_RO)) {
- return new PyUnicode(buf.toString());
+ return new PyUnicode(buf.toString(), true);
}
} else {
// o is some type not allowed:
@@ -998,7 +1131,7 @@
@Override
protected PyString fromSubstring(int begin, int end) {
assert (isBasicPlane()); // can only be used on a codepath from str_ equivalents
- return new PyUnicode(getString().substring(begin, end));
+ return new PyUnicode(getString().substring(begin, end), true);
}
@ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_index_doc)
@@ -1021,7 +1154,7 @@
if (isBasicPlane()) {
return _count(sub.getString(), start, end);
}
- int[] indices = translateIndices(start, end);
+ int[] indices = super.translateIndices(start, end); // do not convert to utf-16 indices.
int count = 0;
for (Iterator<Integer> mainIter = newSubsequenceIterator(indices[0], indices[1], 1); mainIter
.hasNext();) {
@@ -1043,12 +1176,14 @@
@ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_find_doc)
final int unicode_find(PyObject subObj, PyObject start, PyObject end) {
- return _find(coerceToUnicode(subObj).getString(), start, end);
+ int found = _find(coerceToUnicode(subObj).getString(), start, end);
+ return found < 0 ? -1 : translator.codePointIndex(found);
}
@ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_rfind_doc)
final int unicode_rfind(PyObject subObj, PyObject start, PyObject end) {
- return _rfind(coerceToUnicode(subObj).getString(), start, end);
+ int found = _rfind(coerceToUnicode(subObj).getString(), start, end);
+ return found < 0 ? -1 : translator.codePointIndex(found);
}
private static String padding(int n, int pad) {
--
Repository URL: http://hg.python.org/jython
More information about the Jython-checkins
mailing list