[Jython-checkins] jython: Add index translation delegate to PyUnicode, fixes #2100 at O(N) cost.

Wed Sep 17 00:55:24 CEST 2014

http://hg.python.org/jython/rev/191a9854396d
changeset:   7380:191a9854396d
user:        Jeff Allen <ja.py at farowl.co.uk>
date:        Sat Sep 06 13:21:12 2014 +0100
summary:
  Add index translation delegate to PyUnicode, fixes #2100 at O(N) cost.
Access operations [i], slice and find operations take a time proportional to the
length of the string, when it contains supplementary characters.

files:
  Lib/test/test_unicode_jy.py        |   11 +-
  src/org/python/core/PyString.java  |  246 ++++++++++------
  src/org/python/core/PyUnicode.java |  221 ++++++++++++--
  3 files changed, 335 insertions(+), 143 deletions(-)

diff --git a/Lib/test/test_unicode_jy.py b/Lib/test/test_unicode_jy.py
--- a/Lib/test/test_unicode_jy.py
+++ b/Lib/test/test_unicode_jy.py
@@ -290,7 +290,6 @@
         for m in self.material:
             check_slice(m)
 
-    @unittest.skip("Fails on Jython 2.7b3 issue #2100")
     def test_find(self):
         # Test map from internal find result to code point index
         # Fails in Jython 2.7b3
@@ -304,6 +303,7 @@
                     i = u.find(c, start)
                     if i < 0: break
                     self.assertEqual(u[i], c)
+                    self.assertGreaterEqual(i, start)
                     start = i + 1
 
         def check_find_str(m, t):
@@ -325,7 +325,6 @@
                 m.insert(t)
                 check_find_str(m, t)
 
-    @unittest.skip("Fails on Jython 2.7b3 issue #2100")
     def test_rfind(self):
         # Test map from internal rfind result to code point index
         # Fails in Jython 2.7b3
@@ -336,9 +335,11 @@
                 end = m.size
                 u = m.text
                 while True:
-                    end = u.rfind(c, 0, end)
-                    if end < 0: break
-                    self.assertEqual(u[end], c)
+                    i = u.rfind(c, 0, end)
+                    if i < 0: break
+                    self.assertLess(i, end)
+                    self.assertEqual(u[i], c)
+                    end = i
 
         def check_rfind_str(m, t):
             # Check rfind returns correct index for string target
diff --git a/src/org/python/core/PyString.java b/src/org/python/core/PyString.java
--- a/src/org/python/core/PyString.java
+++ b/src/org/python/core/PyString.java
@@ -72,6 +72,17 @@
         return str;
     }
 
+    /**
+     * Determine whether the string consists entirely of basic-plane characters. For a
+     * {@link PyString}, of course, it is always <code>true</code>, but this is useful in cases
+     * where either a <code>PyString</code> or a {@link PyUnicode} is acceptable.
+     *
+     * @return true
+     */
+    public boolean isBasicPlane() {
+        return true;
+    }
+
     @ExposedNew
     static PyObject str_new(PyNewWrapper new_, boolean init, PyType subtype, PyObject[] args,
             String[] keywords) {
@@ -153,7 +164,7 @@
         return str___str__();
     }
 
-    public @ExposedMethod(doc = BuiltinDocs.str___str___doc)
+    @ExposedMethod(doc = BuiltinDocs.str___str___doc)
     final PyString str___str__() {
         if (getClass() == PyString.class) {
             return this;
@@ -673,6 +684,14 @@
         return new PyString(str);
     }
 
+    /**
+     * Create an instance of the same type as this object, from the Java String given as argument.
+     * This is to be overridden in a subclass to return its own type.
+     *
+     * @param string UTF-16 string encoding the characters (as Java).
+     * @param isBasic is ignored in <code>PyString</code> (effectively true).
+     * @return
+     */
     protected PyString createInstance(String str, boolean isBasic) {
         // ignore isBasic, doesn't apply to PyString, just PyUnicode
         return new PyString(str);
@@ -2353,59 +2372,28 @@
      * @param endObj end of slice.
      * @return count of occurrences
      */
-    protected final int _count_old(String sub, PyObject startObj, PyObject endObj) {
-// xxx
+    protected final int _count(String sub, PyObject startObj, PyObject endObj) {
+
         // Interpret the slice indices as concrete values
         int[] indices = translateIndices(startObj, endObj);
         int subLen = sub.length();
 
         if (subLen == 0) {
-            // Special case counting the occurrences of an empty string
-            if (indices[2] > getString().length()) {
+            // Special case counting the occurrences of an empty string.
+            int start = indices[2], end = indices[3], n = __len__();
+            if (end < 0 || end < start || start > n) {
+                // Slice is reversed or does not overlap the string.
                 return 0;
             } else {
-                return indices[1] - indices[0] + 1;
+                // Count of '' is one more than number of characters in overlap.
+                return Math.min(end, n) - Math.max(start, 0) + 1;
             }
 
         } else {
+
             // Skip down this string finding occurrences of sub
-            int start = indices[0], end = indices[1], count = 0;
-            while (true) {
-                int index = getString().indexOf(sub, start);
-                if (index < 0) {
-                    break; // not found
-                } else {
-                    // Found at index. Next search begins at end of this instance, at:
-                    start = index + subLen;
-                    if (start <= end) {
-                        count += 1; // ... and the instance found fits within this string.
-                    } else {
-                        break; // ... but the instance found overlaps the end, so is not valid.
-                    }
-                }
-            }
-            return count;
-        }
-    }
-
-    protected final int _count(String sub, PyObject startObj, PyObject endObj) {
-
-        // Interpret the slice indices as concrete values
-        int[] indices = translateIndices(startObj, endObj);
-        int subLen = sub.length();
-
-        if (subLen == 0) {
-            // Special case counting the occurrences of an empty string
-            if (indices[2] > getString().length()) {
-                return 0;
-            } else {
-                return indices[1] - indices[0] + 1;
-            }
-
-        } else {
-
-            // Skip down this string finding occurrences of sub
-            int start = indices[0], limit = indices[1] - subLen, count = 0;
+            int start = indices[0], end = indices[1];
+            int limit = end - subLen, count = 0;
 
             while (start <= limit) {
                 int index = getString().indexOf(sub, start);
@@ -2496,17 +2484,36 @@
      * {@link PyUnicode#unicode_find(PyObject, PyObject, PyObject)}.
      *
      * @param sub substring to find.
-     * @param start start of slice.
-     * @param end end of slice.
+     * @param startObj start of slice.
+     * @param endObj end of slice.
      * @return index of <code>sub</code> in this object or -1 if not found.
      */
-    protected final int _find(String sub, PyObject start, PyObject end) {
-        int[] indices = translateIndices(start, end);
-        int index = getString().indexOf(sub, indices[0]);
-        if (index < indices[2] || index > indices[1]) {
-            return -1;
+    protected final int _find(String sub, PyObject startObj, PyObject endObj) {
+        // Interpret the slice indices as concrete values
+        int[] indices = translateIndices(startObj, endObj);
+        int subLen = sub.length();
+
+        if (subLen == 0) {
+            // Special case: an empty string may be found anywhere, ...
+            int start = indices[2], end = indices[3];
+            if (end < 0 || end < start || start > __len__()) {
+                // ... except ln a reverse slice or beyond the end of the string,
+                return -1;
+            } else {
+                // ... and will be reported at the start of the overlap.
+                return indices[0];
+            }
+
+        } else {
+            // General case: search for first match then check against slice.
+            int start = indices[0], end = indices[1];
+            int found = getString().indexOf(sub, start);
+            if (found >= 0 && found + subLen <= end) {
+                return found;
+            } else {
+                return -1;
+            }
         }
-        return index;
     }
 
     /**
@@ -2582,17 +2589,36 @@
      * {@link PyUnicode#unicode_rfind(PyObject, PyObject, PyObject)}.
      *
      * @param sub substring to find.
-     * @param start start of slice.
-     * @param end end of slice.
+     * @param startObj start of slice.
+     * @param endObj end of slice.
      * @return index of <code>sub</code> in this object or -1 if not found.
      */
-    protected final int _rfind(String sub, PyObject start, PyObject end) {
-        int[] indices = translateIndices(start, end);
-        int index = getString().lastIndexOf(sub, indices[1] - sub.length());
-        if (index < indices[2]) {
-            return -1;
+    protected final int _rfind(String sub, PyObject startObj, PyObject endObj) {
+        // Interpret the slice indices as concrete values
+        int[] indices = translateIndices(startObj, endObj);
+        int subLen = sub.length();
+
+        if (subLen == 0) {
+            // Special case: an empty string may be found anywhere, ...
+            int start = indices[2], end = indices[3];
+            if (end < 0 || end < start || start > __len__()) {
+                // ... except ln a reverse slice or beyond the end of the string,
+                return -1;
+            } else {
+                // ... and will be reported at the end of the overlap.
+                return indices[1];
+            }
+
+        } else {
+            // General case: search for first match then check against slice.
+            int start = indices[0], end = indices[1];
+            int found = getString().lastIndexOf(sub, end - subLen);
+            if (found >= start) {
+                return found;
+            } else {
+                return -1;
+            }
         }
-        return index;
     }
 
     public double atof() {
@@ -3284,51 +3310,80 @@
     }
 
     /**
-     * Turns the possibly negative Python slice start and end into valid indices into this string.
+     * Many of the string methods deal with slices specified using Python slice semantics:
+     * endpoints, which are <code>PyObject</code>s, may be <code>null</code> or <code>None</code>
+     * (meaning default to one end or the other) or may be negative (meaning "from the end").
+     * Meanwhile, the implementation methods need integer indices, both within the array, and
+     * <code>0<=start<=end<=N</code> the length of the array.
+     * <p>
+     * This method first translates the Python slice <code>startObj</code> and <code>endObj</code>
+     * according to the slice semantics for null and negative values, and stores these in elements 2
+     * and 3 of the result. Then, since the end points of the range may lie outside this sequence's
+     * bounds (in either direction) it reduces them to the nearest points satisfying
+     * <code>0<=start<=end<=N</code>, and stores these in elements [0] and [1] of the
+     * result.
      *
-     * @return a 3 element array of indices into this string describing a substring from [0] to [1].
-     *         [0] <= [1], [0] >= 0 and [1] <= string.length(). The third element contains the
-     *         unadjusted start value (or nearest int).
+     * @param startObj Python start of slice
+     * @param endObj Python end of slice
+     * @return a 4 element array of two range-safe indices, and two original indices.
      */
-    protected int[] translateIndices(PyObject start, PyObject end) {
-        int iStart, iStartUnadjusted, iEnd;
-        int n = getString().length();
-
-        // Make sure the slice end decodes to something in range
-        if (end == null || end == Py.None) {
-            iEnd = n;
+    protected int[] translateIndices(PyObject startObj, PyObject endObj) {
+        int start, end;
+        int n = __len__();
+        int[] result = new int[4];
+
+        // Decode the start using slice semantics
+        if (startObj == null || startObj == Py.None) {
+            start = 0;
+            // result[2] = 0 already
         } else {
-            // Convert to int but limit to Integer.MIN_VALUE <= iEnd <= Integer.MAX_VALUE
-            iEnd = end.asIndex(null);
-            if (iEnd > n) {
-                iEnd = n;
-            } else if (iEnd < 0) {
-                iEnd = n + iEnd;
-                if (iEnd < 0) {
-                    iEnd = 0;
+            // Convert to int but limit to Integer.MIN_VALUE <= start <= Integer.MAX_VALUE
+            start = startObj.asIndex(null);
+            if (start < 0) {
+                // Negative value means "from the end"
+                start = n + start;
+            }
+            result[2] = start;
+        }
+
+        // Decode the end using slice semantics
+        if (endObj == null || endObj == Py.None) {
+            result[1] = result[3] = end = n;
+        } else {
+            // Convert to int but limit to Integer.MIN_VALUE <= end <= Integer.MAX_VALUE
+            end = endObj.asIndex(null);
+            if (end < 0) {
+                // Negative value means "from the end"
+                result[3] = end = end + n;
+                // Ensure end is safe for String.substring(start,end).
+                if (end < 0) {
+                    end = 0;
+                    // result[1] = 0 already
+                } else {
+                    result[1] = end;
+                }
+            } else {
+                result[3] = end;
+                // Ensure end is safe for String.substring(start,end).
+                if (end > n) {
+                    result[1] = end = n;
+                } else {
+                    result[1] = end;
                 }
             }
         }
 
-        // Make sure the slice start decodes to something in range
-        if (start == null || start == Py.None) {
-            iStartUnadjusted = iStart = 0;
+        // Ensure start is safe for String.substring(start,end).
+        if (start < 0) {
+            start = 0;
+            // result[0] = 0 already
+        } else if (start > end) {
+            result[0] = start = end;
         } else {
-            // Convert to int but limit to Integer.MIN_VALUE <= iStart <= Integer.MAX_VALUE
-            iStartUnadjusted = iStart = start.asIndex(null);
-            if (iStart > iEnd) {
-                iStart = iEnd;
-            } else if (iStart < 0) {
-                iStart = n + iStart;
-                if (iStart > iEnd) {
-                    iStart = iEnd;
-                } else if (iStart < 0) {
-                    iStart = 0;
-                }
-            }
+            result[0] = start;
         }
 
-        return new int[] {iStart, iEnd, iStartUnadjusted};
+        return result;
     }
 
     /**
@@ -3847,7 +3902,8 @@
      * @param keywords naming the keyword arguments.
      * @return the object designated or <code>null</code>.
      */
-    private PyObject getFieldObject(String fieldName, boolean bytes, PyObject[] args, String[] keywords) {
+    private PyObject getFieldObject(String fieldName, boolean bytes, PyObject[] args,
+            String[] keywords) {
         FieldNameIterator iterator = new FieldNameIterator(fieldName, bytes);
         PyObject head = iterator.pyHead();
         PyObject obj = null;
diff --git a/src/org/python/core/PyUnicode.java b/src/org/python/core/PyUnicode.java
--- a/src/org/python/core/PyUnicode.java
+++ b/src/org/python/core/PyUnicode.java
@@ -22,31 +22,44 @@
 @ExposedType(name = "unicode", base = PyBaseString.class, doc = BuiltinDocs.unicode_doc)
 public class PyUnicode extends PyString implements Iterable {
 
-    private enum Plane {
+    /**
+     * Nearly every significant method comes in two versions: one applicable when the string
+     * contains only basic plane characters, and one that is correct when supplementary characters
+     * are also present. Set this constant <code>true</code> to treat all strings as containing
+     * supplementary characters, so that these versions will be exercised in tests.
+     */
+    private static final boolean DEBUG_NON_BMP_METHODS = false;
 
-        UNKNOWN, BASIC, ASTRAL
-    }
-
-    private volatile Plane plane = Plane.UNKNOWN;
-    private volatile int codePointCount = -1;
     public static final PyType TYPE = PyType.fromClass(PyUnicode.class);
 
     // for PyJavaClass.init()
     public PyUnicode() {
-        this(TYPE, "");
+        this(TYPE, "", true);
     }
 
+    /**
+     * Construct a PyUnicode interpreting the Java String argument as UTF-16.
+     *
+     * @param string UTF-16 string encoding the characters (as Java).
+     */
     public PyUnicode(String string) {
-        this(TYPE, string);
+        this(TYPE, string, false);
     }
 
+    /**
+     * Construct a PyUnicode interpreting the Java String argument as UTF-16. If it is known that
+     * the string contains no supplementary characters, argument isBasic may be set true by the
+     * caller. If it is false, the PyUnicode will scan the string to find out.
+     *
+     * @param string UTF-16 string encoding the characters (as Java).
+     * @param isBasic true if it is known that only BMP characters are present.
+     */
     public PyUnicode(String string, boolean isBasic) {
-        this(TYPE, string);
-        plane = isBasic ? Plane.BASIC : Plane.UNKNOWN;
+        this(TYPE, string, isBasic);
     }
 
     public PyUnicode(PyType subtype, String string) {
-        super(subtype, string);
+        this(subtype, string, false);
     }
 
     public PyUnicode(PyString pystring) {
@@ -54,12 +67,13 @@
     }
 
     public PyUnicode(PyType subtype, PyString pystring) {
-        this(subtype, pystring instanceof PyUnicode ? pystring.string : pystring.decode()
-                .toString());
+        this(subtype, //
+                pystring instanceof PyUnicode ? pystring.string : pystring.decode().toString(), //
+                pystring.isBasicPlane());
     }
 
     public PyUnicode(char c) {
-        this(TYPE, String.valueOf(c));
+        this(TYPE, String.valueOf(c), true);
     }
 
     public PyUnicode(int codepoint) {
@@ -90,6 +104,20 @@
         this(ucs4.iterator());
     }
 
+    /**
+     * Fundamental all-features constructor on which the others depend. If it is known that the
+     * string contains no supplementary characters, argument isBasic may be set true by the caller.
+     * If it is false, the PyUnicode will scan the string to find out.
+     *
+     * @param subtype actual type to create.
+     * @param string UTF-16 string encoding the characters (as Java).
+     * @param isBasic true if it is known that only BMP characters are present.
+     */
+    private PyUnicode(PyType subtype, String string, boolean isBasic) {
+        super(subtype, string);
+        translator = isBasic ? BASIC : this.chooseIndexTranslator();
+    }
+
     @Override
     public int[] toCodePoints() {
         int n = getCodePointCount();
@@ -101,6 +129,115 @@
         return codePoints;
     }
 
+    // ------------------------------------------------------------------------------------------
+    // Index translation for Unicode beyond the BMP
+    // ------------------------------------------------------------------------------------------
+
+    /**
+     * Index translation between code point index (as seen by Python) and UTF-16 index (as used in
+     * the Java String.
+     */
+    private interface IndexTranslator {
+
+        /** Number of supplementary characters (hence point code length may be found). */
+        public int suppCount();
+
+        /** Translate a UTF-16 code unit index to its equivalent code point index. */
+        public int codePointIndex(int utf16Index);
+
+        /** Translate a code point index to its equivalent UTF-16 code unit index. */
+        public int utf16Index(int codePointIndex);
+    }
+
+    /**
+     * The instance of index translation in use in this string. It will be set to either
+     * {@link #BASIC} or and instance of {@link #Supplementary}.
+     */
+    private final IndexTranslator translator;
+
+    /**
+     * A singleton provides the translation service (which is a pass-through) for all BMP strings.
+     */
+    static final IndexTranslator BASIC = new IndexTranslator() {
+
+        @Override
+        public int suppCount() {
+            return 0;
+        }
+
+        @Override
+        public int codePointIndex(int u) {
+            return u;
+        }
+
+        @Override
+        public int utf16Index(int i) {
+            return i;
+        }
+    };
+
+    /**
+     * A class of index translation that uses the features provided by the Java String.
+     */
+    private class Supplementary implements IndexTranslator {
+
+        /** The number of supplementary character in this string. */
+        private final int supp;
+
+        Supplementary(int supp) {
+            this.supp = supp;
+        }
+
+        @Override
+        public int codePointIndex(int u) {
+            return string.codePointCount(0, u);
+        }
+
+        @Override
+        public int utf16Index(int i) {
+            return string.offsetByCodePoints(0, i);
+        }
+
+        @Override
+        public int suppCount() {
+            return supp;
+        }
+    }
+
+    /**
+     * Choose an {@link IndexTranslator} implementation for efficient working, according to the
+     * contents of the {@link PyString#string}.
+     *
+     * @return chosen <code>IndexTranslator</code>
+     */
+    private IndexTranslator chooseIndexTranslator() {
+        int n = string.length();
+        int s = n - string.codePointCount(0, n);
+        if (DEBUG_NON_BMP_METHODS) {
+            return new Supplementary(s);
+        } else {
+            return s == 0 ? BASIC : new Supplementary(s);
+        }
+    }
+
+    /**
+     * {@inheritDoc}
+     * <p>
+     * In the <code>PyUnicode</code> version, the arguments are code point indices, such as are
+     * received from the Python caller, while the first two elements of the returned array have been
+     * translated to UTF-16 indices in the implementation string.
+     */
+    @Override
+    protected int[] translateIndices(PyObject start, PyObject end) {
+        int[] indices = super.translateIndices(start, end);
+        indices[0] = translator.utf16Index(indices[0]);
+        indices[1] = translator.utf16Index(indices[1]);
+        // indices[2] and [3] remain Unicode indices (and may be out of bounds) relative to len()
+        return indices;
+    }
+
+    // ------------------------------------------------------------------------------------------
+
     // modified to know something about codepoints; we just need to return the
     // corresponding substring; darn UTF16!
     // TODO: we could avoid doing this unnecessary copy
@@ -122,29 +259,22 @@
         return uni;
     }
 
+    /**
+     * {@inheritDoc}
+     *
+     * @return true if the string consists only of BMP characters
+     */
+    @Override
     public boolean isBasicPlane() {
-        if (plane == Plane.BASIC) {
-            return true;
-        } else if (plane == Plane.UNKNOWN) {
-            plane = (getString().length() == getCodePointCount()) ? Plane.BASIC : Plane.ASTRAL;
+        if (DEBUG_NON_BMP_METHODS) {
+            return false;
+        } else {
+            return translator.suppCount() == 0;
         }
-        return plane == Plane.BASIC;
     }
 
-// RETAIN THE BELOW CODE, it facilitates testing astral support more completely
-
-// public boolean isBasicPlane() {
-// return false;
-// }
-
-// END RETAIN
-
     public int getCodePointCount() {
-        if (codePointCount >= 0) {
-            return codePointCount;
-        }
-        codePointCount = getString().codePointCount(0, getString().length());
-        return codePointCount;
+        return string.length() - translator.suppCount();
     }
 
     @ExposedNew
@@ -193,12 +323,13 @@
         return new PyUnicode(str);
     }
 
-    // Unicode ops consisting of basic strings can only produce basic strings;
-    // this may not be the case for astral ones - they also might be basic, in
-    // case of deletes. So optimize by providing a tainting mechanism.
+    /**
+     * @param string UTF-16 string encoding the characters (as Java).
+     * @param isBasic true if it is known that only BMP characters are present.
+     */
     @Override
-    protected PyString createInstance(String str, boolean isBasic) {
-        return new PyUnicode(str, isBasic);
+    protected PyString createInstance(String string, boolean isBasic) {
+        return new PyUnicode(string, isBasic);
     }
 
     @Override
@@ -443,10 +574,12 @@
     private PyUnicode coerceToUnicode(PyObject o) {
         if (o instanceof PyUnicode) {
             return (PyUnicode)o;
+        } else if (o instanceof PyString) {
+            return new PyUnicode(((PyString)o).getString(), true);
         } else if (o instanceof BufferProtocol) {
-            // PyString or PyByteArray, PyMemoryView, Py2kBuffer ...
+            // PyByteArray, PyMemoryView, Py2kBuffer ...
             try (PyBuffer buf = ((BufferProtocol)o).getBuffer(PyBUF.FULL_RO)) {
-                return new PyUnicode(buf.toString());
+                return new PyUnicode(buf.toString(), true);
             }
         } else {
             // o is some type not allowed:
@@ -998,7 +1131,7 @@
     @Override
     protected PyString fromSubstring(int begin, int end) {
         assert (isBasicPlane()); // can only be used on a codepath from str_ equivalents
-        return new PyUnicode(getString().substring(begin, end));
+        return new PyUnicode(getString().substring(begin, end), true);
     }
 
     @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_index_doc)
@@ -1021,7 +1154,7 @@
         if (isBasicPlane()) {
             return _count(sub.getString(), start, end);
         }
-        int[] indices = translateIndices(start, end);
+        int[] indices = super.translateIndices(start, end); // do not convert to utf-16 indices.
         int count = 0;
         for (Iterator<Integer> mainIter = newSubsequenceIterator(indices[0], indices[1], 1); mainIter
                 .hasNext();) {
@@ -1043,12 +1176,14 @@
 
     @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_find_doc)
     final int unicode_find(PyObject subObj, PyObject start, PyObject end) {
-        return _find(coerceToUnicode(subObj).getString(), start, end);
+        int found = _find(coerceToUnicode(subObj).getString(), start, end);
+        return found < 0 ? -1 : translator.codePointIndex(found);
     }
 
     @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_rfind_doc)
     final int unicode_rfind(PyObject subObj, PyObject start, PyObject end) {
-        return _rfind(coerceToUnicode(subObj).getString(), start, end);
+        int found = _rfind(coerceToUnicode(subObj).getString(), start, end);
+        return found < 0 ? -1 : translator.codePointIndex(found);
     }
 
     private static String padding(int n, int pad) {

-- 
Repository URL: http://hg.python.org/jython