[Jython-checkins] jython (merge default -> default): Merge recent work to trunk (mostly PyUnicode).

Wed Sep 17 00:55:29 CEST 2014

http://hg.python.org/jython/rev/776cae0189ed
changeset:   7387:776cae0189ed
parent:      7375:4ed83fcdb13e
parent:      7386:4187e256ae1e
user:        Jeff Allen <ja.py at farowl.co.uk>
date:        Tue Sep 16 23:55:09 2014 +0100
summary:
  Merge recent work to trunk (mostly PyUnicode).

files:
  Lib/test/test_bytes_jy.py                          |   44 +-
  Lib/test/test_unicode_jy.py                        |  249 ++++
  src/org/python/core/PyArray.java                   |   83 +-
  src/org/python/core/PyBUF.java                     |   10 +
  src/org/python/core/PyBuffer.java                  |   53 +-
  src/org/python/core/PyByteArray.java               |   30 +-
  src/org/python/core/PyString.java                  |  253 ++-
  src/org/python/core/PyUnicode.java                 |  563 ++++++++-
  src/org/python/core/buffer/BaseBuffer.java         |   86 +-
  src/org/python/core/buffer/SimpleBuffer.java       |    9 +
  src/org/python/core/buffer/SimpleStringBuffer.java |   11 +-
  src/org/python/modules/_io/PyFileIO.java           |    6 +-
  src/org/python/modules/posix/PosixModule.java      |    9 +-
  tests/java/javatests/ProxyDeserialization.java     |    0 
  tests/python/unicode_index_times.py                |  405 +++++++
  15 files changed, 1591 insertions(+), 220 deletions(-)

diff --git a/Lib/test/test_bytes_jy.py b/Lib/test/test_bytes_jy.py
--- a/Lib/test/test_bytes_jy.py
+++ b/Lib/test/test_bytes_jy.py
@@ -12,11 +12,51 @@
         class Sub(bytearray): pass
         s = Sub("abc123")
         self.assertEqual(len(s), 6)
- 
+
+
+class SimpleOperationsTest(unittest.TestCase):
+    # Things the CPython library did not test throughly enough
+
+    def test_irepeat(self) :
+
+        def check_irepeat(a, n) :
+            # Check in-place multiplication (repeats)
+            b = bytearray(a)
+            b *= n
+            self.assertEquals(b, bytearray(a*n))
+
+        def irepeat_export(a, n) :
+            # In-place multiplication with export mostly raises BufferError
+            b = bytearray(a)
+            with memoryview(b) as m:
+                b *= n
+            # If it doesn't raise, it gets the right answer
+            self.assertEquals(b, bytearray(a*n))
+
+        for a in [b'', b'a', b'hello'] :
+            check_irepeat(a, 7)
+            check_irepeat(a, 1)
+            check_irepeat(a, 0)
+            check_irepeat(a, -1) # -ve treated as 0
+
+        # Resizing with exports should raise an exception
+        self.assertRaises(BufferError, irepeat_export, b'a', 5)
+        self.assertRaises(BufferError, irepeat_export, b'hello', 3)
+        self.assertRaises(BufferError, irepeat_export, b'hello', 0)
+        self.assertRaises(BufferError, irepeat_export, b'hello', -1)
+
+        # These don't raise an exception (CPython 2.7.6, 3.4.1)
+        irepeat_export(b'a', 1)
+        irepeat_export(b'hello', 1)
+        for n in range(-1, 3) :
+            irepeat_export(b'', n)
+
 
 def test_main():
     test.test_support.run_unittest(
-        ByteArraySubclassTest)
+            ByteArraySubclassTest,
+            SimpleOperationsTest,
+        )
 
 
 if __name__ == "__main__":
diff --git a/Lib/test/test_unicode_jy.py b/Lib/test/test_unicode_jy.py
--- a/Lib/test/test_unicode_jy.py
+++ b/Lib/test/test_unicode_jy.py
@@ -3,12 +3,15 @@
 
 Made for Jython.
 """
+import itertools
+import random
 import re
 import string
 import sys
 import unittest
 from StringIO import StringIO
 from test import test_support
+from java.lang import StringBuilder
 
 class UnicodeTestCase(unittest.TestCase):
 
@@ -155,6 +158,251 @@
             u'f')
 
 
+class UnicodeMaterial(object):
+    ''' Object holding a list of single characters and a unicode string
+        that is their concatenation. The sequence is created from a
+        background sequence of basic plane characters and random
+        replacement with supplementary plane characters (those with
+        point code>0xffff).
+    '''
+
+    base = tuple(u'abcdefghijklmnopqrstuvwxyz')
+    supp = tuple(map(unichr, range(0x10000, 0x1000c)))
+    used = sorted(set(base+supp))
+
+    def __init__(self, size=20, pred=None, ran=None):
+        ''' Create size chars choosing an SP char at i where
+            pred(ran, i)==True where ran is an instance of
+            random.Random supplied in the constructor or created
+            locally (if ran==None).
+        '''
+
+        # Generators for the BMP and SP characters
+        base = itertools.cycle(UnicodeMaterial.base)
+        supp = itertools.cycle(UnicodeMaterial.supp)
+
+        # Each instance gets a random generator
+        if ran is None:
+            ran = random.Random()
+        self.random = ran
+
+        if pred is None:
+            pred = lambda ran, j : ran.random() < DEFAULT_RATE
+
+        # Generate the list
+        r = list()
+        for i in range(size):
+            if pred(self.random, i):
+                c = supp.next()
+            else:
+                c = base.next()
+            r.append(c)
+
+        # The list and its concatenation are our material
+        self.ref = r
+        self.size = len(r)
+        self.text = u''.join(r)
+        self.target = u''
+
+    def __len__(self):
+        return self.size
+
+    def insert(self, target, p=None):
+        ''' Insert target string at position p (or middle), truncating if
+            that would make the material any longer
+        '''
+        if p is None:
+            p = max(0, (self.size-len(target)) // 2)
+
+        n = 0
+        for t in target:
+            if p+n >= self.size:
+                break;
+            self.ref[p+n] = t
+            n += 1
+
+        self.target = target[:n]
+        self.text = u''.join(self.ref)
+
+
+class UnicodeIndexMixTest(unittest.TestCase):
+    # Test indexing where there may be more than one code unit per code point.
+    # See Jython Issue #2100.
+
+    # Functions defining particular distributions of SP codes
+    #
+    def evenly(self, rate=0.2):
+        'Evenly distributed at given rate'
+        def f(ran, i):
+            return ran.random() < rate
+        return f
+
+    def evenly_before(self, k, rate=0.2):
+        'Evenly distributed on i<k at given rate'
+        def f(ran, i):
+            return i < k and ran.random() < rate
+        return f
+
+    def evenly_from(self, k, rate=0.2):
+        'Evenly distributed on i>=k at given rate'
+        def f(ran, i):
+            return i >= k and ran.random() < rate
+        return f
+
+    def at(self, places):
+        'Only at specified places'
+        def f(ran, i):
+            return i in places
+        return f
+
+    def setUp(self):
+        ran = random.Random(1234)  # ensure repeatable
+        mat = list()
+        mat.append(UnicodeMaterial(10, self.at([2]), ran))
+        mat.append(UnicodeMaterial(10, self.at([2, 5]), ran))
+        mat.append(UnicodeMaterial(50, self.evenly(), ran))
+        mat.append(UnicodeMaterial(200, self.evenly_before(70), ran))
+        mat.append(UnicodeMaterial(200, self.evenly_from(130), ran))
+        mat.append(UnicodeMaterial(1000, self.evenly(), ran))
+
+        self.material = mat
+
+
+    def test_getitem(self):
+        # Test map from to code point index to internal representation
+        # Fails in Jython 2.7b3
+
+        def check_getitem(m):
+            # Check indexing the string returns the expected point code
+            for i in xrange(m.size):
+                self.assertEqual(m.text[i], m.ref[i])
+
+        for m in self.material:
+            check_getitem(m)
+
+    def test_slice(self):
+        # Test indexing gets the slice ends correct.
+        # Passes in Jython 2.7b3, but may be touched by #2100 changes.
+
+        def check_slice(m):
+            # Check a range of slices against slices of the reference.
+            n = 1
+            while n <= m.size:
+                for i in range(m.size - n):
+                    exp = u''.join(m.ref[i:i+n])
+                    self.assertEqual(m.text[i:i+n], exp)
+                n *= 3
+
+        for m in self.material:
+            check_slice(m)
+
+    def test_find(self):
+        # Test map from internal find result to code point index
+        # Fails in Jython 2.7b3
+
+        def check_find(ref):
+            # Check find returns indexes for single point codes
+            for c in set(m.used):
+                start = 0
+                u = m.text
+                while start < m.size:
+                    i = u.find(c, start)
+                    if i < 0: break
+                    self.assertEqual(u[i], c)
+                    self.assertGreaterEqual(i, start)
+                    start = i + 1
+
+        def check_find_str(m, t):
+            # Check find returns correct index for string target
+            i = m.text.find(t)
+            self.assertEqual(list(t), m.ref[i:i+len(t)])
+
+        targets = [
+            u"this",
+            u"ab\U00010041de", 
+            u"\U00010041\U00010042\U00010042xx",
+            u"xx\U00010041\U00010042\U00010043yy",
+        ]
+
+        for m in self.material:
+            check_find(m)
+            for t in targets:
+                # Insert in middle then try to find it
+                m.insert(t)
+                check_find_str(m, t)
+
+    def test_rfind(self):
+        # Test map from internal rfind result to code point index
+        # Fails in Jython 2.7b3
+
+        def check_rfind(ref):
+            # Check rfind returns indexes for single point codes
+            for c in set(m.used):
+                end = m.size
+                u = m.text
+                while True:
+                    i = u.rfind(c, 0, end)
+                    if i < 0: break
+                    self.assertLess(i, end)
+                    self.assertEqual(u[i], c)
+                    end = i
+
+        def check_rfind_str(m, t):
+            # Check rfind returns correct index for string target
+            i = m.text.rfind(t)
+            self.assertEqual(list(t), m.ref[i:i+len(t)])
+
+        targets = [
+            u"this",
+            u"ab\U00010041de", 
+            u"\U00010041\U00010042\U00010042xx",
+            u"xx\U00010041\U00010042\U00010043yy",
+        ]
+
+        for m in self.material:
+            check_rfind(m)
+            for t in targets:
+                # Insert in middle then try to find it
+                m.insert(t)
+                check_rfind_str(m, t)
+
+    def test_surrogate_validation(self):
+
+        def insert_sb(text, c1, c2):
+            # Insert code points c1, c2 in the text, as a Java StringBuilder
+            sb = StringBuilder()
+            # c1 at the quarter point
+            p1 = len(mat) // 4
+            for c in mat.text[:p1]:
+                sb.appendCodePoint(ord(c))
+            sb.appendCodePoint(c1)
+            # c2 at the three-quarter point
+            p2 = 3 * p1
+            for c in mat.text[p1:p2]:
+                sb.appendCodePoint(ord(c))
+            sb.appendCodePoint(c2)
+            # Rest of text
+            for c in mat.text[p2:]:
+                sb.appendCodePoint(ord(c))
+            return sb
+
+        # Test that lone surrogates are rejected
+        for surr in [0xdc81, 0xdc00, 0xdfff, 0xd800, 0xdbff]:
+            for mat in self.material:
+
+                # Java StringBuilder with two private-use characters:
+                sb = insert_sb(mat.text, 0xe000, 0xf000)
+                # Check this is acceptable
+                #print repr(unicode(sb))
+                self.assertEqual(len(unicode(sb)), len(mat)+2)
+
+                # Java StringBuilder with private-use and lone surrogate:
+                sb = insert_sb(mat.text, 0xe000, surr)
+                # Check this is detected
+                #print repr(unicode(sb))
+                self.assertRaises(ValueError, unicode, sb)
+
+
 class UnicodeFormatTestCase(unittest.TestCase):
 
     def test_unicode_mapping(self):
@@ -601,6 +849,7 @@
 def test_main():
     test_support.run_unittest(
                 UnicodeTestCase,
+                UnicodeIndexMixTest,
                 UnicodeFormatTestCase,
                 UnicodeStdIOTestCase,
                 UnicodeFormatStrTest,
diff --git a/src/org/python/core/PyArray.java b/src/org/python/core/PyArray.java
--- a/src/org/python/core/PyArray.java
+++ b/src/org/python/core/PyArray.java
@@ -1,7 +1,6 @@
 // Copyright (c) Corporation for National Research Initiatives
 package org.python.core;
 
-import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.DataInputStream;
 import java.io.DataOutputStream;
@@ -11,6 +10,7 @@
 import java.io.OutputStream;
 import java.lang.ref.WeakReference;
 import java.lang.reflect.Array;
+import java.nio.ByteBuffer;
 
 import org.python.core.buffer.BaseBuffer;
 import org.python.core.buffer.SimpleStringBuffer;
@@ -1130,19 +1130,17 @@
                 frombytesInternal(StringUtil.toBytes(s));
 
             } else {
-                // Access the bytes
+                // Access the bytes through the abstract API of the BufferProtocol
                 try (PyBuffer pybuf = ((BufferProtocol)input).getBuffer(PyBUF.STRIDED_RO)) {
-                    // Provide argument as stream of bytes for fromstream method
                     if (pybuf.getNdim() == 1) {
                         if (pybuf.getStrides()[0] == 1) {
-                            // Data are contiguous in a byte[]
-                            PyBuffer.Pointer b = pybuf.getBuf();
-                            frombytesInternal(b.storage, b.offset, pybuf.getLen());
+                            // Data are contiguous in the buffer
+                            frombytesInternal(pybuf.getNIOByteBuffer());
                         } else {
                             // As frombytesInternal only knows contiguous bytes, make a copy.
                             byte[] copy = new byte[pybuf.getLen()];
                             pybuf.copyTo(copy, 0);
-                            frombytesInternal(copy);
+                            frombytesInternal(ByteBuffer.wrap(copy));
                         }
                     } else {
                         // Currently don't support n-dimensional sources
@@ -1158,39 +1156,30 @@
     }
 
     /**
-     * Common code supporting Java and Python versions of <code>.fromstring()</code>
-     *
-     * @param input string of bytes encoding the array data
-     */
-    private final void fromstringInternal(String input) {
-        frombytesInternal(StringUtil.toBytes(input));
-    }
-
-    /**
      * Common code supporting Java and Python versions of <code>.fromstring()</code> or
      * <code>.frombytes()</code> (Python 3.2+ name).
      *
      * @param bytes array containing the new array data in machine encoding
      */
     private final void frombytesInternal(byte[] bytes) {
-        frombytesInternal(bytes, 0, bytes.length);
+        frombytesInternal(ByteBuffer.wrap(bytes));
     }
 
     /**
-     * Common code supporting Java and Python versions of <code>.fromstring()</code> or
-     * <code>.frombytes()</code> (Python 3.2+ name).
+     * Copy into this array, the remaining bytes of a ByteBuffer (from the current position to the
+     * limit). This is common code supporting Java and Python versions of <code>.fromstring()</code>
+     * or <code>.frombytes()</code> (Python 3.2+ name).
      *
-     * @param bytes array containing the new array data in machine encoding
-     * @param offset of the first byte to read
-     * @param count of bytes to read
+     * @param bytes buffer containing the new array data in machine encoding
      */
-    private final void frombytesInternal(byte[] bytes, int offset, int count) {
+    private final void frombytesInternal(ByteBuffer bytes) {
 
         // Access the bytes
         int origsize = delegate.getSize();
 
         // Check validity wrt array itemsize
         int itemsize = getStorageSize();
+        int count = bytes.remaining();
         if ((count % itemsize) != 0) {
             throw Py.ValueError("string length not a multiple of item size");
         }
@@ -1201,8 +1190,8 @@
         try {
 
             // Provide argument as stream of bytes for fromstream method
-            ByteArrayInputStream bis = new ByteArrayInputStream(bytes, offset, count);
-            fromStream(bis);
+            InputStream is = new ByteBufferBackedInputStream(bytes);
+            fromStream(is);
 
         } catch (EOFException e) {
             // stubbed catch for fromStream throws
@@ -2117,4 +2106,48 @@
         }
     }
 
+    /**
+     * Wrap a <code>ByteBuffer</code> in an InputStream. Reference: <a
+     * href=http://stackoverflow.com/questions/4332264/wrapping-a-bytebuffer-with-an-inputstream>
+     * Stackoverflow question 4332264</a>.
+     */
+    private class ByteBufferBackedInputStream extends InputStream {
+
+        ByteBuffer buf;
+
+        public ByteBufferBackedInputStream(ByteBuffer buf) {
+            this.buf = buf;
+        }
+
+        /**
+         * Return the number of bytes remaining in the underlying buffer.
+         */
+        @Override
+        public int available() throws IOException {
+            return buf.remaining();
+        }
+
+
+        @Override
+        public int read() {
+            return buf.hasRemaining() ? buf.get() & 0xff : -1;
+        }
+
+        @Override
+        public int read(byte[] bytes, int off, int len) {
+            int n = buf.remaining();
+            if (n >= len) {
+                // There are enough bytes remaining to satisfy the request.
+                buf.get(bytes, off, len);
+                return len;
+            } else if (n > 0) {
+                // There are some bytes remaining: truncate request.
+                buf.get(bytes, off, n);
+                return n;
+            } else {
+                // Signal that there are no bytes left
+                return -1;
+            }
+        }
+    }
 }
diff --git a/src/org/python/core/PyBUF.java b/src/org/python/core/PyBUF.java
--- a/src/org/python/core/PyBUF.java
+++ b/src/org/python/core/PyBUF.java
@@ -213,6 +213,16 @@
      */
     static final int FULL_RO = INDIRECT | FORMAT;
 
+    /* Constants for additional feature(s), not standard for CPython */
+
+    /**
+     * A constant used by the consumer in its call to {@link BufferProtocol#getBuffer(int)} to
+     * specify that it expects to access the buffer contents directly as an array (rather than
+     * through the purely abstract part of the API). <code>getBuffer</code> will raise an exception
+     * if the exporter cannot expose its storage as Java array.
+     */
+    static final int AS_ARRAY = 0x10000000;
+
     /* Constants for readability, not standard for CPython */
 
     /**
diff --git a/src/org/python/core/PyBuffer.java b/src/org/python/core/PyBuffer.java
--- a/src/org/python/core/PyBuffer.java
+++ b/src/org/python/core/PyBuffer.java
@@ -1,5 +1,7 @@
 package org.python.core;
 
+import java.nio.ByteBuffer;
+
 /**
  * The Jython buffer API for access to a byte array within an exporting object. This interface is
  * the counterpart of the CPython <code>Py_buffer</code> struct. Several concrete types implement
@@ -237,10 +239,49 @@
      */
     public PyBuffer getBufferSlice(int flags, int start, int length, int stride);
 
+    // java.nio access to actual storage
+    //
+
+    /**
+     * Obtain a {@link java.nio.ByteBuffer} giving access to the bytes that hold the data being
+     * exported to the consumer. For a one-dimensional contiguous buffer, assuming the following
+     * client code where <code>obj</code> has type <code>BufferProtocol</code>:
+     *
+     * <pre>
+     * PyBuffer a = obj.getBuffer(PyBUF.SIMPLE);
+     * int itemsize = a.getItemsize();
+     * ByteBuffer bb = a.getNIOBuffer();
+     * </pre>
+     *
+     * the item with index <code>bb.pos()+k</code> is in the buffer <code>bb</code> at positions
+     * <code>bb.pos()+k*itemsize</code> to <code>bb.pos()+(k+1)*itemsize - 1</code> inclusive. And
+     * if <code>itemsize==1</code>, the item is simply the byte at position <code>bb.pos()+k</code>.
+     * The buffer limit is set to the first byte beyond the valid data. A block read or write will
+     * therefore access the contents sequentially.
+     * <p>
+     * If the buffer is multidimensional or non-contiguous (strided), the buffer position is still
+     * the (first byte of) the item at index <code>[0]</code> or <code>[0,...,0]</code>, and the
+     * limit is one item beyond the valid data. However, it is necessary to navigate <code>bb</code>
+     * using the <code>shape</code>, <code>strides</code> and maybe <code>suboffsets</code> provided
+     * by the API.
+     *
+     * @return a ByteBuffer equivalent to the exported data contents.
+     */
+    ByteBuffer getNIOByteBuffer();
+
     // Direct access to actual storage
     //
 
     /**
+     * Determine whether the exporter is able to offer direct access to the exported storage as a
+     * Java byte array (through the API that involves class {@link Pointer}), or only supports the
+     * abstract API. See also {@link PyBUF#AS_ARRAY}.
+     *
+     * @return true if array access is supported, false if it is not.
+     */
+    boolean hasArray();
+
+    /**
      * A class that references a <code>byte[]</code> array and a particular offset within it, as the
      * return type for methods that give direct access to byte-oriented data exported by a Python
      * object. In some contexts the consumer will be entitled to make changes to the contents of
@@ -270,11 +311,13 @@
      * Return a structure describing the slice of a byte array that holds the data being exported to
      * the consumer. For a one-dimensional contiguous buffer, assuming the following client code
      * where <code>obj</code> has type <code>BufferProtocol</code>:
+     *
      * <pre>
-     * PyBuffer a = obj.getBuffer();
+     * PyBuffer a = obj.getBuffer(PyBUF.SIMPLE);
      * int itemsize = a.getItemsize();
      * PyBuffer.Pointer b = a.getBuf();
      * </pre>
+     *
      * the item with index <code>k</code> is in the array <code>b.storage</code> at index
      * <code>[b.offset + k*itemsize]</code> to <code>[b.offset + (k+1)*itemsize - 1]</code>
      * inclusive. And if <code>itemsize==1</code>, the item is simply the byte
@@ -293,12 +336,14 @@
      * Return a structure describing the position in a byte array of a single item from the data
      * being exported to the consumer. For a one-dimensional contiguous buffer, assuming the
      * following client code where <code>obj</code> has type <code>BufferProtocol</code>:
+     *
      * <pre>
      * int k = ... ;
-     * PyBuffer a = obj.getBuffer();
+     * PyBuffer a = obj.getBuffer(PyBUF.FULL);
      * int itemsize = a.getItemsize();
      * PyBuffer.Pointer b = a.getPointer(k);
      * </pre>
+     *
      * the item with index <code>k</code> is in the array <code>b.storage</code> at index
      * <code>[b.offset]</code> to <code>[b.offset + itemsize - 1]</code> inclusive. And if
      * <code>itemsize==1</code>, the item is simply the byte <code>b.storage[b.offset]</code>
@@ -317,13 +362,15 @@
      * being exported to the consumer, in the case that array may be multi-dimensional. For a
      * 3-dimensional contiguous buffer, assuming the following client code where <code>obj</code>
      * has type <code>BufferProtocol</code>:
+     *
      * <pre>
      * int i, j, k;
      * // ... calculation that assigns i, j, k
-     * PyBuffer a = obj.getBuffer();
+     * PyBuffer a = obj.getBuffer(PyBUF.FULL);
      * int itemsize = a.getItemsize();
      * PyBuffer.Pointer b = a.getPointer(i,j,k);
      * </pre>
+     *
      * the item with index <code>[i,j,k]</code> is in the array <code>b.storage</code> at index
      * <code>[b.offset]</code> to <code>[b.offset + itemsize - 1]</code> inclusive. And if
      * <code>itemsize==1</code>, the item is simply the byte <code>b.storage[b.offset]</code>
diff --git a/src/org/python/core/PyByteArray.java b/src/org/python/core/PyByteArray.java
--- a/src/org/python/core/PyByteArray.java
+++ b/src/org/python/core/PyByteArray.java
@@ -352,7 +352,33 @@
      * @param count the number of times to repeat this.
      */
     protected synchronized void irepeat(int count) {
-        this.setStorage(repeatImpl(count));
+        // There are several special cases
+        if (size == 0 || count == 1) {
+            // No resize, so no check (consistent with CPython)
+            // Value is unchanged.
+
+        } else if (count <= 0) {
+            // Treat as if count == 0.
+            resizeCheck();
+            this.setStorage(emptyStorage);
+
+        } else {
+            // Open up space (remembering the original size)
+            int orginalSize = size;
+            storageExtend(orginalSize * (count - 1));
+            if (orginalSize == 1) {
+                // Do it efficiently for single bytes
+                byte b = storage[offset];
+                for (int i = 1, p = offset + 1; i < count; i++) {
+                    storage[p++] = b;
+                }
+            } else {
+                // General case
+                for (int i = 1, p = offset + orginalSize; i < count; i++, p += orginalSize) {
+                    System.arraycopy(storage, offset, storage, p, orginalSize);
+                }
+            }
+        }
     }
 
     /**
@@ -924,7 +950,7 @@
         return bytearray___imul__(n);
     }
 
-    @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.bytearray___mul___doc)
+    @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.bytearray___imul___doc)
     final PyObject bytearray___imul__(PyObject n) {
         if (!n.isIndex()) {
             return null;
diff --git a/src/org/python/core/PyString.java b/src/org/python/core/PyString.java
--- a/src/org/python/core/PyString.java
+++ b/src/org/python/core/PyString.java
@@ -72,6 +72,17 @@
         return str;
     }
 
+    /**
+     * Determine whether the string consists entirely of basic-plane characters. For a
+     * {@link PyString}, of course, it is always <code>true</code>, but this is useful in cases
+     * where either a <code>PyString</code> or a {@link PyUnicode} is acceptable.
+     *
+     * @return true
+     */
+    public boolean isBasicPlane() {
+        return true;
+    }
+
     @ExposedNew
     static PyObject str_new(PyNewWrapper new_, boolean init, PyType subtype, PyObject[] args,
             String[] keywords) {
@@ -144,6 +155,13 @@
         return pybuf;
     }
 
+    /**
+     * Return a substring of this object as a Java String.
+     *
+     * @param start the beginning index, inclusive.
+     * @param end the ending index, exclusive.
+     * @return the specified substring.
+     */
     public String substring(int start, int end) {
         return getString().substring(start, end);
     }
@@ -153,7 +171,7 @@
         return str___str__();
     }
 
-    public @ExposedMethod(doc = BuiltinDocs.str___str___doc)
+    @ExposedMethod(doc = BuiltinDocs.str___str___doc)
     final PyString str___str__() {
         if (getClass() == PyString.class) {
             return this;
@@ -673,6 +691,14 @@
         return new PyString(str);
     }
 
+    /**
+     * Create an instance of the same type as this object, from the Java String given as argument.
+     * This is to be overridden in a subclass to return its own type.
+     *
+     * @param string UTF-16 string encoding the characters (as Java).
+     * @param isBasic is ignored in <code>PyString</code> (effectively true).
+     * @return
+     */
     protected PyString createInstance(String str, boolean isBasic) {
         // ignore isBasic, doesn't apply to PyString, just PyUnicode
         return new PyString(str);
@@ -2353,59 +2379,28 @@
      * @param endObj end of slice.
      * @return count of occurrences
      */
-    protected final int _count_old(String sub, PyObject startObj, PyObject endObj) {
-// xxx
+    protected final int _count(String sub, PyObject startObj, PyObject endObj) {
+
         // Interpret the slice indices as concrete values
         int[] indices = translateIndices(startObj, endObj);
         int subLen = sub.length();
 
         if (subLen == 0) {
-            // Special case counting the occurrences of an empty string
-            if (indices[2] > getString().length()) {
+            // Special case counting the occurrences of an empty string.
+            int start = indices[2], end = indices[3], n = __len__();
+            if (end < 0 || end < start || start > n) {
+                // Slice is reversed or does not overlap the string.
                 return 0;
             } else {
-                return indices[1] - indices[0] + 1;
+                // Count of '' is one more than number of characters in overlap.
+                return Math.min(end, n) - Math.max(start, 0) + 1;
             }
 
         } else {
+
             // Skip down this string finding occurrences of sub
-            int start = indices[0], end = indices[1], count = 0;
-            while (true) {
-                int index = getString().indexOf(sub, start);
-                if (index < 0) {
-                    break; // not found
-                } else {
-                    // Found at index. Next search begins at end of this instance, at:
-                    start = index + subLen;
-                    if (start <= end) {
-                        count += 1; // ... and the instance found fits within this string.
-                    } else {
-                        break; // ... but the instance found overlaps the end, so is not valid.
-                    }
-                }
-            }
-            return count;
-        }
-    }
-
-    protected final int _count(String sub, PyObject startObj, PyObject endObj) {
-
-        // Interpret the slice indices as concrete values
-        int[] indices = translateIndices(startObj, endObj);
-        int subLen = sub.length();
-
-        if (subLen == 0) {
-            // Special case counting the occurrences of an empty string
-            if (indices[2] > getString().length()) {
-                return 0;
-            } else {
-                return indices[1] - indices[0] + 1;
-            }
-
-        } else {
-
-            // Skip down this string finding occurrences of sub
-            int start = indices[0], limit = indices[1] - subLen, count = 0;
+            int start = indices[0], end = indices[1];
+            int limit = end - subLen, count = 0;
 
             while (start <= limit) {
                 int index = getString().indexOf(sub, start);
@@ -2496,17 +2491,36 @@
      * {@link PyUnicode#unicode_find(PyObject, PyObject, PyObject)}.
      *
      * @param sub substring to find.
-     * @param start start of slice.
-     * @param end end of slice.
+     * @param startObj start of slice.
+     * @param endObj end of slice.
      * @return index of <code>sub</code> in this object or -1 if not found.
      */
-    protected final int _find(String sub, PyObject start, PyObject end) {
-        int[] indices = translateIndices(start, end);
-        int index = getString().indexOf(sub, indices[0]);
-        if (index < indices[2] || index > indices[1]) {
-            return -1;
+    protected final int _find(String sub, PyObject startObj, PyObject endObj) {
+        // Interpret the slice indices as concrete values
+        int[] indices = translateIndices(startObj, endObj);
+        int subLen = sub.length();
+
+        if (subLen == 0) {
+            // Special case: an empty string may be found anywhere, ...
+            int start = indices[2], end = indices[3];
+            if (end < 0 || end < start || start > __len__()) {
+                // ... except ln a reverse slice or beyond the end of the string,
+                return -1;
+            } else {
+                // ... and will be reported at the start of the overlap.
+                return indices[0];
+            }
+
+        } else {
+            // General case: search for first match then check against slice.
+            int start = indices[0], end = indices[1];
+            int found = getString().indexOf(sub, start);
+            if (found >= 0 && found + subLen <= end) {
+                return found;
+            } else {
+                return -1;
+            }
         }
-        return index;
     }
 
     /**
@@ -2582,17 +2596,36 @@
      * {@link PyUnicode#unicode_rfind(PyObject, PyObject, PyObject)}.
      *
      * @param sub substring to find.
-     * @param start start of slice.
-     * @param end end of slice.
+     * @param startObj start of slice.
+     * @param endObj end of slice.
      * @return index of <code>sub</code> in this object or -1 if not found.
      */
-    protected final int _rfind(String sub, PyObject start, PyObject end) {
-        int[] indices = translateIndices(start, end);
-        int index = getString().lastIndexOf(sub, indices[1] - sub.length());
-        if (index < indices[2]) {
-            return -1;
+    protected final int _rfind(String sub, PyObject startObj, PyObject endObj) {
+        // Interpret the slice indices as concrete values
+        int[] indices = translateIndices(startObj, endObj);
+        int subLen = sub.length();
+
+        if (subLen == 0) {
+            // Special case: an empty string may be found anywhere, ...
+            int start = indices[2], end = indices[3];
+            if (end < 0 || end < start || start > __len__()) {
+                // ... except ln a reverse slice or beyond the end of the string,
+                return -1;
+            } else {
+                // ... and will be reported at the end of the overlap.
+                return indices[1];
+            }
+
+        } else {
+            // General case: search for first match then check against slice.
+            int start = indices[0], end = indices[1];
+            int found = getString().lastIndexOf(sub, end - subLen);
+            if (found >= start) {
+                return found;
+            } else {
+                return -1;
+            }
         }
-        return index;
     }
 
     public double atof() {
@@ -3284,51 +3317,80 @@
     }
 
     /**
-     * Turns the possibly negative Python slice start and end into valid indices into this string.
+     * Many of the string methods deal with slices specified using Python slice semantics:
+     * endpoints, which are <code>PyObject</code>s, may be <code>null</code> or <code>None</code>
+     * (meaning default to one end or the other) or may be negative (meaning "from the end").
+     * Meanwhile, the implementation methods need integer indices, both within the array, and
+     * <code>0<=start<=end<=N</code> the length of the array.
+     * <p>
+     * This method first translates the Python slice <code>startObj</code> and <code>endObj</code>
+     * according to the slice semantics for null and negative values, and stores these in elements 2
+     * and 3 of the result. Then, since the end points of the range may lie outside this sequence's
+     * bounds (in either direction) it reduces them to the nearest points satisfying
+     * <code>0<=start<=end<=N</code>, and stores these in elements [0] and [1] of the
+     * result.
      *
-     * @return a 3 element array of indices into this string describing a substring from [0] to [1].
-     *         [0] <= [1], [0] >= 0 and [1] <= string.length(). The third element contains the
-     *         unadjusted start value (or nearest int).
+     * @param startObj Python start of slice
+     * @param endObj Python end of slice
+     * @return a 4 element array of two range-safe indices, and two original indices.
      */
-    protected int[] translateIndices(PyObject start, PyObject end) {
-        int iStart, iStartUnadjusted, iEnd;
-        int n = getString().length();
-
-        // Make sure the slice end decodes to something in range
-        if (end == null || end == Py.None) {
-            iEnd = n;
+    protected int[] translateIndices(PyObject startObj, PyObject endObj) {
+        int start, end;
+        int n = __len__();
+        int[] result = new int[4];
+
+        // Decode the start using slice semantics
+        if (startObj == null || startObj == Py.None) {
+            start = 0;
+            // result[2] = 0 already
         } else {
-            // Convert to int but limit to Integer.MIN_VALUE <= iEnd <= Integer.MAX_VALUE
-            iEnd = end.asIndex(null);
-            if (iEnd > n) {
-                iEnd = n;
-            } else if (iEnd < 0) {
-                iEnd = n + iEnd;
-                if (iEnd < 0) {
-                    iEnd = 0;
+            // Convert to int but limit to Integer.MIN_VALUE <= start <= Integer.MAX_VALUE
+            start = startObj.asIndex(null);
+            if (start < 0) {
+                // Negative value means "from the end"
+                start = n + start;
+            }
+            result[2] = start;
+        }
+
+        // Decode the end using slice semantics
+        if (endObj == null || endObj == Py.None) {
+            result[1] = result[3] = end = n;
+        } else {
+            // Convert to int but limit to Integer.MIN_VALUE <= end <= Integer.MAX_VALUE
+            end = endObj.asIndex(null);
+            if (end < 0) {
+                // Negative value means "from the end"
+                result[3] = end = end + n;
+                // Ensure end is safe for String.substring(start,end).
+                if (end < 0) {
+                    end = 0;
+                    // result[1] = 0 already
+                } else {
+                    result[1] = end;
+                }
+            } else {
+                result[3] = end;
+                // Ensure end is safe for String.substring(start,end).
+                if (end > n) {
+                    result[1] = end = n;
+                } else {
+                    result[1] = end;
                 }
             }
         }
 
-        // Make sure the slice start decodes to something in range
-        if (start == null || start == Py.None) {
-            iStartUnadjusted = iStart = 0;
+        // Ensure start is safe for String.substring(start,end).
+        if (start < 0) {
+            start = 0;
+            // result[0] = 0 already
+        } else if (start > end) {
+            result[0] = start = end;
         } else {
-            // Convert to int but limit to Integer.MIN_VALUE <= iStart <= Integer.MAX_VALUE
-            iStartUnadjusted = iStart = start.asIndex(null);
-            if (iStart > iEnd) {
-                iStart = iEnd;
-            } else if (iStart < 0) {
-                iStart = n + iStart;
-                if (iStart > iEnd) {
-                    iStart = iEnd;
-                } else if (iStart < 0) {
-                    iStart = 0;
-                }
-            }
+            result[0] = start;
         }
 
-        return new int[] {iStart, iEnd, iStartUnadjusted};
+        return result;
     }
 
     /**
@@ -3847,7 +3909,8 @@
      * @param keywords naming the keyword arguments.
      * @return the object designated or <code>null</code>.
      */
-    private PyObject getFieldObject(String fieldName, boolean bytes, PyObject[] args, String[] keywords) {
+    private PyObject getFieldObject(String fieldName, boolean bytes, PyObject[] args,
+            String[] keywords) {
         FieldNameIterator iterator = new FieldNameIterator(fieldName, bytes);
         PyObject head = iterator.pyHead();
         PyObject obj = null;
diff --git a/src/org/python/core/PyUnicode.java b/src/org/python/core/PyUnicode.java
--- a/src/org/python/core/PyUnicode.java
+++ b/src/org/python/core/PyUnicode.java
@@ -22,31 +22,44 @@
 @ExposedType(name = "unicode", base = PyBaseString.class, doc = BuiltinDocs.unicode_doc)
 public class PyUnicode extends PyString implements Iterable {
 
-    private enum Plane {
+    /**
+     * Nearly every significant method comes in two versions: one applicable when the string
+     * contains only basic plane characters, and one that is correct when supplementary characters
+     * are also present. Set this constant <code>true</code> to treat all strings as containing
+     * supplementary characters, so that these versions will be exercised in tests.
+     */
+    private static final boolean DEBUG_NON_BMP_METHODS = false;
 
-        UNKNOWN, BASIC, ASTRAL
-    }
-
-    private volatile Plane plane = Plane.UNKNOWN;
-    private volatile int codePointCount = -1;
     public static final PyType TYPE = PyType.fromClass(PyUnicode.class);
 
     // for PyJavaClass.init()
     public PyUnicode() {
-        this(TYPE, "");
+        this(TYPE, "", true);
     }
 
+    /**
+     * Construct a PyUnicode interpreting the Java String argument as UTF-16.
+     *
+     * @param string UTF-16 string encoding the characters (as Java).
+     */
     public PyUnicode(String string) {
-        this(TYPE, string);
+        this(TYPE, string, false);
     }
 
+    /**
+     * Construct a PyUnicode interpreting the Java String argument as UTF-16. If it is known that
+     * the string contains no supplementary characters, argument isBasic may be set true by the
+     * caller. If it is false, the PyUnicode will scan the string to find out.
+     *
+     * @param string UTF-16 string encoding the characters (as Java).
+     * @param isBasic true if it is known that only BMP characters are present.
+     */
     public PyUnicode(String string, boolean isBasic) {
-        this(TYPE, string);
-        plane = isBasic ? Plane.BASIC : Plane.UNKNOWN;
+        this(TYPE, string, isBasic);
     }
 
     public PyUnicode(PyType subtype, String string) {
-        super(subtype, string);
+        this(subtype, string, false);
     }
 
     public PyUnicode(PyString pystring) {
@@ -54,12 +67,13 @@
     }
 
     public PyUnicode(PyType subtype, PyString pystring) {
-        this(subtype, pystring instanceof PyUnicode ? pystring.string : pystring.decode()
-                .toString());
+        this(subtype, //
+                pystring instanceof PyUnicode ? pystring.string : pystring.decode().toString(), //
+                pystring.isBasicPlane());
     }
 
     public PyUnicode(char c) {
-        this(TYPE, String.valueOf(c));
+        this(TYPE, String.valueOf(c), true);
     }
 
     public PyUnicode(int codepoint) {
@@ -90,6 +104,20 @@
         this(ucs4.iterator());
     }
 
+    /**
+     * Fundamental all-features constructor on which the others depend. If it is known that the
+     * string contains no supplementary characters, argument isBasic may be set true by the caller.
+     * If it is false, the PyUnicode will scan the string to find out.
+     *
+     * @param subtype actual type to create.
+     * @param string UTF-16 string encoding the characters (as Java).
+     * @param isBasic true if it is known that only BMP characters are present.
+     */
+    private PyUnicode(PyType subtype, String string, boolean isBasic) {
+        super(subtype, string);
+        translator = isBasic ? BASIC : this.chooseIndexTranslator();
+    }
+
     @Override
     public int[] toCodePoints() {
         int n = getCodePointCount();
@@ -101,15 +129,428 @@
         return codePoints;
     }
 
-    // modified to know something about codepoints; we just need to return the
-    // corresponding substring; darn UTF16!
-    // TODO: we could avoid doing this unnecessary copy
+    // ------------------------------------------------------------------------------------------
+    // Index translation for Unicode beyond the BMP
+    // ------------------------------------------------------------------------------------------
+
+    /**
+     * Index translation between code point index (as seen by Python) and UTF-16 index (as used in
+     * the Java String.
+     */
+    private interface IndexTranslator {
+
+        /** Number of supplementary characters (hence point code length may be found). */
+        public int suppCount();
+
+        /** Translate a UTF-16 code unit index to its equivalent code point index. */
+        public int codePointIndex(int utf16Index);
+
+        /** Translate a code point index to its equivalent UTF-16 code unit index. */
+        public int utf16Index(int codePointIndex);
+    }
+
+    /**
+     * The instance of index translation in use in this string. It will be set to either
+     * {@link #BASIC} or and instance of {@link #Supplementary}.
+     */
+    private final IndexTranslator translator;
+
+    /**
+     * A singleton provides the translation service (which is a pass-through) for all BMP strings.
+     */
+    static final IndexTranslator BASIC = new IndexTranslator() {
+
+        @Override
+        public int suppCount() {
+            return 0;
+        }
+
+        @Override
+        public int codePointIndex(int u) {
+            return u;
+        }
+
+        @Override
+        public int utf16Index(int i) {
+            return i;
+        }
+    };
+
+    /**
+     * A class of index translation that uses the cumulative count so far of supplementary
+     * characters, tabulated in blocks of a standard size. The count is then used as an offset
+     * between the code point index and the corresponding point in the UTF-16 representation.
+     */
+    private final class Supplementary implements IndexTranslator {
+
+        /** Tabulates cumulative count so far of supplementary characters, by blocks of size M. */
+        final int[] count;
+
+        /** Configure the block size M, as this power of 2. */
+        static final int LOG2M = 4;
+        /** The block size used for indexing (power of 2). */
+        static final int M = 1 << LOG2M;
+        /** A mask used to separate the block number and offset in the block. */
+        static final int MASK = M - 1;
+
+        /**
+         * The constructor works on a count array prepared by
+         * {@link PyUnicode#getSupplementaryCounts(String)}.
+         */
+        Supplementary(int[] count) {
+            this.count = count;
+        }
+
+        @Override
+        public int codePointIndex(int u) {
+            /*
+             * Let the desired result be j such that utf16Index(j) = u. As we have only a forward
+             * index of the string, we have to conduct a search. In principle, we bound j by a pair
+             * of values (j1,j2) such that j1<=j<j2, and in successive iterations, we shorten the
+             * range until a unique j is found. In the first part of the search, we work in terms of
+             * block numbers, that is, indexes into the count array. We have j<u, and so j<k2*M
+             * where:
+             */
+            int k2 = (u >> LOG2M) + 1;
+            // The count of supplementary characters before the start of block k2 is:
+            int c2 = count[k2 - 1];
+            /*
+             * Since the count array is non-decreasing, and j < k2*M, we have u-j <= count[k2-1].
+             * That is, j >= k1*M, where:
+             */
+            int k1 = Math.max(0, u - c2) >> LOG2M;
+            // The count of supplementary characters before the start of block k1 is:
+            int c1 = (k1 == 0) ? 0 : count[k1 - 1];
+
+            /*
+             * Now, j (to be found) is in an unknown block k, where k1<=k<k2. We make a binary
+             * search, maintaining the inequalities, but moving the end points. When k2=k1+1, we
+             * know that j must be in block k1.
+             */
+            while (true) {
+                if (c2 == c1) {
+                    // We can stop: the region contains no supplementary characters so j is:
+                    return u - c1;
+                }
+                // Choose a candidate k in the middle of the range
+                int k = (k1 + k2) / 2;
+                if (k == k1) {
+                    // We must have k2=k1+1, so j is in block k1
+                    break;
+                } else {
+                    // kx divides the range: is j<kx*M?
+                    int c = count[k - 1];
+                    if ((k << LOG2M) + c > u) {
+                        // k*M+c > u therefore j is not in block k but to its left.
+                        k2 = k;
+                        c2 = c;
+                    } else {
+                        // k*M+c <= u therefore j must be in block k, or to its right.
+                        k1 = k;
+                        c1 = c;
+                    }
+                }
+            }
+
+            /*
+             * At this point, j is known to be in block k1 (and k2=k1+1). c1 is the number of
+             * supplementary characters to the left of code point index k1*M and c2 is the number of
+             * supplementary characters to the left of code point index (k1+1)*M. We have to search
+             * this block sequentially. The current position in the UTF-16 is:
+             */
+            int p = (k1 << LOG2M) + c1;
+            while (p < u) {
+                if (Character.isHighSurrogate(string.charAt(p++))) {
+                    // c1 tracks the number of supplementary characters to the left of p
+                    c1 += 1;
+                    if (c1 == c2) {
+                        // We have found all supplementary characters in the block.
+                        break;
+                    }
+                    // Skip the trailing surrogate.
+                    p++;
+                }
+            }
+            // c1 is the number of supplementary characters to the left of u, so the result j is:
+            return u - c1;
+        }
+
+        @Override
+        public int utf16Index(int i) {
+            // The code point index i lies in the k-th block where:
+            int k = i >> LOG2M;
+            // The offset for the code point index k*M is exactly
+            int d = (k == 0) ? 0 : count[k - 1];
+            // The offset for the code point index (k+1)*M is exactly
+            int e = count[k];
+            if (d == e) {
+                /*
+                 * The offset for the code point index (k+1)*M is the same, and since this is a
+                 * non-decreasing function of k, it is also the value for i.
+                 */
+                return i + d;
+            } else {
+                /*
+                 * The offset for the code point index (k+1)*M is different (higher). We must scan
+                 * along until we have found all the supplementary characters that precede i,
+                 * starting the scan at code point index k*M.
+                 */
+                for (int q = i & ~MASK; q < i; q++) {
+                    if (Character.isHighSurrogate(string.charAt(q + d))) {
+                        d += 1;
+                        if (d == e) {
+                            /*
+                             * We have found all the supplementary characters in this block, so we
+                             * must have found all those to the left of i.
+                             */
+                            break;
+                        }
+                    }
+                }
+
+                // d counts all the supplementary characters to the left of i.
+                return i + d;
+            }
+        }
+
+        @Override
+        public int suppCount() {
+            // The last element of the count array is the total number of supplementary characters.
+            return count[count.length - 1];
+        }
+    }
+
+    /**
+     * Generate the table that is used by the class {@link Supplementary} to accelerate access to
+     * the the implementation string. The method returns <code>null</code> if the string passed
+     * contains no surrogate pairs, in which case we'll use {@link #BASIC} as the translator. This
+     * method is sensitive to {@link #DEBUG_NON_BMP_METHODS} which if true will prevent it returning
+     * null, hance we will always use a {@link Supplementary} {@link #translator}.
+     *
+     * @param string to index
+     * @return the index (counts) or null if basic plane
+     */
+    private static int[] getSupplementaryCounts(final String string) {
+
+        final int n = string.length();
+        int p; // Index of the current UTF-16 code unit.
+
+        /*
+         * We scan to the first surrogate code unit, in a simple loop. If we hit the end before we
+         * find one, no count array will be necessary and we'll use BASIC. If we find a surrogate it
+         * may be half a supplementary character, or a lone surrogate: we'll find out later.
+         */
+        for (p = 0; p < n; p++) {
+            if (Character.isSurrogate(string.charAt(p))) {
+                break;
+            }
+        }
+
+        if (p == n && !DEBUG_NON_BMP_METHODS) {
+            // There are no supplementary characters so the 1:1 translator is fine.
+            return null;
+
+        } else {
+            /*
+             * We have to do this properly, using a scheme in which code point indexes are
+             * efficiently translatable to UTF-16 indexes through a table called here count[]. In
+             * this array, count[k] contains the total number of supplementary characters up to the
+             * end of the k.th block, that is, to the left of code point (k+1)M. We have to fill
+             * this array by scanning the string.
+             */
+            int q = p; // The current code point index (q = p+s).
+            int k = q >> Supplementary.LOG2M; // The block number k = q/M.
+
+            /*
+             * When addressing with a code point index q<=L (the length in code points) we will
+             * index the count array with k = q/M. We have q<=L<=n, therefore q/M <= n/M, the
+             * maximum valid k is 1 + n/M. A q>=L should raise IndexOutOfBoundsException, but it
+             * doesn't matter whether that's from indexing this array, or the string later.
+             */
+            int[] count = new int[1 + (n >> Supplementary.LOG2M)];
+
+            /*
+             * To get the generation of count[] going efficiently, we need to advance the next whole
+             * block. The next loop will complete processing of the block containing the first
+             * supplementary character. Note that in all these loops, if we exit because p reaches a
+             * limit, the count for the last partial block is known from p-q and we take care of
+             * that right at the end of this method. The limit of these loops is n-1, so if we spot
+             * a lead surrogate, the we may access the low-surrogate confident that p+1<n.
+             */
+            while (p < n - 1) {
+
+                // Catch supplementary characters and lone surrogate code units.
+                p += calcAdvance(string, p);
+                // Advance the code point index
+                q += 1;
+
+                // Was that the last in a block?
+                if ((q & Supplementary.MASK) == 0) {
+                    count[k++] = p - q;
+                    break;
+                }
+            }
+
+            /*
+             * If the string is long enough, we can work in whole blocks of M, and there are fewer
+             * things to track. We can't know the number of blocks in advance, but we know there is
+             * at least one whole block to go when p+2*M<n.
+             */
+            while (p + 2 * Supplementary.M < n) {
+
+                for (int i = 0; i < Supplementary.M; i++) {
+                    // Catch supplementary characters and lone surrogate code units.
+                    p += calcAdvance(string, p);
+                }
+
+                // Advance the code point index one whole block
+                q += Supplementary.M;
+
+                // The number of supplementary characters to the left of code point index k*M is:
+                count[k++] = p - q;
+            }
+
+            /*
+             * Process the remaining UTF-16 code units, except possibly the last.
+             */
+            while (p < n - 1) {
+
+                // Catch supplementary characters and lone surrogate code units.
+                p += calcAdvance(string, p);
+                // Advance the code point index
+                q += 1;
+
+                // Was that the last in a block?
+                if ((q & Supplementary.MASK) == 0) {
+                    count[k++] = p - q;
+                }
+            }
+
+            /*
+             * There may be just one UTF-16 unit left (if the last thing processed was not a
+             * surrogate pair).
+             */
+            if (p < n) {
+                // We are at the last UTF-16 unit in string. Any surrogate here is an error.
+                char c = string.charAt(p++);
+                if (Character.isSurrogate(c)) {
+                    throw unpairedSurrogate(p - 1, c);
+                }
+                // Advance the code point index
+                q += 1;
+            }
+
+            /*
+             * There may still be some elements of count[] we haven't set, so we fill to the end
+             * with the total count. This also takes care of an incomplete final block.
+             */
+            int total = p - q;
+            while (k < count.length) {
+                count[k++] = total;
+            }
+
+            return count;
+        }
+    }
+
+    /**
+     * Called at each code point index, returns 2 if this is a surrogate pair, 1 otherwise, and
+     * detects lone surrogates as an error. The return is the amount to advance the UTF-16 index. An
+     * exception is raised if at <code>p</code> we find a lead surrogate without a trailing one
+     * following, or a trailing surrogate directly. It should not be called on the final code unit,
+     * when <code>p==string.length()-1</code>, since it may check the next code unit as well.
+     *
+     * @param string of UTF-16 code units
+     * @param p index into that string
+     * @return 2 if a surrogate pair stands at <code>p</code>, 1 if not
+     * @throws PyException(ValueError) if a lone surrogate stands at <code>p</code>.
+     */
+    private static int calcAdvance(String string, int p) throws PyException {
+
+        // Catch supplementary characters and lone surrogate code units.
+        char c = string.charAt(p);
+
+        if (c >= Character.MIN_SURROGATE) {
+            if (c < Character.MIN_LOW_SURROGATE) {
+                // This is a lead surrogate.
+                if (Character.isLowSurrogate(string.charAt(p + 1))) {
+                    // Required trailing surrogate follows, so step over both.
+                    return 2;
+                } else {
+                    // Required trailing surrogate missing.
+                    throw unpairedSurrogate(p, c);
+                }
+
+            } else if (c <= Character.MAX_SURROGATE) {
+                // This is a lone trailing surrogate
+                throw unpairedSurrogate(p, c);
+
+            } // else this is a private use or special character in 0xE000 to 0xFFFF.
+
+        }
+        return 1;
+    }
+
+    /**
+     * Return a ready-to-throw exception indicating an unpaired surrogate.
+     *
+     * @param p index within that sequence of the problematic code unit
+     * @param c the code unit
+     * @return an exception
+     */
+    private static PyException unpairedSurrogate(int p, int c) {
+        String fmt = "unpaired surrogate %#4x at code unit %d";
+        String msg = String.format(fmt, c, p);
+        return Py.ValueError(msg);
+    }
+
+    /**
+     * Choose an {@link IndexTranslator} implementation for efficient working, according to the
+     * contents of the {@link PyString#string}.
+     *
+     * @return chosen <code>IndexTranslator</code>
+     */
+    private IndexTranslator chooseIndexTranslator() {
+        int[] count = getSupplementaryCounts(string);
+        if (DEBUG_NON_BMP_METHODS) {
+            return new Supplementary(count);
+        } else {
+            return count == null ? BASIC : new Supplementary(count);
+        }
+    }
+
+    /**
+     * {@inheritDoc}
+     * <p>
+     * In the <code>PyUnicode</code> version, the arguments are code point indices, such as are
+     * received from the Python caller, while the first two elements of the returned array have been
+     * translated to UTF-16 indices in the implementation string.
+     */
+    @Override
+    protected int[] translateIndices(PyObject start, PyObject end) {
+        int[] indices = super.translateIndices(start, end);
+        indices[0] = translator.utf16Index(indices[0]);
+        indices[1] = translator.utf16Index(indices[1]);
+        // indices[2] and [3] remain Unicode indices (and may be out of bounds) relative to len()
+        return indices;
+    }
+
+    // ------------------------------------------------------------------------------------------
+
+    /**
+     * {@inheritDoc} The indices are code point indices, not UTF-16 (<code>char</code>) indices. For
+     * example:
+     *
+     * <pre>
+     * PyUnicode u = new PyUnicode("..\ud800\udc02\ud800\udc03...");
+     * // (Python) u = u'..\U00010002\U00010003...'
+     *
+     * String s = u.substring(2, 4);  // = "\ud800\udc02\ud800\udc03" (Java)
+     * </pre>
+     */
     @Override
     public String substring(int start, int end) {
-        if (isBasicPlane()) {
-            return super.substring(start, end);
-        }
-        return new PyUnicode(newSubsequenceIterator(start, end, 1)).getString();
+        return super.substring(translator.utf16Index(start), translator.utf16Index(end));
     }
 
     /**
@@ -122,29 +563,18 @@
         return uni;
     }
 
+    /**
+     * {@inheritDoc}
+     *
+     * @return true if the string consists only of BMP characters
+     */
+    @Override
     public boolean isBasicPlane() {
-        if (plane == Plane.BASIC) {
-            return true;
-        } else if (plane == Plane.UNKNOWN) {
-            plane = (getString().length() == getCodePointCount()) ? Plane.BASIC : Plane.ASTRAL;
-        }
-        return plane == Plane.BASIC;
+        return translator == BASIC;
     }
 
-// RETAIN THE BELOW CODE, it facilitates testing astral support more completely
-
-// public boolean isBasicPlane() {
-// return false;
-// }
-
-// END RETAIN
-
     public int getCodePointCount() {
-        if (codePointCount >= 0) {
-            return codePointCount;
-        }
-        codePointCount = getString().codePointCount(0, getString().length());
-        return codePointCount;
+        return string.length() - translator.suppCount();
     }
 
     @ExposedNew
@@ -193,12 +623,13 @@
         return new PyUnicode(str);
     }
 
-    // Unicode ops consisting of basic strings can only produce basic strings;
-    // this may not be the case for astral ones - they also might be basic, in
-    // case of deletes. So optimize by providing a tainting mechanism.
+    /**
+     * @param string UTF-16 string encoding the characters (as Java).
+     * @param isBasic true if it is known that only BMP characters are present.
+     */
     @Override
-    protected PyString createInstance(String str, boolean isBasic) {
-        return new PyUnicode(str, isBasic);
+    protected PyString createInstance(String string, boolean isBasic) {
+        return new PyUnicode(string, isBasic);
     }
 
     @Override
@@ -295,37 +726,19 @@
 
     @Override
     protected PyObject pyget(int i) {
-        if (isBasicPlane()) {
-            return Py.makeCharacter(getString().charAt(i), true);
-        }
-
-        int k = 0;
-        while (i > 0) {
-            int W1 = getString().charAt(k);
-            if (W1 >= 0xD800 && W1 < 0xDC00) {
-                k += 2;
-            } else {
-                k += 1;
-            }
-            i--;
-        }
-        int codepoint = getString().codePointAt(k);
+        int codepoint = getString().codePointAt(translator.utf16Index(i));
         return Py.makeCharacter(codepoint, true);
     }
 
     private class SubsequenceIteratorImpl implements Iterator {
 
-        private int current, k, start, stop, step;
+        private int current, k, stop, step;
 
         SubsequenceIteratorImpl(int start, int stop, int step) {
-            k = 0;
             current = start;
-            this.start = start;
+            k = translator.utf16Index(current);
             this.stop = stop;
             this.step = step;
-            for (int i = 0; i < start; i++) {
-                nextCodePoint();
-            }
         }
 
         SubsequenceIteratorImpl() {
@@ -443,10 +856,12 @@
     private PyUnicode coerceToUnicode(PyObject o) {
         if (o instanceof PyUnicode) {
             return (PyUnicode)o;
+        } else if (o instanceof PyString) {
+            return new PyUnicode(((PyString)o).getString(), true);
         } else if (o instanceof BufferProtocol) {
-            // PyString or PyByteArray, PyMemoryView, Py2kBuffer ...
+            // PyByteArray, PyMemoryView, Py2kBuffer ...
             try (PyBuffer buf = ((BufferProtocol)o).getBuffer(PyBUF.FULL_RO)) {
-                return new PyUnicode(buf.toString());
+                return new PyUnicode(buf.toString(), true);
             }
         } else {
             // o is some type not allowed:
@@ -998,7 +1413,7 @@
     @Override
     protected PyString fromSubstring(int begin, int end) {
         assert (isBasicPlane()); // can only be used on a codepath from str_ equivalents
-        return new PyUnicode(getString().substring(begin, end));
+        return new PyUnicode(getString().substring(begin, end), true);
     }
 
     @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_index_doc)
@@ -1021,7 +1436,7 @@
         if (isBasicPlane()) {
             return _count(sub.getString(), start, end);
         }
-        int[] indices = translateIndices(start, end);
+        int[] indices = super.translateIndices(start, end); // do not convert to utf-16 indices.
         int count = 0;
         for (Iterator<Integer> mainIter = newSubsequenceIterator(indices[0], indices[1], 1); mainIter
                 .hasNext();) {
@@ -1043,12 +1458,14 @@
 
     @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_find_doc)
     final int unicode_find(PyObject subObj, PyObject start, PyObject end) {
-        return _find(coerceToUnicode(subObj).getString(), start, end);
+        int found = _find(coerceToUnicode(subObj).getString(), start, end);
+        return found < 0 ? -1 : translator.codePointIndex(found);
     }
 
     @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_rfind_doc)
     final int unicode_rfind(PyObject subObj, PyObject start, PyObject end) {
-        return _rfind(coerceToUnicode(subObj).getString(), start, end);
+        int found = _rfind(coerceToUnicode(subObj).getString(), start, end);
+        return found < 0 ? -1 : translator.codePointIndex(found);
     }
 
     private static String padding(int n, int pad) {
@@ -1228,13 +1645,11 @@
 
     @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_startswith_doc)
     final boolean unicode_startswith(PyObject prefix, PyObject start, PyObject end) {
-        // FIXME: slice indexing logic incorrect when this is ASTRAL
         return str_startswith(prefix, start, end);
     }
 
     @ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_endswith_doc)
     final boolean unicode_endswith(PyObject suffix, PyObject start, PyObject end) {
-        // FIXME: slice indexing logic incorrect when this is ASTRAL
         return str_endswith(suffix, start, end);
     }
 
diff --git a/src/org/python/core/buffer/BaseBuffer.java b/src/org/python/core/buffer/BaseBuffer.java
--- a/src/org/python/core/buffer/BaseBuffer.java
+++ b/src/org/python/core/buffer/BaseBuffer.java
@@ -1,5 +1,7 @@
 package org.python.core.buffer;
 
+import java.nio.ByteBuffer;
+
 import org.python.core.BufferProtocol;
 import org.python.core.Py;
 import org.python.core.PyBUF;
@@ -126,17 +128,17 @@
      * Construct an instance of BaseBuffer in support of a sub-class, specifying the 'feature
      * flags', or at least a starting set to be adjusted later. These are the features of the buffer
      * exported, not the flags that form the consumer's request. The buffer will be read-only unless
-     * {@link PyBUF#WRITABLE} is set in the feature flags. {@link PyBUF#FORMAT} is implicitly added
-     * to the feature flags. The navigation arrays are all null, awaiting action by the sub-class
-     * constructor. To complete initialisation, the sub-class normally must assign: the buffer (
-     * {@link #storage}, {@link #index0}), and the navigation arrays ({@link #shape},
-     * {@link #strides}), and call {@link #checkRequestFlags(int)} passing the consumer's request
-     * flags.
+     * {@link PyBUF#WRITABLE} is set in the feature flags. {@link PyBUF#FORMAT} and
+     * {@link PyBUF#AS_ARRAY} are implicitly added to the feature flags. The navigation arrays are
+     * all null, awaiting action by the sub-class constructor. To complete initialisation, the
+     * sub-class normally must assign: the buffer ( {@link #storage}, {@link #index0}), and the
+     * navigation arrays ({@link #shape}, {@link #strides}), and call
+     * {@link #checkRequestFlags(int)} passing the consumer's request flags.
      *
      * @param featureFlags bit pattern that specifies the actual features allowed/required
      */
     protected BaseBuffer(int featureFlags) {
-        setFeatureFlags(featureFlags | FORMAT);
+        setFeatureFlags(featureFlags | FORMAT | AS_ARRAY);
     }
 
     /**
@@ -213,6 +215,12 @@
     }
 
     @Override
+    public boolean hasArray() {
+        // AS_ARRAY is a non-navigational flag, so is inverted in gFeatureFlags
+        return (gFeatureFlags & AS_ARRAY) != 0;
+    }
+
+    @Override
     public int getNdim() {
         return shape.length;
     }
@@ -297,7 +305,7 @@
      * accessors. The default implementation here is suited to N-dimensional arrays.
      *
      * @param indices of the item from the consumer
-     * @return index relative to item x[0,...,0] in actual storage
+     * @return corresponding absolute index in storage
      */
     protected int calcIndex(int... indices) throws IndexOutOfBoundsException {
         final int N = checkDimension(indices);
@@ -313,6 +321,57 @@
     }
 
     /**
+     * Calculate the absolute byte index in the storage array of the last item of the exported data
+     * (if we are not using indirection). This is the greatest value attained by
+     * {@link #calcIndex(int...)}. The first byte not used will be one <code>itemsize</code> more
+     * than the returned value.
+     *
+     * @return greatest absolute index in storage
+     */
+    protected int calcGreatestIndex() throws IndexOutOfBoundsException {
+        final int N = shape.length;
+        // If all the strides are positive, the maximal value is found from:
+        // index = index0 + sum(k=0,N-1) (shape[k]-1)*strides[k]
+        // but in general, for any k where strides[k]<=0, the term should be zero.
+        int index = index0;
+        if (N > 0) {
+            int[] strides = getStrides();
+            for (int k = 0; k < N; k++) {
+                int stride = strides[k];
+                if (stride > 0) {
+                    index += (shape[k] - 1) * stride;
+                }
+            }
+        }
+        return index;
+    }
+
+    /**
+     * Calculate the absolute byte index in the storage array of the first item of the exported data
+     * (if we are not using indirection). This is the least value attained by
+     * {@link #calcIndex(int...)}.
+     *
+     * @return least absolute index in storage
+     */
+    protected int calcLeastIndex() throws IndexOutOfBoundsException {
+        final int N = shape.length;
+        // If all the strides are positive, the maximal value is just index0,
+        // but in general, we must allow strides[k]<=0 for some k:
+        // index = index0 + sum(k=0,N-1) (strides[k]<0) ? (shape[k]-1)*strides[k] : 0
+        int index = index0;
+        if (N > 0) {
+            int[] strides = getStrides();
+            for (int k = 0; k < N; k++) {
+                int stride = strides[k];
+                if (stride < 0) {
+                    index += (shape[k] - 1) * stride;
+                }
+            }
+        }
+        return index;
+    }
+
+    /**
      * {@inheritDoc}
      * <p>
      * The default implementation in <code>BaseBuffer</code> deals with the general one-dimensional
@@ -540,6 +599,15 @@
     // @Override public PyBuffer getBufferSlice(int flags, int start, int length, int stride) {}
 
     @Override
+    public ByteBuffer getNIOByteBuffer() {
+        // Determine the limit of the buffer just beyond the last item.
+        int length = calcGreatestIndex() + getItemsize() - index0;
+        ByteBuffer b = ByteBuffer.wrap(storage, index0, length);
+        // Return as read-only if it is.
+        return isReadonly() ? b.asReadOnlyBuffer() : b;
+    }
+
+    @Override
     public Pointer getBuf() {
         return new Pointer(storage, index0);
     }
@@ -664,6 +732,8 @@
             return bufferRequires("shape array");
         } else if ((syndrome & WRITABLE) != 0) {
             return bufferIsNot("writable");
+        } else if ((syndrome & AS_ARRAY) != 0) {
+            return bufferIsNot("accessible as a Java array");
         } else if ((syndrome & C_CONTIGUOUS) != 0) {
             return bufferIsNot("C-contiguous");
         } else if ((syndrome & F_CONTIGUOUS) != 0) {
diff --git a/src/org/python/core/buffer/SimpleBuffer.java b/src/org/python/core/buffer/SimpleBuffer.java
--- a/src/org/python/core/buffer/SimpleBuffer.java
+++ b/src/org/python/core/buffer/SimpleBuffer.java
@@ -1,5 +1,7 @@
 package org.python.core.buffer;
 
+import java.nio.ByteBuffer;
+
 import org.python.core.PyBuffer;
 import org.python.core.PyException;
 import org.python.core.util.StringUtil;
@@ -229,6 +231,13 @@
     }
 
     @Override
+    public ByteBuffer getNIOByteBuffer() {
+        // Simplify for one-dimensional contiguous bytes
+        ByteBuffer b = ByteBuffer.wrap(storage, index0, shape[0]);
+        return isReadonly() ? b.asReadOnlyBuffer() : b;
+    }
+
+    @Override
     public Pointer getPointer(int index) throws IndexOutOfBoundsException {
         return new Pointer(storage, index0 + index);
     }
diff --git a/src/org/python/core/buffer/SimpleStringBuffer.java b/src/org/python/core/buffer/SimpleStringBuffer.java
--- a/src/org/python/core/buffer/SimpleStringBuffer.java
+++ b/src/org/python/core/buffer/SimpleStringBuffer.java
@@ -1,5 +1,7 @@
 package org.python.core.buffer;
 
+import java.nio.ByteBuffer;
+
 import org.python.core.PyBuffer;
 import org.python.core.util.StringUtil;
 
@@ -112,12 +114,19 @@
             return getBufferSlice(flags, start, length);
         } else {
             // Force creation of the actual byte array from the String.
-            getBuf();
+            ensureHaveBytes();
             // Now we are effectively a SimpleBuffer, return the strided view.
             return super.getBufferSlice(flags, start, length, stride);
         }
     }
 
+    @Override
+    public ByteBuffer getNIOByteBuffer() {
+        // Force creation of the actual byte array from the String.
+        ensureHaveBytes();
+        return super.getNIOByteBuffer().asReadOnlyBuffer();
+    }
+
     /**
      * This method creates an actual byte array from the underlying String if none yet exists.
      */
diff --git a/src/org/python/modules/_io/PyFileIO.java b/src/org/python/modules/_io/PyFileIO.java
--- a/src/org/python/modules/_io/PyFileIO.java
+++ b/src/org/python/modules/_io/PyFileIO.java
@@ -251,8 +251,7 @@
             PyBuffer pybuf = writablePyBuffer(buf);
 
             try {
-                PyBuffer.Pointer bp = pybuf.getBuf();
-                ByteBuffer byteBuffer = ByteBuffer.wrap(bp.storage, bp.offset, pybuf.getLen());
+                ByteBuffer byteBuffer = pybuf.getNIOByteBuffer();
                 synchronized (ioDelegate) {
                     count = ioDelegate.readinto(byteBuffer);
                 }
@@ -293,8 +292,7 @@
 
             try {
                 // Access the data as a java.nio.ByteBuffer [pos:limit] within possibly larger array
-                PyBuffer.Pointer bp = pybuf.getBuf();
-                ByteBuffer byteBuffer = ByteBuffer.wrap(bp.storage, bp.offset, pybuf.getLen());
+                ByteBuffer byteBuffer = pybuf.getNIOByteBuffer();
                 synchronized (ioDelegate) {
                     count = ioDelegate.write(byteBuffer);
                 }
diff --git a/src/org/python/modules/posix/PosixModule.java b/src/org/python/modules/posix/PosixModule.java
--- a/src/org/python/modules/posix/PosixModule.java
+++ b/src/org/python/modules/posix/PosixModule.java
@@ -29,7 +29,6 @@
 import org.python.core.Py;
 import org.python.core.PyBUF;
 import org.python.core.PyBuffer;
-import org.python.core.PyBuffer.Pointer;
 import org.python.core.PyBuiltinFunctionNarrow;
 import org.python.core.PyDictionary;
 import org.python.core.PyException;
@@ -826,10 +825,8 @@
     public static int write(PyObject fd, BufferProtocol bytes) {
         // Get a buffer view: we can cope with N-dimensional data, but not strided data.
         try (PyBuffer buf = bytes.getBuffer(PyBUF.ND)) {
-            // Get the array and offset of the first real byte.
-            Pointer p = buf.getBuf();
-            // Make a ByteBuffer of that array, setting the position and limit to the real data.
-            ByteBuffer bb = ByteBuffer.wrap(p.storage, p.offset, buf.getLen());
+            // Get a ByteBuffer of that data, setting the position and limit to the real data.
+            ByteBuffer bb =  buf.getNIOByteBuffer();
             try {
                 // Write the data (returning the count of bytes).
                 return FileDescriptors.get(fd).write(bb);
@@ -902,7 +899,7 @@
     }
 
     /**
-     * Return a path as a String from a PyObject 
+     * Return a path as a String from a PyObject
      *
      * @param path a PyObject, raising a TypeError if an invalid path type
      * @return a String path
diff --git a/tests/java/javatests/ProxyDeserialization.java b/tests/java/ProxyDeserialization.java
rename from tests/java/javatests/ProxyDeserialization.java
rename to tests/java/ProxyDeserialization.java
diff --git a/tests/python/unicode_index_times.py b/tests/python/unicode_index_times.py
new file mode 100644
--- /dev/null
+++ b/tests/python/unicode_index_times.py
@@ -0,0 +1,405 @@
+# -*- coding: utf-8 -*-
+# unicode speed tests for access operations and find
+# This is part of an effort to supply the Jython implementation of
+# unicode with efficient, correct translations between the visible index
+# of a character and and the index in the implementation array of the
+# UTF-16 code unit(s) that represent it. See also test.test_unicode_jy,
+# and Jython issue #2100. The presence of characters with Unicode
+# code points > 0xffff (supplementary characters), that it takes two
+# code units to represent, makes this non-trivial.
+#
+# The program defines a variety of test strings of differing lengths
+# and distribution of supplementary characters, then times some basic
+# operations involving index translation: of retrieval, slicing and
+# find, to provide an average time per operation. It runs several trials
+# of each test case (a combination of an operation and the test material)
+# and reports the shortest trial, using the same strategy as the timeit
+# module).
+#
+# It was difficult to get repeatable times from the test under Jython,
+# because the JIT compiler (?) is non-deterministic. It proved
+# impossible using a strategy that would run the same test case multiple
+# times in succession. The approach eventually taken was to run the all
+# the test cases once, then repeat this sequence several times, and
+# report the minimum time of each case in these widely separated trials.
+# This strategy provides stable results with the default JIT behaviour.
+#
+# Two interesting options are to run with the # JIT compiler disabled:
+#   $ jython -J-Xint tests/python/unicode_index_times.py
+#
+# or with it continuously enabled:
+#   $ jython -J-Xcomp tests/python/unicode_index_times.py
+#
+from __future__ import print_function
+import itertools
+import random
+import sys
+import time
+
+if sys.platform == "win32" or sys.platform.startswith("java"):
+    # On Windows and Jython, the best timer is time.clock()
+    timer = time.clock
+else:
+    # On most other platforms the best timer is time.time()
+    timer = time.time
+
+if sys.version_info[0] > 2:
+    unichr = chr
+else:
+    def next(it) : return it.next()
+
+# Proportion of characters that are supplementary (if not explicit)
+DEFAULT_RATE = 0.2  # 20%
+
+# We will test performance with these sizes
+SHORT = 10
+MEDIUM = 100
+LONG = 1000
+HUGE = 10000
+
+
+class UnicodeMaterial(object):
+    ''' Object holding a list of single characters and a unicode string
+        that is their concatenation. The sequence is created from a
+        background sequence of basic plane characters and random
+        replacement with supplementary plane characters (those with
+        point code>0xffff).
+    '''
+
+    base = tuple(u'abcdefghijklmnopqrstuvwxyz')
+    if sys.maxunicode < 0x10000:
+        print("Narrow build: all characters from BMP", file=sys.stderr)
+        supp = tuple(map(unichr, range(0x20, 0x2c)))
+    else:
+        # Wide build: we have real supplementary characters
+        supp = tuple(map(unichr, range(0x10000, 0x1000c)))
+    used = sorted(set(base+supp))
+
+    def __init__(self, size=20, pred=None, ran=None):
+        ''' Create size chars choosing an SP char at i where
+            pred(ran, i)==True where ran is an instance of
+            random.Random supplied in the constructor or created
+            locally (if ran==None).
+        '''
+
+        # Generators for the BMP and SP characters
+        base = itertools.cycle(UnicodeMaterial.base)
+        supp = itertools.cycle(UnicodeMaterial.supp)
+
+        # Each instance gets a random generator
+        if ran is None:
+            ran = random.Random()
+        self.random = ran
+
+        if pred is None:
+            pred = lambda ran, j : ran.random() < DEFAULT_RATE
+
+        # Generate the list
+        r = list()
+        for i in range(size):
+            if pred(self.random, i):
+                c = next(supp)
+            else:
+                c = next(base)
+            r.append(c)
+
+        # The list and its concatenation are our material
+        self.ref = r
+        self.size = len(r)
+        self.text = u''.join(r)
+        self.target = u''
+
+    def __len__(self):
+        return self.size
+
+    def insert(self, target, p=None):
+        ''' Insert target string at position p (or middle), truncating if
+            that would make the material any longer
+        '''
+        if p is None:
+            p = max(0, (self.size-len(target)) // 2)
+
+        n = 0
+        for t in target:
+            if p+n >= self.size:
+                break;
+            self.ref[p+n] = t
+            n += 1
+
+        self.target = target[:n]
+        self.text = u''.join(self.ref)
+
+
+class UnicodeActions(UnicodeMaterial):
+    ''' Provides test material with loops for timing.'''
+
+    def __init__(self, size=20, pred=None, ran=None):
+        super(UnicodeActions, self).__init__(size, pred, ran)
+        if self.size <= 0:
+            raise ValueError("The timings don't work for zero length")
+        self.used =  UnicodeMaterial.used
+        self.trash = None
+        # String to find (note 'abcde' in base: nice for false alarms)
+        self.insert(u"abide")
+
+
+    def repeat_getitem(self, mincount):
+        ''' Access each code point in sequence repeatedly so that at
+            least mincount operations have been peformed, and return the
+            actual number of operations.
+        '''
+        n = self.size
+        t = self.text
+        opcount = 0
+        dummy = None
+        while opcount < mincount:
+            # Access each point code
+            i = 0
+            while i < n:
+                # Avoid optimising away the call
+                dummy = t[i]
+                i += 1
+            opcount += n
+        self.trash = dummy
+        return opcount
+
+    def repeat_slice(self, mincount):
+        ''' Extract a slice at each feasible position and length,
+            repeating enough times to do mincount operations, and
+            return the actual number of operations.
+        '''
+        n = self.size
+        t = self.text
+        opcount = 0
+        dummy = None
+
+        while opcount < mincount:
+            m = 1
+            while m <= n:
+                starts = n - m + 1
+                for i in range(starts):
+                    dummy = t[i:i+m]
+                    #print(i, i+m, step, dummy)
+                opcount += starts
+                m *= 5
+
+        return opcount
+
+    def repeat_slice_step(self, mincount):
+        ''' Extract a slice at each feasible position and length,
+            and using different sizes for the step,
+            repeating enough times to do mincount operations, and
+            return the actual number of operations.
+        '''
+        n = self.size
+        t = self.text
+        opcount = 0
+        dummy = None
+        steps = [3, -1, 10]
+
+        while opcount < mincount:
+            for step in steps:
+                if step > 0:
+                    m = 1
+                    while m <= n:
+                        starts = n - m + 1
+                        for i in range(starts):
+                            dummy = t[i:i+m:step]
+                            #print(i, i+m, step, dummy)
+                        opcount += starts
+                        m *= 5
+                else:
+                    m = 1
+                    while m <= n:
+                        starts = n - m + 1
+                        for i in range(starts):
+                            dummy = t[i+m:i:step]
+                            #print(i+m, i, step, dummy)
+                        opcount += starts
+                        m *= 5
+                    
+        return opcount
+
+    def repeat_find_char(self, mincount):
+        ''' Do an incremental find of each code point, repeating
+            enough times to do mincount finds, and return the actual
+            number of operations.
+        '''
+        opcount = 0
+        n = self.size
+        findop = self.text.find
+        dummy = 0
+
+        while opcount < mincount:
+            # The text is searched for every code c.
+            for c in self.used:
+                # ... and every occurrence is found.
+                start = 0
+                while start < n:
+                    i = findop(c, start)
+                    if i < 0: break
+                    # Avoid optimising away the call
+                    dummy += i
+                    start = i + 1
+
+            # Every character in the text was a hit exactly once.
+            # And every character was also a miss, except for
+            # the character that ends the text. So we did:
+            opcount += n + len(self.used) - 1
+
+        self.trash = dummy
+        return opcount
+
+    def repeat_find_str(self, mincount):
+        ''' Find the target string within the material, repeating
+            enough times to do mincount finds, and return the actual
+            number of operations.
+        '''
+        opcount = 0
+        s = self.target
+        findop = self.text.find
+        dummy = 0
+
+        while opcount < mincount:
+            dummy += findop(s)
+            opcount += 1
+
+        self.trash = dummy
+        return opcount
+
+    def repeat_rfind_char(self, mincount):
+        ''' Do an incremental rfind of each code point, repeating
+            enough times to do mincount finds, and return the actual
+            number of operations.
+        '''
+        opcount = 0
+        n = self.size
+        findop = self.text.rfind
+
+        while opcount < mincount:
+            # The text is searched for every code c.
+            for c in self.used:
+                # ... and every occurrence is found.
+                end = n
+                while end >= 0:
+                    end = findop(c, 0, end)
+
+            # Every character in the text was a hit exactly once.
+            # And every character was also a miss. So we did:
+            opcount += n + len(self.used)
+
+        return opcount
+
+    def repeat_rfind_str(self, mincount):
+        ''' Find the target string within the material, repeating
+            enough times to do mincount finds, and return the actual
+            number of operations.
+        '''
+        opcount = 0
+        s = self.target
+        findop = self.text.rfind
+        dummy = 0
+
+        while opcount < mincount:
+            dummy += findop(s)
+            opcount += 1
+
+        self.trash = dummy
+        return opcount
+
+
+def time_per_op(op, mincount):
+    ''' Repeat the given operation at least mincount times and
+        return the time per operation in microseconds.
+    '''
+    t = timer()
+    opcount = op(mincount)
+    return (timer() - t) * 1e6 / opcount
+
+# Functions defining particular distributions of SP codes
+#
+def evenly(rate=DEFAULT_RATE):
+    'Evenly distributed at given rate'
+    def f(ran, i):
+        return ran.random() < rate
+    return f
+
+def evenly_before(k, rate=DEFAULT_RATE):
+    'Evenly distributed on i<k at given rate'
+    def f(ran, i):
+        return i < k and ran.random() < rate
+    return f
+
+def evenly_from(k, rate=DEFAULT_RATE):
+    'Evenly distributed on i>=k at given rate'
+    def f(ran, i):
+        return i >= k and ran.random() < rate
+    return f
+
+def time_all():
+
+    setups = [
+        #("empty", UnicodeActions(0)),
+        ("single bmp", UnicodeActions(1, (lambda ran, i : False))),
+        ("single", UnicodeActions(1, (lambda ran, i : True))),
+        ("short bmp", UnicodeActions(SHORT, (lambda ran, i : False))),
+        ("short 50%", UnicodeActions(SHORT, evenly(0.5))),
+        ("medium bmp", UnicodeActions(MEDIUM, (lambda ran, i : False))),
+        ("medium 10%", UnicodeActions(MEDIUM, evenly(0.1))),
+        ("long bmp", UnicodeActions(LONG, (lambda ran, i : False))),
+        ("long 1%", UnicodeActions(LONG, evenly(0.01))),
+        ("long 10%", UnicodeActions(LONG, evenly(0.1))),
+        ("long 10% L", UnicodeActions(LONG, evenly_before(LONG/4, 0.4))),
+        ("long 10% H", UnicodeActions(LONG, evenly_from(LONG-(LONG/4), 0.4))),
+        ("long 50%", UnicodeActions(LONG, evenly(0.5))),
+        ("huge bmp", UnicodeActions(HUGE, (lambda ran, i : False))),
+        ("huge 10%", UnicodeActions(HUGE, evenly(0.1))),
+    ]
+
+    ops = [
+        ("[i]", "repeat_getitem"),
+        ("[i:i+n]", "repeat_slice"),
+        ("[i:i+n:k]", "repeat_slice_step"),
+        ("find(c)", "repeat_find_char"),
+        ("find(s)", "repeat_find_str"),
+        ("rfind(c)", "repeat_rfind_char"),
+        ("rfind(s)", "repeat_rfind_str"),
+    ]
+
+
+    print("{:12s}{:>6s}".format("time (us)", "len"), end='')
+    for title, _ in ops:
+        print("{:>10s}".format(title), end='')
+    print()
+
+    mintime = dict()
+    def save_mintime(k, t):
+        mintime[k] = min(t, mintime.get(k, 1e9))
+
+    trashcan = []
+
+    # Cycle through the cases repeatedly.
+    for k in range(5):
+        for name, material in setups:
+            for _, opname in ops:
+                # For each case, keep the minimum time
+                op = material.__getattribute__(opname)
+                t = time_per_op(op, 1000)
+                save_mintime((name, opname), t)
+
+            if k == 0:
+                trashcan.append(material.trash)
+
+    # Cycle through the cases again to print them.
+    for name, material in setups:
+        print("{:12s}{:6d}".format(name, material.size), end='')
+        for _, opname in ops:
+            t = mintime[(name, opname)]
+            print("{:10.2f}".format(t), end='')
+        print()
+
+    print("y =", trashcan)
+
+if __name__ == "__main__":
+
+    time_all()

-- 
Repository URL: http://hg.python.org/jython