[Jython-checkins] jython (merge default -> default): Merge recent work to trunk (mostly PyUnicode).
jeff.allen
jython-checkins at python.org
Wed Sep 17 00:55:29 CEST 2014
http://hg.python.org/jython/rev/776cae0189ed
changeset: 7387:776cae0189ed
parent: 7375:4ed83fcdb13e
parent: 7386:4187e256ae1e
user: Jeff Allen <ja.py at farowl.co.uk>
date: Tue Sep 16 23:55:09 2014 +0100
summary:
Merge recent work to trunk (mostly PyUnicode).
files:
Lib/test/test_bytes_jy.py | 44 +-
Lib/test/test_unicode_jy.py | 249 ++++
src/org/python/core/PyArray.java | 83 +-
src/org/python/core/PyBUF.java | 10 +
src/org/python/core/PyBuffer.java | 53 +-
src/org/python/core/PyByteArray.java | 30 +-
src/org/python/core/PyString.java | 253 ++-
src/org/python/core/PyUnicode.java | 563 ++++++++-
src/org/python/core/buffer/BaseBuffer.java | 86 +-
src/org/python/core/buffer/SimpleBuffer.java | 9 +
src/org/python/core/buffer/SimpleStringBuffer.java | 11 +-
src/org/python/modules/_io/PyFileIO.java | 6 +-
src/org/python/modules/posix/PosixModule.java | 9 +-
tests/java/javatests/ProxyDeserialization.java | 0
tests/python/unicode_index_times.py | 405 +++++++
15 files changed, 1591 insertions(+), 220 deletions(-)
diff --git a/Lib/test/test_bytes_jy.py b/Lib/test/test_bytes_jy.py
--- a/Lib/test/test_bytes_jy.py
+++ b/Lib/test/test_bytes_jy.py
@@ -12,11 +12,51 @@
class Sub(bytearray): pass
s = Sub("abc123")
self.assertEqual(len(s), 6)
-
+
+
+class SimpleOperationsTest(unittest.TestCase):
+ # Things the CPython library did not test throughly enough
+
+ def test_irepeat(self) :
+
+ def check_irepeat(a, n) :
+ # Check in-place multiplication (repeats)
+ b = bytearray(a)
+ b *= n
+ self.assertEquals(b, bytearray(a*n))
+
+ def irepeat_export(a, n) :
+ # In-place multiplication with export mostly raises BufferError
+ b = bytearray(a)
+ with memoryview(b) as m:
+ b *= n
+ # If it doesn't raise, it gets the right answer
+ self.assertEquals(b, bytearray(a*n))
+
+ for a in [b'', b'a', b'hello'] :
+ check_irepeat(a, 7)
+ check_irepeat(a, 1)
+ check_irepeat(a, 0)
+ check_irepeat(a, -1) # -ve treated as 0
+
+ # Resizing with exports should raise an exception
+ self.assertRaises(BufferError, irepeat_export, b'a', 5)
+ self.assertRaises(BufferError, irepeat_export, b'hello', 3)
+ self.assertRaises(BufferError, irepeat_export, b'hello', 0)
+ self.assertRaises(BufferError, irepeat_export, b'hello', -1)
+
+ # These don't raise an exception (CPython 2.7.6, 3.4.1)
+ irepeat_export(b'a', 1)
+ irepeat_export(b'hello', 1)
+ for n in range(-1, 3) :
+ irepeat_export(b'', n)
+
def test_main():
test.test_support.run_unittest(
- ByteArraySubclassTest)
+ ByteArraySubclassTest,
+ SimpleOperationsTest,
+ )
if __name__ == "__main__":
diff --git a/Lib/test/test_unicode_jy.py b/Lib/test/test_unicode_jy.py
--- a/Lib/test/test_unicode_jy.py
+++ b/Lib/test/test_unicode_jy.py
@@ -3,12 +3,15 @@
Made for Jython.
"""
+import itertools
+import random
import re
import string
import sys
import unittest
from StringIO import StringIO
from test import test_support
+from java.lang import StringBuilder
class UnicodeTestCase(unittest.TestCase):
@@ -155,6 +158,251 @@
u'f')
+class UnicodeMaterial(object):
+ ''' Object holding a list of single characters and a unicode string
+ that is their concatenation. The sequence is created from a
+ background sequence of basic plane characters and random
+ replacement with supplementary plane characters (those with
+ point code>0xffff).
+ '''
+
+ base = tuple(u'abcdefghijklmnopqrstuvwxyz')
+ supp = tuple(map(unichr, range(0x10000, 0x1000c)))
+ used = sorted(set(base+supp))
+
+ def __init__(self, size=20, pred=None, ran=None):
+ ''' Create size chars choosing an SP char at i where
+ pred(ran, i)==True where ran is an instance of
+ random.Random supplied in the constructor or created
+ locally (if ran==None).
+ '''
+
+ # Generators for the BMP and SP characters
+ base = itertools.cycle(UnicodeMaterial.base)
+ supp = itertools.cycle(UnicodeMaterial.supp)
+
+ # Each instance gets a random generator
+ if ran is None:
+ ran = random.Random()
+ self.random = ran
+
+ if pred is None:
+ pred = lambda ran, j : ran.random() < DEFAULT_RATE
+
+ # Generate the list
+ r = list()
+ for i in range(size):
+ if pred(self.random, i):
+ c = supp.next()
+ else:
+ c = base.next()
+ r.append(c)
+
+ # The list and its concatenation are our material
+ self.ref = r
+ self.size = len(r)
+ self.text = u''.join(r)
+ self.target = u''
+
+ def __len__(self):
+ return self.size
+
+ def insert(self, target, p=None):
+ ''' Insert target string at position p (or middle), truncating if
+ that would make the material any longer
+ '''
+ if p is None:
+ p = max(0, (self.size-len(target)) // 2)
+
+ n = 0
+ for t in target:
+ if p+n >= self.size:
+ break;
+ self.ref[p+n] = t
+ n += 1
+
+ self.target = target[:n]
+ self.text = u''.join(self.ref)
+
+
+class UnicodeIndexMixTest(unittest.TestCase):
+ # Test indexing where there may be more than one code unit per code point.
+ # See Jython Issue #2100.
+
+ # Functions defining particular distributions of SP codes
+ #
+ def evenly(self, rate=0.2):
+ 'Evenly distributed at given rate'
+ def f(ran, i):
+ return ran.random() < rate
+ return f
+
+ def evenly_before(self, k, rate=0.2):
+ 'Evenly distributed on i<k at given rate'
+ def f(ran, i):
+ return i < k and ran.random() < rate
+ return f
+
+ def evenly_from(self, k, rate=0.2):
+ 'Evenly distributed on i>=k at given rate'
+ def f(ran, i):
+ return i >= k and ran.random() < rate
+ return f
+
+ def at(self, places):
+ 'Only at specified places'
+ def f(ran, i):
+ return i in places
+ return f
+
+ def setUp(self):
+ ran = random.Random(1234) # ensure repeatable
+ mat = list()
+ mat.append(UnicodeMaterial(10, self.at([2]), ran))
+ mat.append(UnicodeMaterial(10, self.at([2, 5]), ran))
+ mat.append(UnicodeMaterial(50, self.evenly(), ran))
+ mat.append(UnicodeMaterial(200, self.evenly_before(70), ran))
+ mat.append(UnicodeMaterial(200, self.evenly_from(130), ran))
+ mat.append(UnicodeMaterial(1000, self.evenly(), ran))
+
+ self.material = mat
+
+
+ def test_getitem(self):
+ # Test map from to code point index to internal representation
+ # Fails in Jython 2.7b3
+
+ def check_getitem(m):
+ # Check indexing the string returns the expected point code
+ for i in xrange(m.size):
+ self.assertEqual(m.text[i], m.ref[i])
+
+ for m in self.material:
+ check_getitem(m)
+
+ def test_slice(self):
+ # Test indexing gets the slice ends correct.
+ # Passes in Jython 2.7b3, but may be touched by #2100 changes.
+
+ def check_slice(m):
+ # Check a range of slices against slices of the reference.
+ n = 1
+ while n <= m.size:
+ for i in range(m.size - n):
+ exp = u''.join(m.ref[i:i+n])
+ self.assertEqual(m.text[i:i+n], exp)
+ n *= 3
+
+ for m in self.material:
+ check_slice(m)
+
+ def test_find(self):
+ # Test map from internal find result to code point index
+ # Fails in Jython 2.7b3
+
+ def check_find(ref):
+ # Check find returns indexes for single point codes
+ for c in set(m.used):
+ start = 0
+ u = m.text
+ while start < m.size:
+ i = u.find(c, start)
+ if i < 0: break
+ self.assertEqual(u[i], c)
+ self.assertGreaterEqual(i, start)
+ start = i + 1
+
+ def check_find_str(m, t):
+ # Check find returns correct index for string target
+ i = m.text.find(t)
+ self.assertEqual(list(t), m.ref[i:i+len(t)])
+
+ targets = [
+ u"this",
+ u"ab\U00010041de",
+ u"\U00010041\U00010042\U00010042xx",
+ u"xx\U00010041\U00010042\U00010043yy",
+ ]
+
+ for m in self.material:
+ check_find(m)
+ for t in targets:
+ # Insert in middle then try to find it
+ m.insert(t)
+ check_find_str(m, t)
+
+ def test_rfind(self):
+ # Test map from internal rfind result to code point index
+ # Fails in Jython 2.7b3
+
+ def check_rfind(ref):
+ # Check rfind returns indexes for single point codes
+ for c in set(m.used):
+ end = m.size
+ u = m.text
+ while True:
+ i = u.rfind(c, 0, end)
+ if i < 0: break
+ self.assertLess(i, end)
+ self.assertEqual(u[i], c)
+ end = i
+
+ def check_rfind_str(m, t):
+ # Check rfind returns correct index for string target
+ i = m.text.rfind(t)
+ self.assertEqual(list(t), m.ref[i:i+len(t)])
+
+ targets = [
+ u"this",
+ u"ab\U00010041de",
+ u"\U00010041\U00010042\U00010042xx",
+ u"xx\U00010041\U00010042\U00010043yy",
+ ]
+
+ for m in self.material:
+ check_rfind(m)
+ for t in targets:
+ # Insert in middle then try to find it
+ m.insert(t)
+ check_rfind_str(m, t)
+
+ def test_surrogate_validation(self):
+
+ def insert_sb(text, c1, c2):
+ # Insert code points c1, c2 in the text, as a Java StringBuilder
+ sb = StringBuilder()
+ # c1 at the quarter point
+ p1 = len(mat) // 4
+ for c in mat.text[:p1]:
+ sb.appendCodePoint(ord(c))
+ sb.appendCodePoint(c1)
+ # c2 at the three-quarter point
+ p2 = 3 * p1
+ for c in mat.text[p1:p2]:
+ sb.appendCodePoint(ord(c))
+ sb.appendCodePoint(c2)
+ # Rest of text
+ for c in mat.text[p2:]:
+ sb.appendCodePoint(ord(c))
+ return sb
+
+ # Test that lone surrogates are rejected
+ for surr in [0xdc81, 0xdc00, 0xdfff, 0xd800, 0xdbff]:
+ for mat in self.material:
+
+ # Java StringBuilder with two private-use characters:
+ sb = insert_sb(mat.text, 0xe000, 0xf000)
+ # Check this is acceptable
+ #print repr(unicode(sb))
+ self.assertEqual(len(unicode(sb)), len(mat)+2)
+
+ # Java StringBuilder with private-use and lone surrogate:
+ sb = insert_sb(mat.text, 0xe000, surr)
+ # Check this is detected
+ #print repr(unicode(sb))
+ self.assertRaises(ValueError, unicode, sb)
+
+
class UnicodeFormatTestCase(unittest.TestCase):
def test_unicode_mapping(self):
@@ -601,6 +849,7 @@
def test_main():
test_support.run_unittest(
UnicodeTestCase,
+ UnicodeIndexMixTest,
UnicodeFormatTestCase,
UnicodeStdIOTestCase,
UnicodeFormatStrTest,
diff --git a/src/org/python/core/PyArray.java b/src/org/python/core/PyArray.java
--- a/src/org/python/core/PyArray.java
+++ b/src/org/python/core/PyArray.java
@@ -1,7 +1,6 @@
// Copyright (c) Corporation for National Research Initiatives
package org.python.core;
-import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
@@ -11,6 +10,7 @@
import java.io.OutputStream;
import java.lang.ref.WeakReference;
import java.lang.reflect.Array;
+import java.nio.ByteBuffer;
import org.python.core.buffer.BaseBuffer;
import org.python.core.buffer.SimpleStringBuffer;
@@ -1130,19 +1130,17 @@
frombytesInternal(StringUtil.toBytes(s));
} else {
- // Access the bytes
+ // Access the bytes through the abstract API of the BufferProtocol
try (PyBuffer pybuf = ((BufferProtocol)input).getBuffer(PyBUF.STRIDED_RO)) {
- // Provide argument as stream of bytes for fromstream method
if (pybuf.getNdim() == 1) {
if (pybuf.getStrides()[0] == 1) {
- // Data are contiguous in a byte[]
- PyBuffer.Pointer b = pybuf.getBuf();
- frombytesInternal(b.storage, b.offset, pybuf.getLen());
+ // Data are contiguous in the buffer
+ frombytesInternal(pybuf.getNIOByteBuffer());
} else {
// As frombytesInternal only knows contiguous bytes, make a copy.
byte[] copy = new byte[pybuf.getLen()];
pybuf.copyTo(copy, 0);
- frombytesInternal(copy);
+ frombytesInternal(ByteBuffer.wrap(copy));
}
} else {
// Currently don't support n-dimensional sources
@@ -1158,39 +1156,30 @@
}
/**
- * Common code supporting Java and Python versions of <code>.fromstring()</code>
- *
- * @param input string of bytes encoding the array data
- */
- private final void fromstringInternal(String input) {
- frombytesInternal(StringUtil.toBytes(input));
- }
-
- /**
* Common code supporting Java and Python versions of <code>.fromstring()</code> or
* <code>.frombytes()</code> (Python 3.2+ name).
*
* @param bytes array containing the new array data in machine encoding
*/
private final void frombytesInternal(byte[] bytes) {
- frombytesInternal(bytes, 0, bytes.length);
+ frombytesInternal(ByteBuffer.wrap(bytes));
}
/**
- * Common code supporting Java and Python versions of <code>.fromstring()</code> or
- * <code>.frombytes()</code> (Python 3.2+ name).
+ * Copy into this array, the remaining bytes of a ByteBuffer (from the current position to the
+ * limit). This is common code supporting Java and Python versions of <code>.fromstring()</code>
+ * or <code>.frombytes()</code> (Python 3.2+ name).
*
- * @param bytes array containing the new array data in machine encoding
- * @param offset of the first byte to read
- * @param count of bytes to read
+ * @param bytes buffer containing the new array data in machine encoding
*/
- private final void frombytesInternal(byte[] bytes, int offset, int count) {
+ private final void frombytesInternal(ByteBuffer bytes) {
// Access the bytes
int origsize = delegate.getSize();
// Check validity wrt array itemsize
int itemsize = getStorageSize();
+ int count = bytes.remaining();
if ((count % itemsize) != 0) {
throw Py.ValueError("string length not a multiple of item size");
}
@@ -1201,8 +1190,8 @@
try {
// Provide argument as stream of bytes for fromstream method
- ByteArrayInputStream bis = new ByteArrayInputStream(bytes, offset, count);
- fromStream(bis);
+ InputStream is = new ByteBufferBackedInputStream(bytes);
+ fromStream(is);
} catch (EOFException e) {
// stubbed catch for fromStream throws
@@ -2117,4 +2106,48 @@
}
}
+ /**
+ * Wrap a <code>ByteBuffer</code> in an InputStream. Reference: <a
+ * href=http://stackoverflow.com/questions/4332264/wrapping-a-bytebuffer-with-an-inputstream>
+ * Stackoverflow question 4332264</a>.
+ */
+ private class ByteBufferBackedInputStream extends InputStream {
+
+ ByteBuffer buf;
+
+ public ByteBufferBackedInputStream(ByteBuffer buf) {
+ this.buf = buf;
+ }
+
+ /**
+ * Return the number of bytes remaining in the underlying buffer.
+ */
+ @Override
+ public int available() throws IOException {
+ return buf.remaining();
+ }
+
+
+ @Override
+ public int read() {
+ return buf.hasRemaining() ? buf.get() & 0xff : -1;
+ }
+
+ @Override
+ public int read(byte[] bytes, int off, int len) {
+ int n = buf.remaining();
+ if (n >= len) {
+ // There are enough bytes remaining to satisfy the request.
+ buf.get(bytes, off, len);
+ return len;
+ } else if (n > 0) {
+ // There are some bytes remaining: truncate request.
+ buf.get(bytes, off, n);
+ return n;
+ } else {
+ // Signal that there are no bytes left
+ return -1;
+ }
+ }
+ }
}
diff --git a/src/org/python/core/PyBUF.java b/src/org/python/core/PyBUF.java
--- a/src/org/python/core/PyBUF.java
+++ b/src/org/python/core/PyBUF.java
@@ -213,6 +213,16 @@
*/
static final int FULL_RO = INDIRECT | FORMAT;
+ /* Constants for additional feature(s), not standard for CPython */
+
+ /**
+ * A constant used by the consumer in its call to {@link BufferProtocol#getBuffer(int)} to
+ * specify that it expects to access the buffer contents directly as an array (rather than
+ * through the purely abstract part of the API). <code>getBuffer</code> will raise an exception
+ * if the exporter cannot expose its storage as Java array.
+ */
+ static final int AS_ARRAY = 0x10000000;
+
/* Constants for readability, not standard for CPython */
/**
diff --git a/src/org/python/core/PyBuffer.java b/src/org/python/core/PyBuffer.java
--- a/src/org/python/core/PyBuffer.java
+++ b/src/org/python/core/PyBuffer.java
@@ -1,5 +1,7 @@
package org.python.core;
+import java.nio.ByteBuffer;
+
/**
* The Jython buffer API for access to a byte array within an exporting object. This interface is
* the counterpart of the CPython <code>Py_buffer</code> struct. Several concrete types implement
@@ -237,10 +239,49 @@
*/
public PyBuffer getBufferSlice(int flags, int start, int length, int stride);
+ // java.nio access to actual storage
+ //
+
+ /**
+ * Obtain a {@link java.nio.ByteBuffer} giving access to the bytes that hold the data being
+ * exported to the consumer. For a one-dimensional contiguous buffer, assuming the following
+ * client code where <code>obj</code> has type <code>BufferProtocol</code>:
+ *
+ * <pre>
+ * PyBuffer a = obj.getBuffer(PyBUF.SIMPLE);
+ * int itemsize = a.getItemsize();
+ * ByteBuffer bb = a.getNIOBuffer();
+ * </pre>
+ *
+ * the item with index <code>bb.pos()+k</code> is in the buffer <code>bb</code> at positions
+ * <code>bb.pos()+k*itemsize</code> to <code>bb.pos()+(k+1)*itemsize - 1</code> inclusive. And
+ * if <code>itemsize==1</code>, the item is simply the byte at position <code>bb.pos()+k</code>.
+ * The buffer limit is set to the first byte beyond the valid data. A block read or write will
+ * therefore access the contents sequentially.
+ * <p>
+ * If the buffer is multidimensional or non-contiguous (strided), the buffer position is still
+ * the (first byte of) the item at index <code>[0]</code> or <code>[0,...,0]</code>, and the
+ * limit is one item beyond the valid data. However, it is necessary to navigate <code>bb</code>
+ * using the <code>shape</code>, <code>strides</code> and maybe <code>suboffsets</code> provided
+ * by the API.
+ *
+ * @return a ByteBuffer equivalent to the exported data contents.
+ */
+ ByteBuffer getNIOByteBuffer();
+
// Direct access to actual storage
//
/**
+ * Determine whether the exporter is able to offer direct access to the exported storage as a
+ * Java byte array (through the API that involves class {@link Pointer}), or only supports the
+ * abstract API. See also {@link PyBUF#AS_ARRAY}.
+ *
+ * @return true if array access is supported, false if it is not.
+ */
+ boolean hasArray();
+
+ /**
* A class that references a <code>byte[]</code> array and a particular offset within it, as the
* return type for methods that give direct access to byte-oriented data exported by a Python
* object. In some contexts the consumer will be entitled to make changes to the contents of
@@ -270,11 +311,13 @@
* Return a structure describing the slice of a byte array that holds the data being exported to
* the consumer. For a one-dimensional contiguous buffer, assuming the following client code
* where <code>obj</code> has type <code>BufferProtocol</code>:
+ *
* <pre>
- * PyBuffer a = obj.getBuffer();
+ * PyBuffer a = obj.getBuffer(PyBUF.SIMPLE);
* int itemsize = a.getItemsize();
* PyBuffer.Pointer b = a.getBuf();
* </pre>
+ *
* the item with index <code>k</code> is in the array <code>b.storage</code> at index
* <code>[b.offset + k*itemsize]</code> to <code>[b.offset + (k+1)*itemsize - 1]</code>
* inclusive. And if <code>itemsize==1</code>, the item is simply the byte
@@ -293,12 +336,14 @@
* Return a structure describing the position in a byte array of a single item from the data
* being exported to the consumer. For a one-dimensional contiguous buffer, assuming the
* following client code where <code>obj</code> has type <code>BufferProtocol</code>:
+ *
* <pre>
* int k = ... ;
- * PyBuffer a = obj.getBuffer();
+ * PyBuffer a = obj.getBuffer(PyBUF.FULL);
* int itemsize = a.getItemsize();
* PyBuffer.Pointer b = a.getPointer(k);
* </pre>
+ *
* the item with index <code>k</code> is in the array <code>b.storage</code> at index
* <code>[b.offset]</code> to <code>[b.offset + itemsize - 1]</code> inclusive. And if
* <code>itemsize==1</code>, the item is simply the byte <code>b.storage[b.offset]</code>
@@ -317,13 +362,15 @@
* being exported to the consumer, in the case that array may be multi-dimensional. For a
* 3-dimensional contiguous buffer, assuming the following client code where <code>obj</code>
* has type <code>BufferProtocol</code>:
+ *
* <pre>
* int i, j, k;
* // ... calculation that assigns i, j, k
- * PyBuffer a = obj.getBuffer();
+ * PyBuffer a = obj.getBuffer(PyBUF.FULL);
* int itemsize = a.getItemsize();
* PyBuffer.Pointer b = a.getPointer(i,j,k);
* </pre>
+ *
* the item with index <code>[i,j,k]</code> is in the array <code>b.storage</code> at index
* <code>[b.offset]</code> to <code>[b.offset + itemsize - 1]</code> inclusive. And if
* <code>itemsize==1</code>, the item is simply the byte <code>b.storage[b.offset]</code>
diff --git a/src/org/python/core/PyByteArray.java b/src/org/python/core/PyByteArray.java
--- a/src/org/python/core/PyByteArray.java
+++ b/src/org/python/core/PyByteArray.java
@@ -352,7 +352,33 @@
* @param count the number of times to repeat this.
*/
protected synchronized void irepeat(int count) {
- this.setStorage(repeatImpl(count));
+ // There are several special cases
+ if (size == 0 || count == 1) {
+ // No resize, so no check (consistent with CPython)
+ // Value is unchanged.
+
+ } else if (count <= 0) {
+ // Treat as if count == 0.
+ resizeCheck();
+ this.setStorage(emptyStorage);
+
+ } else {
+ // Open up space (remembering the original size)
+ int orginalSize = size;
+ storageExtend(orginalSize * (count - 1));
+ if (orginalSize == 1) {
+ // Do it efficiently for single bytes
+ byte b = storage[offset];
+ for (int i = 1, p = offset + 1; i < count; i++) {
+ storage[p++] = b;
+ }
+ } else {
+ // General case
+ for (int i = 1, p = offset + orginalSize; i < count; i++, p += orginalSize) {
+ System.arraycopy(storage, offset, storage, p, orginalSize);
+ }
+ }
+ }
}
/**
@@ -924,7 +950,7 @@
return bytearray___imul__(n);
}
- @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.bytearray___mul___doc)
+ @ExposedMethod(type = MethodType.BINARY, doc = BuiltinDocs.bytearray___imul___doc)
final PyObject bytearray___imul__(PyObject n) {
if (!n.isIndex()) {
return null;
diff --git a/src/org/python/core/PyString.java b/src/org/python/core/PyString.java
--- a/src/org/python/core/PyString.java
+++ b/src/org/python/core/PyString.java
@@ -72,6 +72,17 @@
return str;
}
+ /**
+ * Determine whether the string consists entirely of basic-plane characters. For a
+ * {@link PyString}, of course, it is always <code>true</code>, but this is useful in cases
+ * where either a <code>PyString</code> or a {@link PyUnicode} is acceptable.
+ *
+ * @return true
+ */
+ public boolean isBasicPlane() {
+ return true;
+ }
+
@ExposedNew
static PyObject str_new(PyNewWrapper new_, boolean init, PyType subtype, PyObject[] args,
String[] keywords) {
@@ -144,6 +155,13 @@
return pybuf;
}
+ /**
+ * Return a substring of this object as a Java String.
+ *
+ * @param start the beginning index, inclusive.
+ * @param end the ending index, exclusive.
+ * @return the specified substring.
+ */
public String substring(int start, int end) {
return getString().substring(start, end);
}
@@ -153,7 +171,7 @@
return str___str__();
}
- public @ExposedMethod(doc = BuiltinDocs.str___str___doc)
+ @ExposedMethod(doc = BuiltinDocs.str___str___doc)
final PyString str___str__() {
if (getClass() == PyString.class) {
return this;
@@ -673,6 +691,14 @@
return new PyString(str);
}
+ /**
+ * Create an instance of the same type as this object, from the Java String given as argument.
+ * This is to be overridden in a subclass to return its own type.
+ *
+ * @param string UTF-16 string encoding the characters (as Java).
+ * @param isBasic is ignored in <code>PyString</code> (effectively true).
+ * @return
+ */
protected PyString createInstance(String str, boolean isBasic) {
// ignore isBasic, doesn't apply to PyString, just PyUnicode
return new PyString(str);
@@ -2353,59 +2379,28 @@
* @param endObj end of slice.
* @return count of occurrences
*/
- protected final int _count_old(String sub, PyObject startObj, PyObject endObj) {
-// xxx
+ protected final int _count(String sub, PyObject startObj, PyObject endObj) {
+
// Interpret the slice indices as concrete values
int[] indices = translateIndices(startObj, endObj);
int subLen = sub.length();
if (subLen == 0) {
- // Special case counting the occurrences of an empty string
- if (indices[2] > getString().length()) {
+ // Special case counting the occurrences of an empty string.
+ int start = indices[2], end = indices[3], n = __len__();
+ if (end < 0 || end < start || start > n) {
+ // Slice is reversed or does not overlap the string.
return 0;
} else {
- return indices[1] - indices[0] + 1;
+ // Count of '' is one more than number of characters in overlap.
+ return Math.min(end, n) - Math.max(start, 0) + 1;
}
} else {
+
// Skip down this string finding occurrences of sub
- int start = indices[0], end = indices[1], count = 0;
- while (true) {
- int index = getString().indexOf(sub, start);
- if (index < 0) {
- break; // not found
- } else {
- // Found at index. Next search begins at end of this instance, at:
- start = index + subLen;
- if (start <= end) {
- count += 1; // ... and the instance found fits within this string.
- } else {
- break; // ... but the instance found overlaps the end, so is not valid.
- }
- }
- }
- return count;
- }
- }
-
- protected final int _count(String sub, PyObject startObj, PyObject endObj) {
-
- // Interpret the slice indices as concrete values
- int[] indices = translateIndices(startObj, endObj);
- int subLen = sub.length();
-
- if (subLen == 0) {
- // Special case counting the occurrences of an empty string
- if (indices[2] > getString().length()) {
- return 0;
- } else {
- return indices[1] - indices[0] + 1;
- }
-
- } else {
-
- // Skip down this string finding occurrences of sub
- int start = indices[0], limit = indices[1] - subLen, count = 0;
+ int start = indices[0], end = indices[1];
+ int limit = end - subLen, count = 0;
while (start <= limit) {
int index = getString().indexOf(sub, start);
@@ -2496,17 +2491,36 @@
* {@link PyUnicode#unicode_find(PyObject, PyObject, PyObject)}.
*
* @param sub substring to find.
- * @param start start of slice.
- * @param end end of slice.
+ * @param startObj start of slice.
+ * @param endObj end of slice.
* @return index of <code>sub</code> in this object or -1 if not found.
*/
- protected final int _find(String sub, PyObject start, PyObject end) {
- int[] indices = translateIndices(start, end);
- int index = getString().indexOf(sub, indices[0]);
- if (index < indices[2] || index > indices[1]) {
- return -1;
+ protected final int _find(String sub, PyObject startObj, PyObject endObj) {
+ // Interpret the slice indices as concrete values
+ int[] indices = translateIndices(startObj, endObj);
+ int subLen = sub.length();
+
+ if (subLen == 0) {
+ // Special case: an empty string may be found anywhere, ...
+ int start = indices[2], end = indices[3];
+ if (end < 0 || end < start || start > __len__()) {
+ // ... except ln a reverse slice or beyond the end of the string,
+ return -1;
+ } else {
+ // ... and will be reported at the start of the overlap.
+ return indices[0];
+ }
+
+ } else {
+ // General case: search for first match then check against slice.
+ int start = indices[0], end = indices[1];
+ int found = getString().indexOf(sub, start);
+ if (found >= 0 && found + subLen <= end) {
+ return found;
+ } else {
+ return -1;
+ }
}
- return index;
}
/**
@@ -2582,17 +2596,36 @@
* {@link PyUnicode#unicode_rfind(PyObject, PyObject, PyObject)}.
*
* @param sub substring to find.
- * @param start start of slice.
- * @param end end of slice.
+ * @param startObj start of slice.
+ * @param endObj end of slice.
* @return index of <code>sub</code> in this object or -1 if not found.
*/
- protected final int _rfind(String sub, PyObject start, PyObject end) {
- int[] indices = translateIndices(start, end);
- int index = getString().lastIndexOf(sub, indices[1] - sub.length());
- if (index < indices[2]) {
- return -1;
+ protected final int _rfind(String sub, PyObject startObj, PyObject endObj) {
+ // Interpret the slice indices as concrete values
+ int[] indices = translateIndices(startObj, endObj);
+ int subLen = sub.length();
+
+ if (subLen == 0) {
+ // Special case: an empty string may be found anywhere, ...
+ int start = indices[2], end = indices[3];
+ if (end < 0 || end < start || start > __len__()) {
+ // ... except ln a reverse slice or beyond the end of the string,
+ return -1;
+ } else {
+ // ... and will be reported at the end of the overlap.
+ return indices[1];
+ }
+
+ } else {
+ // General case: search for first match then check against slice.
+ int start = indices[0], end = indices[1];
+ int found = getString().lastIndexOf(sub, end - subLen);
+ if (found >= start) {
+ return found;
+ } else {
+ return -1;
+ }
}
- return index;
}
public double atof() {
@@ -3284,51 +3317,80 @@
}
/**
- * Turns the possibly negative Python slice start and end into valid indices into this string.
+ * Many of the string methods deal with slices specified using Python slice semantics:
+ * endpoints, which are <code>PyObject</code>s, may be <code>null</code> or <code>None</code>
+ * (meaning default to one end or the other) or may be negative (meaning "from the end").
+ * Meanwhile, the implementation methods need integer indices, both within the array, and
+ * <code>0<=start<=end<=N</code> the length of the array.
+ * <p>
+ * This method first translates the Python slice <code>startObj</code> and <code>endObj</code>
+ * according to the slice semantics for null and negative values, and stores these in elements 2
+ * and 3 of the result. Then, since the end points of the range may lie outside this sequence's
+ * bounds (in either direction) it reduces them to the nearest points satisfying
+ * <code>0<=start<=end<=N</code>, and stores these in elements [0] and [1] of the
+ * result.
*
- * @return a 3 element array of indices into this string describing a substring from [0] to [1].
- * [0] <= [1], [0] >= 0 and [1] <= string.length(). The third element contains the
- * unadjusted start value (or nearest int).
+ * @param startObj Python start of slice
+ * @param endObj Python end of slice
+ * @return a 4 element array of two range-safe indices, and two original indices.
*/
- protected int[] translateIndices(PyObject start, PyObject end) {
- int iStart, iStartUnadjusted, iEnd;
- int n = getString().length();
-
- // Make sure the slice end decodes to something in range
- if (end == null || end == Py.None) {
- iEnd = n;
+ protected int[] translateIndices(PyObject startObj, PyObject endObj) {
+ int start, end;
+ int n = __len__();
+ int[] result = new int[4];
+
+ // Decode the start using slice semantics
+ if (startObj == null || startObj == Py.None) {
+ start = 0;
+ // result[2] = 0 already
} else {
- // Convert to int but limit to Integer.MIN_VALUE <= iEnd <= Integer.MAX_VALUE
- iEnd = end.asIndex(null);
- if (iEnd > n) {
- iEnd = n;
- } else if (iEnd < 0) {
- iEnd = n + iEnd;
- if (iEnd < 0) {
- iEnd = 0;
+ // Convert to int but limit to Integer.MIN_VALUE <= start <= Integer.MAX_VALUE
+ start = startObj.asIndex(null);
+ if (start < 0) {
+ // Negative value means "from the end"
+ start = n + start;
+ }
+ result[2] = start;
+ }
+
+ // Decode the end using slice semantics
+ if (endObj == null || endObj == Py.None) {
+ result[1] = result[3] = end = n;
+ } else {
+ // Convert to int but limit to Integer.MIN_VALUE <= end <= Integer.MAX_VALUE
+ end = endObj.asIndex(null);
+ if (end < 0) {
+ // Negative value means "from the end"
+ result[3] = end = end + n;
+ // Ensure end is safe for String.substring(start,end).
+ if (end < 0) {
+ end = 0;
+ // result[1] = 0 already
+ } else {
+ result[1] = end;
+ }
+ } else {
+ result[3] = end;
+ // Ensure end is safe for String.substring(start,end).
+ if (end > n) {
+ result[1] = end = n;
+ } else {
+ result[1] = end;
}
}
}
- // Make sure the slice start decodes to something in range
- if (start == null || start == Py.None) {
- iStartUnadjusted = iStart = 0;
+ // Ensure start is safe for String.substring(start,end).
+ if (start < 0) {
+ start = 0;
+ // result[0] = 0 already
+ } else if (start > end) {
+ result[0] = start = end;
} else {
- // Convert to int but limit to Integer.MIN_VALUE <= iStart <= Integer.MAX_VALUE
- iStartUnadjusted = iStart = start.asIndex(null);
- if (iStart > iEnd) {
- iStart = iEnd;
- } else if (iStart < 0) {
- iStart = n + iStart;
- if (iStart > iEnd) {
- iStart = iEnd;
- } else if (iStart < 0) {
- iStart = 0;
- }
- }
+ result[0] = start;
}
- return new int[] {iStart, iEnd, iStartUnadjusted};
+ return result;
}
/**
@@ -3847,7 +3909,8 @@
* @param keywords naming the keyword arguments.
* @return the object designated or <code>null</code>.
*/
- private PyObject getFieldObject(String fieldName, boolean bytes, PyObject[] args, String[] keywords) {
+ private PyObject getFieldObject(String fieldName, boolean bytes, PyObject[] args,
+ String[] keywords) {
FieldNameIterator iterator = new FieldNameIterator(fieldName, bytes);
PyObject head = iterator.pyHead();
PyObject obj = null;
diff --git a/src/org/python/core/PyUnicode.java b/src/org/python/core/PyUnicode.java
--- a/src/org/python/core/PyUnicode.java
+++ b/src/org/python/core/PyUnicode.java
@@ -22,31 +22,44 @@
@ExposedType(name = "unicode", base = PyBaseString.class, doc = BuiltinDocs.unicode_doc)
public class PyUnicode extends PyString implements Iterable {
- private enum Plane {
+ /**
+ * Nearly every significant method comes in two versions: one applicable when the string
+ * contains only basic plane characters, and one that is correct when supplementary characters
+ * are also present. Set this constant <code>true</code> to treat all strings as containing
+ * supplementary characters, so that these versions will be exercised in tests.
+ */
+ private static final boolean DEBUG_NON_BMP_METHODS = false;
- UNKNOWN, BASIC, ASTRAL
- }
-
- private volatile Plane plane = Plane.UNKNOWN;
- private volatile int codePointCount = -1;
public static final PyType TYPE = PyType.fromClass(PyUnicode.class);
// for PyJavaClass.init()
public PyUnicode() {
- this(TYPE, "");
+ this(TYPE, "", true);
}
+ /**
+ * Construct a PyUnicode interpreting the Java String argument as UTF-16.
+ *
+ * @param string UTF-16 string encoding the characters (as Java).
+ */
public PyUnicode(String string) {
- this(TYPE, string);
+ this(TYPE, string, false);
}
+ /**
+ * Construct a PyUnicode interpreting the Java String argument as UTF-16. If it is known that
+ * the string contains no supplementary characters, argument isBasic may be set true by the
+ * caller. If it is false, the PyUnicode will scan the string to find out.
+ *
+ * @param string UTF-16 string encoding the characters (as Java).
+ * @param isBasic true if it is known that only BMP characters are present.
+ */
public PyUnicode(String string, boolean isBasic) {
- this(TYPE, string);
- plane = isBasic ? Plane.BASIC : Plane.UNKNOWN;
+ this(TYPE, string, isBasic);
}
public PyUnicode(PyType subtype, String string) {
- super(subtype, string);
+ this(subtype, string, false);
}
public PyUnicode(PyString pystring) {
@@ -54,12 +67,13 @@
}
public PyUnicode(PyType subtype, PyString pystring) {
- this(subtype, pystring instanceof PyUnicode ? pystring.string : pystring.decode()
- .toString());
+ this(subtype, //
+ pystring instanceof PyUnicode ? pystring.string : pystring.decode().toString(), //
+ pystring.isBasicPlane());
}
public PyUnicode(char c) {
- this(TYPE, String.valueOf(c));
+ this(TYPE, String.valueOf(c), true);
}
public PyUnicode(int codepoint) {
@@ -90,6 +104,20 @@
this(ucs4.iterator());
}
+ /**
+ * Fundamental all-features constructor on which the others depend. If it is known that the
+ * string contains no supplementary characters, argument isBasic may be set true by the caller.
+ * If it is false, the PyUnicode will scan the string to find out.
+ *
+ * @param subtype actual type to create.
+ * @param string UTF-16 string encoding the characters (as Java).
+ * @param isBasic true if it is known that only BMP characters are present.
+ */
+ private PyUnicode(PyType subtype, String string, boolean isBasic) {
+ super(subtype, string);
+ translator = isBasic ? BASIC : this.chooseIndexTranslator();
+ }
+
@Override
public int[] toCodePoints() {
int n = getCodePointCount();
@@ -101,15 +129,428 @@
return codePoints;
}
- // modified to know something about codepoints; we just need to return the
- // corresponding substring; darn UTF16!
- // TODO: we could avoid doing this unnecessary copy
+ // ------------------------------------------------------------------------------------------
+ // Index translation for Unicode beyond the BMP
+ // ------------------------------------------------------------------------------------------
+
+ /**
+ * Index translation between code point index (as seen by Python) and UTF-16 index (as used in
+ * the Java String.
+ */
+ private interface IndexTranslator {
+
+ /** Number of supplementary characters (hence point code length may be found). */
+ public int suppCount();
+
+ /** Translate a UTF-16 code unit index to its equivalent code point index. */
+ public int codePointIndex(int utf16Index);
+
+ /** Translate a code point index to its equivalent UTF-16 code unit index. */
+ public int utf16Index(int codePointIndex);
+ }
+
+ /**
+ * The instance of index translation in use in this string. It will be set to either
+ * {@link #BASIC} or and instance of {@link #Supplementary}.
+ */
+ private final IndexTranslator translator;
+
+ /**
+ * A singleton provides the translation service (which is a pass-through) for all BMP strings.
+ */
+ static final IndexTranslator BASIC = new IndexTranslator() {
+
+ @Override
+ public int suppCount() {
+ return 0;
+ }
+
+ @Override
+ public int codePointIndex(int u) {
+ return u;
+ }
+
+ @Override
+ public int utf16Index(int i) {
+ return i;
+ }
+ };
+
+ /**
+ * A class of index translation that uses the cumulative count so far of supplementary
+ * characters, tabulated in blocks of a standard size. The count is then used as an offset
+ * between the code point index and the corresponding point in the UTF-16 representation.
+ */
+ private final class Supplementary implements IndexTranslator {
+
+ /** Tabulates cumulative count so far of supplementary characters, by blocks of size M. */
+ final int[] count;
+
+ /** Configure the block size M, as this power of 2. */
+ static final int LOG2M = 4;
+ /** The block size used for indexing (power of 2). */
+ static final int M = 1 << LOG2M;
+ /** A mask used to separate the block number and offset in the block. */
+ static final int MASK = M - 1;
+
+ /**
+ * The constructor works on a count array prepared by
+ * {@link PyUnicode#getSupplementaryCounts(String)}.
+ */
+ Supplementary(int[] count) {
+ this.count = count;
+ }
+
+ @Override
+ public int codePointIndex(int u) {
+ /*
+ * Let the desired result be j such that utf16Index(j) = u. As we have only a forward
+ * index of the string, we have to conduct a search. In principle, we bound j by a pair
+ * of values (j1,j2) such that j1<=j<j2, and in successive iterations, we shorten the
+ * range until a unique j is found. In the first part of the search, we work in terms of
+ * block numbers, that is, indexes into the count array. We have j<u, and so j<k2*M
+ * where:
+ */
+ int k2 = (u >> LOG2M) + 1;
+ // The count of supplementary characters before the start of block k2 is:
+ int c2 = count[k2 - 1];
+ /*
+ * Since the count array is non-decreasing, and j < k2*M, we have u-j <= count[k2-1].
+ * That is, j >= k1*M, where:
+ */
+ int k1 = Math.max(0, u - c2) >> LOG2M;
+ // The count of supplementary characters before the start of block k1 is:
+ int c1 = (k1 == 0) ? 0 : count[k1 - 1];
+
+ /*
+ * Now, j (to be found) is in an unknown block k, where k1<=k<k2. We make a binary
+ * search, maintaining the inequalities, but moving the end points. When k2=k1+1, we
+ * know that j must be in block k1.
+ */
+ while (true) {
+ if (c2 == c1) {
+ // We can stop: the region contains no supplementary characters so j is:
+ return u - c1;
+ }
+ // Choose a candidate k in the middle of the range
+ int k = (k1 + k2) / 2;
+ if (k == k1) {
+ // We must have k2=k1+1, so j is in block k1
+ break;
+ } else {
+ // kx divides the range: is j<kx*M?
+ int c = count[k - 1];
+ if ((k << LOG2M) + c > u) {
+ // k*M+c > u therefore j is not in block k but to its left.
+ k2 = k;
+ c2 = c;
+ } else {
+ // k*M+c <= u therefore j must be in block k, or to its right.
+ k1 = k;
+ c1 = c;
+ }
+ }
+ }
+
+ /*
+ * At this point, j is known to be in block k1 (and k2=k1+1). c1 is the number of
+ * supplementary characters to the left of code point index k1*M and c2 is the number of
+ * supplementary characters to the left of code point index (k1+1)*M. We have to search
+ * this block sequentially. The current position in the UTF-16 is:
+ */
+ int p = (k1 << LOG2M) + c1;
+ while (p < u) {
+ if (Character.isHighSurrogate(string.charAt(p++))) {
+ // c1 tracks the number of supplementary characters to the left of p
+ c1 += 1;
+ if (c1 == c2) {
+ // We have found all supplementary characters in the block.
+ break;
+ }
+ // Skip the trailing surrogate.
+ p++;
+ }
+ }
+ // c1 is the number of supplementary characters to the left of u, so the result j is:
+ return u - c1;
+ }
+
+ @Override
+ public int utf16Index(int i) {
+ // The code point index i lies in the k-th block where:
+ int k = i >> LOG2M;
+ // The offset for the code point index k*M is exactly
+ int d = (k == 0) ? 0 : count[k - 1];
+ // The offset for the code point index (k+1)*M is exactly
+ int e = count[k];
+ if (d == e) {
+ /*
+ * The offset for the code point index (k+1)*M is the same, and since this is a
+ * non-decreasing function of k, it is also the value for i.
+ */
+ return i + d;
+ } else {
+ /*
+ * The offset for the code point index (k+1)*M is different (higher). We must scan
+ * along until we have found all the supplementary characters that precede i,
+ * starting the scan at code point index k*M.
+ */
+ for (int q = i & ~MASK; q < i; q++) {
+ if (Character.isHighSurrogate(string.charAt(q + d))) {
+ d += 1;
+ if (d == e) {
+ /*
+ * We have found all the supplementary characters in this block, so we
+ * must have found all those to the left of i.
+ */
+ break;
+ }
+ }
+ }
+
+ // d counts all the supplementary characters to the left of i.
+ return i + d;
+ }
+ }
+
+ @Override
+ public int suppCount() {
+ // The last element of the count array is the total number of supplementary characters.
+ return count[count.length - 1];
+ }
+ }
+
+ /**
+ * Generate the table that is used by the class {@link Supplementary} to accelerate access to
+ * the the implementation string. The method returns <code>null</code> if the string passed
+ * contains no surrogate pairs, in which case we'll use {@link #BASIC} as the translator. This
+ * method is sensitive to {@link #DEBUG_NON_BMP_METHODS} which if true will prevent it returning
+ * null, hance we will always use a {@link Supplementary} {@link #translator}.
+ *
+ * @param string to index
+ * @return the index (counts) or null if basic plane
+ */
+ private static int[] getSupplementaryCounts(final String string) {
+
+ final int n = string.length();
+ int p; // Index of the current UTF-16 code unit.
+
+ /*
+ * We scan to the first surrogate code unit, in a simple loop. If we hit the end before we
+ * find one, no count array will be necessary and we'll use BASIC. If we find a surrogate it
+ * may be half a supplementary character, or a lone surrogate: we'll find out later.
+ */
+ for (p = 0; p < n; p++) {
+ if (Character.isSurrogate(string.charAt(p))) {
+ break;
+ }
+ }
+
+ if (p == n && !DEBUG_NON_BMP_METHODS) {
+ // There are no supplementary characters so the 1:1 translator is fine.
+ return null;
+
+ } else {
+ /*
+ * We have to do this properly, using a scheme in which code point indexes are
+ * efficiently translatable to UTF-16 indexes through a table called here count[]. In
+ * this array, count[k] contains the total number of supplementary characters up to the
+ * end of the k.th block, that is, to the left of code point (k+1)M. We have to fill
+ * this array by scanning the string.
+ */
+ int q = p; // The current code point index (q = p+s).
+ int k = q >> Supplementary.LOG2M; // The block number k = q/M.
+
+ /*
+ * When addressing with a code point index q<=L (the length in code points) we will
+ * index the count array with k = q/M. We have q<=L<=n, therefore q/M <= n/M, the
+ * maximum valid k is 1 + n/M. A q>=L should raise IndexOutOfBoundsException, but it
+ * doesn't matter whether that's from indexing this array, or the string later.
+ */
+ int[] count = new int[1 + (n >> Supplementary.LOG2M)];
+
+ /*
+ * To get the generation of count[] going efficiently, we need to advance the next whole
+ * block. The next loop will complete processing of the block containing the first
+ * supplementary character. Note that in all these loops, if we exit because p reaches a
+ * limit, the count for the last partial block is known from p-q and we take care of
+ * that right at the end of this method. The limit of these loops is n-1, so if we spot
+ * a lead surrogate, the we may access the low-surrogate confident that p+1<n.
+ */
+ while (p < n - 1) {
+
+ // Catch supplementary characters and lone surrogate code units.
+ p += calcAdvance(string, p);
+ // Advance the code point index
+ q += 1;
+
+ // Was that the last in a block?
+ if ((q & Supplementary.MASK) == 0) {
+ count[k++] = p - q;
+ break;
+ }
+ }
+
+ /*
+ * If the string is long enough, we can work in whole blocks of M, and there are fewer
+ * things to track. We can't know the number of blocks in advance, but we know there is
+ * at least one whole block to go when p+2*M<n.
+ */
+ while (p + 2 * Supplementary.M < n) {
+
+ for (int i = 0; i < Supplementary.M; i++) {
+ // Catch supplementary characters and lone surrogate code units.
+ p += calcAdvance(string, p);
+ }
+
+ // Advance the code point index one whole block
+ q += Supplementary.M;
+
+ // The number of supplementary characters to the left of code point index k*M is:
+ count[k++] = p - q;
+ }
+
+ /*
+ * Process the remaining UTF-16 code units, except possibly the last.
+ */
+ while (p < n - 1) {
+
+ // Catch supplementary characters and lone surrogate code units.
+ p += calcAdvance(string, p);
+ // Advance the code point index
+ q += 1;
+
+ // Was that the last in a block?
+ if ((q & Supplementary.MASK) == 0) {
+ count[k++] = p - q;
+ }
+ }
+
+ /*
+ * There may be just one UTF-16 unit left (if the last thing processed was not a
+ * surrogate pair).
+ */
+ if (p < n) {
+ // We are at the last UTF-16 unit in string. Any surrogate here is an error.
+ char c = string.charAt(p++);
+ if (Character.isSurrogate(c)) {
+ throw unpairedSurrogate(p - 1, c);
+ }
+ // Advance the code point index
+ q += 1;
+ }
+
+ /*
+ * There may still be some elements of count[] we haven't set, so we fill to the end
+ * with the total count. This also takes care of an incomplete final block.
+ */
+ int total = p - q;
+ while (k < count.length) {
+ count[k++] = total;
+ }
+
+ return count;
+ }
+ }
+
+ /**
+ * Called at each code point index, returns 2 if this is a surrogate pair, 1 otherwise, and
+ * detects lone surrogates as an error. The return is the amount to advance the UTF-16 index. An
+ * exception is raised if at <code>p</code> we find a lead surrogate without a trailing one
+ * following, or a trailing surrogate directly. It should not be called on the final code unit,
+ * when <code>p==string.length()-1</code>, since it may check the next code unit as well.
+ *
+ * @param string of UTF-16 code units
+ * @param p index into that string
+ * @return 2 if a surrogate pair stands at <code>p</code>, 1 if not
+ * @throws PyException(ValueError) if a lone surrogate stands at <code>p</code>.
+ */
+ private static int calcAdvance(String string, int p) throws PyException {
+
+ // Catch supplementary characters and lone surrogate code units.
+ char c = string.charAt(p);
+
+ if (c >= Character.MIN_SURROGATE) {
+ if (c < Character.MIN_LOW_SURROGATE) {
+ // This is a lead surrogate.
+ if (Character.isLowSurrogate(string.charAt(p + 1))) {
+ // Required trailing surrogate follows, so step over both.
+ return 2;
+ } else {
+ // Required trailing surrogate missing.
+ throw unpairedSurrogate(p, c);
+ }
+
+ } else if (c <= Character.MAX_SURROGATE) {
+ // This is a lone trailing surrogate
+ throw unpairedSurrogate(p, c);
+
+ } // else this is a private use or special character in 0xE000 to 0xFFFF.
+
+ }
+ return 1;
+ }
+
+ /**
+ * Return a ready-to-throw exception indicating an unpaired surrogate.
+ *
+ * @param p index within that sequence of the problematic code unit
+ * @param c the code unit
+ * @return an exception
+ */
+ private static PyException unpairedSurrogate(int p, int c) {
+ String fmt = "unpaired surrogate %#4x at code unit %d";
+ String msg = String.format(fmt, c, p);
+ return Py.ValueError(msg);
+ }
+
+ /**
+ * Choose an {@link IndexTranslator} implementation for efficient working, according to the
+ * contents of the {@link PyString#string}.
+ *
+ * @return chosen <code>IndexTranslator</code>
+ */
+ private IndexTranslator chooseIndexTranslator() {
+ int[] count = getSupplementaryCounts(string);
+ if (DEBUG_NON_BMP_METHODS) {
+ return new Supplementary(count);
+ } else {
+ return count == null ? BASIC : new Supplementary(count);
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ * <p>
+ * In the <code>PyUnicode</code> version, the arguments are code point indices, such as are
+ * received from the Python caller, while the first two elements of the returned array have been
+ * translated to UTF-16 indices in the implementation string.
+ */
+ @Override
+ protected int[] translateIndices(PyObject start, PyObject end) {
+ int[] indices = super.translateIndices(start, end);
+ indices[0] = translator.utf16Index(indices[0]);
+ indices[1] = translator.utf16Index(indices[1]);
+ // indices[2] and [3] remain Unicode indices (and may be out of bounds) relative to len()
+ return indices;
+ }
+
+ // ------------------------------------------------------------------------------------------
+
+ /**
+ * {@inheritDoc} The indices are code point indices, not UTF-16 (<code>char</code>) indices. For
+ * example:
+ *
+ * <pre>
+ * PyUnicode u = new PyUnicode("..\ud800\udc02\ud800\udc03...");
+ * // (Python) u = u'..\U00010002\U00010003...'
+ *
+ * String s = u.substring(2, 4); // = "\ud800\udc02\ud800\udc03" (Java)
+ * </pre>
+ */
@Override
public String substring(int start, int end) {
- if (isBasicPlane()) {
- return super.substring(start, end);
- }
- return new PyUnicode(newSubsequenceIterator(start, end, 1)).getString();
+ return super.substring(translator.utf16Index(start), translator.utf16Index(end));
}
/**
@@ -122,29 +563,18 @@
return uni;
}
+ /**
+ * {@inheritDoc}
+ *
+ * @return true if the string consists only of BMP characters
+ */
+ @Override
public boolean isBasicPlane() {
- if (plane == Plane.BASIC) {
- return true;
- } else if (plane == Plane.UNKNOWN) {
- plane = (getString().length() == getCodePointCount()) ? Plane.BASIC : Plane.ASTRAL;
- }
- return plane == Plane.BASIC;
+ return translator == BASIC;
}
-// RETAIN THE BELOW CODE, it facilitates testing astral support more completely
-
-// public boolean isBasicPlane() {
-// return false;
-// }
-
-// END RETAIN
-
public int getCodePointCount() {
- if (codePointCount >= 0) {
- return codePointCount;
- }
- codePointCount = getString().codePointCount(0, getString().length());
- return codePointCount;
+ return string.length() - translator.suppCount();
}
@ExposedNew
@@ -193,12 +623,13 @@
return new PyUnicode(str);
}
- // Unicode ops consisting of basic strings can only produce basic strings;
- // this may not be the case for astral ones - they also might be basic, in
- // case of deletes. So optimize by providing a tainting mechanism.
+ /**
+ * @param string UTF-16 string encoding the characters (as Java).
+ * @param isBasic true if it is known that only BMP characters are present.
+ */
@Override
- protected PyString createInstance(String str, boolean isBasic) {
- return new PyUnicode(str, isBasic);
+ protected PyString createInstance(String string, boolean isBasic) {
+ return new PyUnicode(string, isBasic);
}
@Override
@@ -295,37 +726,19 @@
@Override
protected PyObject pyget(int i) {
- if (isBasicPlane()) {
- return Py.makeCharacter(getString().charAt(i), true);
- }
-
- int k = 0;
- while (i > 0) {
- int W1 = getString().charAt(k);
- if (W1 >= 0xD800 && W1 < 0xDC00) {
- k += 2;
- } else {
- k += 1;
- }
- i--;
- }
- int codepoint = getString().codePointAt(k);
+ int codepoint = getString().codePointAt(translator.utf16Index(i));
return Py.makeCharacter(codepoint, true);
}
private class SubsequenceIteratorImpl implements Iterator {
- private int current, k, start, stop, step;
+ private int current, k, stop, step;
SubsequenceIteratorImpl(int start, int stop, int step) {
- k = 0;
current = start;
- this.start = start;
+ k = translator.utf16Index(current);
this.stop = stop;
this.step = step;
- for (int i = 0; i < start; i++) {
- nextCodePoint();
- }
}
SubsequenceIteratorImpl() {
@@ -443,10 +856,12 @@
private PyUnicode coerceToUnicode(PyObject o) {
if (o instanceof PyUnicode) {
return (PyUnicode)o;
+ } else if (o instanceof PyString) {
+ return new PyUnicode(((PyString)o).getString(), true);
} else if (o instanceof BufferProtocol) {
- // PyString or PyByteArray, PyMemoryView, Py2kBuffer ...
+ // PyByteArray, PyMemoryView, Py2kBuffer ...
try (PyBuffer buf = ((BufferProtocol)o).getBuffer(PyBUF.FULL_RO)) {
- return new PyUnicode(buf.toString());
+ return new PyUnicode(buf.toString(), true);
}
} else {
// o is some type not allowed:
@@ -998,7 +1413,7 @@
@Override
protected PyString fromSubstring(int begin, int end) {
assert (isBasicPlane()); // can only be used on a codepath from str_ equivalents
- return new PyUnicode(getString().substring(begin, end));
+ return new PyUnicode(getString().substring(begin, end), true);
}
@ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_index_doc)
@@ -1021,7 +1436,7 @@
if (isBasicPlane()) {
return _count(sub.getString(), start, end);
}
- int[] indices = translateIndices(start, end);
+ int[] indices = super.translateIndices(start, end); // do not convert to utf-16 indices.
int count = 0;
for (Iterator<Integer> mainIter = newSubsequenceIterator(indices[0], indices[1], 1); mainIter
.hasNext();) {
@@ -1043,12 +1458,14 @@
@ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_find_doc)
final int unicode_find(PyObject subObj, PyObject start, PyObject end) {
- return _find(coerceToUnicode(subObj).getString(), start, end);
+ int found = _find(coerceToUnicode(subObj).getString(), start, end);
+ return found < 0 ? -1 : translator.codePointIndex(found);
}
@ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_rfind_doc)
final int unicode_rfind(PyObject subObj, PyObject start, PyObject end) {
- return _rfind(coerceToUnicode(subObj).getString(), start, end);
+ int found = _rfind(coerceToUnicode(subObj).getString(), start, end);
+ return found < 0 ? -1 : translator.codePointIndex(found);
}
private static String padding(int n, int pad) {
@@ -1228,13 +1645,11 @@
@ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_startswith_doc)
final boolean unicode_startswith(PyObject prefix, PyObject start, PyObject end) {
- // FIXME: slice indexing logic incorrect when this is ASTRAL
return str_startswith(prefix, start, end);
}
@ExposedMethod(defaults = {"null", "null"}, doc = BuiltinDocs.unicode_endswith_doc)
final boolean unicode_endswith(PyObject suffix, PyObject start, PyObject end) {
- // FIXME: slice indexing logic incorrect when this is ASTRAL
return str_endswith(suffix, start, end);
}
diff --git a/src/org/python/core/buffer/BaseBuffer.java b/src/org/python/core/buffer/BaseBuffer.java
--- a/src/org/python/core/buffer/BaseBuffer.java
+++ b/src/org/python/core/buffer/BaseBuffer.java
@@ -1,5 +1,7 @@
package org.python.core.buffer;
+import java.nio.ByteBuffer;
+
import org.python.core.BufferProtocol;
import org.python.core.Py;
import org.python.core.PyBUF;
@@ -126,17 +128,17 @@
* Construct an instance of BaseBuffer in support of a sub-class, specifying the 'feature
* flags', or at least a starting set to be adjusted later. These are the features of the buffer
* exported, not the flags that form the consumer's request. The buffer will be read-only unless
- * {@link PyBUF#WRITABLE} is set in the feature flags. {@link PyBUF#FORMAT} is implicitly added
- * to the feature flags. The navigation arrays are all null, awaiting action by the sub-class
- * constructor. To complete initialisation, the sub-class normally must assign: the buffer (
- * {@link #storage}, {@link #index0}), and the navigation arrays ({@link #shape},
- * {@link #strides}), and call {@link #checkRequestFlags(int)} passing the consumer's request
- * flags.
+ * {@link PyBUF#WRITABLE} is set in the feature flags. {@link PyBUF#FORMAT} and
+ * {@link PyBUF#AS_ARRAY} are implicitly added to the feature flags. The navigation arrays are
+ * all null, awaiting action by the sub-class constructor. To complete initialisation, the
+ * sub-class normally must assign: the buffer ( {@link #storage}, {@link #index0}), and the
+ * navigation arrays ({@link #shape}, {@link #strides}), and call
+ * {@link #checkRequestFlags(int)} passing the consumer's request flags.
*
* @param featureFlags bit pattern that specifies the actual features allowed/required
*/
protected BaseBuffer(int featureFlags) {
- setFeatureFlags(featureFlags | FORMAT);
+ setFeatureFlags(featureFlags | FORMAT | AS_ARRAY);
}
/**
@@ -213,6 +215,12 @@
}
@Override
+ public boolean hasArray() {
+ // AS_ARRAY is a non-navigational flag, so is inverted in gFeatureFlags
+ return (gFeatureFlags & AS_ARRAY) != 0;
+ }
+
+ @Override
public int getNdim() {
return shape.length;
}
@@ -297,7 +305,7 @@
* accessors. The default implementation here is suited to N-dimensional arrays.
*
* @param indices of the item from the consumer
- * @return index relative to item x[0,...,0] in actual storage
+ * @return corresponding absolute index in storage
*/
protected int calcIndex(int... indices) throws IndexOutOfBoundsException {
final int N = checkDimension(indices);
@@ -313,6 +321,57 @@
}
/**
+ * Calculate the absolute byte index in the storage array of the last item of the exported data
+ * (if we are not using indirection). This is the greatest value attained by
+ * {@link #calcIndex(int...)}. The first byte not used will be one <code>itemsize</code> more
+ * than the returned value.
+ *
+ * @return greatest absolute index in storage
+ */
+ protected int calcGreatestIndex() throws IndexOutOfBoundsException {
+ final int N = shape.length;
+ // If all the strides are positive, the maximal value is found from:
+ // index = index0 + sum(k=0,N-1) (shape[k]-1)*strides[k]
+ // but in general, for any k where strides[k]<=0, the term should be zero.
+ int index = index0;
+ if (N > 0) {
+ int[] strides = getStrides();
+ for (int k = 0; k < N; k++) {
+ int stride = strides[k];
+ if (stride > 0) {
+ index += (shape[k] - 1) * stride;
+ }
+ }
+ }
+ return index;
+ }
+
+ /**
+ * Calculate the absolute byte index in the storage array of the first item of the exported data
+ * (if we are not using indirection). This is the least value attained by
+ * {@link #calcIndex(int...)}.
+ *
+ * @return least absolute index in storage
+ */
+ protected int calcLeastIndex() throws IndexOutOfBoundsException {
+ final int N = shape.length;
+ // If all the strides are positive, the maximal value is just index0,
+ // but in general, we must allow strides[k]<=0 for some k:
+ // index = index0 + sum(k=0,N-1) (strides[k]<0) ? (shape[k]-1)*strides[k] : 0
+ int index = index0;
+ if (N > 0) {
+ int[] strides = getStrides();
+ for (int k = 0; k < N; k++) {
+ int stride = strides[k];
+ if (stride < 0) {
+ index += (shape[k] - 1) * stride;
+ }
+ }
+ }
+ return index;
+ }
+
+ /**
* {@inheritDoc}
* <p>
* The default implementation in <code>BaseBuffer</code> deals with the general one-dimensional
@@ -540,6 +599,15 @@
// @Override public PyBuffer getBufferSlice(int flags, int start, int length, int stride) {}
@Override
+ public ByteBuffer getNIOByteBuffer() {
+ // Determine the limit of the buffer just beyond the last item.
+ int length = calcGreatestIndex() + getItemsize() - index0;
+ ByteBuffer b = ByteBuffer.wrap(storage, index0, length);
+ // Return as read-only if it is.
+ return isReadonly() ? b.asReadOnlyBuffer() : b;
+ }
+
+ @Override
public Pointer getBuf() {
return new Pointer(storage, index0);
}
@@ -664,6 +732,8 @@
return bufferRequires("shape array");
} else if ((syndrome & WRITABLE) != 0) {
return bufferIsNot("writable");
+ } else if ((syndrome & AS_ARRAY) != 0) {
+ return bufferIsNot("accessible as a Java array");
} else if ((syndrome & C_CONTIGUOUS) != 0) {
return bufferIsNot("C-contiguous");
} else if ((syndrome & F_CONTIGUOUS) != 0) {
diff --git a/src/org/python/core/buffer/SimpleBuffer.java b/src/org/python/core/buffer/SimpleBuffer.java
--- a/src/org/python/core/buffer/SimpleBuffer.java
+++ b/src/org/python/core/buffer/SimpleBuffer.java
@@ -1,5 +1,7 @@
package org.python.core.buffer;
+import java.nio.ByteBuffer;
+
import org.python.core.PyBuffer;
import org.python.core.PyException;
import org.python.core.util.StringUtil;
@@ -229,6 +231,13 @@
}
@Override
+ public ByteBuffer getNIOByteBuffer() {
+ // Simplify for one-dimensional contiguous bytes
+ ByteBuffer b = ByteBuffer.wrap(storage, index0, shape[0]);
+ return isReadonly() ? b.asReadOnlyBuffer() : b;
+ }
+
+ @Override
public Pointer getPointer(int index) throws IndexOutOfBoundsException {
return new Pointer(storage, index0 + index);
}
diff --git a/src/org/python/core/buffer/SimpleStringBuffer.java b/src/org/python/core/buffer/SimpleStringBuffer.java
--- a/src/org/python/core/buffer/SimpleStringBuffer.java
+++ b/src/org/python/core/buffer/SimpleStringBuffer.java
@@ -1,5 +1,7 @@
package org.python.core.buffer;
+import java.nio.ByteBuffer;
+
import org.python.core.PyBuffer;
import org.python.core.util.StringUtil;
@@ -112,12 +114,19 @@
return getBufferSlice(flags, start, length);
} else {
// Force creation of the actual byte array from the String.
- getBuf();
+ ensureHaveBytes();
// Now we are effectively a SimpleBuffer, return the strided view.
return super.getBufferSlice(flags, start, length, stride);
}
}
+ @Override
+ public ByteBuffer getNIOByteBuffer() {
+ // Force creation of the actual byte array from the String.
+ ensureHaveBytes();
+ return super.getNIOByteBuffer().asReadOnlyBuffer();
+ }
+
/**
* This method creates an actual byte array from the underlying String if none yet exists.
*/
diff --git a/src/org/python/modules/_io/PyFileIO.java b/src/org/python/modules/_io/PyFileIO.java
--- a/src/org/python/modules/_io/PyFileIO.java
+++ b/src/org/python/modules/_io/PyFileIO.java
@@ -251,8 +251,7 @@
PyBuffer pybuf = writablePyBuffer(buf);
try {
- PyBuffer.Pointer bp = pybuf.getBuf();
- ByteBuffer byteBuffer = ByteBuffer.wrap(bp.storage, bp.offset, pybuf.getLen());
+ ByteBuffer byteBuffer = pybuf.getNIOByteBuffer();
synchronized (ioDelegate) {
count = ioDelegate.readinto(byteBuffer);
}
@@ -293,8 +292,7 @@
try {
// Access the data as a java.nio.ByteBuffer [pos:limit] within possibly larger array
- PyBuffer.Pointer bp = pybuf.getBuf();
- ByteBuffer byteBuffer = ByteBuffer.wrap(bp.storage, bp.offset, pybuf.getLen());
+ ByteBuffer byteBuffer = pybuf.getNIOByteBuffer();
synchronized (ioDelegate) {
count = ioDelegate.write(byteBuffer);
}
diff --git a/src/org/python/modules/posix/PosixModule.java b/src/org/python/modules/posix/PosixModule.java
--- a/src/org/python/modules/posix/PosixModule.java
+++ b/src/org/python/modules/posix/PosixModule.java
@@ -29,7 +29,6 @@
import org.python.core.Py;
import org.python.core.PyBUF;
import org.python.core.PyBuffer;
-import org.python.core.PyBuffer.Pointer;
import org.python.core.PyBuiltinFunctionNarrow;
import org.python.core.PyDictionary;
import org.python.core.PyException;
@@ -826,10 +825,8 @@
public static int write(PyObject fd, BufferProtocol bytes) {
// Get a buffer view: we can cope with N-dimensional data, but not strided data.
try (PyBuffer buf = bytes.getBuffer(PyBUF.ND)) {
- // Get the array and offset of the first real byte.
- Pointer p = buf.getBuf();
- // Make a ByteBuffer of that array, setting the position and limit to the real data.
- ByteBuffer bb = ByteBuffer.wrap(p.storage, p.offset, buf.getLen());
+ // Get a ByteBuffer of that data, setting the position and limit to the real data.
+ ByteBuffer bb = buf.getNIOByteBuffer();
try {
// Write the data (returning the count of bytes).
return FileDescriptors.get(fd).write(bb);
@@ -902,7 +899,7 @@
}
/**
- * Return a path as a String from a PyObject
+ * Return a path as a String from a PyObject
*
* @param path a PyObject, raising a TypeError if an invalid path type
* @return a String path
diff --git a/tests/java/javatests/ProxyDeserialization.java b/tests/java/ProxyDeserialization.java
rename from tests/java/javatests/ProxyDeserialization.java
rename to tests/java/ProxyDeserialization.java
diff --git a/tests/python/unicode_index_times.py b/tests/python/unicode_index_times.py
new file mode 100644
--- /dev/null
+++ b/tests/python/unicode_index_times.py
@@ -0,0 +1,405 @@
+# -*- coding: utf-8 -*-
+# unicode speed tests for access operations and find
+# This is part of an effort to supply the Jython implementation of
+# unicode with efficient, correct translations between the visible index
+# of a character and and the index in the implementation array of the
+# UTF-16 code unit(s) that represent it. See also test.test_unicode_jy,
+# and Jython issue #2100. The presence of characters with Unicode
+# code points > 0xffff (supplementary characters), that it takes two
+# code units to represent, makes this non-trivial.
+#
+# The program defines a variety of test strings of differing lengths
+# and distribution of supplementary characters, then times some basic
+# operations involving index translation: of retrieval, slicing and
+# find, to provide an average time per operation. It runs several trials
+# of each test case (a combination of an operation and the test material)
+# and reports the shortest trial, using the same strategy as the timeit
+# module).
+#
+# It was difficult to get repeatable times from the test under Jython,
+# because the JIT compiler (?) is non-deterministic. It proved
+# impossible using a strategy that would run the same test case multiple
+# times in succession. The approach eventually taken was to run the all
+# the test cases once, then repeat this sequence several times, and
+# report the minimum time of each case in these widely separated trials.
+# This strategy provides stable results with the default JIT behaviour.
+#
+# Two interesting options are to run with the # JIT compiler disabled:
+# $ jython -J-Xint tests/python/unicode_index_times.py
+#
+# or with it continuously enabled:
+# $ jython -J-Xcomp tests/python/unicode_index_times.py
+#
+from __future__ import print_function
+import itertools
+import random
+import sys
+import time
+
+if sys.platform == "win32" or sys.platform.startswith("java"):
+ # On Windows and Jython, the best timer is time.clock()
+ timer = time.clock
+else:
+ # On most other platforms the best timer is time.time()
+ timer = time.time
+
+if sys.version_info[0] > 2:
+ unichr = chr
+else:
+ def next(it) : return it.next()
+
+# Proportion of characters that are supplementary (if not explicit)
+DEFAULT_RATE = 0.2 # 20%
+
+# We will test performance with these sizes
+SHORT = 10
+MEDIUM = 100
+LONG = 1000
+HUGE = 10000
+
+
+class UnicodeMaterial(object):
+ ''' Object holding a list of single characters and a unicode string
+ that is their concatenation. The sequence is created from a
+ background sequence of basic plane characters and random
+ replacement with supplementary plane characters (those with
+ point code>0xffff).
+ '''
+
+ base = tuple(u'abcdefghijklmnopqrstuvwxyz')
+ if sys.maxunicode < 0x10000:
+ print("Narrow build: all characters from BMP", file=sys.stderr)
+ supp = tuple(map(unichr, range(0x20, 0x2c)))
+ else:
+ # Wide build: we have real supplementary characters
+ supp = tuple(map(unichr, range(0x10000, 0x1000c)))
+ used = sorted(set(base+supp))
+
+ def __init__(self, size=20, pred=None, ran=None):
+ ''' Create size chars choosing an SP char at i where
+ pred(ran, i)==True where ran is an instance of
+ random.Random supplied in the constructor or created
+ locally (if ran==None).
+ '''
+
+ # Generators for the BMP and SP characters
+ base = itertools.cycle(UnicodeMaterial.base)
+ supp = itertools.cycle(UnicodeMaterial.supp)
+
+ # Each instance gets a random generator
+ if ran is None:
+ ran = random.Random()
+ self.random = ran
+
+ if pred is None:
+ pred = lambda ran, j : ran.random() < DEFAULT_RATE
+
+ # Generate the list
+ r = list()
+ for i in range(size):
+ if pred(self.random, i):
+ c = next(supp)
+ else:
+ c = next(base)
+ r.append(c)
+
+ # The list and its concatenation are our material
+ self.ref = r
+ self.size = len(r)
+ self.text = u''.join(r)
+ self.target = u''
+
+ def __len__(self):
+ return self.size
+
+ def insert(self, target, p=None):
+ ''' Insert target string at position p (or middle), truncating if
+ that would make the material any longer
+ '''
+ if p is None:
+ p = max(0, (self.size-len(target)) // 2)
+
+ n = 0
+ for t in target:
+ if p+n >= self.size:
+ break;
+ self.ref[p+n] = t
+ n += 1
+
+ self.target = target[:n]
+ self.text = u''.join(self.ref)
+
+
+class UnicodeActions(UnicodeMaterial):
+ ''' Provides test material with loops for timing.'''
+
+ def __init__(self, size=20, pred=None, ran=None):
+ super(UnicodeActions, self).__init__(size, pred, ran)
+ if self.size <= 0:
+ raise ValueError("The timings don't work for zero length")
+ self.used = UnicodeMaterial.used
+ self.trash = None
+ # String to find (note 'abcde' in base: nice for false alarms)
+ self.insert(u"abide")
+
+
+ def repeat_getitem(self, mincount):
+ ''' Access each code point in sequence repeatedly so that at
+ least mincount operations have been peformed, and return the
+ actual number of operations.
+ '''
+ n = self.size
+ t = self.text
+ opcount = 0
+ dummy = None
+ while opcount < mincount:
+ # Access each point code
+ i = 0
+ while i < n:
+ # Avoid optimising away the call
+ dummy = t[i]
+ i += 1
+ opcount += n
+ self.trash = dummy
+ return opcount
+
+ def repeat_slice(self, mincount):
+ ''' Extract a slice at each feasible position and length,
+ repeating enough times to do mincount operations, and
+ return the actual number of operations.
+ '''
+ n = self.size
+ t = self.text
+ opcount = 0
+ dummy = None
+
+ while opcount < mincount:
+ m = 1
+ while m <= n:
+ starts = n - m + 1
+ for i in range(starts):
+ dummy = t[i:i+m]
+ #print(i, i+m, step, dummy)
+ opcount += starts
+ m *= 5
+
+ return opcount
+
+ def repeat_slice_step(self, mincount):
+ ''' Extract a slice at each feasible position and length,
+ and using different sizes for the step,
+ repeating enough times to do mincount operations, and
+ return the actual number of operations.
+ '''
+ n = self.size
+ t = self.text
+ opcount = 0
+ dummy = None
+ steps = [3, -1, 10]
+
+ while opcount < mincount:
+ for step in steps:
+ if step > 0:
+ m = 1
+ while m <= n:
+ starts = n - m + 1
+ for i in range(starts):
+ dummy = t[i:i+m:step]
+ #print(i, i+m, step, dummy)
+ opcount += starts
+ m *= 5
+ else:
+ m = 1
+ while m <= n:
+ starts = n - m + 1
+ for i in range(starts):
+ dummy = t[i+m:i:step]
+ #print(i+m, i, step, dummy)
+ opcount += starts
+ m *= 5
+
+ return opcount
+
+ def repeat_find_char(self, mincount):
+ ''' Do an incremental find of each code point, repeating
+ enough times to do mincount finds, and return the actual
+ number of operations.
+ '''
+ opcount = 0
+ n = self.size
+ findop = self.text.find
+ dummy = 0
+
+ while opcount < mincount:
+ # The text is searched for every code c.
+ for c in self.used:
+ # ... and every occurrence is found.
+ start = 0
+ while start < n:
+ i = findop(c, start)
+ if i < 0: break
+ # Avoid optimising away the call
+ dummy += i
+ start = i + 1
+
+ # Every character in the text was a hit exactly once.
+ # And every character was also a miss, except for
+ # the character that ends the text. So we did:
+ opcount += n + len(self.used) - 1
+
+ self.trash = dummy
+ return opcount
+
+ def repeat_find_str(self, mincount):
+ ''' Find the target string within the material, repeating
+ enough times to do mincount finds, and return the actual
+ number of operations.
+ '''
+ opcount = 0
+ s = self.target
+ findop = self.text.find
+ dummy = 0
+
+ while opcount < mincount:
+ dummy += findop(s)
+ opcount += 1
+
+ self.trash = dummy
+ return opcount
+
+ def repeat_rfind_char(self, mincount):
+ ''' Do an incremental rfind of each code point, repeating
+ enough times to do mincount finds, and return the actual
+ number of operations.
+ '''
+ opcount = 0
+ n = self.size
+ findop = self.text.rfind
+
+ while opcount < mincount:
+ # The text is searched for every code c.
+ for c in self.used:
+ # ... and every occurrence is found.
+ end = n
+ while end >= 0:
+ end = findop(c, 0, end)
+
+ # Every character in the text was a hit exactly once.
+ # And every character was also a miss. So we did:
+ opcount += n + len(self.used)
+
+ return opcount
+
+ def repeat_rfind_str(self, mincount):
+ ''' Find the target string within the material, repeating
+ enough times to do mincount finds, and return the actual
+ number of operations.
+ '''
+ opcount = 0
+ s = self.target
+ findop = self.text.rfind
+ dummy = 0
+
+ while opcount < mincount:
+ dummy += findop(s)
+ opcount += 1
+
+ self.trash = dummy
+ return opcount
+
+
+def time_per_op(op, mincount):
+ ''' Repeat the given operation at least mincount times and
+ return the time per operation in microseconds.
+ '''
+ t = timer()
+ opcount = op(mincount)
+ return (timer() - t) * 1e6 / opcount
+
+# Functions defining particular distributions of SP codes
+#
+def evenly(rate=DEFAULT_RATE):
+ 'Evenly distributed at given rate'
+ def f(ran, i):
+ return ran.random() < rate
+ return f
+
+def evenly_before(k, rate=DEFAULT_RATE):
+ 'Evenly distributed on i<k at given rate'
+ def f(ran, i):
+ return i < k and ran.random() < rate
+ return f
+
+def evenly_from(k, rate=DEFAULT_RATE):
+ 'Evenly distributed on i>=k at given rate'
+ def f(ran, i):
+ return i >= k and ran.random() < rate
+ return f
+
+def time_all():
+
+ setups = [
+ #("empty", UnicodeActions(0)),
+ ("single bmp", UnicodeActions(1, (lambda ran, i : False))),
+ ("single", UnicodeActions(1, (lambda ran, i : True))),
+ ("short bmp", UnicodeActions(SHORT, (lambda ran, i : False))),
+ ("short 50%", UnicodeActions(SHORT, evenly(0.5))),
+ ("medium bmp", UnicodeActions(MEDIUM, (lambda ran, i : False))),
+ ("medium 10%", UnicodeActions(MEDIUM, evenly(0.1))),
+ ("long bmp", UnicodeActions(LONG, (lambda ran, i : False))),
+ ("long 1%", UnicodeActions(LONG, evenly(0.01))),
+ ("long 10%", UnicodeActions(LONG, evenly(0.1))),
+ ("long 10% L", UnicodeActions(LONG, evenly_before(LONG/4, 0.4))),
+ ("long 10% H", UnicodeActions(LONG, evenly_from(LONG-(LONG/4), 0.4))),
+ ("long 50%", UnicodeActions(LONG, evenly(0.5))),
+ ("huge bmp", UnicodeActions(HUGE, (lambda ran, i : False))),
+ ("huge 10%", UnicodeActions(HUGE, evenly(0.1))),
+ ]
+
+ ops = [
+ ("[i]", "repeat_getitem"),
+ ("[i:i+n]", "repeat_slice"),
+ ("[i:i+n:k]", "repeat_slice_step"),
+ ("find(c)", "repeat_find_char"),
+ ("find(s)", "repeat_find_str"),
+ ("rfind(c)", "repeat_rfind_char"),
+ ("rfind(s)", "repeat_rfind_str"),
+ ]
+
+
+ print("{:12s}{:>6s}".format("time (us)", "len"), end='')
+ for title, _ in ops:
+ print("{:>10s}".format(title), end='')
+ print()
+
+ mintime = dict()
+ def save_mintime(k, t):
+ mintime[k] = min(t, mintime.get(k, 1e9))
+
+ trashcan = []
+
+ # Cycle through the cases repeatedly.
+ for k in range(5):
+ for name, material in setups:
+ for _, opname in ops:
+ # For each case, keep the minimum time
+ op = material.__getattribute__(opname)
+ t = time_per_op(op, 1000)
+ save_mintime((name, opname), t)
+
+ if k == 0:
+ trashcan.append(material.trash)
+
+ # Cycle through the cases again to print them.
+ for name, material in setups:
+ print("{:12s}{:6d}".format(name, material.size), end='')
+ for _, opname in ops:
+ t = mintime[(name, opname)]
+ print("{:10.2f}".format(t), end='')
+ print()
+
+ print("y =", trashcan)
+
+if __name__ == "__main__":
+
+ time_all()
--
Repository URL: http://hg.python.org/jython
More information about the Jython-checkins
mailing list