[Jython-checkins] jython: Regression and speed tests preparatory to work on PyUnicode indexing.
jeff.allen
jython-checkins at python.org
Wed Sep 17 00:55:24 CEST 2014
http://hg.python.org/jython/rev/aed06364c44d
changeset: 7379:aed06364c44d
user: Jeff Allen <ja.py at farowl.co.uk>
date: Sat Aug 16 22:40:18 2014 +0100
summary:
Regression and speed tests preparatory to work on PyUnicode indexing.
Includes a failing test for unicode.find() and .rfind(), and a complicated
benchmark for performance of indexing, slice and (r)find (compatible with
CPython 2.7 and 3.4).
files:
Lib/test/test_unicode_jy.py | 211 +++++++++
tests/python/unicode_index_times.py | 368 ++++++++++++++++
2 files changed, 579 insertions(+), 0 deletions(-)
diff --git a/Lib/test/test_unicode_jy.py b/Lib/test/test_unicode_jy.py
--- a/Lib/test/test_unicode_jy.py
+++ b/Lib/test/test_unicode_jy.py
@@ -3,6 +3,8 @@
Made for Jython.
"""
+import itertools
+import random
import re
import string
import sys
@@ -150,6 +152,214 @@
u'f')
+class UnicodeMaterial(object):
+ ''' Object holding a list of single characters and a unicode string
+ that is their concatenation. The sequence is created from a
+ background sequence of basic plane characters and random
+ replacement with supplementary plane characters (those with
+ point code>0xffff).
+ '''
+
+ base = tuple(u'abcdefghijklmnopqrstuvwxyz')
+ supp = tuple(map(unichr, range(0x10000, 0x1000c)))
+ used = sorted(set(base+supp))
+
+ def __init__(self, size=20, pred=None, ran=None):
+ ''' Create size chars choosing an SP char at i where
+ pred(ran, i)==True where ran is an instance of
+ random.Random supplied in the constructor or created
+ locally (if ran==None).
+ '''
+
+ # Generators for the BMP and SP characters
+ base = itertools.cycle(UnicodeMaterial.base)
+ supp = itertools.cycle(UnicodeMaterial.supp)
+
+ # Each instance gets a random generator
+ if ran is None:
+ ran = random.Random()
+ self.random = ran
+
+ if pred is None:
+ pred = lambda ran, j : ran.random() < DEFAULT_RATE
+
+ # Generate the list
+ r = list()
+ for i in range(size):
+ if pred(self.random, i):
+ c = supp.next()
+ else:
+ c = base.next()
+ r.append(c)
+
+ # The list and its concatenation are our material
+ self.ref = r
+ self.size = len(r)
+ self.text = u''.join(r)
+ self.target = u''
+
+ def __len__(self):
+ return self.size
+
+ def insert(self, target, p=None):
+ ''' Insert target string at position p (or middle), truncating if
+ that would make the material any longer
+ '''
+ if p is None:
+ p = max(0, (self.size-len(target)) // 2)
+
+ n = 0
+ for t in target:
+ if p+n >= self.size:
+ break;
+ self.ref[p+n] = t
+ n += 1
+
+ self.target = target[:n]
+ self.text = u''.join(self.ref)
+
+
+class UnicodeIndexMixTest(unittest.TestCase):
+ # Test indexing where there may be more than one code unit per code point.
+ # See Jython Issue #2100.
+
+ # Functions defining particular distributions of SP codes
+ #
+ def evenly(self, rate=0.2):
+ 'Evenly distributed at given rate'
+ def f(ran, i):
+ return ran.random() < rate
+ return f
+
+ def evenly_before(self, k, rate=0.2):
+ 'Evenly distributed on i<k at given rate'
+ def f(ran, i):
+ return i < k and ran.random() < rate
+ return f
+
+ def evenly_from(self, k, rate=0.2):
+ 'Evenly distributed on i>=k at given rate'
+ def f(ran, i):
+ return i >= k and ran.random() < rate
+ return f
+
+ def at(self, places):
+ 'Only at specified places'
+ def f(ran, i):
+ return i in places
+ return f
+
+ def setUp(self):
+ ran = random.Random(1234) # ensure repeatable
+ mat = list()
+ mat.append(UnicodeMaterial(10, self.at([2]), ran))
+ mat.append(UnicodeMaterial(10, self.at([2, 5]), ran))
+ mat.append(UnicodeMaterial(50, self.evenly(), ran))
+ mat.append(UnicodeMaterial(200, self.evenly_before(70), ran))
+ mat.append(UnicodeMaterial(200, self.evenly_from(130), ran))
+ mat.append(UnicodeMaterial(1000, self.evenly(), ran))
+
+ self.material = mat
+
+
+ def test_getitem(self):
+ # Test map from to code point index to internal representation
+ # Fails in Jython 2.7b3
+
+ def check_getitem(m):
+ # Check indexing the string returns the expected point code
+ for i in xrange(m.size):
+ self.assertEqual(m.text[i], m.ref[i])
+
+ for m in self.material:
+ check_getitem(m)
+
+ def test_slice(self):
+ # Test indexing gets the slice ends correct.
+ # Passes in Jython 2.7b3, but may be touched by #2100 changes.
+
+ def check_slice(m):
+ # Check a range of slices against slices of the reference.
+ n = 1
+ while n <= m.size:
+ for i in range(m.size - n):
+ exp = u''.join(m.ref[i:i+n])
+ self.assertEqual(m.text[i:i+n], exp)
+ n *= 3
+
+ for m in self.material:
+ check_slice(m)
+
+ @unittest.skip("Fails on Jython 2.7b3 issue #2100")
+ def test_find(self):
+ # Test map from internal find result to code point index
+ # Fails in Jython 2.7b3
+
+ def check_find(ref):
+ # Check find returns indexes for single point codes
+ for c in set(m.used):
+ start = 0
+ u = m.text
+ while start < m.size:
+ i = u.find(c, start)
+ if i < 0: break
+ self.assertEqual(u[i], c)
+ start = i + 1
+
+ def check_find_str(m, t):
+ # Check find returns correct index for string target
+ i = m.text.find(t)
+ self.assertEqual(list(t), m.ref[i:i+len(t)])
+
+ targets = [
+ u"this",
+ u"ab\U00010041de",
+ u"\U00010041\U00010042\U00010042xx",
+ u"xx\U00010041\U00010042\U00010043yy",
+ ]
+
+ for m in self.material:
+ check_find(m)
+ for t in targets:
+ # Insert in middle then try to find it
+ m.insert(t)
+ check_find_str(m, t)
+
+ @unittest.skip("Fails on Jython 2.7b3 issue #2100")
+ def test_rfind(self):
+ # Test map from internal rfind result to code point index
+ # Fails in Jython 2.7b3
+
+ def check_rfind(ref):
+ # Check rfind returns indexes for single point codes
+ for c in set(m.used):
+ end = m.size
+ u = m.text
+ while True:
+ end = u.rfind(c, 0, end)
+ if end < 0: break
+ self.assertEqual(u[end], c)
+
+ def check_rfind_str(m, t):
+ # Check rfind returns correct index for string target
+ i = m.text.rfind(t)
+ self.assertEqual(list(t), m.ref[i:i+len(t)])
+
+ targets = [
+ u"this",
+ u"ab\U00010041de",
+ u"\U00010041\U00010042\U00010042xx",
+ u"xx\U00010041\U00010042\U00010043yy",
+ ]
+
+ for m in self.material:
+ check_rfind(m)
+ for t in targets:
+ # Insert in middle then try to find it
+ m.insert(t)
+ check_rfind_str(m, t)
+
+
class UnicodeFormatTestCase(unittest.TestCase):
def test_unicode_mapping(self):
@@ -596,6 +806,7 @@
def test_main():
test_support.run_unittest(
UnicodeTestCase,
+ UnicodeIndexMixTest,
UnicodeFormatTestCase,
UnicodeStdIOTestCase,
UnicodeFormatStrTest,
diff --git a/tests/python/unicode_index_times.py b/tests/python/unicode_index_times.py
new file mode 100644
--- /dev/null
+++ b/tests/python/unicode_index_times.py
@@ -0,0 +1,368 @@
+# -*- coding: utf-8 -*-
+# unicode speed tests for access operations and find
+# This is part of an effort to supply the Jython implementation of
+# unicode with efficient, correct translations between the visible index
+# of a character and and the index in the implementation array of the
+# UTF-16 code unit(s) that represent it. See also test.test_unicode_jy,
+# and Jython issue #2100. The presence of characters with Unicode
+# code points > 0xffff (supplementary characters), that it takes two
+# code units to represent, makes this non-trivial.
+#
+# The program defines a variety of test strings of differing lengths
+# and distribution of supplementary characters, then times some basic
+# operations involving index translation: of retrieval, slicing and
+# find, to provide an average time per operation. It runs several trials
+# of each test case (a combination of an operation and the test material)
+# and reports the shortest trial, using the same strategy as the timeit
+# module).
+#
+# It was difficult to get repeatable times from the test under Jython,
+# because the JIT compiler (?) is non-deterministic. It proved
+# impossible using a strategy that would run the same test case multiple
+# times in succession. The approach eventually taken was to run the all
+# the test cases once, then repeat this sequence several times, and
+# report the minimum time of each case in these widely separated trials.
+# This strategy provides stable results with the default JIT behaviour.
+#
+# Two interesting options are to run with the # JIT compiler disabled:
+# $ jython -J-Xint tests/python/unicode_index_times.py
+#
+# or with it continuously enabled:
+# $ jython -J-Xcomp tests/python/unicode_index_times.py
+#
+from __future__ import print_function
+import itertools
+import random
+import sys
+import time
+
+if sys.platform == "win32" or sys.platform.startswith("java"):
+ # On Windows and Jython, the best timer is time.clock()
+ timer = time.clock
+else:
+ # On most other platforms the best timer is time.time()
+ timer = time.time
+
+if sys.version_info[0] > 2:
+ unichr = chr
+else:
+ def next(it) : return it.next()
+
+# Proportion of characters that are supplementary (if not explicit)
+DEFAULT_RATE = 0.2 # 20%
+
+# We will test performance with these sizes
+SHORT = 10
+MEDIUM = 100
+LONG = 1000
+HUGE = 10000
+
+
+class UnicodeMaterial(object):
+ ''' Object holding a list of single characters and a unicode string
+ that is their concatenation. The sequence is created from a
+ background sequence of basic plane characters and random
+ replacement with supplementary plane characters (those with
+ point code>0xffff).
+ '''
+
+ base = tuple(u'abcdefghijklmnopqrstuvwxyz')
+ if sys.maxunicode < 0x10000:
+ print("Narrow build: all characters from BMP", file=sys.stderr)
+ supp = tuple(map(unichr, range(0x20, 0x2c)))
+ else:
+ # Wide build: we have real supplementary characters
+ supp = tuple(map(unichr, range(0x10000, 0x1000c)))
+ used = sorted(set(base+supp))
+
+ def __init__(self, size=20, pred=None, ran=None):
+ ''' Create size chars choosing an SP char at i where
+ pred(ran, i)==True where ran is an instance of
+ random.Random supplied in the constructor or created
+ locally (if ran==None).
+ '''
+
+ # Generators for the BMP and SP characters
+ base = itertools.cycle(UnicodeMaterial.base)
+ supp = itertools.cycle(UnicodeMaterial.supp)
+
+ # Each instance gets a random generator
+ if ran is None:
+ ran = random.Random()
+ self.random = ran
+
+ if pred is None:
+ pred = lambda ran, j : ran.random() < DEFAULT_RATE
+
+ # Generate the list
+ r = list()
+ for i in range(size):
+ if pred(self.random, i):
+ c = next(supp)
+ else:
+ c = next(base)
+ r.append(c)
+
+ # The list and its concatenation are our material
+ self.ref = r
+ self.size = len(r)
+ self.text = u''.join(r)
+ self.target = u''
+
+ def __len__(self):
+ return self.size
+
+ def insert(self, target, p=None):
+ ''' Insert target string at position p (or middle), truncating if
+ that would make the material any longer
+ '''
+ if p is None:
+ p = max(0, (self.size-len(target)) // 2)
+
+ n = 0
+ for t in target:
+ if p+n >= self.size:
+ break;
+ self.ref[p+n] = t
+ n += 1
+
+ self.target = target[:n]
+ self.text = u''.join(self.ref)
+
+
+class UnicodeActions(UnicodeMaterial):
+ ''' Provides test material with loops for timing.'''
+
+ def __init__(self, size=20, pred=None, ran=None):
+ super(UnicodeActions, self).__init__(size, pred, ran)
+ if self.size <= 0:
+ raise ValueError("The timings don't work for zero length")
+ self.used = UnicodeMaterial.used
+ self.trash = None
+ # String to find (note 'abcde' in base: nice for false alarms)
+ self.insert(u"abide")
+
+
+ def repeat_getitem(self, mincount):
+ ''' Access each code point in sequence repeatedly so that at
+ least mincount operations have been peformed, and return the
+ actual number of operations.
+ '''
+ n = self.size
+ t = self.text
+ opcount = 0
+ dummy = None
+ while opcount < mincount:
+ # Access each point code
+ i = 0
+ while i < n:
+ # Avoid optimising away the call
+ dummy = t[i]
+ i += 1
+ opcount += n
+ self.trash = dummy
+ return opcount
+
+ def repeat_slice(self, mincount):
+ ''' Extract a slice at each feasible position and length,
+ repeating enough times to do mincount operations, and
+ return the actual number of operations.
+ '''
+ n = self.size
+ t = self.text
+ opcount = 0
+ dummy = None
+
+ while opcount < mincount:
+ m = 1
+ while m <= n:
+ starts = n - m + 1
+ for i in range(starts):
+ dummy = t[i:i+m]
+ opcount += starts
+ m *= 5
+
+ return opcount
+
+ def repeat_find_char(self, mincount):
+ ''' Do an incremental find of each code point, repeating
+ enough times to do mincount finds, and return the actual
+ number of operations.
+ '''
+ opcount = 0
+ n = self.size
+ findop = self.text.find
+ dummy = 0
+
+ while opcount < mincount:
+ # The text is searched for every code c.
+ for c in self.used:
+ # ... and every occurrence is found.
+ start = 0
+ while start < n:
+ i = findop(c, start)
+ if i < 0: break
+ # Avoid optimising away the call
+ dummy += i
+ start = i + 1
+
+ # Every character in the text was a hit exactly once.
+ # And every character was also a miss, except for
+ # the character that ends the text. So we did:
+ opcount += n + len(self.used) - 1
+
+ self.trash = dummy
+ return opcount
+
+ def repeat_find_str(self, mincount):
+ ''' Find the target string within the material, repeating
+ enough times to do mincount finds, and return the actual
+ number of operations.
+ '''
+ opcount = 0
+ s = self.target
+ findop = self.text.find
+ dummy = 0
+
+ while opcount < mincount:
+ dummy += findop(s)
+ opcount += 1
+
+ self.trash = dummy
+ return opcount
+
+ def repeat_rfind_char(self, mincount):
+ ''' Do an incremental rfind of each code point, repeating
+ enough times to do mincount finds, and return the actual
+ number of operations.
+ '''
+ opcount = 0
+ n = self.size
+ findop = self.text.rfind
+
+ while opcount < mincount:
+ # The text is searched for every code c.
+ for c in self.used:
+ # ... and every occurrence is found.
+ end = n
+ while end >= 0:
+ end = findop(c, 0, end)
+
+ # Every character in the text was a hit exactly once.
+ # And every character was also a miss. So we did:
+ opcount += n + len(self.used)
+
+ return opcount
+
+ def repeat_rfind_str(self, mincount):
+ ''' Find the target string within the material, repeating
+ enough times to do mincount finds, and return the actual
+ number of operations.
+ '''
+ opcount = 0
+ s = self.target
+ findop = self.text.rfind
+ dummy = 0
+
+ while opcount < mincount:
+ dummy += findop(s)
+ opcount += 1
+
+ self.trash = dummy
+ return opcount
+
+
+def time_per_op(op, mincount):
+ ''' Repeat the given operation at least mincount times and
+ return the time per operation in microseconds.
+ '''
+ t = timer()
+ opcount = op(mincount)
+ return (timer() - t) * 1e6 / opcount
+
+# Functions defining particular distributions of SP codes
+#
+def evenly(rate=DEFAULT_RATE):
+ 'Evenly distributed at given rate'
+ def f(ran, i):
+ return ran.random() < rate
+ return f
+
+def evenly_before(k, rate=DEFAULT_RATE):
+ 'Evenly distributed on i<k at given rate'
+ def f(ran, i):
+ return i < k and ran.random() < rate
+ return f
+
+def evenly_from(k, rate=DEFAULT_RATE):
+ 'Evenly distributed on i>=k at given rate'
+ def f(ran, i):
+ return i >= k and ran.random() < rate
+ return f
+
+def time_all():
+
+ setups = [
+ #("empty", UnicodeActions(0)),
+ ("single bmp", UnicodeActions(1, (lambda ran, i : False))),
+ ("single", UnicodeActions(1, (lambda ran, i : True))),
+ ("short bmp", UnicodeActions(SHORT, (lambda ran, i : False))),
+ ("short 50%", UnicodeActions(SHORT, evenly(0.5))),
+ ("medium bmp", UnicodeActions(MEDIUM, (lambda ran, i : False))),
+ ("medium 10%", UnicodeActions(MEDIUM, evenly(0.1))),
+ ("long bmp", UnicodeActions(LONG, (lambda ran, i : False))),
+ ("long 1%", UnicodeActions(LONG, evenly(0.01))),
+ ("long 10%", UnicodeActions(LONG, evenly(0.1))),
+ ("long 10% low", UnicodeActions(LONG, evenly_before(LONG/4, 0.4))),
+ ("long 10% high", UnicodeActions(LONG, evenly_from(LONG-(LONG/4), 0.4))),
+ ("long 50%", UnicodeActions(LONG, evenly(0.5))),
+ ("huge bmp", UnicodeActions(HUGE, (lambda ran, i : False))),
+ ("huge 10%", UnicodeActions(HUGE, evenly(0.1))),
+ ]
+
+ ops = [
+ ("[i]", "repeat_getitem"),
+ ("[i:i+n]", "repeat_slice"),
+ ("find(c)", "repeat_find_char"),
+ ("find(str)", "repeat_find_str"),
+ ("rfind(c)", "repeat_rfind_char"),
+ ("rfind(str)", "repeat_rfind_str"),
+ ]
+
+
+ print("{:15s}{:>6s}".format("time (us)", "len"), end='')
+ for title, _ in ops:
+ print("{:>12s}".format(title), end='')
+ print()
+
+ mintime = dict()
+ def save_mintime(k, t):
+ mintime[k] = min(t, mintime.get(k, 1e9))
+
+ trashcan = []
+
+ # Cycle through the cases repeatedly.
+ for k in range(5):
+ for name, material in setups:
+ for _, opname in ops:
+ # For each case, keep the minimum time
+ op = material.__getattribute__(opname)
+ t = time_per_op(op, 1000)
+ save_mintime((name, opname), t)
+
+ if k == 0:
+ trashcan.append(material.trash)
+
+ # Cycle through the cases again to print them.
+ for name, material in setups:
+ print("{:15s}{:6d}".format(name, material.size), end='')
+ for _, opname in ops:
+ t = mintime[(name, opname)]
+ print("{:12.3f}".format(t), end='')
+ print()
+
+ print("y =", trashcan)
+
+if __name__ == "__main__":
+
+ time_all()
--
Repository URL: http://hg.python.org/jython
More information about the Jython-checkins
mailing list