[Jython-checkins] jython: Regression and speed tests preparatory to work on PyUnicode indexing.

Wed Sep 17 00:55:24 CEST 2014

http://hg.python.org/jython/rev/aed06364c44d
changeset:   7379:aed06364c44d
user:        Jeff Allen <ja.py at farowl.co.uk>
date:        Sat Aug 16 22:40:18 2014 +0100
summary:
  Regression and speed tests preparatory to work on PyUnicode indexing.
Includes a failing test for unicode.find() and .rfind(), and a complicated
benchmark for performance of indexing, slice and (r)find (compatible with
CPython 2.7 and 3.4).

files:
  Lib/test/test_unicode_jy.py         |  211 +++++++++
  tests/python/unicode_index_times.py |  368 ++++++++++++++++
  2 files changed, 579 insertions(+), 0 deletions(-)

diff --git a/Lib/test/test_unicode_jy.py b/Lib/test/test_unicode_jy.py
--- a/Lib/test/test_unicode_jy.py
+++ b/Lib/test/test_unicode_jy.py
@@ -3,6 +3,8 @@
 
 Made for Jython.
 """
+import itertools
+import random
 import re
 import string
 import sys
@@ -150,6 +152,214 @@
             u'f')
 
 
+class UnicodeMaterial(object):
+    ''' Object holding a list of single characters and a unicode string
+        that is their concatenation. The sequence is created from a
+        background sequence of basic plane characters and random
+        replacement with supplementary plane characters (those with
+        point code>0xffff).
+    '''
+
+    base = tuple(u'abcdefghijklmnopqrstuvwxyz')
+    supp = tuple(map(unichr, range(0x10000, 0x1000c)))
+    used = sorted(set(base+supp))
+
+    def __init__(self, size=20, pred=None, ran=None):
+        ''' Create size chars choosing an SP char at i where
+            pred(ran, i)==True where ran is an instance of
+            random.Random supplied in the constructor or created
+            locally (if ran==None).
+        '''
+
+        # Generators for the BMP and SP characters
+        base = itertools.cycle(UnicodeMaterial.base)
+        supp = itertools.cycle(UnicodeMaterial.supp)
+
+        # Each instance gets a random generator
+        if ran is None:
+            ran = random.Random()
+        self.random = ran
+
+        if pred is None:
+            pred = lambda ran, j : ran.random() < DEFAULT_RATE
+
+        # Generate the list
+        r = list()
+        for i in range(size):
+            if pred(self.random, i):
+                c = supp.next()
+            else:
+                c = base.next()
+            r.append(c)
+
+        # The list and its concatenation are our material
+        self.ref = r
+        self.size = len(r)
+        self.text = u''.join(r)
+        self.target = u''
+
+    def __len__(self):
+        return self.size
+
+    def insert(self, target, p=None):
+        ''' Insert target string at position p (or middle), truncating if
+            that would make the material any longer
+        '''
+        if p is None:
+            p = max(0, (self.size-len(target)) // 2)
+
+        n = 0
+        for t in target:
+            if p+n >= self.size:
+                break;
+            self.ref[p+n] = t
+            n += 1
+
+        self.target = target[:n]
+        self.text = u''.join(self.ref)
+
+
+class UnicodeIndexMixTest(unittest.TestCase):
+    # Test indexing where there may be more than one code unit per code point.
+    # See Jython Issue #2100.
+
+    # Functions defining particular distributions of SP codes
+    #
+    def evenly(self, rate=0.2):
+        'Evenly distributed at given rate'
+        def f(ran, i):
+            return ran.random() < rate
+        return f
+
+    def evenly_before(self, k, rate=0.2):
+        'Evenly distributed on i<k at given rate'
+        def f(ran, i):
+            return i < k and ran.random() < rate
+        return f
+
+    def evenly_from(self, k, rate=0.2):
+        'Evenly distributed on i>=k at given rate'
+        def f(ran, i):
+            return i >= k and ran.random() < rate
+        return f
+
+    def at(self, places):
+        'Only at specified places'
+        def f(ran, i):
+            return i in places
+        return f
+
+    def setUp(self):
+        ran = random.Random(1234)  # ensure repeatable
+        mat = list()
+        mat.append(UnicodeMaterial(10, self.at([2]), ran))
+        mat.append(UnicodeMaterial(10, self.at([2, 5]), ran))
+        mat.append(UnicodeMaterial(50, self.evenly(), ran))
+        mat.append(UnicodeMaterial(200, self.evenly_before(70), ran))
+        mat.append(UnicodeMaterial(200, self.evenly_from(130), ran))
+        mat.append(UnicodeMaterial(1000, self.evenly(), ran))
+
+        self.material = mat
+
+
+    def test_getitem(self):
+        # Test map from to code point index to internal representation
+        # Fails in Jython 2.7b3
+
+        def check_getitem(m):
+            # Check indexing the string returns the expected point code
+            for i in xrange(m.size):
+                self.assertEqual(m.text[i], m.ref[i])
+
+        for m in self.material:
+            check_getitem(m)
+
+    def test_slice(self):
+        # Test indexing gets the slice ends correct.
+        # Passes in Jython 2.7b3, but may be touched by #2100 changes.
+
+        def check_slice(m):
+            # Check a range of slices against slices of the reference.
+            n = 1
+            while n <= m.size:
+                for i in range(m.size - n):
+                    exp = u''.join(m.ref[i:i+n])
+                    self.assertEqual(m.text[i:i+n], exp)
+                n *= 3
+
+        for m in self.material:
+            check_slice(m)
+
+    @unittest.skip("Fails on Jython 2.7b3 issue #2100")
+    def test_find(self):
+        # Test map from internal find result to code point index
+        # Fails in Jython 2.7b3
+
+        def check_find(ref):
+            # Check find returns indexes for single point codes
+            for c in set(m.used):
+                start = 0
+                u = m.text
+                while start < m.size:
+                    i = u.find(c, start)
+                    if i < 0: break
+                    self.assertEqual(u[i], c)
+                    start = i + 1
+
+        def check_find_str(m, t):
+            # Check find returns correct index for string target
+            i = m.text.find(t)
+            self.assertEqual(list(t), m.ref[i:i+len(t)])
+
+        targets = [
+            u"this",
+            u"ab\U00010041de", 
+            u"\U00010041\U00010042\U00010042xx",
+            u"xx\U00010041\U00010042\U00010043yy",
+        ]
+
+        for m in self.material:
+            check_find(m)
+            for t in targets:
+                # Insert in middle then try to find it
+                m.insert(t)
+                check_find_str(m, t)
+
+    @unittest.skip("Fails on Jython 2.7b3 issue #2100")
+    def test_rfind(self):
+        # Test map from internal rfind result to code point index
+        # Fails in Jython 2.7b3
+
+        def check_rfind(ref):
+            # Check rfind returns indexes for single point codes
+            for c in set(m.used):
+                end = m.size
+                u = m.text
+                while True:
+                    end = u.rfind(c, 0, end)
+                    if end < 0: break
+                    self.assertEqual(u[end], c)
+
+        def check_rfind_str(m, t):
+            # Check rfind returns correct index for string target
+            i = m.text.rfind(t)
+            self.assertEqual(list(t), m.ref[i:i+len(t)])
+
+        targets = [
+            u"this",
+            u"ab\U00010041de", 
+            u"\U00010041\U00010042\U00010042xx",
+            u"xx\U00010041\U00010042\U00010043yy",
+        ]
+
+        for m in self.material:
+            check_rfind(m)
+            for t in targets:
+                # Insert in middle then try to find it
+                m.insert(t)
+                check_rfind_str(m, t)
+
+
 class UnicodeFormatTestCase(unittest.TestCase):
 
     def test_unicode_mapping(self):
@@ -596,6 +806,7 @@
 def test_main():
     test_support.run_unittest(
                 UnicodeTestCase,
+                UnicodeIndexMixTest,
                 UnicodeFormatTestCase,
                 UnicodeStdIOTestCase,
                 UnicodeFormatStrTest,
diff --git a/tests/python/unicode_index_times.py b/tests/python/unicode_index_times.py
new file mode 100644
--- /dev/null
+++ b/tests/python/unicode_index_times.py
@@ -0,0 +1,368 @@
+# -*- coding: utf-8 -*-
+# unicode speed tests for access operations and find
+# This is part of an effort to supply the Jython implementation of
+# unicode with efficient, correct translations between the visible index
+# of a character and and the index in the implementation array of the
+# UTF-16 code unit(s) that represent it. See also test.test_unicode_jy,
+# and Jython issue #2100. The presence of characters with Unicode
+# code points > 0xffff (supplementary characters), that it takes two
+# code units to represent, makes this non-trivial.
+#
+# The program defines a variety of test strings of differing lengths
+# and distribution of supplementary characters, then times some basic
+# operations involving index translation: of retrieval, slicing and
+# find, to provide an average time per operation. It runs several trials
+# of each test case (a combination of an operation and the test material)
+# and reports the shortest trial, using the same strategy as the timeit
+# module).
+#
+# It was difficult to get repeatable times from the test under Jython,
+# because the JIT compiler (?) is non-deterministic. It proved
+# impossible using a strategy that would run the same test case multiple
+# times in succession. The approach eventually taken was to run the all
+# the test cases once, then repeat this sequence several times, and
+# report the minimum time of each case in these widely separated trials.
+# This strategy provides stable results with the default JIT behaviour.
+#
+# Two interesting options are to run with the # JIT compiler disabled:
+#   $ jython -J-Xint tests/python/unicode_index_times.py
+#
+# or with it continuously enabled:
+#   $ jython -J-Xcomp tests/python/unicode_index_times.py
+#
+from __future__ import print_function
+import itertools
+import random
+import sys
+import time
+
+if sys.platform == "win32" or sys.platform.startswith("java"):
+    # On Windows and Jython, the best timer is time.clock()
+    timer = time.clock
+else:
+    # On most other platforms the best timer is time.time()
+    timer = time.time
+
+if sys.version_info[0] > 2:
+    unichr = chr
+else:
+    def next(it) : return it.next()
+
+# Proportion of characters that are supplementary (if not explicit)
+DEFAULT_RATE = 0.2  # 20%
+
+# We will test performance with these sizes
+SHORT = 10
+MEDIUM = 100
+LONG = 1000
+HUGE = 10000
+
+
+class UnicodeMaterial(object):
+    ''' Object holding a list of single characters and a unicode string
+        that is their concatenation. The sequence is created from a
+        background sequence of basic plane characters and random
+        replacement with supplementary plane characters (those with
+        point code>0xffff).
+    '''
+
+    base = tuple(u'abcdefghijklmnopqrstuvwxyz')
+    if sys.maxunicode < 0x10000:
+        print("Narrow build: all characters from BMP", file=sys.stderr)
+        supp = tuple(map(unichr, range(0x20, 0x2c)))
+    else:
+        # Wide build: we have real supplementary characters
+        supp = tuple(map(unichr, range(0x10000, 0x1000c)))
+    used = sorted(set(base+supp))
+
+    def __init__(self, size=20, pred=None, ran=None):
+        ''' Create size chars choosing an SP char at i where
+            pred(ran, i)==True where ran is an instance of
+            random.Random supplied in the constructor or created
+            locally (if ran==None).
+        '''
+
+        # Generators for the BMP and SP characters
+        base = itertools.cycle(UnicodeMaterial.base)
+        supp = itertools.cycle(UnicodeMaterial.supp)
+
+        # Each instance gets a random generator
+        if ran is None:
+            ran = random.Random()
+        self.random = ran
+
+        if pred is None:
+            pred = lambda ran, j : ran.random() < DEFAULT_RATE
+
+        # Generate the list
+        r = list()
+        for i in range(size):
+            if pred(self.random, i):
+                c = next(supp)
+            else:
+                c = next(base)
+            r.append(c)
+
+        # The list and its concatenation are our material
+        self.ref = r
+        self.size = len(r)
+        self.text = u''.join(r)
+        self.target = u''
+
+    def __len__(self):
+        return self.size
+
+    def insert(self, target, p=None):
+        ''' Insert target string at position p (or middle), truncating if
+            that would make the material any longer
+        '''
+        if p is None:
+            p = max(0, (self.size-len(target)) // 2)
+
+        n = 0
+        for t in target:
+            if p+n >= self.size:
+                break;
+            self.ref[p+n] = t
+            n += 1
+
+        self.target = target[:n]
+        self.text = u''.join(self.ref)
+
+
+class UnicodeActions(UnicodeMaterial):
+    ''' Provides test material with loops for timing.'''
+
+    def __init__(self, size=20, pred=None, ran=None):
+        super(UnicodeActions, self).__init__(size, pred, ran)
+        if self.size <= 0:
+            raise ValueError("The timings don't work for zero length")
+        self.used =  UnicodeMaterial.used
+        self.trash = None
+        # String to find (note 'abcde' in base: nice for false alarms)
+        self.insert(u"abide")
+
+
+    def repeat_getitem(self, mincount):
+        ''' Access each code point in sequence repeatedly so that at
+            least mincount operations have been peformed, and return the
+            actual number of operations.
+        '''
+        n = self.size
+        t = self.text
+        opcount = 0
+        dummy = None
+        while opcount < mincount:
+            # Access each point code
+            i = 0
+            while i < n:
+                # Avoid optimising away the call
+                dummy = t[i]
+                i += 1
+            opcount += n
+        self.trash = dummy
+        return opcount
+
+    def repeat_slice(self, mincount):
+        ''' Extract a slice at each feasible position and length,
+            repeating enough times to do mincount operations, and
+            return the actual number of operations.
+        '''
+        n = self.size
+        t = self.text
+        opcount = 0
+        dummy = None
+
+        while opcount < mincount:
+            m = 1
+            while m <= n:
+                starts = n - m + 1
+                for i in range(starts):
+                    dummy = t[i:i+m]
+                opcount += starts
+                m *= 5
+
+        return opcount
+
+    def repeat_find_char(self, mincount):
+        ''' Do an incremental find of each code point, repeating
+            enough times to do mincount finds, and return the actual
+            number of operations.
+        '''
+        opcount = 0
+        n = self.size
+        findop = self.text.find
+        dummy = 0
+
+        while opcount < mincount:
+            # The text is searched for every code c.
+            for c in self.used:
+                # ... and every occurrence is found.
+                start = 0
+                while start < n:
+                    i = findop(c, start)
+                    if i < 0: break
+                    # Avoid optimising away the call
+                    dummy += i
+                    start = i + 1
+
+            # Every character in the text was a hit exactly once.
+            # And every character was also a miss, except for
+            # the character that ends the text. So we did:
+            opcount += n + len(self.used) - 1
+
+        self.trash = dummy
+        return opcount
+
+    def repeat_find_str(self, mincount):
+        ''' Find the target string within the material, repeating
+            enough times to do mincount finds, and return the actual
+            number of operations.
+        '''
+        opcount = 0
+        s = self.target
+        findop = self.text.find
+        dummy = 0
+
+        while opcount < mincount:
+            dummy += findop(s)
+            opcount += 1
+
+        self.trash = dummy
+        return opcount
+
+    def repeat_rfind_char(self, mincount):
+        ''' Do an incremental rfind of each code point, repeating
+            enough times to do mincount finds, and return the actual
+            number of operations.
+        '''
+        opcount = 0
+        n = self.size
+        findop = self.text.rfind
+
+        while opcount < mincount:
+            # The text is searched for every code c.
+            for c in self.used:
+                # ... and every occurrence is found.
+                end = n
+                while end >= 0:
+                    end = findop(c, 0, end)
+
+            # Every character in the text was a hit exactly once.
+            # And every character was also a miss. So we did:
+            opcount += n + len(self.used)
+
+        return opcount
+
+    def repeat_rfind_str(self, mincount):
+        ''' Find the target string within the material, repeating
+            enough times to do mincount finds, and return the actual
+            number of operations.
+        '''
+        opcount = 0
+        s = self.target
+        findop = self.text.rfind
+        dummy = 0
+
+        while opcount < mincount:
+            dummy += findop(s)
+            opcount += 1
+
+        self.trash = dummy
+        return opcount
+
+
+def time_per_op(op, mincount):
+    ''' Repeat the given operation at least mincount times and
+        return the time per operation in microseconds.
+    '''
+    t = timer()
+    opcount = op(mincount)
+    return (timer() - t) * 1e6 / opcount
+
+# Functions defining particular distributions of SP codes
+#
+def evenly(rate=DEFAULT_RATE):
+    'Evenly distributed at given rate'
+    def f(ran, i):
+        return ran.random() < rate
+    return f
+
+def evenly_before(k, rate=DEFAULT_RATE):
+    'Evenly distributed on i<k at given rate'
+    def f(ran, i):
+        return i < k and ran.random() < rate
+    return f
+
+def evenly_from(k, rate=DEFAULT_RATE):
+    'Evenly distributed on i>=k at given rate'
+    def f(ran, i):
+        return i >= k and ran.random() < rate
+    return f
+
+def time_all():
+
+    setups = [
+        #("empty", UnicodeActions(0)),
+        ("single bmp", UnicodeActions(1, (lambda ran, i : False))),
+        ("single", UnicodeActions(1, (lambda ran, i : True))),
+        ("short bmp", UnicodeActions(SHORT, (lambda ran, i : False))),
+        ("short 50%", UnicodeActions(SHORT, evenly(0.5))),
+        ("medium bmp", UnicodeActions(MEDIUM, (lambda ran, i : False))),
+        ("medium 10%", UnicodeActions(MEDIUM, evenly(0.1))),
+        ("long bmp", UnicodeActions(LONG, (lambda ran, i : False))),
+        ("long 1%", UnicodeActions(LONG, evenly(0.01))),
+        ("long 10%", UnicodeActions(LONG, evenly(0.1))),
+        ("long 10% low", UnicodeActions(LONG, evenly_before(LONG/4, 0.4))),
+        ("long 10% high", UnicodeActions(LONG, evenly_from(LONG-(LONG/4), 0.4))),
+        ("long 50%", UnicodeActions(LONG, evenly(0.5))),
+        ("huge bmp", UnicodeActions(HUGE, (lambda ran, i : False))),
+        ("huge 10%", UnicodeActions(HUGE, evenly(0.1))),
+    ]
+
+    ops = [
+        ("[i]", "repeat_getitem"),
+        ("[i:i+n]", "repeat_slice"),
+        ("find(c)", "repeat_find_char"),
+        ("find(str)", "repeat_find_str"),
+        ("rfind(c)", "repeat_rfind_char"),
+        ("rfind(str)", "repeat_rfind_str"),
+    ]
+
+
+    print("{:15s}{:>6s}".format("time (us)", "len"), end='')
+    for title, _ in ops:
+        print("{:>12s}".format(title), end='')
+    print()
+
+    mintime = dict()
+    def save_mintime(k, t):
+        mintime[k] = min(t, mintime.get(k, 1e9))
+
+    trashcan = []
+
+    # Cycle through the cases repeatedly.
+    for k in range(5):
+        for name, material in setups:
+            for _, opname in ops:
+                # For each case, keep the minimum time
+                op = material.__getattribute__(opname)
+                t = time_per_op(op, 1000)
+                save_mintime((name, opname), t)
+
+            if k == 0:
+                trashcan.append(material.trash)
+
+    # Cycle through the cases again to print them.
+    for name, material in setups:
+        print("{:15s}{:6d}".format(name, material.size), end='')
+        for _, opname in ops:
+            t = mintime[(name, opname)]
+            print("{:12.3f}".format(t), end='')
+        print()
+
+    print("y =", trashcan)
+
+if __name__ == "__main__":
+
+    time_all()

-- 
Repository URL: http://hg.python.org/jython