[pypy-commit] pypy unicode-utf8-re: in-progress

arigo pypy.commits at gmail.com
Sun Dec 3 09:44:59 EST 2017


Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8-re
Changeset: r93244:4b6473b3ea05
Date: 2017-12-03 15:44 +0100
http://bitbucket.org/pypy/pypy/changeset/4b6473b3ea05/

Log:	in-progress

diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -1164,7 +1164,7 @@
             if sre_match(ctx, base, start, None) is not None:
                 ctx.match_start = start
                 return True
-        start += 1
+        start = ctx.next(start)
     return False
 
 install_jitdriver_spec('FastSearch',
@@ -1183,6 +1183,8 @@
     prefix_len = ctx.pat(5)
     assert prefix_len >= 0
     i = 0
+    j = 0
+    past_start_positions = [0] * (prefix_len - 1)
     while True:
         ctx.jitdriver_FastSearch.jit_merge_point(ctx=ctx,
                 string_position=string_position, i=i, prefix_len=prefix_len)
@@ -1196,10 +1198,26 @@
             i += 1
             if i == prefix_len:
                 # found a potential match
-                start = string_position + 1 - prefix_len
-                assert start >= 0
+
+                # This would be 'start = string_position + 1 - prefix_len'
+                # but it's probably faster to record the 'prefix_len'
+                # most recent locations, for utf8
+                start = past_start_positions[j]
+                assert start >= ctx.ZERO
                 prefix_skip = ctx.pat(6)
-                ptr = start + prefix_skip
+                if prefix_skip >= prefix_len - 1:
+                    try:
+                        ptr = ctx.next_n(string_position,
+                                         prefix_skip - (prefix_len - 1),
+                                         ctx.end)
+                    except EndOfString:
+                        ptr = -1
+                else:
+                    assert prefix_skip < prefix_len - 1
+                    j_prefix_skip = j + prefix_skip
+                    if j_prefix_skip >= prefix_len - 1:
+                        j_prefix_skip -= (prefix_len - 1)
+                    ptr = past_start_positions[j_prefix_skip]
                 #flags = ctx.pat(2)
                 #if flags & rsre_char.SRE_INFO_LITERAL:
                 #    # matched all of pure literal pattern
@@ -1209,11 +1227,16 @@
                 #    return True
                 pattern_offset = ctx.pat(1) + 1
                 ppos_start = pattern_offset + 2 * prefix_skip
-                if sre_match(ctx, ppos_start, ptr, None) is not None:
+                if (ptr >= ctx.ZERO and
+                        sre_match(ctx, ppos_start, ptr, None) is not None):
                     ctx.match_start = start
                     return True
                 overlap_offset = prefix_len + (7 - 1)
                 i = ctx.pat(overlap_offset + i)
-        string_position += 1
+        past_start_positions[j] = string_position
+        string_position = ctx.next(string_position)
         if string_position >= ctx.end:
             return False
+        j += 1
+        if j == prefix_len - 1:
+            j = 0
diff --git a/rpython/rlib/rsre/test/support.py b/rpython/rlib/rsre/test/support.py
--- a/rpython/rlib/rsre/test/support.py
+++ b/rpython/rlib/rsre/test/support.py
@@ -1,6 +1,6 @@
 import sys, random
 from rpython.rlib import debug
-from rpython.rlib.rsre.rsre_core import _adjust, match_context
+from rpython.rlib.rsre.rsre_core import _adjust, match_context, search_context
 from rpython.rlib.rsre.rsre_core import StrMatchContext, EndOfString
 
 
@@ -112,3 +112,13 @@
 
 def fullmatch(pattern, string, start=0, end=sys.maxint, flags=0):
     return match(pattern, string, start, end, flags, fullmatch=True)
+
+def search(pattern, string, start=0, end=sys.maxint, flags=0):
+    start, end = _adjust(start, end, len(string))
+    start = Position(start)
+    end = Position(end)
+    ctx = MatchContextForTests(pattern, string, start, end, flags)
+    if search_context(ctx):
+        return ctx
+    else:
+        return None
diff --git a/rpython/rlib/rsre/test/test_search.py b/rpython/rlib/rsre/test/test_search.py
--- a/rpython/rlib/rsre/test/test_search.py
+++ b/rpython/rlib/rsre/test/test_search.py
@@ -1,44 +1,44 @@
 import re, py
-from rpython.rlib.rsre import rsre_core
 from rpython.rlib.rsre.test.test_match import get_code, get_code_and_re
+from rpython.rlib.rsre.test.support import search, match
 
 
 class TestSearch:
 
     def test_code1(self):
         r_code1 = get_code(r'[abc][def][ghi]')
-        res = rsre_core.search(r_code1, "fooahedixxx")
+        res = search(r_code1, "fooahedixxx")
         assert res is None
-        res = rsre_core.search(r_code1, "fooahcdixxx")
+        res = search(r_code1, "fooahcdixxx")
         assert res is not None
         assert res.span() == (5, 8)
 
     def test_code2(self):
         r_code2 = get_code(r'<item>\s*<title>(.*?)</title>')
-        res = rsre_core.search(r_code2, "foo bar <item>  <title>abc</title>def")
+        res = search(r_code2, "foo bar <item>  <title>abc</title>def")
         assert res is not None
         assert res.span() == (8, 34)
 
     def test_pure_literal(self):
         r_code3 = get_code(r'foobar')
-        res = rsre_core.search(r_code3, "foo bar foobar baz")
+        res = search(r_code3, "foo bar foobar baz")
         assert res is not None
         assert res.span() == (8, 14)
 
     def test_code3(self):
         r_code1 = get_code(r'<item>\s*<title>(.*?)</title>')
-        res = rsre_core.match(r_code1, "<item>  <title>abc</title>def")
+        res = match(r_code1, "<item>  <title>abc</title>def")
         assert res is not None
 
     def test_max_until_0_65535(self):
         r_code2 = get_code(r'<abc>(?:xy)*xy</abc>')
-        #res = rsre_core.match(r_code2, '<abc></abc>def')
+        #res = match(r_code2, '<abc></abc>def')
         #assert res is None
-        #res = rsre_core.match(r_code2, '<abc>xy</abc>def')
+        #res = match(r_code2, '<abc>xy</abc>def')
         #assert res is not None
-        res = rsre_core.match(r_code2, '<abc>xyxyxy</abc>def')
+        res = match(r_code2, '<abc>xyxyxy</abc>def')
         assert res is not None
-        res = rsre_core.match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def')
+        res = match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def')
         assert res is not None
 
     def test_max_until_3_5(self):
@@ -46,18 +46,18 @@
         for i in range(8):
             s = '<abc>' + 'xy'*i + '</abc>defdefdefdefdef'
             assert (r.match(s) is not None) is (3 <= i-1 <= 5)
-            res = rsre_core.match(r_code2, s)
+            res = match(r_code2, s)
             assert (res is not None) is (3 <= i-1 <= 5)
 
     def test_min_until_0_65535(self):
         r_code2 = get_code(r'<abc>(?:xy)*?xy</abc>')
-        res = rsre_core.match(r_code2, '<abc></abc>def')
+        res = match(r_code2, '<abc></abc>def')
         assert res is None
-        res = rsre_core.match(r_code2, '<abc>xy</abc>def')
+        res = match(r_code2, '<abc>xy</abc>def')
         assert res is not None
-        res = rsre_core.match(r_code2, '<abc>xyxyxy</abc>def')
+        res = match(r_code2, '<abc>xyxyxy</abc>def')
         assert res is not None
-        res = rsre_core.match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def')
+        res = match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def')
         assert res is not None
 
     def test_min_until_3_5(self):
@@ -65,44 +65,44 @@
         for i in range(8):
             s = '<abc>' + 'xy'*i + '</abc>defdefdefdefdef'
             assert (r.match(s) is not None) is (3 <= i-1 <= 5)
-            res = rsre_core.match(r_code2, s)
+            res = match(r_code2, s)
             assert (res is not None) is (3 <= i-1 <= 5)
 
     def test_min_repeat_one(self):
         r_code3 = get_code(r'<abc>.{3,5}?y')
         for i in range(8):
-            res = rsre_core.match(r_code3, '<abc>' + 'x'*i + 'y')
+            res = match(r_code3, '<abc>' + 'x'*i + 'y')
             assert (res is not None) is (3 <= i <= 5)
 
     def test_simple_group(self):
         r_code4 = get_code(r'<abc>(x.)</abc>')
-        res = rsre_core.match(r_code4, '<abc>xa</abc>def')
+        res = match(r_code4, '<abc>xa</abc>def')
         assert res is not None
         assert res.get_mark(0) == 5
         assert res.get_mark(1) == 7
 
     def test_max_until_groups(self):
         r_code4 = get_code(r'<abc>(x.)*xy</abc>')
-        res = rsre_core.match(r_code4, '<abc>xaxbxy</abc>def')
+        res = match(r_code4, '<abc>xaxbxy</abc>def')
         assert res is not None
         assert res.get_mark(0) == 7
         assert res.get_mark(1) == 9
 
     def test_group_branch(self):
         r_code5 = get_code(r'<abc>(ab|c)</abc>')
-        res = rsre_core.match(r_code5, '<abc>ab</abc>def')
+        res = match(r_code5, '<abc>ab</abc>def')
         assert (res.get_mark(0), res.get_mark(1)) == (5, 7)
-        res = rsre_core.match(r_code5, '<abc>c</abc>def')
+        res = match(r_code5, '<abc>c</abc>def')
         assert (res.get_mark(0), res.get_mark(1)) == (5, 6)
-        res = rsre_core.match(r_code5, '<abc>de</abc>def')
+        res = match(r_code5, '<abc>de</abc>def')
         assert res is None
 
     def test_group_branch_max_until(self):
         r_code6 = get_code(r'<abc>(ab|c)*a</abc>')
-        res = rsre_core.match(r_code6, '<abc>ccabcccaba</abc>def')
+        res = match(r_code6, '<abc>ccabcccaba</abc>def')
         assert (res.get_mark(0), res.get_mark(1)) == (12, 14)
         r_code7 = get_code(r'<abc>((ab)|(c))*a</abc>')
-        res = rsre_core.match(r_code7, '<abc>ccabcccaba</abc>def')
+        res = match(r_code7, '<abc>ccabcccaba</abc>def')
         assert (res.get_mark(0), res.get_mark(1)) == (12, 14)
         assert (res.get_mark(2), res.get_mark(3)) == (12, 14)
         assert (res.get_mark(4), res.get_mark(5)) == (11, 12)
@@ -113,7 +113,7 @@
         assert match.span(1) == (12, 13)
         assert match.span(3) == (12, 13)
         assert match.span(2) == (8, 9)
-        res = rsre_core.match(r_code7, '<abc>bbbabbbb</abc>')
+        res = match(r_code7, '<abc>bbbabbbb</abc>')
         assert (res.get_mark(0), res.get_mark(1)) == (12, 13)
         assert (res.get_mark(4), res.get_mark(5)) == (12, 13)
         assert (res.get_mark(2), res.get_mark(3)) == (8, 9)
@@ -124,7 +124,7 @@
         assert match.span(1) == (6, 7)
         assert match.span(3) == (6, 7)
         assert match.span(2) == (5, 6)
-        res = rsre_core.match(r_code8, '<abc>ab</abc>')
+        res = match(r_code8, '<abc>ab</abc>')
         assert (res.get_mark(0), res.get_mark(1)) == (6, 7)
         assert (res.get_mark(4), res.get_mark(5)) == (6, 7)
         assert (res.get_mark(2), res.get_mark(3)) == (5, 6)
@@ -134,7 +134,7 @@
         match = r9.match('xyzxc')
         assert match.span(1) == (3, 4)
         assert match.span(2) == (-1, -1)
-        res = rsre_core.match(r_code9, 'xyzxc')
+        res = match(r_code9, 'xyzxc')
         assert (res.get_mark(0), res.get_mark(1)) == (3, 4)
         assert (res.get_mark(2), res.get_mark(3)) == (-1, -1)
 
@@ -143,7 +143,7 @@
         match = r9.match('xycxyzxc')
         assert match.span(2) == (6, 7)
         #assert match.span(3) == (1, 2) --- bug of CPython
-        res = rsre_core.match(r_code9, 'xycxyzxc')
+        res = match(r_code9, 'xycxyzxc')
         assert (res.get_mark(2), res.get_mark(3)) == (6, 7)
         assert (res.get_mark(4), res.get_mark(5)) == (1, 2)
 
@@ -151,19 +151,19 @@
         r_code, r = get_code_and_re(r'(a?)+y')
         assert r.match('y')
         assert r.match('aaayaaay').span() == (0, 4)
-        res = rsre_core.match(r_code, 'y')
+        res = match(r_code, 'y')
         assert res
-        res = rsre_core.match(r_code, 'aaayaaay')
+        res = match(r_code, 'aaayaaay')
         assert res and res.span() == (0, 4)
         #
         r_code, r = get_code_and_re(r'(a?){4,6}y')
         assert r.match('y')
-        res = rsre_core.match(r_code, 'y')
+        res = match(r_code, 'y')
         assert res
         #
         r_code, r = get_code_and_re(r'(a?)*y')
         assert r.match('y')
-        res = rsre_core.match(r_code, 'y')
+        res = match(r_code, 'y')
         assert res
 
     def test_empty_maxuntil_2(self):
@@ -173,24 +173,24 @@
             py.test.skip("older version of the stdlib: %s" % (e,))
         assert r.match('XfooXbarX').span() == (0, 5)
         assert r.match('XfooXbarX').span(1) == (4, 4)
-        res = rsre_core.match(r_code, 'XfooXbarX')
+        res = match(r_code, 'XfooXbarX')
         assert res.span() == (0, 5)
         assert res.span(1) == (4, 4)
 
     def test_empty_minuntil(self):
         r_code, r = get_code_and_re(r'(a?)+?y')
         #assert not r.match('z') -- CPython bug (at least 2.5) eats all memory
-        res = rsre_core.match(r_code, 'z')
+        res = match(r_code, 'z')
         assert not res
         #
         r_code, r = get_code_and_re(r'(a?){4,6}?y')
         assert not r.match('z')
-        res = rsre_core.match(r_code, 'z')
+        res = match(r_code, 'z')
         assert not res
         #
         r_code, r = get_code_and_re(r'(a?)*?y')
         #assert not r.match('z') -- CPython bug (at least 2.5) eats all memory
-        res = rsre_core.match(r_code, 'z')
+        res = match(r_code, 'z')
         assert not res
 
     def test_empty_search(self):
@@ -198,7 +198,7 @@
         for j in range(-2, 6):
             for i in range(-2, 6):
                 match = r.search('abc', i, j)
-                res = rsre_core.search(r_code, 'abc', i, j)
+                res = search(r_code, 'abc', i, j)
                 jk = min(max(j, 0), 3)
                 ik = min(max(i, 0), 3)
                 if ik <= jk:


More information about the pypy-commit mailing list