[pypy-commit] pypy unicode-utf8-re: in-progress
arigo
pypy.commits at gmail.com
Sun Dec 3 09:44:59 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8-re
Changeset: r93244:4b6473b3ea05
Date: 2017-12-03 15:44 +0100
http://bitbucket.org/pypy/pypy/changeset/4b6473b3ea05/
Log: in-progress
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -1164,7 +1164,7 @@
if sre_match(ctx, base, start, None) is not None:
ctx.match_start = start
return True
- start += 1
+ start = ctx.next(start)
return False
install_jitdriver_spec('FastSearch',
@@ -1183,6 +1183,8 @@
prefix_len = ctx.pat(5)
assert prefix_len >= 0
i = 0
+ j = 0
+ past_start_positions = [0] * (prefix_len - 1)
while True:
ctx.jitdriver_FastSearch.jit_merge_point(ctx=ctx,
string_position=string_position, i=i, prefix_len=prefix_len)
@@ -1196,10 +1198,26 @@
i += 1
if i == prefix_len:
# found a potential match
- start = string_position + 1 - prefix_len
- assert start >= 0
+
+ # This would be 'start = string_position + 1 - prefix_len'
+ # but it's probably faster to record the 'prefix_len'
+ # most recent locations, for utf8
+ start = past_start_positions[j]
+ assert start >= ctx.ZERO
prefix_skip = ctx.pat(6)
- ptr = start + prefix_skip
+ if prefix_skip >= prefix_len - 1:
+ try:
+ ptr = ctx.next_n(string_position,
+ prefix_skip - (prefix_len - 1),
+ ctx.end)
+ except EndOfString:
+ ptr = -1
+ else:
+ assert prefix_skip < prefix_len - 1
+ j_prefix_skip = j + prefix_skip
+ if j_prefix_skip >= prefix_len - 1:
+ j_prefix_skip -= (prefix_len - 1)
+ ptr = past_start_positions[j_prefix_skip]
#flags = ctx.pat(2)
#if flags & rsre_char.SRE_INFO_LITERAL:
# # matched all of pure literal pattern
@@ -1209,11 +1227,16 @@
# return True
pattern_offset = ctx.pat(1) + 1
ppos_start = pattern_offset + 2 * prefix_skip
- if sre_match(ctx, ppos_start, ptr, None) is not None:
+ if (ptr >= ctx.ZERO and
+ sre_match(ctx, ppos_start, ptr, None) is not None):
ctx.match_start = start
return True
overlap_offset = prefix_len + (7 - 1)
i = ctx.pat(overlap_offset + i)
- string_position += 1
+ past_start_positions[j] = string_position
+ string_position = ctx.next(string_position)
if string_position >= ctx.end:
return False
+ j += 1
+ if j == prefix_len - 1:
+ j = 0
diff --git a/rpython/rlib/rsre/test/support.py b/rpython/rlib/rsre/test/support.py
--- a/rpython/rlib/rsre/test/support.py
+++ b/rpython/rlib/rsre/test/support.py
@@ -1,6 +1,6 @@
import sys, random
from rpython.rlib import debug
-from rpython.rlib.rsre.rsre_core import _adjust, match_context
+from rpython.rlib.rsre.rsre_core import _adjust, match_context, search_context
from rpython.rlib.rsre.rsre_core import StrMatchContext, EndOfString
@@ -112,3 +112,13 @@
def fullmatch(pattern, string, start=0, end=sys.maxint, flags=0):
return match(pattern, string, start, end, flags, fullmatch=True)
+
+def search(pattern, string, start=0, end=sys.maxint, flags=0):
+ start, end = _adjust(start, end, len(string))
+ start = Position(start)
+ end = Position(end)
+ ctx = MatchContextForTests(pattern, string, start, end, flags)
+ if search_context(ctx):
+ return ctx
+ else:
+ return None
diff --git a/rpython/rlib/rsre/test/test_search.py b/rpython/rlib/rsre/test/test_search.py
--- a/rpython/rlib/rsre/test/test_search.py
+++ b/rpython/rlib/rsre/test/test_search.py
@@ -1,44 +1,44 @@
import re, py
-from rpython.rlib.rsre import rsre_core
from rpython.rlib.rsre.test.test_match import get_code, get_code_and_re
+from rpython.rlib.rsre.test.support import search, match
class TestSearch:
def test_code1(self):
r_code1 = get_code(r'[abc][def][ghi]')
- res = rsre_core.search(r_code1, "fooahedixxx")
+ res = search(r_code1, "fooahedixxx")
assert res is None
- res = rsre_core.search(r_code1, "fooahcdixxx")
+ res = search(r_code1, "fooahcdixxx")
assert res is not None
assert res.span() == (5, 8)
def test_code2(self):
r_code2 = get_code(r'<item>\s*<title>(.*?)</title>')
- res = rsre_core.search(r_code2, "foo bar <item> <title>abc</title>def")
+ res = search(r_code2, "foo bar <item> <title>abc</title>def")
assert res is not None
assert res.span() == (8, 34)
def test_pure_literal(self):
r_code3 = get_code(r'foobar')
- res = rsre_core.search(r_code3, "foo bar foobar baz")
+ res = search(r_code3, "foo bar foobar baz")
assert res is not None
assert res.span() == (8, 14)
def test_code3(self):
r_code1 = get_code(r'<item>\s*<title>(.*?)</title>')
- res = rsre_core.match(r_code1, "<item> <title>abc</title>def")
+ res = match(r_code1, "<item> <title>abc</title>def")
assert res is not None
def test_max_until_0_65535(self):
r_code2 = get_code(r'<abc>(?:xy)*xy</abc>')
- #res = rsre_core.match(r_code2, '<abc></abc>def')
+ #res = match(r_code2, '<abc></abc>def')
#assert res is None
- #res = rsre_core.match(r_code2, '<abc>xy</abc>def')
+ #res = match(r_code2, '<abc>xy</abc>def')
#assert res is not None
- res = rsre_core.match(r_code2, '<abc>xyxyxy</abc>def')
+ res = match(r_code2, '<abc>xyxyxy</abc>def')
assert res is not None
- res = rsre_core.match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def')
+ res = match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def')
assert res is not None
def test_max_until_3_5(self):
@@ -46,18 +46,18 @@
for i in range(8):
s = '<abc>' + 'xy'*i + '</abc>defdefdefdefdef'
assert (r.match(s) is not None) is (3 <= i-1 <= 5)
- res = rsre_core.match(r_code2, s)
+ res = match(r_code2, s)
assert (res is not None) is (3 <= i-1 <= 5)
def test_min_until_0_65535(self):
r_code2 = get_code(r'<abc>(?:xy)*?xy</abc>')
- res = rsre_core.match(r_code2, '<abc></abc>def')
+ res = match(r_code2, '<abc></abc>def')
assert res is None
- res = rsre_core.match(r_code2, '<abc>xy</abc>def')
+ res = match(r_code2, '<abc>xy</abc>def')
assert res is not None
- res = rsre_core.match(r_code2, '<abc>xyxyxy</abc>def')
+ res = match(r_code2, '<abc>xyxyxy</abc>def')
assert res is not None
- res = rsre_core.match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def')
+ res = match(r_code2, '<abc>' + 'xy'*1000 + '</abc>def')
assert res is not None
def test_min_until_3_5(self):
@@ -65,44 +65,44 @@
for i in range(8):
s = '<abc>' + 'xy'*i + '</abc>defdefdefdefdef'
assert (r.match(s) is not None) is (3 <= i-1 <= 5)
- res = rsre_core.match(r_code2, s)
+ res = match(r_code2, s)
assert (res is not None) is (3 <= i-1 <= 5)
def test_min_repeat_one(self):
r_code3 = get_code(r'<abc>.{3,5}?y')
for i in range(8):
- res = rsre_core.match(r_code3, '<abc>' + 'x'*i + 'y')
+ res = match(r_code3, '<abc>' + 'x'*i + 'y')
assert (res is not None) is (3 <= i <= 5)
def test_simple_group(self):
r_code4 = get_code(r'<abc>(x.)</abc>')
- res = rsre_core.match(r_code4, '<abc>xa</abc>def')
+ res = match(r_code4, '<abc>xa</abc>def')
assert res is not None
assert res.get_mark(0) == 5
assert res.get_mark(1) == 7
def test_max_until_groups(self):
r_code4 = get_code(r'<abc>(x.)*xy</abc>')
- res = rsre_core.match(r_code4, '<abc>xaxbxy</abc>def')
+ res = match(r_code4, '<abc>xaxbxy</abc>def')
assert res is not None
assert res.get_mark(0) == 7
assert res.get_mark(1) == 9
def test_group_branch(self):
r_code5 = get_code(r'<abc>(ab|c)</abc>')
- res = rsre_core.match(r_code5, '<abc>ab</abc>def')
+ res = match(r_code5, '<abc>ab</abc>def')
assert (res.get_mark(0), res.get_mark(1)) == (5, 7)
- res = rsre_core.match(r_code5, '<abc>c</abc>def')
+ res = match(r_code5, '<abc>c</abc>def')
assert (res.get_mark(0), res.get_mark(1)) == (5, 6)
- res = rsre_core.match(r_code5, '<abc>de</abc>def')
+ res = match(r_code5, '<abc>de</abc>def')
assert res is None
def test_group_branch_max_until(self):
r_code6 = get_code(r'<abc>(ab|c)*a</abc>')
- res = rsre_core.match(r_code6, '<abc>ccabcccaba</abc>def')
+ res = match(r_code6, '<abc>ccabcccaba</abc>def')
assert (res.get_mark(0), res.get_mark(1)) == (12, 14)
r_code7 = get_code(r'<abc>((ab)|(c))*a</abc>')
- res = rsre_core.match(r_code7, '<abc>ccabcccaba</abc>def')
+ res = match(r_code7, '<abc>ccabcccaba</abc>def')
assert (res.get_mark(0), res.get_mark(1)) == (12, 14)
assert (res.get_mark(2), res.get_mark(3)) == (12, 14)
assert (res.get_mark(4), res.get_mark(5)) == (11, 12)
@@ -113,7 +113,7 @@
assert match.span(1) == (12, 13)
assert match.span(3) == (12, 13)
assert match.span(2) == (8, 9)
- res = rsre_core.match(r_code7, '<abc>bbbabbbb</abc>')
+ res = match(r_code7, '<abc>bbbabbbb</abc>')
assert (res.get_mark(0), res.get_mark(1)) == (12, 13)
assert (res.get_mark(4), res.get_mark(5)) == (12, 13)
assert (res.get_mark(2), res.get_mark(3)) == (8, 9)
@@ -124,7 +124,7 @@
assert match.span(1) == (6, 7)
assert match.span(3) == (6, 7)
assert match.span(2) == (5, 6)
- res = rsre_core.match(r_code8, '<abc>ab</abc>')
+ res = match(r_code8, '<abc>ab</abc>')
assert (res.get_mark(0), res.get_mark(1)) == (6, 7)
assert (res.get_mark(4), res.get_mark(5)) == (6, 7)
assert (res.get_mark(2), res.get_mark(3)) == (5, 6)
@@ -134,7 +134,7 @@
match = r9.match('xyzxc')
assert match.span(1) == (3, 4)
assert match.span(2) == (-1, -1)
- res = rsre_core.match(r_code9, 'xyzxc')
+ res = match(r_code9, 'xyzxc')
assert (res.get_mark(0), res.get_mark(1)) == (3, 4)
assert (res.get_mark(2), res.get_mark(3)) == (-1, -1)
@@ -143,7 +143,7 @@
match = r9.match('xycxyzxc')
assert match.span(2) == (6, 7)
#assert match.span(3) == (1, 2) --- bug of CPython
- res = rsre_core.match(r_code9, 'xycxyzxc')
+ res = match(r_code9, 'xycxyzxc')
assert (res.get_mark(2), res.get_mark(3)) == (6, 7)
assert (res.get_mark(4), res.get_mark(5)) == (1, 2)
@@ -151,19 +151,19 @@
r_code, r = get_code_and_re(r'(a?)+y')
assert r.match('y')
assert r.match('aaayaaay').span() == (0, 4)
- res = rsre_core.match(r_code, 'y')
+ res = match(r_code, 'y')
assert res
- res = rsre_core.match(r_code, 'aaayaaay')
+ res = match(r_code, 'aaayaaay')
assert res and res.span() == (0, 4)
#
r_code, r = get_code_and_re(r'(a?){4,6}y')
assert r.match('y')
- res = rsre_core.match(r_code, 'y')
+ res = match(r_code, 'y')
assert res
#
r_code, r = get_code_and_re(r'(a?)*y')
assert r.match('y')
- res = rsre_core.match(r_code, 'y')
+ res = match(r_code, 'y')
assert res
def test_empty_maxuntil_2(self):
@@ -173,24 +173,24 @@
py.test.skip("older version of the stdlib: %s" % (e,))
assert r.match('XfooXbarX').span() == (0, 5)
assert r.match('XfooXbarX').span(1) == (4, 4)
- res = rsre_core.match(r_code, 'XfooXbarX')
+ res = match(r_code, 'XfooXbarX')
assert res.span() == (0, 5)
assert res.span(1) == (4, 4)
def test_empty_minuntil(self):
r_code, r = get_code_and_re(r'(a?)+?y')
#assert not r.match('z') -- CPython bug (at least 2.5) eats all memory
- res = rsre_core.match(r_code, 'z')
+ res = match(r_code, 'z')
assert not res
#
r_code, r = get_code_and_re(r'(a?){4,6}?y')
assert not r.match('z')
- res = rsre_core.match(r_code, 'z')
+ res = match(r_code, 'z')
assert not res
#
r_code, r = get_code_and_re(r'(a?)*?y')
#assert not r.match('z') -- CPython bug (at least 2.5) eats all memory
- res = rsre_core.match(r_code, 'z')
+ res = match(r_code, 'z')
assert not res
def test_empty_search(self):
@@ -198,7 +198,7 @@
for j in range(-2, 6):
for i in range(-2, 6):
match = r.search('abc', i, j)
- res = rsre_core.search(r_code, 'abc', i, j)
+ res = search(r_code, 'abc', i, j)
jk = min(max(j, 0), 3)
ik = min(max(i, 0), 3)
if ik <= jk:
More information about the pypy-commit
mailing list