[pypy-commit] pypy unicode-utf8-re: Remove slowly_convert_byte_pos_to_index

Fri Dec 8 06:45:36 EST 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8-re
Changeset: r93312:b58a53172e21
Date: 2017-12-08 12:44 +0100
http://bitbucket.org/pypy/pypy/changeset/b58a53172e21/

Log:	Remove slowly_convert_byte_pos_to_index

diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -159,9 +159,6 @@
     def prev_n(self, position, n, start_position):
         raise NotImplementedError
     @not_rpython
-    def slowly_convert_byte_pos_to_index(self, position):
-        raise NotImplementedError
-    @not_rpython
     def debug_check_pos(self, position):
         raise NotImplementedError
     @not_rpython
@@ -178,15 +175,13 @@
         raise NotImplementedError
 
     def get_mark(self, gid):
-        mark = find_mark(self.match_marks, gid)
-        return self.slowly_convert_byte_pos_to_index(mark)
+        return find_mark(self.match_marks, gid)
 
     def flatten_marks(self):
         # for testing
         if self.match_marks_flat is None:
             self._compute_flattened_marks()
-        return [self.slowly_convert_byte_pos_to_index(i)
-                for i in self.match_marks_flat]
+        return self.match_marks_flat
 
     def _compute_flattened_marks(self):
         self.match_marks_flat = [self.match_start, self.match_end]
@@ -249,9 +244,6 @@
             raise EndOfString
         return position
 
-    def slowly_convert_byte_pos_to_index(self, position):
-        return position
-
     def debug_check_pos(self, position):
         pass
 
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
--- a/rpython/rlib/rsre/rsre_utf8.py
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -3,16 +3,19 @@
 from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rlib.rsre.rsre_core import AbstractMatchContext, EndOfString
 from rpython.rlib.rsre import rsre_char
+from rpython.rlib.objectmodel import we_are_translated
 from rpython.rlib import rutf8
 
 
 class Utf8MatchContext(AbstractMatchContext):
+    """A context that matches unicode, but encoded in a utf8 string.
+    Be careful because most positions taken by, handled in, and returned
+    by this class are expressed in *bytes*, not in characters.
+    """
 
-    def __init__(self, pattern, utf8string, index_storage,
-                 match_start, end, flags):
+    def __init__(self, pattern, utf8string, match_start, end, flags):
         AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
         self._utf8 = utf8string
-        self._index_storage = index_storage
 
     def str(self, index):
         check_nonneg(index)
@@ -58,16 +61,15 @@
         assert position >= 0
         return position
 
-    def slowly_convert_byte_pos_to_index(self, position):
-        return rutf8.codepoint_index_at_byte_position(
-            self._utf8, self._index_storage, position)
-
     def debug_check_pos(self, position):
+        if we_are_translated():
+            return
+        if position == len(self._utf8):
+            return   # end of string is fine
         assert not (0x80 <= self._utf8[position] < 0xC0)   # continuation byte
 
 
-def utf8search(pattern, utf8string, index_storage=None, bytestart=0,
-               byteend=sys.maxint, flags=0):
+def utf8search(pattern, utf8string, bytestart=0, byteend=sys.maxint, flags=0):
     # bytestart and byteend must be valid byte positions inside the
     # utf8string.
     from rpython.rlib.rsre.rsre_core import search_context
@@ -76,11 +78,9 @@
     assert 0 <= byteend
     if byteend > len(utf8string):
         byteend = len(utf8string)
-    if index_storage is None:     # should be restricted to tests only
-        length = rutf8.check_utf8(utf8string, allow_surrogates=True)
-        index_storage = rutf8.create_utf8_index_storage(utf8string, length)
-    ctx = Utf8MatchContext(pattern, utf8string, index_storage,
-                           bytestart, byteend, flags)
+    ctx = Utf8MatchContext(pattern, utf8string, bytestart, byteend, flags)
+    ctx.debug_check_pos(bytestart)
+    ctx.debug_check_pos(byteend)
     if search_context(ctx):
         return ctx
     else:
diff --git a/rpython/rlib/rsre/test/test_search.py b/rpython/rlib/rsre/test/test_search.py
--- a/rpython/rlib/rsre/test/test_search.py
+++ b/rpython/rlib/rsre/test/test_search.py
@@ -12,19 +12,22 @@
         assert res is None
         res = self.search(r_code1, "fooahcdixxx")
         assert res is not None
-        assert res.span() == (5, 8)
+        P = self.P
+        assert res.span() == (P(5), P(8))
 
     def test_code2(self):
         r_code2 = get_code(r'<item>\s*<title>(.*?)</title>')
         res = self.search(r_code2, "foo bar <item>  <title>abc</title>def")
         assert res is not None
-        assert res.span() == (8, 34)
+        P = self.P
+        assert res.span() == (P(8), P(34))
 
     def test_pure_literal(self):
         r_code3 = get_code(r'foobar')
         res = self.search(r_code3, "foo bar foobar baz")
         assert res is not None
-        assert res.span() == (8, 14)
+        P = self.P
+        assert res.span() == (P(8), P(14))
 
     def test_code3(self):
         r_code1 = get_code(r'<item>\s*<title>(.*?)</title>')
@@ -79,34 +82,38 @@
         r_code4 = get_code(r'<abc>(x.)</abc>')
         res = self.match(r_code4, '<abc>xa</abc>def')
         assert res is not None
-        assert res.get_mark(0) == 5
-        assert res.get_mark(1) == 7
+        P = self.P
+        assert res.get_mark(0) == P(5)
+        assert res.get_mark(1) == P(7)
 
     def test_max_until_groups(self):
         r_code4 = get_code(r'<abc>(x.)*xy</abc>')
         res = self.match(r_code4, '<abc>xaxbxy</abc>def')
         assert res is not None
-        assert res.get_mark(0) == 7
-        assert res.get_mark(1) == 9
+        P = self.P
+        assert res.get_mark(0) == P(7)
+        assert res.get_mark(1) == P(9)
 
     def test_group_branch(self):
         r_code5 = get_code(r'<abc>(ab|c)</abc>')
         res = self.match(r_code5, '<abc>ab</abc>def')
-        assert (res.get_mark(0), res.get_mark(1)) == (5, 7)
+        P = self.P
+        assert (res.get_mark(0), res.get_mark(1)) == (P(5), P(7))
         res = self.match(r_code5, '<abc>c</abc>def')
-        assert (res.get_mark(0), res.get_mark(1)) == (5, 6)
+        assert (res.get_mark(0), res.get_mark(1)) == (P(5), P(6))
         res = self.match(r_code5, '<abc>de</abc>def')
         assert res is None
 
     def test_group_branch_max_until(self):
         r_code6 = get_code(r'<abc>(ab|c)*a</abc>')
         res = self.match(r_code6, '<abc>ccabcccaba</abc>def')
-        assert (res.get_mark(0), res.get_mark(1)) == (12, 14)
+        P = self.P
+        assert (res.get_mark(0), res.get_mark(1)) == (P(12), P(14))
         r_code7 = get_code(r'<abc>((ab)|(c))*a</abc>')
         res = self.match(r_code7, '<abc>ccabcccaba</abc>def')
-        assert (res.get_mark(0), res.get_mark(1)) == (12, 14)
-        assert (res.get_mark(2), res.get_mark(3)) == (12, 14)
-        assert (res.get_mark(4), res.get_mark(5)) == (11, 12)
+        assert (res.get_mark(0), res.get_mark(1)) == (P(12), P(14))
+        assert (res.get_mark(2), res.get_mark(3)) == (P(12), P(14))
+        assert (res.get_mark(4), res.get_mark(5)) == (P(11), P(12))
 
     def test_group_7(self):
         r_code7, r7 = get_code_and_re(r'<abc>((a)?(b))*</abc>')
@@ -115,9 +122,10 @@
         assert m.span(3) == (12, 13)
         assert m.span(2) == (8, 9)
         res = self.match(r_code7, '<abc>bbbabbbb</abc>')
-        assert (res.get_mark(0), res.get_mark(1)) == (12, 13)
-        assert (res.get_mark(4), res.get_mark(5)) == (12, 13)
-        assert (res.get_mark(2), res.get_mark(3)) == (8, 9)
+        P = self.P
+        assert (res.get_mark(0), res.get_mark(1)) == (P(12), P(13))
+        assert (res.get_mark(4), res.get_mark(5)) == (P(12), P(13))
+        assert (res.get_mark(2), res.get_mark(3)) == (P(8), P(9))
 
     def test_group_branch_repeat_complex_case(self):
         r_code8, r8 = get_code_and_re(r'<abc>((a)|(b))*</abc>')
@@ -126,9 +134,10 @@
         assert m.span(3) == (6, 7)
         assert m.span(2) == (5, 6)
         res = self.match(r_code8, '<abc>ab</abc>')
-        assert (res.get_mark(0), res.get_mark(1)) == (6, 7)
-        assert (res.get_mark(4), res.get_mark(5)) == (6, 7)
-        assert (res.get_mark(2), res.get_mark(3)) == (5, 6)
+        P = self.P
+        assert (res.get_mark(0), res.get_mark(1)) == (P(6), P(7))
+        assert (res.get_mark(4), res.get_mark(5)) == (P(6), P(7))
+        assert (res.get_mark(2), res.get_mark(3)) == (P(5), P(6))
 
     def test_minuntil_lastmark_restore(self):
         r_code9, r9 = get_code_and_re(r'(x|yz)+?(y)??c')
@@ -136,7 +145,8 @@
         assert m.span(1) == (3, 4)
         assert m.span(2) == (-1, -1)
         res = self.match(r_code9, 'xyzxc')
-        assert (res.get_mark(0), res.get_mark(1)) == (3, 4)
+        P = self.P
+        assert (res.get_mark(0), res.get_mark(1)) == (P(3), P(4))
         assert (res.get_mark(2), res.get_mark(3)) == (-1, -1)
 
     def test_minuntil_bug(self):
@@ -145,8 +155,9 @@
         assert m.span(2) == (6, 7)
         #assert self.match.span(3) == (1, 2) --- bug of CPython
         res = self.match(r_code9, 'xycxyzxc')
-        assert (res.get_mark(2), res.get_mark(3)) == (6, 7)
-        assert (res.get_mark(4), res.get_mark(5)) == (1, 2)
+        P = self.P
+        assert (res.get_mark(2), res.get_mark(3)) == (P(6), P(7))
+        assert (res.get_mark(4), res.get_mark(5)) == (P(1), P(2))
 
     def test_empty_maxuntil(self):
         r_code, r = get_code_and_re(r'(a?)+y')
@@ -155,7 +166,8 @@
         res = self.match(r_code, 'y')
         assert res
         res = self.match(r_code, 'aaayaaay')
-        assert res and res.span() == (0, 4)
+        P = self.P
+        assert res and res.span() == (P(0), P(4))
         #
         r_code, r = get_code_and_re(r'(a?){4,6}y')
         assert r.match('y')
@@ -175,8 +187,9 @@
         assert r.match('XfooXbarX').span() == (0, 5)
         assert r.match('XfooXbarX').span(1) == (4, 4)
         res = self.match(r_code, 'XfooXbarX')
-        assert res.span() == (0, 5)
-        assert res.span(1) == (4, 4)
+        P = self.P
+        assert res.span() == (P(0), P(5))
+        assert res.span(1) == (P(4), P(4))
 
     def test_empty_minuntil(self):
         r_code, r = get_code_and_re(r'(a?)+?y')
@@ -206,8 +219,8 @@
                     assert match is not None
                     assert match.span() == (ik, ik)
                     assert res is not None
-                    assert res.match_start == self.Position(ik)
-                    assert res.match_end == self.Position(ik)
+                    assert res.match_start == self.P(ik)
+                    assert res.match_end == self.P(ik)
                 else:
                     assert match is None
                     assert res is None
@@ -216,14 +229,14 @@
 class TestSearchCustom(BaseTestSearch):
     search = staticmethod(support.search)
     match = staticmethod(support.match)
-    Position = support.Position
+    P = support.Position
 
 class TestSearchStr(BaseTestSearch):
     search = staticmethod(rsre_core.search)
     match = staticmethod(rsre_core.match)
-    Position = staticmethod(lambda n: n)
+    P = staticmethod(lambda n: n)
 
 class TestSearchUtf8(BaseTestSearch):
     search = staticmethod(rsre_utf8.utf8search)
     match = staticmethod(rsre_utf8.utf8match)
-    Position = staticmethod(lambda n: n)   # NB. only for plain ascii
+    P = staticmethod(lambda n: n)   # NB. only for plain ascii