[pypy-commit] pypy py3.5: Trying to fix some re failures in 3.5 (will graft back to default)

Sat Oct 15 07:56:27 EDT 2016

Author: Armin Rigo <arigo at tunes.org>
Branch: py3.5
Changeset: r87812:bff060a4f720
Date: 2016-10-15 13:55 +0200
http://bitbucket.org/pypy/pypy/changeset/bff060a4f720/

Log:	Trying to fix some re failures in 3.5 (will graft back to default)

diff --git a/rpython/rlib/rlocale.py b/rpython/rlib/rlocale.py
--- a/rpython/rlib/rlocale.py
+++ b/rpython/rlib/rlocale.py
@@ -195,6 +195,7 @@
 
 isalpha = external('isalpha', [rffi.INT], rffi.INT)
 isupper = external('isupper', [rffi.INT], rffi.INT)
+toupper = external('toupper', [rffi.INT], rffi.INT)
 islower = external('islower', [rffi.INT], rffi.INT)
 tolower = external('tolower', [rffi.INT], rffi.INT)
 isalnum = external('isalnum', [rffi.INT], rffi.INT)
diff --git a/rpython/rlib/rsre/rpy/sre_constants.py b/rpython/rlib/rsre/rpy/sre_constants.py
--- a/rpython/rlib/rsre/rpy/sre_constants.py
+++ b/rpython/rlib/rsre/rpy/sre_constants.py
@@ -58,6 +58,7 @@
 REPEAT_ONE = "repeat_one"
 SUBPATTERN = "subpattern"
 MIN_REPEAT_ONE = "min_repeat_one"
+RANGE_IGNORE = "range_ignore"
 
 # positions
 AT_BEGINNING = "at_beginning"
@@ -119,8 +120,8 @@
     REPEAT,
     REPEAT_ONE,
     SUBPATTERN,
-    MIN_REPEAT_ONE
-
+    MIN_REPEAT_ONE,
+    RANGE_IGNORE,
 ]
 
 ATCODES = [
diff --git a/rpython/rlib/rsre/rsre_char.py b/rpython/rlib/rsre/rsre_char.py
--- a/rpython/rlib/rsre/rsre_char.py
+++ b/rpython/rlib/rsre/rsre_char.py
@@ -2,7 +2,7 @@
 Character categories and charsets.
 """
 import sys
-from rpython.rlib.rlocale import tolower, isalnum
+from rpython.rlib.rlocale import tolower, toupper, isalnum
 from rpython.rlib.unroll import unrolling_iterable
 from rpython.rlib import jit
 from rpython.rlib.rarithmetic import int_between
@@ -67,6 +67,19 @@
             char_ord += ord('a') - ord('A')
     return char_ord
 
+def getupper(char_ord, flags):
+    if flags & SRE_FLAG_LOCALE:
+        if char_ord < 256:      # cheating!  Well, CPython does too.
+            char_ord = toupper(char_ord)
+        return char_ord
+    elif flags & SRE_FLAG_UNICODE:
+        assert unicodedb is not None
+        char_ord = unicodedb.toupper(char_ord)
+    else:
+        if int_between(ord('a'), char_ord, ord('z') + 1):   # ASCII upper
+            char_ord += ord('A') - ord('a')
+    return char_ord
+
 #### Category helpers
 
 is_a_word = [(chr(i).isalnum() or chr(i) == '_') for i in range(256)]
@@ -139,16 +152,17 @@
 ##### Charset evaluation
 
 @jit.unroll_safe
-def check_charset(pattern, ppos, char_code):
+def check_charset(ctx, ppos, char_code):
     """Checks whether a character matches set of arbitrary length.
     The set starts at pattern[ppos]."""
     negated = False
     result = False
+    pattern = ctx.pattern
     while True:
         opcode = pattern[ppos]
         for i, function in set_dispatch_unroll:
             if opcode == i:
-                newresult, ppos = function(pattern, ppos, char_code)
+                newresult, ppos = function(ctx, ppos, char_code)
                 result |= newresult
                 break
         else:
@@ -163,18 +177,21 @@
         return not result
     return result
 
-def set_literal(pat, index, char_code):
+def set_literal(ctx, index, char_code):
     # <LITERAL> <code>
+    pat = ctx.pattern
     match = pat[index+1] == char_code
     return match, index + 2
 
-def set_category(pat, index, char_code):
+def set_category(ctx, index, char_code):
     # <CATEGORY> <code>
+    pat = ctx.pattern
     match = category_dispatch(pat[index+1], char_code)
     return match, index + 2
 
-def set_charset(pat, index, char_code):
+def set_charset(ctx, index, char_code):
     # <CHARSET> <bitmap> (16 bits per code word)
+    pat = ctx.pattern
     if CODESIZE == 2:
         match = char_code < 256 and \
                 (pat[index+1+(char_code >> 4)] & (1 << (char_code & 15)))
@@ -184,13 +201,25 @@
                 (pat[index+1+(char_code >> 5)] & (1 << (char_code & 31)))
         return match, index + 9   # skip bitmap
 
-def set_range(pat, index, char_code):
+def set_range(ctx, index, char_code):
     # <RANGE> <lower> <upper>
+    pat = ctx.pattern
     match = int_between(pat[index+1], char_code, pat[index+2] + 1)
     return match, index + 3
 
-def set_bigcharset(pat, index, char_code):
+def set_range_ignore(ctx, index, char_code):
+    # <RANGE_IGNORE> <lower> <upper>
+    # the char_code is already lower cased
+    pat = ctx.pattern
+    lower = pat[index + 1]
+    upper = pat[index + 2]
+    match1 = int_between(lower, char_code, upper + 1)
+    match2 = int_between(lower, getupper(char_code, ctx.flags), upper + 1)
+    return match1 | match2, index + 3
+
+def set_bigcharset(ctx, index, char_code):
     # <BIGCHARSET> <blockcount> <256 blockindices> <blocks>
+    pat = ctx.pattern
     count = pat[index+1]
     index += 2
 
@@ -224,7 +253,7 @@
     index += count * (32 / CODESIZE)  # skip blocks
     return match, index
 
-def set_unicode_general_category(pat, index, char_code):
+def set_unicode_general_category(ctx, index, char_code):
     # Unicode "General category property code" (not used by Python).
     # A general category is two letters.  'pat[index+1]' contains both
     # the first character, and the second character shifted by 8.
@@ -233,6 +262,7 @@
     # Negative matches are triggered by bit number 7.
     assert unicodedb is not None
     cat = unicodedb.category(char_code)
+    pat = ctx.pattern
     category_code = pat[index + 1]
     first_character = category_code & 0x7F
     second_character = (category_code >> 8) & 0x7F
@@ -260,6 +290,7 @@
     11: set_bigcharset,
     19: set_literal,
     27: set_range,
+    32: set_range_ignore,
     70: set_unicode_general_category,
 }
 set_dispatch_unroll = unrolling_iterable(sorted(set_dispatch_table.items()))
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -40,6 +40,7 @@
 OPCODE_REPEAT_ONE         = 29
 #OPCODE_SUBPATTERN        = 30
 OPCODE_MIN_REPEAT_ONE     = 31
+OPCODE_RANGE_IGNORE       = 32
 
 # not used by Python itself
 OPCODE_UNICODE_GENERAL_CATEGORY = 70
@@ -640,8 +641,7 @@
         elif op == OPCODE_IN:
             # match set member (or non_member)
             # <IN> <skip> <set>
-            if ptr >= ctx.end or not rsre_char.check_charset(ctx.pattern,
-                                                             ppos+1,
+            if ptr >= ctx.end or not rsre_char.check_charset(ctx, ppos+1,
                                                              ctx.str(ptr)):
                 return
             ppos += ctx.pat(ppos)
@@ -650,8 +650,7 @@
         elif op == OPCODE_IN_IGNORE:
             # match set member (or non_member), ignoring case
             # <IN> <skip> <set>
-            if ptr >= ctx.end or not rsre_char.check_charset(ctx.pattern,
-                                                             ppos+1,
+            if ptr >= ctx.end or not rsre_char.check_charset(ctx, ppos+1,
                                                              ctx.lowstr(ptr)):
                 return
             ppos += ctx.pat(ppos)
@@ -871,10 +870,10 @@
     return True    # match anything (including a newline)
 @specializectx
 def match_IN(ctx, ptr, ppos):
-    return rsre_char.check_charset(ctx.pattern, ppos+2, ctx.str(ptr))
+    return rsre_char.check_charset(ctx, ppos+2, ctx.str(ptr))
 @specializectx
 def match_IN_IGNORE(ctx, ptr, ppos):
-    return rsre_char.check_charset(ctx.pattern, ppos+2, ctx.lowstr(ptr))
+    return rsre_char.check_charset(ctx, ppos+2, ctx.lowstr(ptr))
 @specializectx
 def match_LITERAL(ctx, ptr, ppos):
     return ctx.str(ptr) == ctx.pat(ppos+1)
@@ -1134,7 +1133,7 @@
     while start < ctx.end:
         ctx.jitdriver_CharsetSearch.jit_merge_point(ctx=ctx, start=start,
                                                     base=base)
-        if rsre_char.check_charset(ctx.pattern, 5, ctx.str(start)):
+        if rsre_char.check_charset(ctx, 5, ctx.str(start)):
             if sre_match(ctx, base, start, None) is not None:
                 ctx.match_start = start
                 return True
diff --git a/rpython/rlib/rsre/test/test_char.py b/rpython/rlib/rsre/test/test_char.py
--- a/rpython/rlib/rsre/test/test_char.py
+++ b/rpython/rlib/rsre/test/test_char.py
@@ -34,6 +34,22 @@
     assert rsre_char.getlower(UPPER_PI, SRE_FLAG_LOCALE | SRE_FLAG_UNICODE) \
                                                          == UPPER_PI
 
+def test_getupper():
+    assert rsre_char.getupper(ord('A'), 0) == ord('A')
+    assert rsre_char.getupper(ord('b'), 0) == ord('B')
+    assert rsre_char.getupper(10, 0) == 10
+    assert rsre_char.getupper(LOWER_PI, 0) == LOWER_PI
+    #
+    assert rsre_char.getupper(ord('a'), SRE_FLAG_UNICODE) == ord('A')
+    assert rsre_char.getupper(ord('2'), SRE_FLAG_UNICODE) == ord('2')
+    assert rsre_char.getupper(10, SRE_FLAG_UNICODE) == 10
+    assert rsre_char.getupper(LOWER_PI, SRE_FLAG_UNICODE) == UPPER_PI
+    #
+    assert rsre_char.getupper(LOWER_PI, SRE_FLAG_LOCALE) == LOWER_PI
+    assert rsre_char.getupper(LOWER_PI, SRE_FLAG_LOCALE | SRE_FLAG_UNICODE) \
+                                                         == LOWER_PI
+
+
 def test_is_word():
     assert rsre_char.is_word(ord('A'))
     assert rsre_char.is_word(ord('_'))
@@ -128,6 +144,10 @@
     assert     cat(CHCODES["category_uni_not_digit"], DINGBAT_CIRCLED)
 
 
+class Ctx:
+    def __init__(self, pattern):
+        self.pattern = pattern
+
 def test_general_category():
     from rpython.rlib.unicodedata import unicodedb
 
@@ -137,12 +157,12 @@
         pat_neg = [70, ord(cat) | 0x80, 0]
         for c in positive:
             assert unicodedb.category(ord(c)).startswith(cat)
-            assert rsre_char.check_charset(pat_pos, 0, ord(c))
-            assert not rsre_char.check_charset(pat_neg, 0, ord(c))
+            assert rsre_char.check_charset(Ctx(pat_pos), 0, ord(c))
+            assert not rsre_char.check_charset(Ctx(pat_neg), 0, ord(c))
         for c in negative:
             assert not unicodedb.category(ord(c)).startswith(cat)
-            assert not rsre_char.check_charset(pat_pos, 0, ord(c))
-            assert rsre_char.check_charset(pat_neg, 0, ord(c))
+            assert not rsre_char.check_charset(Ctx(pat_pos), 0, ord(c))
+            assert rsre_char.check_charset(Ctx(pat_neg), 0, ord(c))
 
     def cat2num(cat):
         return ord(cat[0]) | (ord(cat[1]) << 8)
@@ -153,17 +173,17 @@
         pat_neg = [70, cat2num(cat) | 0x80, 0]
         for c in positive:
             assert unicodedb.category(ord(c)) == cat
-            assert rsre_char.check_charset(pat_pos, 0, ord(c))
-            assert not rsre_char.check_charset(pat_neg, 0, ord(c))
+            assert rsre_char.check_charset(Ctx(pat_pos), 0, ord(c))
+            assert not rsre_char.check_charset(Ctx(pat_neg), 0, ord(c))
         for c in negative:
             assert unicodedb.category(ord(c)) != cat
-            assert not rsre_char.check_charset(pat_pos, 0, ord(c))
-            assert rsre_char.check_charset(pat_neg, 0, ord(c))
+            assert not rsre_char.check_charset(Ctx(pat_pos), 0, ord(c))
+            assert rsre_char.check_charset(Ctx(pat_neg), 0, ord(c))
 
     # test for how the common 'L&' pattern might be compiled
     pat = [70, cat2num('Lu'), 70, cat2num('Ll'), 70, cat2num('Lt'), 0]
-    assert rsre_char.check_charset(pat, 0, 65)    # Lu
-    assert rsre_char.check_charset(pat, 0, 99)    # Ll
-    assert rsre_char.check_charset(pat, 0, 453)   # Lt
-    assert not rsre_char.check_charset(pat, 0, 688)    # Lm
-    assert not rsre_char.check_charset(pat, 0, 5870)   # Nl
+    assert rsre_char.check_charset(Ctx(pat), 0, 65)    # Lu
+    assert rsre_char.check_charset(Ctx(pat), 0, 99)    # Ll
+    assert rsre_char.check_charset(Ctx(pat), 0, 453)   # Lt
+    assert not rsre_char.check_charset(Ctx(pat), 0, 688)    # Lm
+    assert not rsre_char.check_charset(Ctx(pat), 0, 5870)   # Nl
diff --git a/rpython/rlib/rsre/test/test_match.py b/rpython/rlib/rsre/test/test_match.py
--- a/rpython/rlib/rsre/test/test_match.py
+++ b/rpython/rlib/rsre/test/test_match.py
@@ -1,5 +1,5 @@
 import re, random, py
-from rpython.rlib.rsre import rsre_core
+from rpython.rlib.rsre import rsre_core, rsre_char
 from rpython.rlib.rsre.rpy import get_code, VERSION
 
 
@@ -299,3 +299,12 @@
         assert rsre_core.fullmatch(r, "ab")
         r = get_code(r"(?!a)..")
         assert not rsre_core.fullmatch(r, "ab")
+
+    def test_range_ignore(self):
+        from rpython.rlib.unicodedata import unicodedb
+        rsre_char.set_unicode_db(unicodedb)
+        #
+        r = get_code(u"[\U00010428-\U0001044f]", re.I)
+        assert r.count(27) == 1       # OPCODE_RANGE
+        r[r.index(27)] = 32           # => OPCODE_RANGE_IGNORE
+        assert rsre_core.match(r, u"\U00010428")