[pypy-commit] pypy py3.5: Trying to fix some re failures in 3.5 (will graft back to default)
arigo
pypy.commits at gmail.com
Sat Oct 15 07:56:27 EDT 2016
Author: Armin Rigo <arigo at tunes.org>
Branch: py3.5
Changeset: r87812:bff060a4f720
Date: 2016-10-15 13:55 +0200
http://bitbucket.org/pypy/pypy/changeset/bff060a4f720/
Log: Trying to fix some re failures in 3.5 (will graft back to default)
diff --git a/rpython/rlib/rlocale.py b/rpython/rlib/rlocale.py
--- a/rpython/rlib/rlocale.py
+++ b/rpython/rlib/rlocale.py
@@ -195,6 +195,7 @@
isalpha = external('isalpha', [rffi.INT], rffi.INT)
isupper = external('isupper', [rffi.INT], rffi.INT)
+toupper = external('toupper', [rffi.INT], rffi.INT)
islower = external('islower', [rffi.INT], rffi.INT)
tolower = external('tolower', [rffi.INT], rffi.INT)
isalnum = external('isalnum', [rffi.INT], rffi.INT)
diff --git a/rpython/rlib/rsre/rpy/sre_constants.py b/rpython/rlib/rsre/rpy/sre_constants.py
--- a/rpython/rlib/rsre/rpy/sre_constants.py
+++ b/rpython/rlib/rsre/rpy/sre_constants.py
@@ -58,6 +58,7 @@
REPEAT_ONE = "repeat_one"
SUBPATTERN = "subpattern"
MIN_REPEAT_ONE = "min_repeat_one"
+RANGE_IGNORE = "range_ignore"
# positions
AT_BEGINNING = "at_beginning"
@@ -119,8 +120,8 @@
REPEAT,
REPEAT_ONE,
SUBPATTERN,
- MIN_REPEAT_ONE
-
+ MIN_REPEAT_ONE,
+ RANGE_IGNORE,
]
ATCODES = [
diff --git a/rpython/rlib/rsre/rsre_char.py b/rpython/rlib/rsre/rsre_char.py
--- a/rpython/rlib/rsre/rsre_char.py
+++ b/rpython/rlib/rsre/rsre_char.py
@@ -2,7 +2,7 @@
Character categories and charsets.
"""
import sys
-from rpython.rlib.rlocale import tolower, isalnum
+from rpython.rlib.rlocale import tolower, toupper, isalnum
from rpython.rlib.unroll import unrolling_iterable
from rpython.rlib import jit
from rpython.rlib.rarithmetic import int_between
@@ -67,6 +67,19 @@
char_ord += ord('a') - ord('A')
return char_ord
+def getupper(char_ord, flags):
+ if flags & SRE_FLAG_LOCALE:
+ if char_ord < 256: # cheating! Well, CPython does too.
+ char_ord = toupper(char_ord)
+ return char_ord
+ elif flags & SRE_FLAG_UNICODE:
+ assert unicodedb is not None
+ char_ord = unicodedb.toupper(char_ord)
+ else:
+ if int_between(ord('a'), char_ord, ord('z') + 1): # ASCII upper
+ char_ord += ord('A') - ord('a')
+ return char_ord
+
#### Category helpers
is_a_word = [(chr(i).isalnum() or chr(i) == '_') for i in range(256)]
@@ -139,16 +152,17 @@
##### Charset evaluation
@jit.unroll_safe
-def check_charset(pattern, ppos, char_code):
+def check_charset(ctx, ppos, char_code):
"""Checks whether a character matches set of arbitrary length.
The set starts at pattern[ppos]."""
negated = False
result = False
+ pattern = ctx.pattern
while True:
opcode = pattern[ppos]
for i, function in set_dispatch_unroll:
if opcode == i:
- newresult, ppos = function(pattern, ppos, char_code)
+ newresult, ppos = function(ctx, ppos, char_code)
result |= newresult
break
else:
@@ -163,18 +177,21 @@
return not result
return result
-def set_literal(pat, index, char_code):
+def set_literal(ctx, index, char_code):
# <LITERAL> <code>
+ pat = ctx.pattern
match = pat[index+1] == char_code
return match, index + 2
-def set_category(pat, index, char_code):
+def set_category(ctx, index, char_code):
# <CATEGORY> <code>
+ pat = ctx.pattern
match = category_dispatch(pat[index+1], char_code)
return match, index + 2
-def set_charset(pat, index, char_code):
+def set_charset(ctx, index, char_code):
# <CHARSET> <bitmap> (16 bits per code word)
+ pat = ctx.pattern
if CODESIZE == 2:
match = char_code < 256 and \
(pat[index+1+(char_code >> 4)] & (1 << (char_code & 15)))
@@ -184,13 +201,25 @@
(pat[index+1+(char_code >> 5)] & (1 << (char_code & 31)))
return match, index + 9 # skip bitmap
-def set_range(pat, index, char_code):
+def set_range(ctx, index, char_code):
# <RANGE> <lower> <upper>
+ pat = ctx.pattern
match = int_between(pat[index+1], char_code, pat[index+2] + 1)
return match, index + 3
-def set_bigcharset(pat, index, char_code):
+def set_range_ignore(ctx, index, char_code):
+ # <RANGE_IGNORE> <lower> <upper>
+ # the char_code is already lower cased
+ pat = ctx.pattern
+ lower = pat[index + 1]
+ upper = pat[index + 2]
+ match1 = int_between(lower, char_code, upper + 1)
+ match2 = int_between(lower, getupper(char_code, ctx.flags), upper + 1)
+ return match1 | match2, index + 3
+
+def set_bigcharset(ctx, index, char_code):
# <BIGCHARSET> <blockcount> <256 blockindices> <blocks>
+ pat = ctx.pattern
count = pat[index+1]
index += 2
@@ -224,7 +253,7 @@
index += count * (32 / CODESIZE) # skip blocks
return match, index
-def set_unicode_general_category(pat, index, char_code):
+def set_unicode_general_category(ctx, index, char_code):
# Unicode "General category property code" (not used by Python).
# A general category is two letters. 'pat[index+1]' contains both
# the first character, and the second character shifted by 8.
@@ -233,6 +262,7 @@
# Negative matches are triggered by bit number 7.
assert unicodedb is not None
cat = unicodedb.category(char_code)
+ pat = ctx.pattern
category_code = pat[index + 1]
first_character = category_code & 0x7F
second_character = (category_code >> 8) & 0x7F
@@ -260,6 +290,7 @@
11: set_bigcharset,
19: set_literal,
27: set_range,
+ 32: set_range_ignore,
70: set_unicode_general_category,
}
set_dispatch_unroll = unrolling_iterable(sorted(set_dispatch_table.items()))
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -40,6 +40,7 @@
OPCODE_REPEAT_ONE = 29
#OPCODE_SUBPATTERN = 30
OPCODE_MIN_REPEAT_ONE = 31
+OPCODE_RANGE_IGNORE = 32
# not used by Python itself
OPCODE_UNICODE_GENERAL_CATEGORY = 70
@@ -640,8 +641,7 @@
elif op == OPCODE_IN:
# match set member (or non_member)
# <IN> <skip> <set>
- if ptr >= ctx.end or not rsre_char.check_charset(ctx.pattern,
- ppos+1,
+ if ptr >= ctx.end or not rsre_char.check_charset(ctx, ppos+1,
ctx.str(ptr)):
return
ppos += ctx.pat(ppos)
@@ -650,8 +650,7 @@
elif op == OPCODE_IN_IGNORE:
# match set member (or non_member), ignoring case
# <IN> <skip> <set>
- if ptr >= ctx.end or not rsre_char.check_charset(ctx.pattern,
- ppos+1,
+ if ptr >= ctx.end or not rsre_char.check_charset(ctx, ppos+1,
ctx.lowstr(ptr)):
return
ppos += ctx.pat(ppos)
@@ -871,10 +870,10 @@
return True # match anything (including a newline)
@specializectx
def match_IN(ctx, ptr, ppos):
- return rsre_char.check_charset(ctx.pattern, ppos+2, ctx.str(ptr))
+ return rsre_char.check_charset(ctx, ppos+2, ctx.str(ptr))
@specializectx
def match_IN_IGNORE(ctx, ptr, ppos):
- return rsre_char.check_charset(ctx.pattern, ppos+2, ctx.lowstr(ptr))
+ return rsre_char.check_charset(ctx, ppos+2, ctx.lowstr(ptr))
@specializectx
def match_LITERAL(ctx, ptr, ppos):
return ctx.str(ptr) == ctx.pat(ppos+1)
@@ -1134,7 +1133,7 @@
while start < ctx.end:
ctx.jitdriver_CharsetSearch.jit_merge_point(ctx=ctx, start=start,
base=base)
- if rsre_char.check_charset(ctx.pattern, 5, ctx.str(start)):
+ if rsre_char.check_charset(ctx, 5, ctx.str(start)):
if sre_match(ctx, base, start, None) is not None:
ctx.match_start = start
return True
diff --git a/rpython/rlib/rsre/test/test_char.py b/rpython/rlib/rsre/test/test_char.py
--- a/rpython/rlib/rsre/test/test_char.py
+++ b/rpython/rlib/rsre/test/test_char.py
@@ -34,6 +34,22 @@
assert rsre_char.getlower(UPPER_PI, SRE_FLAG_LOCALE | SRE_FLAG_UNICODE) \
== UPPER_PI
+def test_getupper():
+ assert rsre_char.getupper(ord('A'), 0) == ord('A')
+ assert rsre_char.getupper(ord('b'), 0) == ord('B')
+ assert rsre_char.getupper(10, 0) == 10
+ assert rsre_char.getupper(LOWER_PI, 0) == LOWER_PI
+ #
+ assert rsre_char.getupper(ord('a'), SRE_FLAG_UNICODE) == ord('A')
+ assert rsre_char.getupper(ord('2'), SRE_FLAG_UNICODE) == ord('2')
+ assert rsre_char.getupper(10, SRE_FLAG_UNICODE) == 10
+ assert rsre_char.getupper(LOWER_PI, SRE_FLAG_UNICODE) == UPPER_PI
+ #
+ assert rsre_char.getupper(LOWER_PI, SRE_FLAG_LOCALE) == LOWER_PI
+ assert rsre_char.getupper(LOWER_PI, SRE_FLAG_LOCALE | SRE_FLAG_UNICODE) \
+ == LOWER_PI
+
+
def test_is_word():
assert rsre_char.is_word(ord('A'))
assert rsre_char.is_word(ord('_'))
@@ -128,6 +144,10 @@
assert cat(CHCODES["category_uni_not_digit"], DINGBAT_CIRCLED)
+class Ctx:
+ def __init__(self, pattern):
+ self.pattern = pattern
+
def test_general_category():
from rpython.rlib.unicodedata import unicodedb
@@ -137,12 +157,12 @@
pat_neg = [70, ord(cat) | 0x80, 0]
for c in positive:
assert unicodedb.category(ord(c)).startswith(cat)
- assert rsre_char.check_charset(pat_pos, 0, ord(c))
- assert not rsre_char.check_charset(pat_neg, 0, ord(c))
+ assert rsre_char.check_charset(Ctx(pat_pos), 0, ord(c))
+ assert not rsre_char.check_charset(Ctx(pat_neg), 0, ord(c))
for c in negative:
assert not unicodedb.category(ord(c)).startswith(cat)
- assert not rsre_char.check_charset(pat_pos, 0, ord(c))
- assert rsre_char.check_charset(pat_neg, 0, ord(c))
+ assert not rsre_char.check_charset(Ctx(pat_pos), 0, ord(c))
+ assert rsre_char.check_charset(Ctx(pat_neg), 0, ord(c))
def cat2num(cat):
return ord(cat[0]) | (ord(cat[1]) << 8)
@@ -153,17 +173,17 @@
pat_neg = [70, cat2num(cat) | 0x80, 0]
for c in positive:
assert unicodedb.category(ord(c)) == cat
- assert rsre_char.check_charset(pat_pos, 0, ord(c))
- assert not rsre_char.check_charset(pat_neg, 0, ord(c))
+ assert rsre_char.check_charset(Ctx(pat_pos), 0, ord(c))
+ assert not rsre_char.check_charset(Ctx(pat_neg), 0, ord(c))
for c in negative:
assert unicodedb.category(ord(c)) != cat
- assert not rsre_char.check_charset(pat_pos, 0, ord(c))
- assert rsre_char.check_charset(pat_neg, 0, ord(c))
+ assert not rsre_char.check_charset(Ctx(pat_pos), 0, ord(c))
+ assert rsre_char.check_charset(Ctx(pat_neg), 0, ord(c))
# test for how the common 'L&' pattern might be compiled
pat = [70, cat2num('Lu'), 70, cat2num('Ll'), 70, cat2num('Lt'), 0]
- assert rsre_char.check_charset(pat, 0, 65) # Lu
- assert rsre_char.check_charset(pat, 0, 99) # Ll
- assert rsre_char.check_charset(pat, 0, 453) # Lt
- assert not rsre_char.check_charset(pat, 0, 688) # Lm
- assert not rsre_char.check_charset(pat, 0, 5870) # Nl
+ assert rsre_char.check_charset(Ctx(pat), 0, 65) # Lu
+ assert rsre_char.check_charset(Ctx(pat), 0, 99) # Ll
+ assert rsre_char.check_charset(Ctx(pat), 0, 453) # Lt
+ assert not rsre_char.check_charset(Ctx(pat), 0, 688) # Lm
+ assert not rsre_char.check_charset(Ctx(pat), 0, 5870) # Nl
diff --git a/rpython/rlib/rsre/test/test_match.py b/rpython/rlib/rsre/test/test_match.py
--- a/rpython/rlib/rsre/test/test_match.py
+++ b/rpython/rlib/rsre/test/test_match.py
@@ -1,5 +1,5 @@
import re, random, py
-from rpython.rlib.rsre import rsre_core
+from rpython.rlib.rsre import rsre_core, rsre_char
from rpython.rlib.rsre.rpy import get_code, VERSION
@@ -299,3 +299,12 @@
assert rsre_core.fullmatch(r, "ab")
r = get_code(r"(?!a)..")
assert not rsre_core.fullmatch(r, "ab")
+
+ def test_range_ignore(self):
+ from rpython.rlib.unicodedata import unicodedb
+ rsre_char.set_unicode_db(unicodedb)
+ #
+ r = get_code(u"[\U00010428-\U0001044f]", re.I)
+ assert r.count(27) == 1 # OPCODE_RANGE
+ r[r.index(27)] = 32 # => OPCODE_RANGE_IGNORE
+ assert rsre_core.match(r, u"\U00010428")
More information about the pypy-commit
mailing list