[pypy-svn] r51871 - in pypy/dist/pypy/rlib/parsing: . test
jared.grubb at codespeak.net
jared.grubb at codespeak.net
Tue Feb 26 10:56:31 CET 2008
Author: jared.grubb
Date: Tue Feb 26 10:56:29 2008
New Revision: 51871
Added:
pypy/dist/pypy/rlib/parsing/test/test_pcre_regtest.py (contents, props changed)
Modified:
pypy/dist/pypy/rlib/parsing/deterministic.py
pypy/dist/pypy/rlib/parsing/regexparse.py
pypy/dist/pypy/rlib/parsing/test/test_deterministic.py
pypy/dist/pypy/rlib/parsing/test/test_regexparse.py
Log:
parsing/regex stuff: add support for {n,} '\cx' '\377'; also working on adapter class to let us run some PCRE regression tests (what better way to test our NFA's and DFA's than some vigorous RE's :)
parsing/detrministic: make_nice_charset_repr now escapes the ] as well
Modified: pypy/dist/pypy/rlib/parsing/deterministic.py
==============================================================================
--- pypy/dist/pypy/rlib/parsing/deterministic.py (original)
+++ pypy/dist/pypy/rlib/parsing/deterministic.py Tue Feb 26 10:56:29 2008
@@ -27,13 +27,14 @@
# Change the above list into a list of sorted tuples
real_result = [(c,l) for [c,l] in result]
+ # Sort longer runs first (hence -c), then alphabetically
real_result.sort(key=lambda (l,c): (-c,l))
return real_result
def make_nice_charset_repr(chars):
# Compress the letters & digits
letters = set(chars) & set("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
- therest = set(chars) - letters - set('-')
+ therest = set(chars) - letters - set(['-',']'])
charranges = compress_char_set(letters)
result = []
for a, num in charranges:
@@ -45,8 +46,11 @@
else:
result.append("%s-%s" % (repr(a)[1:-1], repr(chr(ord(a) + num - 1))[1:-1]))
result += [repr(c)[1:-1] for c in therest]
+ # Handle the special chars that MUST get escaped
if '-' in chars:
result += ['\\-']
+ if ']' in chars:
+ result += ['\\]']
return "".join(result)
class LexerError(Exception):
@@ -214,6 +218,9 @@
result.emit("i = 0")
result.emit("state = 0")
result.start_block("while 1:")
+
+ # state_to_chars is a dict containing the sets of
+ # Ex: state_to_chars = { 0: set('a','b','c'), ...}
state_to_chars = {}
for (state, char), nextstate in self.transitions.iteritems():
state_to_chars.setdefault(state, {}).setdefault(nextstate, set()).add(char)
Modified: pypy/dist/pypy/rlib/parsing/regexparse.py
==============================================================================
--- pypy/dist/pypy/rlib/parsing/regexparse.py (original)
+++ pypy/dist/pypy/rlib/parsing/regexparse.py Tue Feb 26 10:56:29 2008
@@ -20,23 +20,38 @@
}
for i in range(256):
- # 'x' and numbers are reserved for hexadecimal/octal escapes
- if chr(i) in 'x01234567':
- continue
- escaped = "\\" + chr(i)
+ if chr(i) not in 'x01234567':
+ # 'x' and numbers are reserved for hexadecimal/octal escapes
+ escaped = "\\" + chr(i)
+ if escaped not in ESCAPES:
+ ESCAPES[escaped] = chr(i)
+
+ # Three digit octals
+ escaped = "\\%03o" % i
if escaped not in ESCAPES:
ESCAPES[escaped] = chr(i)
+
+ if 0 <= i <= 077:
+ # Two digit octal digs are ok too
+ escaped = "\\%02o" % i
+ if escaped not in ESCAPES:
+ ESCAPES[escaped] = chr(i)
+
+ # Add the ctrl-x types:
+ # Rule, according to PCRE:
+ # if x is a lower case letter, it is converted to upper case.
+ # Then bit 6 of the character (hex 40) is inverted.
+ # Thus, \cz => 0x1A, but \c{ => 0x3B, while \c; => 0x7B.
+ escaped = "\\c%s" % chr(i)
+ if escaped not in ESCAPES:
+ ESCAPES[escaped] = chr(ord(chr(i).upper()) ^ 0x40)
+
+
for a in "0123456789ABCDEFabcdef":
for b in "0123456789ABCDEFabcdef":
escaped = "\\x%s%s" % (a, b)
if escaped not in ESCAPES:
ESCAPES[escaped] = chr(int("%s%s" % (a, b), 16))
-for a in "0123":
- for b in "01234567":
- for c in "01234567":
- escaped = "\\x%s%s%s" % (a, b, c)
- if escaped not in ESCAPES:
- ESCAPES[escaped] = chr(int("%s%s%s" % (a, b, c), 8))
def unescape(s):
result = []
@@ -90,11 +105,16 @@
| r1 = primary
'?'
return {regex.StringExpression("") | r1}
- | r = primary
+ | r1 = primary
+ '{'
+ n = clippednumrange
+ '}'
+ return {r1 * n + r1.kleene()}
+ | r1 = primary
'{'
n = numrange
'}'
- return {r * n[0] + reduce(operator.or_, [r * i for i in range(n[1] - n[0] + 1)], regex.StringExpression(""))}
+ return {r1 * n[0] + reduce(operator.or_, [r1 * i for i in range(n[1] - n[0] + 1)], regex.StringExpression(""))}
| primary;
primary:
@@ -112,7 +132,7 @@
return {c};
QUOTEDCHAR:
- `(\\x[0-9a-fA-F]{2})|(\\.)`;
+ `(\\x[0-9a-fA-F]{2})|(\\[0-3]?[0-7][0-7])|(\\c.)|(\\.)`;
CHAR:
`[^\*\+\(\)\[\]\{\}\|\.\-\?\,\^]`;
@@ -149,6 +169,11 @@
| n1 = NUM
return {n1, n1};
+clippednumrange:
+ n1 = NUM
+ ','
+ return {n1};
+
NUM:
c = `0|([1-9][0-9]*)`
return {int(c)};
@@ -537,14 +562,14 @@
_call_status = self._primary()
_result = _call_status.result
_error = self._combine_errors(_error, _call_status.error)
- r = _result
+ r1 = _result
_result = self.__chars__('{')
- _call_status = self._numrange()
+ _call_status = self._clippednumrange()
_result = _call_status.result
_error = self._combine_errors(_error, _call_status.error)
n = _result
_result = self.__chars__('}')
- _result = (r * n[0] + reduce(operator.or_, [r * i for i in range(n[1] - n[0] + 1)], regex.StringExpression("")))
+ _result = (r1 * n + r1.kleene())
break
except BacktrackException, _exc:
_error = self._combine_errors(_error, _exc.error)
@@ -554,10 +579,27 @@
_call_status = self._primary()
_result = _call_status.result
_error = self._combine_errors(_error, _call_status.error)
+ r1 = _result
+ _result = self.__chars__('{')
+ _call_status = self._numrange()
+ _result = _call_status.result
+ _error = self._combine_errors(_error, _call_status.error)
+ n = _result
+ _result = self.__chars__('}')
+ _result = (r1 * n[0] + reduce(operator.or_, [r1 * i for i in range(n[1] - n[0] + 1)], regex.StringExpression("")))
break
except BacktrackException, _exc:
_error = self._combine_errors(_error, _exc.error)
self._pos = _choice4
+ _choice5 = self._pos
+ try:
+ _call_status = self._primary()
+ _result = _call_status.result
+ _error = self._combine_errors(_error, _call_status.error)
+ break
+ except BacktrackException, _exc:
+ _error = self._combine_errors(_error, _exc.error)
+ self._pos = _choice5
raise BacktrackException(_error)
_call_status = self._primary()
_result = _call_status.result
@@ -787,7 +829,7 @@
try:
_result = None
_error = None
- _result = self._regex1380912319()
+ _result = self._regex1192240515()
assert _status.status != _status.LEFTRECURSION
_status.status = _status.NORMAL
_status.pos = self._pos
@@ -1216,6 +1258,64 @@
_status.error = _error
_status.status = _status.ERROR
raise BacktrackException(_error)
+ def clippednumrange(self):
+ return self._clippednumrange().result
+ def _clippednumrange(self):
+ _key = self._pos
+ _status = self._dict_clippednumrange.get(_key, None)
+ if _status is None:
+ _status = self._dict_clippednumrange[_key] = Status()
+ else:
+ _statusstatus = _status.status
+ if _statusstatus == _status.NORMAL:
+ self._pos = _status.pos
+ return _status
+ elif _statusstatus == _status.ERROR:
+ raise BacktrackException(_status.error)
+ elif (_statusstatus == _status.INPROGRESS or
+ _statusstatus == _status.LEFTRECURSION):
+ _status.status = _status.LEFTRECURSION
+ if _status.result is not None:
+ self._pos = _status.pos
+ return _status
+ else:
+ raise BacktrackException(None)
+ elif _statusstatus == _status.SOMESOLUTIONS:
+ _status.status = _status.INPROGRESS
+ _startingpos = self._pos
+ try:
+ _result = None
+ _error = None
+ _call_status = self._NUM()
+ _result = _call_status.result
+ _error = _call_status.error
+ n1 = _result
+ _result = self.__chars__(',')
+ _result = (n1)
+ if _status.status == _status.LEFTRECURSION:
+ if _status.result is not None:
+ if _status.pos >= self._pos:
+ _status.status = _status.NORMAL
+ self._pos = _status.pos
+ return _status
+ _status.pos = self._pos
+ _status.status = _status.SOMESOLUTIONS
+ _status.result = _result
+ _status.error = _error
+ self._pos = _startingpos
+ return self._clippednumrange()
+ _status.status = _status.NORMAL
+ _status.pos = self._pos
+ _status.result = _result
+ _status.error = _error
+ return _status
+ except BacktrackException, _exc:
+ _status.pos = -1
+ _status.result = None
+ _error = self._combine_errors(_error, _exc.error)
+ _status.error = _error
+ _status.status = _status.ERROR
+ raise BacktrackException(_error)
def NUM(self):
return self._NUM().result
def _NUM(self):
@@ -1265,6 +1365,7 @@
self._dict_subrange = {}
self._dict_rangeelement = {}
self._dict_numrange = {}
+ self._dict_clippednumrange = {}
self._dict_NUM = {}
self._pos = 0
self._inputstream = inputstream
@@ -1282,10 +1383,10 @@
_result = self._inputstream[_pos: _upto]
self._pos = _upto
return _result
- def _regex1323868075(self):
+ def _regex1192240515(self):
_choice1 = self._pos
_runner = self._Runner(self._inputstream, self._pos)
- _i = _runner.recognize_1323868075(self._pos)
+ _i = _runner.recognize_1192240515(self._pos)
if _runner.last_matched_state == -1:
self._pos = _choice1
raise BacktrackException
@@ -1296,10 +1397,10 @@
_result = self._inputstream[_pos: _upto]
self._pos = _upto
return _result
- def _regex1380912319(self):
+ def _regex1323868075(self):
_choice2 = self._pos
_runner = self._Runner(self._inputstream, self._pos)
- _i = _runner.recognize_1380912319(self._pos)
+ _i = _runner.recognize_1323868075(self._pos)
if _runner.last_matched_state == -1:
self._pos = _choice2
raise BacktrackException
@@ -1360,7 +1461,7 @@
break
runner.state = state
return ~i
- def recognize_1323868075(runner, i):
+ def recognize_1192240515(runner, i):
#auto-generated code, don't edit
assert i >= 0
input = runner.text
@@ -1374,95 +1475,144 @@
runner.state = 0
return ~i
if char == '\\':
- state = 1
- elif '/' <= char <= '>':
- state = 1
- elif '@' <= char <= 'Z':
- state = 1
- elif '_' <= char <= 'z':
- state = 1
- elif '\x00' <= char <= "'":
- state = 1
- elif '~' <= char <= '\xff':
- state = 1
+ state = 6
else:
break
- runner.last_matched_state = state
- runner.last_matched_index = i - 1
- runner.state = state
- if i == len(input):
- return i
- else:
- return ~i
- break
- runner.state = state
- return ~i
- def recognize_1380912319(runner, i):
- #auto-generated code, don't edit
- assert i >= 0
- input = runner.text
- state = 0
- while 1:
- if state == 0:
+ if state == 1:
+ runner.last_matched_index = i - 1
+ runner.last_matched_state = state
try:
char = input[i]
i += 1
except IndexError:
- runner.state = 0
- return ~i
- if char == '\\':
+ runner.state = 1
+ return i
+ if '0' <= char <= '7':
state = 4
else:
break
- if state == 1:
+ if state == 2:
+ runner.last_matched_index = i - 1
+ runner.last_matched_state = state
try:
char = input[i]
i += 1
except IndexError:
- runner.state = 1
- return ~i
- if 'A' <= char <= 'F':
- state = 3
+ runner.state = 2
+ return i
+ if '0' <= char <= '9':
+ state = 5
+ elif 'A' <= char <= 'F':
+ state = 5
elif 'a' <= char <= 'f':
- state = 3
- elif '0' <= char <= '9':
- state = 3
+ state = 5
else:
break
- if state == 2:
+ if state == 3:
runner.last_matched_index = i - 1
runner.last_matched_state = state
try:
char = input[i]
i += 1
except IndexError:
- runner.state = 2
+ runner.state = 3
return i
- if 'A' <= char <= 'F':
- state = 1
- continue
- elif 'a' <= char <= 'f':
- state = 1
- continue
- elif '0' <= char <= '9':
- state = 1
- continue
+ if '\x00' <= char <= '\xff':
+ state = 7
else:
break
if state == 4:
+ runner.last_matched_index = i - 1
+ runner.last_matched_state = state
try:
char = input[i]
i += 1
except IndexError:
runner.state = 4
+ return i
+ if '0' <= char <= '7':
+ state = 7
+ else:
+ break
+ if state == 5:
+ try:
+ char = input[i]
+ i += 1
+ except IndexError:
+ runner.state = 5
+ return ~i
+ if '0' <= char <= '9':
+ state = 7
+ elif 'A' <= char <= 'F':
+ state = 7
+ elif 'a' <= char <= 'f':
+ state = 7
+ else:
+ break
+ if state == 6:
+ try:
+ char = input[i]
+ i += 1
+ except IndexError:
+ runner.state = 6
return ~i
- if char == 'x':
+ if '0' <= char <= '3':
+ state = 1
+ continue
+ elif char == 'x':
state = 2
continue
- elif '\x00' <= char <= 'w':
+ elif char == 'c':
state = 3
+ continue
+ elif '4' <= char <= '7':
+ state = 4
+ continue
elif 'y' <= char <= '\xff':
- state = 3
+ state = 7
+ elif '\x00' <= char <= '/':
+ state = 7
+ elif '8' <= char <= 'b':
+ state = 7
+ elif 'd' <= char <= 'w':
+ state = 7
+ else:
+ break
+ runner.last_matched_state = state
+ runner.last_matched_index = i - 1
+ runner.state = state
+ if i == len(input):
+ return i
+ else:
+ return ~i
+ break
+ runner.state = state
+ return ~i
+ def recognize_1323868075(runner, i):
+ #auto-generated code, don't edit
+ assert i >= 0
+ input = runner.text
+ state = 0
+ while 1:
+ if state == 0:
+ try:
+ char = input[i]
+ i += 1
+ except IndexError:
+ runner.state = 0
+ return ~i
+ if '~' <= char <= '\xff':
+ state = 1
+ elif '\x00' <= char <= "'":
+ state = 1
+ elif '_' <= char <= 'z':
+ state = 1
+ elif '@' <= char <= 'Z':
+ state = 1
+ elif '/' <= char <= '>':
+ state = 1
+ elif char == '\\':
+ state = 1
else:
break
runner.last_matched_state = state
@@ -1506,6 +1656,13 @@
+
+
+
+
+
+
+
def test_generate():
f = py.magic.autopath()
oldcontent = f.read()
Modified: pypy/dist/pypy/rlib/parsing/test/test_deterministic.py
==============================================================================
--- pypy/dist/pypy/rlib/parsing/test/test_deterministic.py (original)
+++ pypy/dist/pypy/rlib/parsing/test/test_deterministic.py Tue Feb 26 10:56:29 2008
@@ -170,4 +170,6 @@
assert make_nice_charset_repr("ABCabc") == 'A-Ca-c'
assert make_nice_charset_repr("zycba") == 'a-cyz'
assert make_nice_charset_repr(string.ascii_letters) == 'A-Za-z'
- assert make_nice_charset_repr(string.printable) == 'A-Za-z0-9\\t\\x0b\\n\\r\\x0c! #"%$\'&)(+*,/.;:=<?>@[]\\\\_^`{}|~\\-'
+ # this next one is ugly... need to clean it up (sometimes it fails because it's
+ # being generated from a dict, so the order is funky)
+ assert make_nice_charset_repr(string.printable) == 'A-Za-z0-9\\t\\x0b\\n\\r\\x0c! #"%$\'&)(+*,/.;:=<?>@[\\\\_^`{}|~\\-\\]'
\ No newline at end of file
Added: pypy/dist/pypy/rlib/parsing/test/test_pcre_regtest.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/rlib/parsing/test/test_pcre_regtest.py Tue Feb 26 10:56:29 2008
@@ -0,0 +1,172 @@
+# This file can (in progress) read and parse PCRE regression tests to try out
+# on our regular expression library.
+#
+# To try this out, 'man pcretest' and then grab testinput1 and testoutput1 from
+# the PCRE source code. (I need to look into whether we could distribute these
+# files with pypy?)
+
+import py
+from pypy.rlib.parsing.regexparse import make_runner, unescape, RegexParser
+import string
+import re
+
+py.test.skip("In Progress...")
+
+def get_simult_lines(tests, results, test_line_num=0):
+ """Returns a line from the input/output, ensuring that
+ we are sync'd up between the two."""
+ test = tests.pop(0)
+ result = results.pop(0)
+
+ test_line_num += 1
+
+ if test != result:
+ raise Exception("Lost sync between files at input line %d.\n INPUT: %s\n OUTPUT: %s" % (test_line_num, test, result))
+
+ return test
+
+def get_definition_line(tests, results):
+ """Gets a test definition line, formatted per the PCRE spec."""
+ delim = None
+ test = ''
+ result = ''
+
+ # A line is marked by a start-delimeter and an end-delimeter.
+ # The delimeter is non-alphanumeric
+ # If a backslash follows the delimiter, then the backslash should
+ # be appended to the end. (Otherwise, \ + delim would not be a
+ # delim anymore!)
+ while 1:
+ test += get_simult_lines(tests, results)
+
+ if delim is None:
+ delim = test[0]
+ assert delim in (set(string.printable) - set(string.letters) - set(string.digits))
+ test_re = re.compile(r'%(delim)s(([^%(delim)s]|\\%(delim)s)*([^\\]))%(delim)s(\\?)(.*)' % {'delim': delim})
+
+ matches = test_re.findall(test)
+ if matches:
+ break
+
+ assert len(matches)==1
+ test = matches[0][0]
+
+ # Add the backslash, if we gotta
+ test += matches[0][-2]
+ flags = matches[0][-1]
+
+ return test, flags
+
+def get_test_result(tests, results):
+ """Gets the expected return from the regular expression"""
+ # Second line is the test to run against the regex
+ # ' TEXT'
+ test = get_simult_lines(tests, results)
+ if not test:
+ return None, None
+ if not test.startswith(' '):
+ raise Exception("Input & output match, but I don't understand. (Got %r)" % test)
+ test = unescape(test[4:])
+
+ # Third line in the OUTPUT is the result, either:
+ # ' 0: ...' for a match
+ # 'No match' for no match
+ result = unescape(results.pop(0))
+ if result == 'No match':
+ pass
+ elif result.startswith(' 0: '):
+ # Now we need to eat any further lines like:
+ # ' 1: ....' a subgroup match
+ while results[0]:
+ if results[0][2] == ':':
+ results.pop(0)
+ else:
+ break
+ else:
+ raise Exception("Lost sync in output.")
+ return test, result
+
+def test_file():
+ """Open the PCRE tests and run them."""
+ tests = [line.rstrip() for line in open('testinput1','r').readlines()]
+ results = [line.rstrip() for line in open('testoutput1','r').readlines()]
+
+ regex_flag_mapping = { '': lambda s: s,
+ 'i': lambda s: s.upper()
+ }
+
+ import pdb
+ while tests:
+ # First line is a test, in the form:
+ # '/regex expression/FLAGS'
+ regex, regex_flags = get_definition_line(tests, results)
+
+ # Handle the flags:
+ try:
+ text_prepare = regex_flag_mapping[regex_flags]
+ except KeyError:
+ print "UNKNOWN FLAGS: %s" % regex_flags
+ continue
+
+ print '%r' % regex
+
+ skipped = any([op in regex for op in ['*?', '??', '+?', '}?']])
+ if skipped:
+ print " SKIPPED (cant do non-greedy operators)"
+ # now burn all the tests for this regex
+ while 1:
+ test, result = get_test_result(tests, results)
+ if not test:
+ break # A blank line means we have nothing to do
+ continue
+
+ regex_to_use = text_prepare(regex)
+
+ anchor_left = regex_to_use.startswith('^')
+ anchor_right = regex_to_use.endswith('$') and not regex_to_use.endswith('\\$')
+ if anchor_left:
+ regex_to_use = regex_to_use[1:] # chop the ^ if it's there
+ if anchor_right:
+ regex_to_use = regex_to_use[:-1] # chop the $ if it's there
+
+ # Finally, we make the pypy regex runner
+ runner = make_runner(regex_to_use)
+
+ # Now run the test expressions against the Regex
+ while 1:
+ test, result = get_test_result(tests, results)
+ if not test:
+ break # A blank line means we have nothing to do
+
+ # Create possible subsequences that we should test
+ if anchor_left:
+ subseq_gen = [0]
+ else:
+ subseq_gen = (start for start in range(0, len(test)))
+
+ if anchor_right:
+ subseq_gen = ( (start, len(test)) for start in subseq_gen )
+ else:
+ # Go backwards to simulate greediness
+ subseq_gen = ( (start, end) for start in subseq_gen for end in range(len(test)+1, start+1, -1) )
+
+ # Search the possibilities for a match...
+ for start, end in subseq_gen:
+ attempt = text_prepare(test[start:end])
+ matched = runner.recognize(attempt)
+ if matched:
+ break
+
+ # Did we get what we expected?
+ if result == 'No match':
+ if matched:
+ print " FALSE MATCH: regex==%r test==%r" % (regex, test)
+ else:
+ print " pass : regex==%r test==%r" % (regex, test)
+ elif result.startswith(' 0: '):
+ if not matched:
+ print " MISSED: regex==%r test==%r" % (regex, test)
+ elif not attempt==text_prepare(result[4:]):
+ print " BAD MATCH: regex==%r test==%r found==%r expect==%r" % (regex, test, attempt, result[4:])
+ else:
+ print " pass : regex==%r test==%r" % (regex, test)
Modified: pypy/dist/pypy/rlib/parsing/test/test_regexparse.py
==============================================================================
--- pypy/dist/pypy/rlib/parsing/test/test_regexparse.py (original)
+++ pypy/dist/pypy/rlib/parsing/test/test_regexparse.py Tue Feb 26 10:56:29 2008
@@ -96,7 +96,7 @@
assert r.recognize("a" * 15)
assert not r.recognize("a" * 14)
assert not r.recognize("a" * 16)
- assert not r.recognize("b" * 16)
+ assert not r.recognize("b" * 15)
r = make_runner('a{2,10}')
assert r.recognize("a" * 2)
assert r.recognize("a" * 5)
@@ -105,6 +105,14 @@
assert not r.recognize("a" + "b")
assert not r.recognize("a" * 11)
assert not r.recognize("a" * 12)
+ r = make_runner('a{3,}')
+ assert r.recognize("a" * 3)
+ assert r.recognize("a" * 5)
+ assert r.recognize("a" * 10)
+ assert r.recognize("a" * 12)
+ assert not r.recognize("a")
+ assert not r.recognize("a" + "b")
+ assert not r.recognize("a" * 2)
def test_quotes():
r = make_runner('"[^\\"]*"')
@@ -114,6 +122,13 @@
r = make_runner('\\n\\x0a')
assert not r.recognize("n\n")
assert r.recognize("\n\n")
+ r = make_runner('\\12\\012')
+ assert r.recognize("\n\n")
+ r = make_runner('\\377\\xff')
+ assert r.recognize("\xff\xff")
+ r = make_runner('\\?')
+ assert r.recognize("?")
+ assert not r.recognize("a")
def test_comment():
r = make_runner("(/\\*[^\\*/]*\\*/)")
More information about the Pypy-commit
mailing list