Inverse regex?
Jeff Petkau
jpet at eskimo.com
Sun Aug 26 18:00:09 EDT 2001
Andrew Henshaw <ahenshaw at apower dot com> wrote in message
news:tkcud8arriomfa at corp.supernews.com...
> Has anybody seen or developed what might be called an inverse regex
> generator. We would like to do some unit testing of modules that have
user
> input validated by regular expressions. It would be nice to throw some
some
> strings at them and see if the propagated input causes an error in the
code
> downstream. In other words, it may help to detect if our validation
> routines are insufficient.
>
> I understand that, in many cases, a complete list of valid strings is
> impossible. However, for certain regular expressions it would be
certainly
> reasonable. For others, it will be possible, but practically impossible
> (set too large). For the impossible, and near impossible, it would be
nice
> to generate a bounded set that hits many of the degenerate and extreme
> cases.
>
> Any ideas?
>
> Andrew Henshaw
Here's some code that generates random strings from regexes. It's not
quite what you were after and it's very incomplete, but it might help
and it's fun to play with.
---
"""
inverse regex:
given a regular expression string, produces random strings that
match the expression.
usage:
import reinv
print reinv.random_from_pattern("[a-z]*")
--> gxbf
print reinv.random_from_pattern("(cat |dog )+")
--> cat cat dog cat
print reinv.random_from_pattern("(cat |dog )+")
--> dog dog
some of the many cases that this code doesn't handle well:
(foo|x*)(^blah|stuff)
matches blah, foostuff, xstuff, xxstuff, xxxstuff, ...
one of the second group's cases is made impossible by what the
first group matched. We need to backtrack to generate correct
results.
((cat|dog)\2)*
I don't know what the semantics of this case are supposed to be;
playing with sre.match() didn't help much.
case insensitivity, regular expression flags, etc.
As a cheap hack to deal with being so crippled, we make repeated attempts
to generate random strings, check them with re.match(), and return the
first one that matches. This means that we'll never return an invalid
string, although we may not return all valid strings and we may be unable
to generate any strings for some legitimate expressions.
"""
import random,string
from sre_constants import *
import sre_parse,sre
STAR_REPEAT_CHANCE = .7
MAX_RE_ATTEMPTS = 10
class FailedToGenerate:
pass
def nots(s):
r = [chr(i) for i in range(256) if chr(i) not in s]
return ''.join(r)
allchars = nots('')
category_chars = {
CATEGORY_SPACE: string.whitespace,
CATEGORY_NOT_SPACE: nots(string.whitespace),
CATEGORY_DIGIT: string.digits,
CATEGORY_NOT_DIGIT: nots(string.digits),
CATEGORY_WORD: string.digits + string.letters + '_',
CATEGORY_NOT_WORD: nots(string.digits + string.letters + '_'),
CATEGORY_LINEBREAK: '\n\r',
CATEGORY_NOT_LINEBREAK: nots('\n\r'),
}
def cons_lookup(index, groups):
"""cons_lookup(int, cons_list) -> string
cons_list is either None, or a tuple of the form (int, string,
cons_list).
This is a handy way to keep track of group bindings we've made.
"""
while groups is not None:
i, v, groups = groups
if i==index:
return v
raise FailedToGenerate()
def random_from_sub(tok,val,groups):
"""given a token and associated value (as produced by sre_parse),
and a cons-list of group bindings, returns a string that matches
the subpattern defined by the token/value/groups, and a new
cons-list of group bindings.
"""
if tok is LITERAL:
return chr(val), groups
if tok is ANY:
return random.choice(allchars), groups
if tok is AT:
# ignore AT tokens
return '', groups
if tok is IN:
t,v = random.choice(val)
return random_from_sub(t, v, groups)
if tok is RANGE:
lo, hi = val
return chr(random.randint(lo,hi)), groups
if tok is BRANCH:
assert val[0] is None
return random_from_seq(random.choice(val[1]), groups)
if tok is MAX_REPEAT:
lo, hi, seq = val
r = []
for i in range(hi):
if i>=lo and random.random()>STAR_REPEAT_CHANCE: break
s,groups = random_from_seq(seq,groups)
r.append(s)
return ''.join(r), groups
if tok is SUBPATTERN:
index, seq = val
r, g = random_from_seq(seq, groups)
return r, (index, r, g)
if tok is GROUPREF:
return cons_lookup(val,groups), groups
if tok is CATEGORY:
return random.choice(category_chars[val]), groups
raise FailedToGenerate()
def random_from_seq(seq,groups):
oseq = []
for tok,val in seq:
s,groups = random_from_sub(tok,val,groups)
oseq.append(s)
return ''.join(oseq), groups
def random_from_pattern(pattern):
seq = sre_parse.parse(pattern)
# Make several tries at generation before giving up.
# Failure can be indicated by a FailedToGenerate exception,
# or just by returning a bogus string. (The latter check
# allows us to be a little sloppy in some of the difficult
# cases.)
for i in range(MAX_RE_ATTEMPTS):
try:
r = random_from_seq(seq,None)[0]
if sre.match(pattern, r):
return r
except FailedToGenerate:
pass
raise FailedToGenerate()
if __name__ == '__main__':
def check(p):
print '\nMatches for %s:' % p
for i in range(5):
try:
r = random_from_pattern(p)
assert sre.match(p,r)
except FailedToGenerate:
r = 'failed!'
print ' ',r
check('foo$foo')
check('x')
check('xy')
check('x|y')
check('x|y|z')
check('[abc]')
check('[a-z]')
check('[A-Z]')
check('cat|dog')
check('cat|dog|mouse')
check('c[aeiou]t|d[aeiou]g|mouse')
check('x*')
check('y+')
check('(cat |dog )*')
check('(cat |dog )(\\1)*')
check('-?[0-9]+\.[0-9]{2}')
check('-?\d+\.\d\d')
check('.*foo.*')
check('\w+( \w+)*')
check('[A-Z][a-z]*( [a-z]+)*\\.')
check('( [aeiouAEIOU]?([bcdfghjklmnpqrstvwxyz][aeiou]){1,4})+')
check('(( [aeiouAEIOU]?([bcdfghjklmnpqrstvwxyz][aeiou]){1,4})\\2?)+')
More information about the Python-list
mailing list