pattern-based string expansion
Nick Mathewson
nickm at mit.edu
Thu Apr 20 00:12:26 EDT 2000
On Thu, 20 Apr 2000 02:51:50 GMT, sue <smalleys at gte.net> wrote:
>I'm not looking for all possible strings with a wildcard,
>just a pseudo-RE-based expansion. The RE would not support
>open-ended arguments (* or +), but would have ?,[] and so
>on.
>The expansion needs to be ordered.
>
>I've thought about hacking the glob(3) code to work on an
>input RE, but I was hoping this already existed somewhere.
Well, the code below will probably only be useful to a hacker, but if
you're prepared to hack glob(3), this should be a snap.
It's a very very simplified version of (a subset of (a more general
regex-manipulation module for (a project that I'm working on))). I
didn't need an expansion function for my project, but it was pretty
easy to hack one up.
============================================================
#!/usr/bin/python
import string
def expand_re(s):
'''Given a regular expression, returns all a list of all possible
expansions. It supports (), |, [], ?, \[dwsDWS], and '.'.
The backslash character escapes any other character.
Example:
expand_re('spam( spam( spam[!?]?)?)?|Bloody Vikings!')
=> [ 'spam spam spam!',
'spam spam spam?',
'spam spam spam',
'spam spam',
'spam',
'Bloody Vikings!' ]
BUG: Only mildly tested.
BUG: Doesn't catch bad regular expressions, or give good error
messages.
BUG: Isn't very fast or efficient.
BUG: Can't handle character escapes within [].
BUG: Can't handle [^...].
BUG: Doesn't support unicode.
BUG: Doesn't support case-insensitive matching.
BUG: Always assumes that . should match newline.
BUG: Doesn't support +, *, {n,m}. In order to do these, we'd need
a function to only consider expansions up to a certain length,
or to only return the first n expansions.
'''
re, pos = _parseRE(s,0)
return re.get_expansions()
def _parseRE(s, pos):
'''Does the dirty work of parsing a regular expression from string 's',
starting at position 'pos'. Returns the Seq/Alt/Opt/Charset object
corresponding to s, along with the position immediately following
returns element, end tuple'''
# First, we find all the elements in s. Elements are:
# (groups)
# [character sets] (also '.', \d, and friends.)
# characters
# optional-elements?
# |
elements = []
while pos < len(s):
ch = s[pos]
if ch == '(':
group, pos = _parseRE(s, pos+1)
pos = pos-1
elements.append(group)
elif ch == ')':
pos = pos + 1
break
elif ch == '\\':
next = _special.get(s[pos+1], None)
if next:
elements.append(next)
else:
elements.append(s[pos+1])
pos = pos + 1
elif ch == '[':
chrs = []
while 1:
pos = pos + 1
ch = s[pos]
if ch == ']':
break
chrs.append(ch)
elements.append(CharSet(string.join(chrs,'')))
elif ch == '?':
elements[-1] = Alt([elements[-1], ''])
elif ch == '|':
elements.append(None) #HACKHACK
elif ch == '.':
elements.append(_allChars)
else:
elements.append(ch)
pos = pos + 1
# Now we compact all adjacent characters into strings. This will help
# our efficiency in the end, I think. The revised list is stored in
# 'els'.
els = []
cur_str = None
for el in elements:
if type(el) == type(''):
if cur_str is not None:
cur_str = cur_str + el
else:
cur_str = el
else:
if cur_str is not None:
els.append(cur_str)
cur_str = None
els.append(el)
if cur_str:
els.append(cur_str)
# Now we go through the list to look for vertical bars -- encoded as
# None. We find all alternatives within the current group.
groups = []
cur_group = []
for el in els:
if el is None:
groups.append(cur_group)
cur_group = []
else:
cur_group.append(el)
groups.append(cur_group)
# If there's more than one alternative, return an Alt object. Otherwise,
# a Seq.
if len(groups) == 1:
return Seq(groups[0]), pos
else:
return Alt(map(Seq,groups)), pos
class Seq:
'''This class represents a series of regular expression elements, one
following another'''
def __init__(self, members):
self.members = members
def get_expansions(self):
member_expansions = map(_expand, self.members)
return _joinings(member_expansions)
class Alt:
'''This class represents a choice between two or more elements.'''
def __init__(self, alternatives):
self.alternatives = alternatives
def get_expansions(self):
member_expansions = map(_expand, self.alternatives)
ex = []
for el in member_expansions:
# This is a hack; I'd rather say 'ex.extend(el)', but not
# everyone is using Python >=1.5.2
ex[len(ex):len(ex)] = el
return ex
class CharSet:
'''This class represents a group of possible characters,
such as [abc] or \d' '''
def __init__(self, characters):
self.characters = list(characters)
def get_expansions(self):
return self.characters
def _expand(element):
'''Returns the expansion of an element.'''
if type(element) == type(''):
return [ element ]
else:
return element.get_expansions()
def _joinings(lst):
''' Takes a list of lists of strings, such as [ ['a1','a2'], ['b1, b2']].
Returns a new list consisting of the joins of one member from list1,
one from list2, and so forth. (Such as ['a1b1','a1b2','a2b1','a1b2'])
'''
prefixes = ['']
for sublist in lst:
next_prefixes = []
if len(sublist) == 0:
return []
for prefix in prefixes:
for el in sublist:
next_prefixes.append(prefix + el)
prefixes = next_prefixes
return prefixes
_allChars = CharSet(string.join(map(chr,range(0,255)),''))
_special = {
't' : '\t',
'n' : '\n',
'r' : '\r',
'f' : '\f',
'e' : '\e',
's' : CharSet(' \t\n\f\r'),
'd' : CharSet(string.digits),
'w' : CharSet(string.letters + string.digits + "_"),
'W': CharSet(map(chr, [32,96] + range(0,48) + range(58,65) +
range(91,95) + range(123,256))),
'S': CharSet(map(chr, [65,11]+range(0,8)+range(14,32)+
range(33,92)+
range(93,256))),
'D': CharSet(map(chr, [32]+range(0,48)+range(58,256)))
}
if __name__ == '__main__':
import sys
expr = sys.argv[1]
for el in expand_re(expr):
print el
============================================================
Was that about what you had in mind?
Delurkingly-yrs
--
Nick Mathewson <nickm at mit.edu> http://www.mit.edu/~nickm/
More information about the Python-list
mailing list