pattern-based string expansion

Nick Mathewson nickm at mit.edu
Thu Apr 20 00:12:26 EDT 2000


On Thu, 20 Apr 2000 02:51:50 GMT, sue <smalleys at gte.net> wrote:
>I'm not looking for all possible strings with a wildcard,
>just a pseudo-RE-based expansion.  The RE would not support
>open-ended arguments (* or +), but would have ?,[] and so
>on.
>The expansion needs to be ordered.
>
>I've thought about hacking the glob(3) code to work on an
>input RE, but I was hoping this already existed somewhere.

Well, the code below will probably only be useful to a hacker, but if
you're prepared to hack glob(3), this should be a snap.  

It's a very very simplified version of (a subset of (a more general
regex-manipulation module for (a project that I'm working on))).  I
didn't need an expansion function for my project, but it was pretty
easy to hack one up.

============================================================
#!/usr/bin/python

import string

def expand_re(s):
    '''Given a regular expression, returns all a list of all possible 
       expansions.  It supports (), |, [], ?, \[dwsDWS], and '.'.
       The backslash character escapes any other character.  

       Example:
           expand_re('spam( spam( spam[!?]?)?)?|Bloody Vikings!')
              => [ 'spam spam spam!', 
                   'spam spam spam?', 
                   'spam spam spam',
	           'spam spam',
		   'spam',
		   'Bloody Vikings!' ]

       BUG: Only mildly tested.	      
       BUG: Doesn't catch bad regular expressions, or give good error
            messages.
       BUG: Isn't very fast or efficient.
       BUG: Can't handle character escapes within [].
       BUG: Can't handle [^...].
       BUG: Doesn't support unicode. 
       BUG: Doesn't support case-insensitive matching.
       BUG: Always assumes that . should match newline.
       BUG: Doesn't support +, *, {n,m}.  In order to do these, we'd need
            a function to only consider expansions up to a certain length,
            or to only return the first n expansions.
       '''

    re, pos = _parseRE(s,0)
    return re.get_expansions()

def _parseRE(s, pos):
    '''Does the dirty work of parsing a regular expression from string 's',
       starting at position 'pos'.  Returns the Seq/Alt/Opt/Charset object
       corresponding to s, along with the position immediately following
       
       returns element, end tuple'''

    # First, we find all the elements in s.  Elements are:
    #  (groups)
    #  [character sets]     (also '.', \d, and friends.)
    #  characters
    #  optional-elements?
    #  |
    elements = []
    while pos < len(s):
	ch = s[pos]
	if ch == '(':
	    group, pos = _parseRE(s, pos+1)
	    pos = pos-1
	    elements.append(group)
	elif ch == ')':
	    pos = pos + 1
	    break
	elif ch == '\\':
	    next = _special.get(s[pos+1], None)
	    if next:
		elements.append(next)
	    else:
		elements.append(s[pos+1])
	    pos = pos + 1
	elif ch == '[':
	    chrs = []
	    while 1:
		pos = pos + 1
		ch = s[pos]		
		if ch == ']':
		    break
		chrs.append(ch)
	    elements.append(CharSet(string.join(chrs,'')))
	elif ch == '?':
	    elements[-1] = Alt([elements[-1], ''])
	elif ch == '|':
	    elements.append(None) #HACKHACK
	elif ch == '.':
	    elements.append(_allChars)
	else:
	    elements.append(ch)
	pos = pos + 1

    # Now we compact all adjacent characters into strings.  This will help
    # our efficiency in the end, I think.  The revised list is stored in
    # 'els'.
    els = []
    cur_str = None
    for el in elements:
	if type(el) == type(''):
	    if cur_str is not None:
		cur_str = cur_str + el
	    else:
		cur_str = el
	else:
	    if cur_str is not None:
		els.append(cur_str)
		cur_str = None
	    els.append(el)
    if cur_str:
	els.append(cur_str)

    # Now we go through the list to look for vertical bars -- encoded as
    # None.  We find all alternatives within the current group.
    groups = []
    cur_group = []
    for el in els:
	if el is None:
	    groups.append(cur_group)
	    cur_group = []
	else:
	    cur_group.append(el)
    groups.append(cur_group)
    
    # If there's more than one alternative, return an Alt object.  Otherwise,
    # a Seq.
    if len(groups) == 1:
	return Seq(groups[0]), pos
    else:
	return Alt(map(Seq,groups)), pos

class Seq:
    '''This class represents a series of regular expression elements, one 
       following another'''
    def __init__(self, members):
	self.members = members

    def get_expansions(self):
	member_expansions = map(_expand, self.members)
	return _joinings(member_expansions)
    
class Alt:
    '''This class represents a choice between two or more elements.'''
    def __init__(self, alternatives):
	self.alternatives = alternatives

    def get_expansions(self):
	member_expansions = map(_expand, self.alternatives)
	ex = []
	for el in member_expansions:
	    # This is a hack; I'd rather say 'ex.extend(el)', but not
            # everyone is using Python >=1.5.2
	    ex[len(ex):len(ex)] = el
	return ex

class CharSet:
    '''This class represents a group of possible characters, 
       such as [abc] or \d' '''
    def __init__(self, characters):
	self.characters = list(characters)

    def get_expansions(self):
	return self.characters    

def _expand(element):
    '''Returns the expansion of an element.'''
    if type(element) == type(''):
	return [ element ]
    else:
	return element.get_expansions()

def _joinings(lst):
    ''' Takes a list of lists of strings, such as [ ['a1','a2'], ['b1, b2']].
        Returns a new list consisting of the joins of one member from list1,
        one from list2, and so forth. (Such as ['a1b1','a1b2','a2b1','a1b2'])
	'''
    prefixes = ['']
    for sublist in lst:
	next_prefixes = []
	if len(sublist) == 0:
	    return []
	for prefix in prefixes:
	    for el in sublist:
		next_prefixes.append(prefix + el)
	prefixes = next_prefixes 
    return prefixes

_allChars = CharSet(string.join(map(chr,range(0,255)),''))
_special = { 
             't' : '\t',
	     'n' : '\n',
	     'r' : '\r',
	     'f' : '\f',
	     'e' : '\e',
             's' : CharSet(' \t\n\f\r'),
	     'd' : CharSet(string.digits),
	     'w' : CharSet(string.letters + string.digits + "_"),
	     'W':  CharSet(map(chr, [32,96] + range(0,48) + range(58,65) +
			       range(91,95) + range(123,256))),
	     'S':  CharSet(map(chr, [65,11]+range(0,8)+range(14,32)+
			       range(33,92)+
			       range(93,256))),
	     'D':  CharSet(map(chr, [32]+range(0,48)+range(58,256)))
	     }

if __name__ == '__main__':
    import sys
    expr = sys.argv[1]
    for el in expand_re(expr):
	print el
============================================================

Was that about what you had in mind?

Delurkingly-yrs
-- 
Nick Mathewson     <nickm at mit.edu>     http://www.mit.edu/~nickm/



More information about the Python-list mailing list