convert list of strings to set of regexes; convert list of strings to trie

Wed Jul 21 03:30:17 EDT 2004

Klaus Neuner <klaus_neuner82 at yahoo.de> wrote:
>  I need a function that converts a list into a set of regexes. Like so:
> 
>    string_list = ["blaa", "blab", "raaa", "rabb"]
> 
>    print string_list2regexes(string_list)
> 
> 
>  This should return something like:
> 
>    ["bla(a|b)", "ra(aa|bb)"]

The program below does exactly this...  Run it like this

$ ./words-to-regexp.pl 5
Extracting 5 letter words
blaa
blab
raaa
rabb

(Giving the list of words to stdin)

It produced this regexp which matches just the above strings.

Re: bla[ab]|ra(?:aa|bb)

For your input.  Eg find a regexp to match all the 1&2 letter words
in the dictionary...

./words-to-regexp.pl 2 < /usr/share/dict/words

Re: [aa]|a[cdghlmmnrssttuy]|[bb]|b[aeeikry]|[cc]|c[adfilmorssu]|[dd]|d[bor]|[ee]|e[dhmrssux]|[ff]|f[aemr]|[gg]|g[adeeos]|[hh]|h[aeefgiooz]|[ii]|i[dfnnorstt]|[jj]|j[or]|[kk]|k[crsw]|[ll]|l[aaeiorstu]|[mm]|m[abdeginorsstuy]|[nn]|n[abdeiopu]|[oo]|o[bfhknrswxz]|[pp]|p[aabdhimotu]|[qqrr]|r[abdeehnsux]|[ss]|s[bcehimnort]|[tt]|t[abchiilmosy]|[uu]|u[hprs]|[vv]|v[as]|[ww]|w[emu]|[xx]|xe|[yy]|y[abe]|[zz]|z[nr]

Yes its in perl, but its almost entirely regexps! I haven't got the
time to pythonise it at the moment - hope you enjoy a challenge ;-)

#!/usr/bin/perl -w
#
# The challenge - to write a function which given a list of words
# returns a regexp which will match those and only those words.

use strict;
my $LIMIT = shift || 3;
$|=1;

print "Extracting $LIMIT letter words\n";

my @list = ();
while (<>)
{
    chomp;
    next if $_ eq "";
    push @list, lc($_) if length($_) <= $LIMIT;
}

print "Extracted ", scalar(@list), " words\n";

my ($re, $old_re) = ("", "1");

while ($re ne $old_re)
{
    $old_re = $re;
    print "-" x 60, "\n";
    $re = word_list_to_regexp( @list );
    check_word_list_to_regexp($re, \@list);
    print "Length: ", length($re), "\n";
}
exit;

############################################################
# Converts a list of words into a regexp which will
# match those words and those numbers only.
#
# It does this by constructing a regexp and then progressively
# simplifying it - recursively if necessary.  It uses regexp's to
# transform the regexp of course!  This is almost a general purpose
# regexp optimiser.
#
# We assume that the caller will bound the regexp with ^( and )$ or
# \W(?: and )\W or whatever takes their fancy
#
# Set $DEBUG to 1 if you want to print lots of info and check the
# regexp works after each transformation.
#
# Warning: code contains heavy regexps - lift with care ;-)
# Caution: Code may use exponential time and space ;-(
############################################################

sub word_list_to_regexp
{
    my (@list) = @_;
    my $DEBUG = 1;

    # The basic regexp with |'s on the start and end to make our life
    # easier
    # Should uniq here too...
    $re = join("|", sort { 
	#length($a) <=> length($b) || 
	$a cmp $b 
	} @list);

    $re = "|$re|";

    # Transform the regexp in stages, making sure at all time the
    # regexp is correct if $DEBUG is set

    check_word_list_to_regexp($re, \@list) if $DEBUG;

    # 1) Concatenate all the single characters a|b|c into [abc]'s
    $re =~ s{ \| ( \w (?: \| \w )+ ) (?= \| ) }
    {
	my ( $string ) = ( $1 );
	print "string = '$string'\n" if $DEBUG;
	"|[" . join("", split m{\|}, $string) . "]"
    }gex;

    check_word_list_to_regexp($re, \@list) if $DEBUG;

    # 2) Find all the Xa|Xb|Xc and change to X(?:a|b|c)]
    $re =~ s{ \| ( (\w+)(\w+) (?: \| \2\w+ )+ ) (?= \| ) }
    {
	my ( $string, $prefix ) = ( $1, $2 );
	print "prefix = '$prefix', string = '$string'\n" if $DEBUG;
	"|$prefix\(?:" . join("|", map { substr($_, length $prefix) } split m{\|}, $string) . ")"
    }gex;

    check_word_list_to_regexp($re, \@list) if $DEBUG;

    # 3) Find all the aX|bX|cX and change to (a|b|c)X]
    $re =~ s{ \| ( (\w+?)(.+) (?: \| \w+\3 )+ ) (?= \| ) }
    {
	my ( $string, $postfix ) = ( $1, $3 );
	print "postfix = '$postfix', string = '$string'\n" if $DEBUG;
	$string =~ s{ \Q$postfix\E  (?= \| | $ ) }{}gx;
	print "...string = '$string'\n" if $DEBUG;
	"|(?:$string)$postfix"
    }gex;

    check_word_list_to_regexp($re, \@list) if $DEBUG;

    # 4) Change (?:a|b|c) into [abc]
    $re =~ s{ \(\?\: ( \w (?: \| \w )+ ) \) }
    {
	my ( $string ) = ( $1 );
	print "string = '$string'\n" if $DEBUG;
	"[" . join("", split m{\|}, $string) . "]"
    }gex;

    check_word_list_to_regexp($re, \@list) if $DEBUG;

    # 5) Optimise [abc] into [a-c] or \d
    # This doesn't optimise all the cases only the complete continuous
    # range in the [ ... ]
#    $re =~ s{ \[ ( \w{3,} ) \] }
#    {
#	my ( $string, $start, $end ) = ( $1, substr($1, 0, 1), substr($1, -1, 1) );
#	print "match ['$string']...range [$start-$end]\n" if $DEBUG;
#	if ($end - $start + 1 == length $string)
#	{
#	    $start == 0 && $end == 9 ? '\d' : "[$start-$end]";
#	}
#	else
#	{
#	    "[$string]";
#	}
#    }gex;

    check_word_list_to_regexp($re, \@list) if $DEBUG;

    my $re_length;

    do
    {
    $re_length = length($re);

    # 6) recurse on any sequences left (?:ab|cd|ef)
    $re =~ s{ \(\?\: ( \w+ (?: \| \w+ )+ ) \) }
    {
	my ( $string ) = ( $1 );
	if (length($string) < length($re) - 4)
	{
	    print "**** Recursing on '$string'\n" if $DEBUG;
	    "(?:" . word_list_to_regexp(split m{\|}, $string) . ")";
	}
	else
	{
	    "(?:$string)";
	}
    }gex;

    # 6a) recurse on any sequences left |ab|cd|ef|
    $re =~ s{ \| ( \w+ (?: \| \w+ )+ ) \| }
    {
	my ( $string ) = ( $1 );
	if (length($string) < length($re) - 2)
	{
	    print "**** Recursing on '$string'\n" if $DEBUG;
	    "|" . word_list_to_regexp(split m{\|}, $string) . "|";
	}
	else
	{
	    "|$string|";
	}
    }gex;
    }
    until (length($re) == $re_length);

    check_word_list_to_regexp($re, \@list) if $DEBUG;

    # 7) fix the | on each end
    $re =~ s{^\|}{};
    $re =~ s{\|$}{};

    print "**** Returning '$re'\n" if $DEBUG;

    return $re;
}

############################################################
# Test subroutine to check the regexp performs as advertised
#
# Call with a regexp and a reference to a list of numbers
# it will check that the regexp matches all the list and
# doesn't match some others (obviously it can't check them
# all can it!) die-ing on any failures.
############################################################

sub check_word_list_to_regexp
{
    my ($re, $list) = @_;
    my %list = map { $_ => 1 } @$list;
    print "Re: $re\n";

    # Put some other test cases in
    $list{$_} += 0 for (0..999);
    $list{int(rand()*1000)} += 0 for (0..99);
    $list{int(rand()*10000)} += 0 for (0..99);
    $list{int(rand()*100000)} += 0 for (0..99);

    # print join(", ", map {"$_ => $list{$_}"} keys %list), "\n";
    $re =~ s{^\|}{};		# fix | on start and end
    $re =~ s{\|$}{};
    $re = "^(?:$re)\$";		# put in ^(?: ... )$
    $re = qr{$re};		# compile the regexp for speed

    # Check all the keys in list against the regexp - some should pass
    # and some should fail
    for my $item (keys %list)
    {
        if ($list{$item} xor ($item =~ /$re/))
        {
            die "*** FAILED '$re' for '$item' ShouldMatch: $list{$item}\n";
        }
        else
        {
            # print "OK '$re' for '$item'\n";
        }
    }
}

-- 
Nick Craig-Wood <nick at craig-wood.com> -- http://www.craig-wood.com/nick