newb: comapring two strings

Peter Otten __peter__ at web.de
Fri May 19 02:09:42 EDT 2006


manstey wrote:

> Is there a clever way to see if two strings of the same length vary by
> only one character, and what the character is in both strings.
> 
> E.g. str1=yaqtil str2=yaqtel
> 
> they differ at str1[4] and the difference is ('i','e')
> 
> But if there was str1=yiqtol and str2=yaqtel, I am not interested.
> 
> can anyone suggest a simple way to do this?
> 
> My next problem is, I have a list of 300,000+ words and I want to find
> every pair of such strings. I thought I would first sort on length of
> string, but how do I iterate through the following:

Not sure if it can handle 300000 words, but here is something to play with.

import sys

def find_similars(words, lookup=None, dupes=None):
    if lookup is None:
        lookup = {}
    if dupes is None:
        dupes = set()
    for word in words:
        low_word = word.lower()
        if low_word not in dupes:
            dupes.add(low_word)
            last = None
            for i, c in enumerate(low_word):
                if c == last: continue
                key = low_word[:i], low_word[i+1:]
                if key in lookup:
                    lookup[key].append(word)
                else:
                    lookup[key] = [word]
                last = c
    return (group for group in lookup.itervalues() if len(group) > 1)

def main():
    import optparse
    parser = optparse.OptionParser()
    parser.usage += " infile[s]"
    parser.add_option("-n", "--limit", type="int", help="process at most
LIMIT words")
    options, args = parser.parse_args()
    if args:
        words = (w.strip() for fn in args for w in open(fn))
    else:
        words = (w.strip() for w in sys.stdin)
    if options.limit is not None:
        from itertools import islice
        words = islice(words, max_len)

    for group in find_similars(words):
        print " ".join(sorted(group))

if __name__ == "__main__":
    main()
 
Peter



More information about the Python-list mailing list