Fuzzy string matching?

Tim Peters tim_one at email.msn.com
Sat Aug 28 04:05:21 EDT 1999


[John Machin, disses Python's soundex implementation]

[Skip Montanaro]
> ...
> Guido would like sondex.c out of the core.  Tim Peters sent a Python
> replacement to me.  I was supposed to give it some life.  It's still
> sitting in my inbox.

So what does it need to come to life?  It's code; it works <wink>.

> I'll try to take care of it next week (bug fixes and all).

It didn't have the bugs John's worried about.

> (I realize that doesn't make Soundex any better for the fuzzy
> matching algorithm, but I should at least fix any obvious bugs...)

I find Knuth's description of the algorithm ambiguous in several respects, and
there are many incompatible soundex algorithms "out there".  The one I sent
before probably should have treated non-letters as if they didn't exist, so
I'll attach a changed version that does so.

soundex-is-a-pragmatic-hack-so-right-vs-wrong-is-fuzzy-ly y'rs  - tim

NDIGITS = 3

import string
_upper = string.upper
_translate = string.translate

# P for Punctuation -- all characters treated as if they weren't there
_tran = ["P"] * 256
def _setcode(letters, value):
    for ch in letters:
        _tran[ord(_upper(ch))] = _tran[ord(ch)] = value
_setcode("bfpv", "1")
_setcode("cgjkqsxz", "2")
_setcode("dt", "3")
_setcode("l", "4")
_setcode("mn", "5")
_setcode("r", "6")
# B for Break -- these guys break runs
_setcode("aeiouy", "B")
# I for Invisible -- they're invisible except as first char
_setcode("hw", "I")

_punctuation = filter(lambda i: _tran[i] == "P", range(256))
_punctuation = string.join(map(chr, _punctuation), "")
assert len(_punctuation) == 256 - 52, "Soundex initialization failed"

_tran = string.join(_tran, "")
del string, _setcode

def soundex(name):
    """name -> Soundex code, following Knuth Vol 3 Ed 2 pg 394."""

    while name and _tran[ord(name[0])] == "P":
        name = name[1:]
    if not name:
        raise ValueError("soundex requires non-empty name argument")
    coded = _translate(name, _tran, _punctuation)
    out = _upper(name[0])
    lastrealcode = coded[0]
    ignore_same = 1
    for thiscode in coded[1:]:
        if thiscode == "B":
            ignore_same = 0
            continue
        if thiscode == "I":
            continue
        if ignore_same and lastrealcode == thiscode:
            continue
        out = out + thiscode
        lastrealcode = thiscode
        ignore_same = 1
        if len(out) > NDIGITS:
            break
    if len(out) < NDIGITS + 1:
        out = out + "0" * (NDIGITS + 1 - len(out))
    return out

def _test():
    global nerrors
    def check(name, expected):
        global nerrors
        got = soundex(name)
        if got != expected:
            nerrors = nerrors + 1
            print "error in Soundex test: name", name, \
                  "expected", expected, "got", got
    nerrors = 0
    check("Euler", "E460")
    check("Ellery", "E460")
    check("guass", "G200")
    check("gauss", "G200")
    check("Ghosh", "G200")
    check("HILBERT", "H416")
    check("Heilbronn", "H416")
    check("Knuth", "K530")
    check("K ** n  U123t9247h   ", "K530")
    check("!Kant", "K530")
    check("Lloyd", "L300")
    check("Liddy", "L300")
    check("Lukasiewicz", "L222")
    check("Lissajous", "L222")
    check("Wachs", "W200")
    check("Waugh", "W200")
    check("HYHYH", "H000")
    check("kkkkkkkwwwwkkkkkhhhhhkkkkemmnmnhmn", "K500")
    check("Rogers", "R262")
    check("Rodgers", "R326")
    check("Sinclair", "S524")
    check("St. Clair", "S324")
    check("Tchebysheff", "T212")
    check("@@=T-c=h,e+b#y{s}h\000e\377f:f|", "T212")
    check(" C h e b y s h e v ", "C121")

    if nerrors:
        raise SystemError("soundex test failed with " + `nerrors` +
                          " errors")

if __name__ == "__main__":
    _test()



a



More information about the Python-list mailing list