Most efficient method to search text?

Wed Oct 16 20:22:23 EDT 2002

On Wed, 16 Oct 2002 13:35:09 GMT, Michael Hudson <mwh at python.net> wrote:

>robin.siebler at corp.palm.com (Robin Siebler) writes:
>
>> I wrote a script to search a slew of files for certain words and
>> names.  However, I am sure that there has to be a faster/better way to
>> do it.
>
>I should point out that I'm not really replying in order to help you
>-- if I do, that's a bonus.  I've actually wanted to sit down and
>write this code out for a while, and this seemed like a good excuse.
>
>Here's a way to quickly (I hope! Haven't done any benchmarks) tell if
>one of a bunch of words is contained in a chunk of text, assuming the
>words are known beforehand:
>
>import string
>
>wordchars = string.letters
>allchars = map(chr, range(256))
>hittoken = 'hit'
>
>def _compile(wordlist, d, skipalong):
>    t = {}
>    for word in wordlist:
>        t.setdefault(word[0], []).append(word[1:])
>    for k, v in t.iteritems():
>        d[k] = {}
>        if '' in v:
>            for c in allchars:
>                d[k][c] = hittoken
>            for c in wordchars:
>                d[k][c] = skipalong
>        else:
>            for c in allchars:
>                d[k][c] = skipalong
>        _compile([y for y in v if y <> ''], d[k], skipalong)
>
>def compile(wordlist):
>    root = {}
>    skipalong = {}
>    for c in allchars:
>        skipalong[c] = root
>    for c in wordchars:
>        skipalong[c] = skipalong
>    for c in allchars:
>        root[c] = root
>    _compile(wordlist, root, skipalong)
>    return root
>
>def match(set, text):
>    _hittoken = hittoken
>    for c in text + '\000':
>        set = set[c]
>        if set is _hittoken:
>            return 1
>    else:
>        return 0
>
>You probably don't want to try and print the kind of things compile()
>returns...
>
>I don't really feel like explaining it, either, but it shouldn't be
>too hard to grasp the principles if you've heard of a dfa.
>
>You use it like this:
>
>>>> set = robin.compile(['Tim', 'Guido', 'Barry'])
>>>> files = glob.glob(os.path.expanduser("~/src/python/dist/src/Lib/*.py"))
>>>> for file in files:
>...  if robin.match(s, open(file).read()):
>...      print os.path.basename(file)
>... 
>cgi.py
>doctest.py
>getpass.py
>gettext.py
>imputil.py
>profile.py
>pstats.py
>pty.py
>pydoc.py
>random.py
>rfc822.py
>site.py
>smtpd.py
>tabnanny.py
>telnetlib.py
>threading.py
>tokenize.py
>whrandom.py
>xmlrpclib.py
>sets.py
>heapq.py
>>>> 
>
>There are surely cleverer Boyer-Moore tricks you can pull, but I
>personally think this code is really really neat.  You should be able
>to write lexers very like this (though not full-on regexps --
>backtracking ain't gonna fit, here).
>
That's pretty cool, but for just finding words I just thought of an alternative
has_word, I think ;-) Given a wordlist and string to search, the wordsearch part could
be a long one liner, but I think this is a little clearer ;-) Don't know about the timing:
No guarantees...

#!/usr/bin/python
# __PyPAN_Py has_word.py -- searches string and returns words found of given list
#
# May also be run at command line to search for words in globbed files:
# Usage: python has_word.py [word]+ -- [fileglob]+'
#
def has_word(
    wordlist,   # 'word' or ['word', 'word2', 'etc']
    aString,    # string to find words in
    trans = ''.join([c.isalnum() and c or ' ' for c in map(chr,range(256))])
):
    """
    Search for any of a list of words in a string and return list of words found.
    """
    if isinstance(wordlist, str): wordlist = [wordlist] # allow single word for convenience
    string_words = dict( map(None, aString.translate(trans).split(),[]))
    return [w for w in wordlist if w in string_words]   # and 1 or 0 for bool return

if __name__ == '__main__':
    import sys, os, glob
    try:
        eow = sys.argv.index('--')
        words = sys.argv[1:eow]
        if eow<0 or not words: raise ValueError
        globs = sys.argv[eow+1:]
        for g in globs:
            files = glob.glob(g) # "~/src/python/dist/src/Lib/*.py"))
            for filename in files:
                found = has_word(words, file(filename).read())
                if found:
                    print '%16s: %s' % (os.path.basename(filename), found)
    except:
        print 'Usage: python has_word.py [word]+ -- [fileglob]+'
# __PyPAN__

Which may be used like:

[17:07] C:\pywk\ut>python has_word.py Tim Guido Barry Michael -- d:\python22\lib\*.py
          cgi.py: ['Guido', 'Michael']
      doctest.py: ['Tim']
      getpass.py: ['Guido']
      gettext.py: ['Barry']
      imputil.py: ['Tim', 'Guido']
      profile.py: ['Guido']
       pstats.py: ['Guido']
          pty.py: ['Guido']
        pydoc.py: ['Guido']
       random.py: ['Tim', 'Guido']
       rfc822.py: ['Guido']
         site.py: ['Guido']
        smtpd.py: ['Barry']
     tabnanny.py: ['Tim']
    telnetlib.py: ['Guido']
    threading.py: ['Tim']
     tokenize.py: ['Tim']
     whrandom.py: ['Guido']

Regards,
Bengt Richter