[Python-checkins] python/nondist/sandbox/spambayes GBayes.py,1.10,1.11

Fri, 23 Aug 2002 07:25:41 -0700

Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv4428a

Modified Files:
	GBayes.py 
Log Message:
Allow command line specification of tokenize functions
    run w/ -t flag to override default tokenize function
    run w/ -H flag to see list of tokenize functions

When adding a new tokenizer, make docstring a short description and add a
key/value pair to the tokenizers dict.  The key is what the user specifies.
The value is a tokenize function.

Added two new tokenizers - tokenize_wordpairs_foldcase and
tokenize_words_and_pairs.  It's not obvious that either is better than any
of the preexisting functions.

Should probably add info to the pickle which indicates the tokenizing
function used to build it.  This could then be the default for spam
detection runs.

Next step is to drive this with spam/non-spam corpora, selecting each of the
various tokenizer functions, and presenting the results in tabular form.

Index: GBayes.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/GBayes.py,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** GBayes.py	23 Aug 2002 03:10:42 -0000	1.10
--- GBayes.py	23 Aug 2002 14:25:39 -0000	1.11
***************
*** 35,39 ****
      -o file
          with -m, output all messages, with marks, to file
! 
  When called without any options or arguments, a short self-test is run.
  """
--- 35,42 ----
      -o file
          with -m, output all messages, with marks, to file
!     -t func
!         tokenize using function 'func'.  for a list of functions, run w/ -H.
!     -H
!         describe all available tokenizing functions, then exit
  When called without any options or arguments, a short self-test is run.
  """
***************
*** 338,359 ****

  def tokenize_words_foldcase(string):
      return _token_re.findall(string.lower())

  def tokenize_words(string):
      return _token_re.findall(string)

  def tokenize_split_foldcase(string):
      return string.lower().split()

  def tokenize_split(string):
      return string.split()

  # Do an N-gram generator instead.  Fold case and collapse runs of whitespace.
  # Probably a good idea to fold punctuation characters into one (or just a
  # few) representatives too.
  def tokenize_5gram_foldcase_wscollapse(string, N=5):
      normalized = " ".join(string.lower().split())
!     for i in xrange(len(normalized)-N+1):
!         yield normalized[i : i+N]

  def tokenize_ngram(string, N):
--- 341,384 ----

  def tokenize_words_foldcase(string):
+     r"""tokenize w/ re '[\w$\-]+', fold case"""
      return _token_re.findall(string.lower())

  def tokenize_words(string):
+     r"""tokenize w/ re '[\w$\-]+'"""
      return _token_re.findall(string)

  def tokenize_split_foldcase(string):
+     r"""tokenize using simple string split(), fold case"""
      return string.lower().split()

  def tokenize_split(string):
+     r"""tokenize using simple string split()"""
      return string.split()

+ def tokenize_wordpairs_foldcase(string):
+     r"""tokenize w/ re '[\w$\-]+' -> 'w1 w2', 'w3 w4', ..., fold case"""
+     lst = _token_re.findall(string.lower())
+     for i in range(0, len(lst), 2):
+         yield " ".join(lst[i:i+2])
+ 
+ def tokenize_words_and_pairs(string):
+     r"""tokenize w/ re '[\w$\-]+' -> w1, w2, 'w1 w2', w3, w4, 'w3 w4' ..."""
+     lst = _token_re.findall(string.lower())
+     lst.append("")
+     for i in range(0, len(lst)-1, 2):
+         a = lst[i]
+         b = lst[i+1]
+         yield a
+         if b:
+             yield b
+             yield "%s %s" % (a, b)
+ 
  # Do an N-gram generator instead.  Fold case and collapse runs of whitespace.
  # Probably a good idea to fold punctuation characters into one (or just a
  # few) representatives too.
  def tokenize_5gram_foldcase_wscollapse(string, N=5):
+     r"""tokenize w/ 5-char runs, fold case, normalize whitespace"""
      normalized = " ".join(string.lower().split())
!     return tokenize_ngram(normalized, N)

  def tokenize_ngram(string, N):
***************
*** 362,374 ****

  def tokenize_5gram(string):
      return tokenize_ngram(string, 5)

  def tokenize_10gram(string):
      return tokenize_ngram(string, 10)

  def tokenize_15gram(string):
      return tokenize_ngram(string, 15)

! tokenize = tokenize_words_foldcase

  spam1 = """
--- 387,415 ----

  def tokenize_5gram(string):
+     r"""tokenize w/ strict 5-char runs"""
      return tokenize_ngram(string, 5)

  def tokenize_10gram(string):
+     r"""tokenize w/ strict 10-char runs"""
      return tokenize_ngram(string, 10)

  def tokenize_15gram(string):
+     r"""tokenize w/ strict 15-char runs"""
      return tokenize_ngram(string, 15)

! # add user-visible string as key and function as value - function's docstring
! # serves as help string when -H is used, so keep it brief!
! tokenizers = {
!     "5gram": tokenize_5gram,
!     "5gram_fold_normws": tokenize_5gram_foldcase_wscollapse,
!     "10gram": tokenize_10gram,
!     "15gram": tokenize_15gram,
!     "word_and_pairs": tokenize_words_and_pairs,
!     "wordpairs_fold": tokenize_wordpairs_foldcase,
!     "split": tokenize_split,
!     "split_fold": tokenize_split_foldcase,
!     "words": tokenize_words,
!     "words_fold": tokenize_words_foldcase,
!     }

  spam1 = """
***************
*** 584,596 ****

  def usage(code, msg=''):
-     print >> sys.stderr, __doc__ % globals()
      if msg:
          print >> sys.stderr, msg
      sys.exit(code)

  def main():
      try:
!         opts, args = getopt.getopt(sys.argv[1:], 'hg:s:u:p:c:m:o:')
      except getopt.error, msg:
          usage(1, msg)
--- 625,659 ----

  def usage(code, msg=''):
      if msg:
          print >> sys.stderr, msg
+         print >> sys.stderr
+     print >> sys.stderr, __doc__ % globals()
      sys.exit(code)

+ def describe_tokenizers(tokenize):
+     print >> sys.stderr, "Possible tokenizing functions are:"
+     keys = tokenizers.keys()
+     keys.sort()
+     maxlen = max(map(len, keys))
+     default = "unknown"
+     for k in keys:
+         func = tokenizers[k]
+         if tokenize == func:
+             default = k
+         doc = func.__doc__ or "???"
+         if maxlen + 4 + len(doc) > 78:
+             sp = "\n"+" "*5
+         else:
+             sp = " "*(maxlen-len(k)+1)
+         print >> sys.stderr, "  %s:%s%s" % (k, sp, doc)
+     if default:
+         print >> sys.stderr, "Default tokenizer is", default
+     sys.exit(0)
+ 
+ 
  def main():
      try:
!         opts, args = getopt.getopt(sys.argv[1:], 'hHg:s:u:p:c:m:o:t:')
      except getopt.error, msg:
          usage(1, msg)
***************
*** 602,608 ****
--- 665,674 ----

      threshold = count = good = spam = unknown = pck = mark = output = None
+     tokenize = tokenize_words_foldcase
      for opt, arg in opts:
          if opt == '-h':
              usage(0)
+         elif opt == '-H':
+             describe_tokenizers(tokenize)
          elif opt == '-g':
              good = arg
***************
*** 611,614 ****
--- 677,684 ----
          elif opt == '-u':
              unknown = arg
+         elif opt == '-t':
+             tokenize = tokenizers.get(arg)
+             if tokenize is None:
+                 usage(1, "Unrecognized tokenize function: %s" % arg)
          elif opt == '-p':
              pck = arg