[Spambayes-checkins] spambayes/contrib spamcounts.py,1.6,1.7

Mon Apr 24 00:30:48 CEST 2006

Update of /cvsroot/spambayes/spambayes/contrib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv4248

Modified Files:
	spamcounts.py 
Log Message:
I think this reduces the sensitivity of this code to classifier changes.  It
could still be better.  Awaiting someone's verdict on
storage.database_type().


Index: spamcounts.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/contrib/spamcounts.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** spamcounts.py	12 Jan 2004 08:36:15 -0000	1.6
--- spamcounts.py	23 Apr 2006 22:30:46 -0000	1.7
***************
*** 27,31 ****
  from spambayes.Options import options, get_pathname_option
  from spambayes.tokenizer import tokenize
! from spambayes.storage import STATE_KEY
  
  prog = sys.argv[0]
--- 27,31 ----
  from spambayes.Options import options, get_pathname_option
  from spambayes.tokenizer import tokenize
! from spambayes.storage import STATE_KEY, database_type, open_storage
  
  prog = sys.argv[0]
***************
*** 36,57 ****
      print >> sys.stderr, __doc__.strip() % globals()
  
- # From msgs on spambayes mailing list, spam prob is calculated thusly:
- ## hc = ham token count
- ## nh = total number of ham messages
- ## sc = spam token count
- ## ns = total number of spam messages
- ## hr = ham ratio = hc / nh
- ## sr = spam ratio = sc / ns
- ## p = base spam probability = sr / (sr + hr)
- ## S = unknown word strength (static factor = 0.45 by default)
- ## x = unknown word probability (static factor = 0.5 by default)
- ## n = total number of messages the token appeared in = hc + sc
- ## sp = final spam probability = ((S * x) + (n * p)) / (S + n)
- 
- 
  def print_spamcounts(tokens, db, use_re):
      if use_re:
          s = sets.Set()
!         keys = db.keys()
          for pat in tokens:
              for k in keys:
--- 36,43 ----
      print >> sys.stderr, __doc__.strip() % globals()
  
  def print_spamcounts(tokens, db, use_re):
      if use_re:
          s = sets.Set()
!         keys = db._wordinfokeys()
          for pat in tokens:
              for k in keys:
***************
*** 60,67 ****
          tokens = list(s)
  
-     S = options["Classifier", "unknown_word_strength"]
-     x = options["Classifier", "unknown_word_prob"]
-     _, ns, nh = db[STATE_KEY]
- 
      writer = csv.writer(sys.stdout)
      writer.writerow(("token", "nspam", "nham", "spam prob"))
--- 46,49 ----
***************
*** 72,87 ****
          seen.add(t)
  
!         try:
!             sc, hc = db.get(t, (0, 0))
!         except ValueError:
!             _, sc, hc = db.get(t, (0, 0, 0))
          if sc == hc == 0:
              continue
  
!         hr = hc / nh
!         sr = sc / ns
!         p = sr / (sr + hr)
!         n = hc + sc
!         sp = ((S * x) + (n * p)) / (S + n)
  
          writer.writerow((t, sc, hc, sp))
--- 54,62 ----
          seen.add(t)
  
!         sc, hc = db._wordinfoget(t).__getstate__()
          if sc == hc == 0:
              continue
  
!         sp = db.spamprob([t])
  
          writer.writerow((t, sc, hc, sp))
***************
*** 117,126 ****
          return 1
  
!     dbname = os.path.expanduser(dbname)
!     print >> sys.stderr, "db:", dbname
!     if ispickle:
!         db = pickle.load(file(dbname))
!     else:
!         db = shelve.open(dbname, flag='r')
  
      if tokenizestdin:
--- 92,97 ----
          return 1
  
!     dbname, usedb = database_type(opts)
!     db = open_storage(dbname, usedb)
  
      if tokenizestdin: