[Spambayes-checkins] spambayes/contrib spamcounts.py,1.6,1.7
Skip Montanaro
montanaro at users.sourceforge.net
Mon Apr 24 00:30:48 CEST 2006
Update of /cvsroot/spambayes/spambayes/contrib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv4248
Modified Files:
spamcounts.py
Log Message:
I think this reduces the sensitivity of this code to classifier changes. It
could still be better. Awaiting someone's verdict on
storage.database_type().
Index: spamcounts.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/contrib/spamcounts.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** spamcounts.py 12 Jan 2004 08:36:15 -0000 1.6
--- spamcounts.py 23 Apr 2006 22:30:46 -0000 1.7
***************
*** 27,31 ****
from spambayes.Options import options, get_pathname_option
from spambayes.tokenizer import tokenize
! from spambayes.storage import STATE_KEY
prog = sys.argv[0]
--- 27,31 ----
from spambayes.Options import options, get_pathname_option
from spambayes.tokenizer import tokenize
! from spambayes.storage import STATE_KEY, database_type, open_storage
prog = sys.argv[0]
***************
*** 36,57 ****
print >> sys.stderr, __doc__.strip() % globals()
- # From msgs on spambayes mailing list, spam prob is calculated thusly:
- ## hc = ham token count
- ## nh = total number of ham messages
- ## sc = spam token count
- ## ns = total number of spam messages
- ## hr = ham ratio = hc / nh
- ## sr = spam ratio = sc / ns
- ## p = base spam probability = sr / (sr + hr)
- ## S = unknown word strength (static factor = 0.45 by default)
- ## x = unknown word probability (static factor = 0.5 by default)
- ## n = total number of messages the token appeared in = hc + sc
- ## sp = final spam probability = ((S * x) + (n * p)) / (S + n)
-
-
def print_spamcounts(tokens, db, use_re):
if use_re:
s = sets.Set()
! keys = db.keys()
for pat in tokens:
for k in keys:
--- 36,43 ----
print >> sys.stderr, __doc__.strip() % globals()
def print_spamcounts(tokens, db, use_re):
if use_re:
s = sets.Set()
! keys = db._wordinfokeys()
for pat in tokens:
for k in keys:
***************
*** 60,67 ****
tokens = list(s)
- S = options["Classifier", "unknown_word_strength"]
- x = options["Classifier", "unknown_word_prob"]
- _, ns, nh = db[STATE_KEY]
-
writer = csv.writer(sys.stdout)
writer.writerow(("token", "nspam", "nham", "spam prob"))
--- 46,49 ----
***************
*** 72,87 ****
seen.add(t)
! try:
! sc, hc = db.get(t, (0, 0))
! except ValueError:
! _, sc, hc = db.get(t, (0, 0, 0))
if sc == hc == 0:
continue
! hr = hc / nh
! sr = sc / ns
! p = sr / (sr + hr)
! n = hc + sc
! sp = ((S * x) + (n * p)) / (S + n)
writer.writerow((t, sc, hc, sp))
--- 54,62 ----
seen.add(t)
! sc, hc = db._wordinfoget(t).__getstate__()
if sc == hc == 0:
continue
! sp = db.spamprob([t])
writer.writerow((t, sc, hc, sp))
***************
*** 117,126 ****
return 1
! dbname = os.path.expanduser(dbname)
! print >> sys.stderr, "db:", dbname
! if ispickle:
! db = pickle.load(file(dbname))
! else:
! db = shelve.open(dbname, flag='r')
if tokenizestdin:
--- 92,97 ----
return 1
! dbname, usedb = database_type(opts)
! db = open_storage(dbname, usedb)
if tokenizestdin:
More information about the Spambayes-checkins
mailing list