[Spambayes-checkins]
spambayes Options.py,1.27,1.28 classifier.py,1.18,1.19
Tim Peters
tim_one@users.sourceforge.net
Mon, 23 Sep 2002 20:29:51 -0700
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv9255
Modified Files:
Options.py classifier.py
Log Message:
New option use_central_limit2 is Gary Robin's logarithmic variation of
the central-limit code.
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.27
retrieving revision 1.28
diff -C2 -d -r1.27 -r1.28
*** Options.py 23 Sep 2002 22:41:52 -0000 1.27
--- Options.py 24 Sep 2002 03:29:48 -0000 1.28
***************
*** 203,206 ****
--- 203,210 ----
# square roots. An NxN test grid should work fine.
use_central_limit: False
+
+ # Same as use_central_limit, except takes logarithms of probabilities and
+ # probability complements (p and 1-p) instead.
+ use_central_limit2: False
"""
***************
*** 251,254 ****
--- 255,259 ----
'use_central_limit': boolean_cracker,
+ 'use_central_limit2': boolean_cracker,
},
}
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.18
retrieving revision 1.19
diff -C2 -d -r1.18 -r1.19
*** classifier.py 23 Sep 2002 21:19:08 -0000 1.18
--- classifier.py 24 Sep 2002 03:29:48 -0000 1.19
***************
*** 743,744 ****
--- 743,838 ----
if options.use_central_limit:
spamprob = central_limit_spamprob
+
+
+
+
+ def central_limit_compute_population_stats2(self, msgstream, is_spam):
+ from math import ldexp, log
+
+ sum = sumsq = 0
+ seen = {}
+ for msg in msgstream:
+ for prob, word, record in self._getclues(msg):
+ if word in seen:
+ continue
+ seen[word] = 1
+ if is_spam:
+ prob = log(prob)
+ else:
+ prob = log(1.0 - prob)
+ prob = long(ldexp(prob, 64))
+ sum += prob
+ sumsq += prob * prob
+ n = len(seen)
+
+ if is_spam:
+ self.spamn, self.spamsum, self.spamsumsq = n, sum, sumsq
+ spamsum = self.spamsum
+ self.spammean = ldexp(spamsum, -64) / self.spamn
+ spamvar = self.spamsumsq * self.spamn - spamsum**2
+ self.spamvar = ldexp(spamvar, -128) / (self.spamn ** 2)
+ print 'spammean', self.spammean, 'spamvar', self.spamvar
+ else:
+ self.hamn, self.hamsum, self.hamsumsq = n, sum, sumsq
+ hamsum = self.hamsum
+ self.hammean = ldexp(hamsum, -64) / self.hamn
+ hamvar = self.hamsumsq * self.hamn - hamsum**2
+ self.hamvar = ldexp(hamvar, -128) / (self.hamn ** 2)
+ print 'hammean', self.hammean, 'hamvar', self.hamvar
+
+ if options.use_central_limit2:
+ compute_population_stats = central_limit_compute_population_stats2
+
+ def central_limit_spamprob2(self, wordstream, evidence=False):
+ """Return best-guess probability that wordstream is spam.
+
+ wordstream is an iterable object producing words.
+ The return value is a float in [0.0, 1.0].
+
+ If optional arg evidence is True, the return value is a pair
+ probability, evidence
+ where evidence is a list of (word, probability) pairs.
+ """
+
+ from math import sqrt, log
+
+ clues = self._getclues(wordstream)
+ hsum = ssum = 0.0
+ for prob, word, record in clues:
+ ssum += log(prob)
+ hsum += log(1.0 - prob)
+ if record is not None:
+ record.killcount += 1
+ n = len(clues)
+ if n == 0:
+ return 0.5
+ hmean = hsum / n
+ smean = ssum / n
+
+ # If this sample is drawn from the spam population, its mean is
+ # distributed around spammean with variance spamvar/n. Likewise
+ # for if it's drawn from the ham population. Compute a normalized
+ # z-score (how many stddevs is it away from the population mean?)
+ # against both populations, and then it's ham or spam depending
+ # on which population it matches better.
+ zham = (hmean - self.hammean) / sqrt(self.hamvar / n)
+ zspam = (smean - self.spammean) / sqrt(self.spamvar / n)
+ stat = abs(zham) - abs(zspam) # > 0 for spam, < 0 for ham
+
+ # Normalize into [0, 1]. I'm arbitrarily clipping it to fit in
+ # [-20, 20] first. 20 is a massive z-score difference.
+ if stat < -20.0:
+ stat = -20.0
+ elif stat > 20.0:
+ stat = 20.0
+ stat = 0.5 + stat / 40.0
+
+ if evidence:
+ clues = [(word, prob) for prob, word, record in clues]
+ clues.sort(lambda a, b: cmp(a[1], b[1]))
+ return stat, clues
+ else:
+ return stat
+
+ if options.use_central_limit2:
+ spamprob = central_limit_spamprob2