[Spambayes-checkins] spambayes Options.py,1.27,1.28 classifier.py,1.18,1.19

Tim Peters tim_one@users.sourceforge.net
Mon, 23 Sep 2002 20:29:51 -0700


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv9255

Modified Files:
	Options.py classifier.py 
Log Message:
New option use_central_limit2 is Gary Robin's logarithmic variation of
the central-limit code.


Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.27
retrieving revision 1.28
diff -C2 -d -r1.27 -r1.28
*** Options.py	23 Sep 2002 22:41:52 -0000	1.27
--- Options.py	24 Sep 2002 03:29:48 -0000	1.28
***************
*** 203,206 ****
--- 203,210 ----
  # square roots.  An NxN test grid should work fine.
  use_central_limit: False
+ 
+ # Same as use_central_limit, except takes logarithms of probabilities and
+ # probability complements (p and 1-p) instead.
+ use_central_limit2: False
  """
  
***************
*** 251,254 ****
--- 255,259 ----
  
                     'use_central_limit': boolean_cracker,
+                    'use_central_limit2': boolean_cracker,
                     },
  }

Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.18
retrieving revision 1.19
diff -C2 -d -r1.18 -r1.19
*** classifier.py	23 Sep 2002 21:19:08 -0000	1.18
--- classifier.py	24 Sep 2002 03:29:48 -0000	1.19
***************
*** 743,744 ****
--- 743,838 ----
      if options.use_central_limit:
          spamprob = central_limit_spamprob
+ 
+ 
+ 
+ 
+     def central_limit_compute_population_stats2(self, msgstream, is_spam):
+         from math import ldexp, log
+ 
+         sum = sumsq = 0
+         seen = {}
+         for msg in msgstream:
+             for prob, word, record in self._getclues(msg):
+                 if word in seen:
+                     continue
+                 seen[word] = 1
+                 if is_spam:
+                     prob = log(prob)
+                 else:
+                     prob = log(1.0 - prob)
+                 prob = long(ldexp(prob, 64))
+                 sum += prob
+                 sumsq += prob * prob
+         n = len(seen)
+ 
+         if is_spam:
+             self.spamn, self.spamsum, self.spamsumsq = n, sum, sumsq
+             spamsum = self.spamsum
+             self.spammean = ldexp(spamsum, -64) / self.spamn
+             spamvar = self.spamsumsq * self.spamn - spamsum**2
+             self.spamvar = ldexp(spamvar, -128) / (self.spamn ** 2)
+             print 'spammean', self.spammean, 'spamvar', self.spamvar
+         else:
+             self.hamn, self.hamsum, self.hamsumsq = n, sum, sumsq
+             hamsum = self.hamsum
+             self.hammean = ldexp(hamsum, -64) / self.hamn
+             hamvar = self.hamsumsq * self.hamn - hamsum**2
+             self.hamvar = ldexp(hamvar, -128) / (self.hamn ** 2)
+             print 'hammean', self.hammean, 'hamvar', self.hamvar
+ 
+     if options.use_central_limit2:
+         compute_population_stats = central_limit_compute_population_stats2
+ 
+     def central_limit_spamprob2(self, wordstream, evidence=False):
+         """Return best-guess probability that wordstream is spam.
+ 
+         wordstream is an iterable object producing words.
+         The return value is a float in [0.0, 1.0].
+ 
+         If optional arg evidence is True, the return value is a pair
+             probability, evidence
+         where evidence is a list of (word, probability) pairs.
+         """
+ 
+         from math import sqrt, log
+ 
+         clues = self._getclues(wordstream)
+         hsum = ssum = 0.0
+         for prob, word, record in clues:
+             ssum += log(prob)
+             hsum += log(1.0 - prob)
+             if record is not None:
+                 record.killcount += 1
+         n = len(clues)
+         if n == 0:
+             return 0.5
+         hmean = hsum / n
+         smean = ssum / n
+ 
+         # If this sample is drawn from the spam population, its mean is
+         # distributed around spammean with variance spamvar/n.  Likewise
+         # for if it's drawn from the ham population.  Compute a normalized
+         # z-score (how many stddevs is it away from the population mean?)
+         # against both populations, and then it's ham or spam depending
+         # on which population it matches better.
+         zham = (hmean - self.hammean) / sqrt(self.hamvar / n)
+         zspam = (smean - self.spammean) / sqrt(self.spamvar / n)
+         stat = abs(zham) - abs(zspam)  # > 0 for spam, < 0 for ham
+ 
+         # Normalize into [0, 1].  I'm arbitrarily clipping it to fit in
+         # [-20, 20] first.  20 is a massive z-score difference.
+         if stat < -20.0:
+             stat = -20.0
+         elif stat > 20.0:
+             stat = 20.0
+         stat = 0.5 + stat / 40.0
+ 
+         if evidence:
+             clues = [(word, prob) for prob, word, record in clues]
+             clues.sort(lambda a, b: cmp(a[1], b[1]))
+             return stat, clues
+         else:
+             return stat
+ 
+     if options.use_central_limit2:
+         spamprob = central_limit_spamprob2