[Spambayes-checkins]
spambayes Options.py,1.52,1.53 classifier.py,1.40,1.41
Tim Peters
tim_one@users.sourceforge.net
Fri, 18 Oct 2002 14:38:18 -0700
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv29331
Modified Files:
Options.py classifier.py
Log Message:
Dropped tim_combining. Rearranged the remaining combining schemes to
get the code close together. Added '*n*' pseudo-clue to mixed_combining
clue list.
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.52
retrieving revision 1.53
diff -C2 -d -r1.52 -r1.53
*** Options.py 18 Oct 2002 18:30:38 -0000 1.52
--- Options.py 18 Oct 2002 21:38:16 -0000 1.53
***************
*** 240,252 ****
robinson_minimum_prob_strength: 0.1
- # For the default scheme, use "tim-combining" of probabilities. Tim-
- # combining is a kind of cross between Paul Graham's and Gary Robinson's
- # combining schemes. Unlike Paul's, it's never crazy-certain, and compared
- # to Gary's, in Tim's tests it greatly increased the spread between mean
- # ham-scores and spam-scores, while simultaneously decreasing the variance
- # of both. Tim needed a higher spam_cutoff value for best results, but
- # spam_cutoff is less touchy than under Gary-combining.
- use_tim_combining: False
-
# For vectors of random, uniformly distributed probabilities, -2*sum(ln(p_i))
# follows the chi-squared distribution with 2*n degrees of freedom. That's
--- 240,243 ----
***************
*** 319,323 ****
'robinson_probability_s': float_cracker,
'robinson_minimum_prob_strength': float_cracker,
- 'use_tim_combining': boolean_cracker,
'use_chi_squared_combining': boolean_cracker,
--- 310,313 ----
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.40
retrieving revision 1.41
diff -C2 -d -r1.40 -r1.41
*** classifier.py 18 Oct 2002 18:30:38 -0000 1.40
--- classifier.py 18 Oct 2002 21:38:16 -0000 1.41
***************
*** 97,100 ****
--- 97,103 ----
self.wordinfo, self.nspam, self.nham = t[1:]
+ # spamprob() implementations. One of the following is aliased to
+ # spamprob, depending on option settings.
+
def gary_spamprob(self, wordstream, evidence=False):
"""Return best-guess probability that wordstream is spam.
***************
*** 166,170 ****
return prob
! spamprob = gary_spamprob # may be replaced later
def learn(self, wordstream, is_spam, update_probabilities=True):
--- 169,332 ----
return prob
! spamprob = gary_spamprob # may be replaced by one of the next ones
!
! # Across vectors of length n, containing random uniformly-distributed
! # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution
! # with 2*n degrees of freedom. This has been proven (in some
! # appropriate sense) to be the most sensitive possible test for
! # rejecting the hypothesis that a vector of probabilities is uniformly
! # distributed. Gary Robinson's original scheme was monotonic *with*
! # this test, but skipped the details. Turns out that getting closer
! # to the theoretical roots gives a much sharper classification, with
! # a very small (in # of msgs), but also very broad (in range of scores),
! # "middle ground", where most of the mistakes live. In particular,
! # this scheme seems immune to all forms of "cancellation disease": if
! # there are many strong ham *and* spam clues, this reliably scores
! # close to 0.5. Most other schemes are extremely certain then -- and
! # often wrong.
! def chi2_spamprob(self, wordstream, evidence=False):
! """Return best-guess probability that wordstream is spam.
!
! wordstream is an iterable object producing words.
! The return value is a float in [0.0, 1.0].
!
! If optional arg evidence is True, the return value is a pair
! probability, evidence
! where evidence is a list of (word, probability) pairs.
! """
!
! from math import frexp, log as ln
!
! # We compute two chi-squared statistics, one for ham and one for
! # spam. The sum-of-the-logs business is more sensitive to probs
! # near 0 than to probs near 1, so the spam measure uses 1-p (so
! # that high-spamprob words have greatest effect), and the ham
! # measure uses p directly (so that lo-spamprob words have greatest
! # effect).
! #
! # For optimization, sum-of-logs == log-of-product, and f.p.
! # multiplication is a lot cheaper than calling ln(). It's easy
! # to underflow to 0.0, though, so we simulate unbounded dynamic
! # range via frexp. The real product H = this H * 2**Hexp, and
! # likewise the real product S = this S * 2**Sexp.
! H = S = 1.0
! Hexp = Sexp = 0
!
! clues = self._getclues(wordstream)
! for prob, word, record in clues:
! if record is not None: # else wordinfo doesn't know about it
! record.killcount += 1
! S *= 1.0 - prob
! H *= prob
! if S < 1e-200: # prevent underflow
! S, e = frexp(S)
! Sexp += e
! if H < 1e-200: # prevent underflow
! H, e = frexp(H)
! Hexp += e
!
! # Compute the natural log of the product = sum of the logs:
! # ln(x * 2**i) = ln(x) + i * ln(2).
! S = ln(S) + Sexp * LN2
! H = ln(H) + Hexp * LN2
!
! n = len(clues)
! if n:
! S = 1.0 - chi2Q(-2.0 * S, 2*n)
! H = 1.0 - chi2Q(-2.0 * H, 2*n)
!
! # How to combine these into a single spam score? We originally
! # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A
! # systematic problem is that we could end up being near-certain
! # a thing was (for example) spam, even if S was small, provided
! # that H was much smaller.
! # Rob Hooft stared at these problems and invented the measure
! # we use now, the simpler S-H, scaled into [0., 1.].
! prob = (S-H + 1.0) / 2.0
! else:
! prob = 0.5
!
! if evidence:
! clues = [(w, p) for p, w, r in clues]
! clues.sort(lambda a, b: cmp(a[1], b[1]))
! clues.insert(0, ('*S*', S))
! clues.insert(0, ('*H*', H))
! return prob, clues
! else:
! return prob
!
! if options.use_chi_squared_combining:
! spamprob = chi2_spamprob
!
! # This is a weighted average of the other two. In extreme cases, they
! # often seem to disagree on how "certain" they are. Mixing softens
! # the extremes, pushing even some very hard cases into the middle ground.
! def mixed_spamprob(self, wordstream, evidence=False):
! """Return best-guess probability that wordstream is spam.
!
! wordstream is an iterable object producing words.
! The return value is a float in [0.0, 1.0].
!
! If optional arg evidence is True, the return value is a pair
! probability, evidence
! where evidence is a list of (word, probability) pairs.
! """
!
! from math import frexp, log as ln
!
! H = S = 1.0
! Hexp = Sexp = 0
!
! clues = self._getclues(wordstream)
! for prob, word, record in clues:
! if record is not None: # else wordinfo doesn't know about it
! record.killcount += 1
! S *= 1.0 - prob
! H *= prob
! if S < 1e-200: # prevent underflow
! S, e = frexp(S)
! Sexp += e
! if H < 1e-200: # prevent underflow
! H, e = frexp(H)
! Hexp += e
!
! n = len(clues)
! if n:
! nrecip = 1.0 / n
! P = 1.0 - S**nrecip * 2.0**(Sexp * nrecip)
! Q = 1.0 - H**nrecip * 2.0**(Hexp * nrecip)
!
! S = ln(S) + Sexp * LN2
! H = ln(H) + Hexp * LN2
! S = 1.0 - chi2Q(-2.0 * S, 2*n)
! H = 1.0 - chi2Q(-2.0 * H, 2*n)
!
! else:
! P = Q = S = H = 1.0
!
! gary_score = P/(P+Q)
! chi_score = (S-H + 1.0) / 2.0
!
! w = options.mixed_combining_chi_weight
! prob = w * chi_score + (1.0 - w) * gary_score
!
! if evidence:
! clues = [(w, p) for p, w, r in clues]
! clues.sort(lambda a, b: cmp(a[1], b[1]))
! extra = [('*chi_score*', chi_score),
! ('*gary_score*', gary_score),
! ('*S*', S),
! ('*H*', H),
! ('*P*', P),
! ('*Q*', Q),
! ('*n*', n),
! ]
! clues[0:0] = extra
! return prob, clues
! else:
! return prob
!
! if options.use_mixed_combining:
! spamprob = mixed_spamprob
def learn(self, wordstream, is_spam, update_probabilities=True):
***************
*** 358,610 ****
# Return (prob, word, record).
return [t[1:] for t in clues]
-
- #************************************************************************
- # Some options change so much behavior that it's better to write a
- # different method.
- # CAUTION: These end up overwriting methods of the same name above.
- # A subclass would be cleaner, but experiments will soon enough lead
- # to only one of the alternatives surviving.
-
- def tim_spamprob(self, wordstream, evidence=False):
- """Return best-guess probability that wordstream is spam.
-
- wordstream is an iterable object producing words.
- The return value is a float in [0.0, 1.0].
-
- If optional arg evidence is True, the return value is a pair
- probability, evidence
- where evidence is a list of (word, probability) pairs.
- """
-
- from math import frexp
-
- # The real H = this H times 2**Hexp. Likewise for S. We're
- # simulating unbounded dynamic float range by hand. If this pans
- # out, *maybe* we should store logarithms in the database instead
- # and just add them here. But I like keeping raw counts in the
- # database (they're easy to understand, manipulate and combine),
- # and there's no evidence that this simulation is a significant
- # expense.
- # S is a spamminess measure, and is the geometric mean of the
- # extreme-word spamprobs.
- # H is a hamminess measure, and is the geometric mean of 1 - the
- # extreme-word spamprobs.
- H = S = 1.0
- Hexp = Sexp = 0
- clues = self._getclues(wordstream)
- for prob, word, record in clues:
- if record is not None: # else wordinfo doesn't know about it
- record.killcount += 1
- S *= prob
- H *= 1.0 - prob
- if S < 1e-200: # move back into range
- S, e = frexp(S)
- Sexp += e
- if H < 1e-200: # move back into range
- H, e = frexp(H)
- Hexp += e
-
- S, e = frexp(S)
- Sexp += e
- H, e = frexp(H)
- Hexp += e
-
- num_clues = len(clues)
- if num_clues:
- # (x*2**e)**n = x**n * 2**(e*n).
- n = 1.0 / num_clues
- S = S**n * 2.0**(Sexp * n)
- H = H**n * 2.0**(Hexp * n)
- prob = S/(S+H)
- else:
- prob = 0.5
-
- if evidence:
- clues = [(w, p) for p, w, r in clues]
- clues.sort(lambda a, b: cmp(a[1], b[1]))
- return prob, clues
- else:
- return prob
-
- if options.use_tim_combining:
- spamprob = tim_spamprob
-
- # Across vectors of length n, containing random uniformly-distributed
- # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution
- # with 2*n degrees of freedom. This has been proven (in some
- # appropriate sense) to be the most sensitive possible test for
- # rejecting the hypothesis that a vector of probabilities is uniformly
- # distributed. Gary Robinson's original scheme was monotonic *with*
- # this test, but skipped the details. Turns out that getting closer
- # to the theoretical roots gives a much sharper classification, with
- # a very small (in # of msgs), but also very broad (in range of scores),
- # "middle ground", where most of the mistakes live. In particular,
- # this scheme seems immune to all forms of "cancellation disease": if
- # there are many strong ham *and* spam clues, this reliably scores
- # close to 0.5. Most other schemes are extremely certain then -- and
- # often wrong.
- def chi2_spamprob(self, wordstream, evidence=False):
- """Return best-guess probability that wordstream is spam.
-
- wordstream is an iterable object producing words.
- The return value is a float in [0.0, 1.0].
-
- If optional arg evidence is True, the return value is a pair
- probability, evidence
- where evidence is a list of (word, probability) pairs.
- """
-
- from math import frexp, log as ln
-
- # We compute two chi-squared statistics, one for ham and one for
- # spam. The sum-of-the-logs business is more sensitive to probs
- # near 0 than to probs near 1, so the spam measure uses 1-p (so
- # that high-spamprob words have greatest effect), and the ham
- # measure uses p directly (so that lo-spamprob words have greatest
- # effect).
- #
- # For optimization, sum-of-logs == log-of-product, and f.p.
- # multiplication is a lot cheaper than calling ln(). It's easy
- # to underflow to 0.0, though, so we simulate unbounded dynamic
- # range via frexp. The real product H = this H * 2**Hexp, and
- # likewise the real product S = this S * 2**Sexp.
- H = S = 1.0
- Hexp = Sexp = 0
-
- clues = self._getclues(wordstream)
- for prob, word, record in clues:
- if record is not None: # else wordinfo doesn't know about it
- record.killcount += 1
- S *= 1.0 - prob
- H *= prob
- if S < 1e-200: # prevent underflow
- S, e = frexp(S)
- Sexp += e
- if H < 1e-200: # prevent underflow
- H, e = frexp(H)
- Hexp += e
-
- # Compute the natural log of the product = sum of the logs:
- # ln(x * 2**i) = ln(x) + i * ln(2).
- S = ln(S) + Sexp * LN2
- H = ln(H) + Hexp * LN2
-
- n = len(clues)
- if n:
- S = 1.0 - chi2Q(-2.0 * S, 2*n)
- H = 1.0 - chi2Q(-2.0 * H, 2*n)
-
- # How to combine these into a single spam score? We originally
- # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A
- # systematic problem is that we could end up being near-certain
- # a thing was (for example) spam, even if S was small, provided
- # that H was much smaller.
- # Rob Hooft stared at these problems and invented the measure
- # we use now, the simpler S-H, scaled into [0., 1.].
- prob = (S-H + 1.0) / 2.0
- else:
- prob = 0.5
-
- if evidence:
- clues = [(w, p) for p, w, r in clues]
- clues.sort(lambda a, b: cmp(a[1], b[1]))
- clues.insert(0, ('*S*', S))
- clues.insert(0, ('*H*', H))
- return prob, clues
- else:
- return prob
-
- if options.use_chi_squared_combining:
- spamprob = chi2_spamprob
-
- def mixed_spamprob(self, wordstream, evidence=False):
- """Return best-guess probability that wordstream is spam.
-
- wordstream is an iterable object producing words.
- The return value is a float in [0.0, 1.0].
-
- If optional arg evidence is True, the return value is a pair
- probability, evidence
- where evidence is a list of (word, probability) pairs.
- """
-
- from math import frexp, log as ln
-
- # We compute two chi-squared statistics, one for ham and one for
- # spam. The sum-of-the-logs business is more sensitive to probs
- # near 0 than to probs near 1, so the spam measure uses 1-p (so
- # that high-spamprob words have greatest effect), and the ham
- # measure uses p directly (so that lo-spamprob words have greatest
- # effect).
- #
- # For optimization, sum-of-logs == log-of-product, and f.p.
- # multiplication is a lot cheaper than calling ln(). It's easy
- # to underflow to 0.0, though, so we simulate unbounded dynamic
- # range via frexp. The real product H = this H * 2**Hexp, and
- # likewise the real product S = this S * 2**Sexp.
- H = S = 1.0
- Hexp = Sexp = 0
-
- clues = self._getclues(wordstream)
- for prob, word, record in clues:
- if record is not None: # else wordinfo doesn't know about it
- record.killcount += 1
- S *= 1.0 - prob
- H *= prob
- if S < 1e-200: # prevent underflow
- S, e = frexp(S)
- Sexp += e
- if H < 1e-200: # prevent underflow
- H, e = frexp(H)
- Hexp += e
-
- n = len(clues)
- if n:
- #P = 1.0 - P**(1./num_clues)
- #Q = 1.0 - Q**(1./num_clues)
- #
- # (x*2**e)**n = x**n * 2**(e*n)
- nrecip = 1.0 / n
- P = 1.0 - S**nrecip * 2.0**(Sexp * nrecip)
- Q = 1.0 - H**nrecip * 2.0**(Hexp * nrecip)
-
- # Compute the natural log of the product = sum of the logs:
- # ln(x * 2**i) = ln(x) + i * ln(2).
- S = ln(S) + Sexp * LN2
- H = ln(H) + Hexp * LN2
-
- S = 1.0 - chi2Q(-2.0 * S, 2*n)
- H = 1.0 - chi2Q(-2.0 * H, 2*n)
-
- else:
- P = Q = S = H = 1.0
-
- gary_score = P/(P+Q)
-
- # How to combine these into a single spam score? We originally
- # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A
- # systematic problem is that we could end up being near-certain
- # a thing was (for example) spam, even if S was small, provided
- # that H was much smaller.
- # Rob Hooft stared at these problems and invented the measure
- # we use now, the simpler S-H, scaled into [0., 1.].
- chi_score = (S-H + 1.0) / 2.0
-
- x = options.mixed_combining_chi_weight
- prob = x * chi_score + (1.0 - x) * gary_score
-
- if evidence:
- clues = [(w, p) for p, w, r in clues]
- clues.sort(lambda a, b: cmp(a[1], b[1]))
- clues.insert(0, ('*P*', P))
- clues.insert(0, ('*Q*', Q))
- clues.insert(0, ('*S*', S))
- clues.insert(0, ('*H*', H))
- clues.insert(0, ('*chi_score*', chi_score))
- clues.insert(0, ('*gary_score*', gary_score))
- return prob, clues
- else:
- return prob
-
- if options.use_mixed_combining:
- spamprob = mixed_spamprob
--- 520,521 ----