[spambayes-dev] Mozilla SpamBayes "porting"

Miguel Vargas miguel at vargas.com
Fri Feb 20 23:49:10 EST 2004


By the way, here's the latest incarnation of my code.  I'm still 
somewhat confused about the floor() function in chi2Q.  I understand why 
it's not needed, so why is it in the SpamBayes code?

/** This section comes from probability(self, record) and 
_getclues(self, wordstream)**/
     for (i = 0; i < count; ++i) {
         Token& token = tokens[i];
// tokens is an array of Token, elements of a Token
// include both token.mProbability and token.mDistance

         const char* word = token.mWord;
	Token* t = mGoodTokens.get(word);
	double hamcount = ((t != NULL) ? t->mCount : 0);
	t = mBadTokens.get(word);
	double spamcount = ((t != NULL) ? t->mCount : 0);

	prob = (spamcount / nbad) / ( hamcount / ngood + spamcount / nbad);
	double n = hamcount + spamcount;
	prob =  (0.225 + n * prob) / (.45 + n);
	double distance = abs(prob - 0.5);
	if (distance >= .1) {
		goodclues++;
		token.mDistance = distance;
		token.mProbability = prob;
	} else {
		token.mDistance = -1; //ignore clue
	}
     }

     // sort the array by the token distances
     PRUint32 first, last = count;
     if (count > 150) {
         first = count - 150;

//  This function sorts the array by token.mDistance
         NS_QuickSort(tokens, count, sizeof(Token), compareTokens, NULL);
     } else {
         first = 0;
     }


/** This section comes from chi2_spamprob(self, wordstream, 
evidence=False) **/
     double H = 1.0, S = 1.0;
     PRUint32 Hexp = 0, Sexp = 0;
     goodclues=0;
     int e;
     for (i = first; i < last; ++i) {
         if (tokens[i].mDistance != -1) {
	    goodclues++;
	    double value = tokens[i].mProbability;
	    S *= (1.0 - value);
	    H *= value;
	    if ( S < 1e-200 ) {
		    S = frexp(S, &e);
		    Sexp += e;
	    }
	    if ( H < 1e-200 ) {
		    H = frexp(H, &e);
		    Hexp += e;
	    }
	}
     }

     S = log(S) + Sexp * M_LN2;
     H = log(H) + Hexp * M_LN2;

     if (goodclues>0) {
         S = 1.0 - chi2Q(-2.0 * S, 2 * goodclues);
         H = 1.0 - chi2Q(-2.0 * H, 2 * goodclues);
         prob = (S-H +1.0) / 2.0;
     } else {
         prob = 0.5;
     }

     PRBool isJunk = (prob >= 0.90);

------------------------------------
Here's the chi2Q funcition:
double chi2Q (double x2, PRUint32 v) {
         PRUint32 i;
         double m = x2 / 2.0;
         double sum = exp(-m);
         double term = sum;
	
	NS_ASSERTION(!(v & 1), "chi2Q called with odd value");

         for (i=1 ; i<=v/2 ; ++i) {
                 term *= m / i;
                 sum += term;
         }
         return dmin(sum,1.0);
}



More information about the spambayes-dev mailing list