[spambayes-dev] Mozilla SpamBayes "porting"
Miguel Vargas
miguel at vargas.com
Fri Feb 20 23:49:10 EST 2004
By the way, here's the latest incarnation of my code. I'm still
somewhat confused about the floor() function in chi2Q. I understand why
it's not needed, so why is it in the SpamBayes code?
/** This section comes from probability(self, record) and
_getclues(self, wordstream)**/
for (i = 0; i < count; ++i) {
Token& token = tokens[i];
// tokens is an array of Token, elements of a Token
// include both token.mProbability and token.mDistance
const char* word = token.mWord;
Token* t = mGoodTokens.get(word);
double hamcount = ((t != NULL) ? t->mCount : 0);
t = mBadTokens.get(word);
double spamcount = ((t != NULL) ? t->mCount : 0);
prob = (spamcount / nbad) / ( hamcount / ngood + spamcount / nbad);
double n = hamcount + spamcount;
prob = (0.225 + n * prob) / (.45 + n);
double distance = abs(prob - 0.5);
if (distance >= .1) {
goodclues++;
token.mDistance = distance;
token.mProbability = prob;
} else {
token.mDistance = -1; //ignore clue
}
}
// sort the array by the token distances
PRUint32 first, last = count;
if (count > 150) {
first = count - 150;
// This function sorts the array by token.mDistance
NS_QuickSort(tokens, count, sizeof(Token), compareTokens, NULL);
} else {
first = 0;
}
/** This section comes from chi2_spamprob(self, wordstream,
evidence=False) **/
double H = 1.0, S = 1.0;
PRUint32 Hexp = 0, Sexp = 0;
goodclues=0;
int e;
for (i = first; i < last; ++i) {
if (tokens[i].mDistance != -1) {
goodclues++;
double value = tokens[i].mProbability;
S *= (1.0 - value);
H *= value;
if ( S < 1e-200 ) {
S = frexp(S, &e);
Sexp += e;
}
if ( H < 1e-200 ) {
H = frexp(H, &e);
Hexp += e;
}
}
}
S = log(S) + Sexp * M_LN2;
H = log(H) + Hexp * M_LN2;
if (goodclues>0) {
S = 1.0 - chi2Q(-2.0 * S, 2 * goodclues);
H = 1.0 - chi2Q(-2.0 * H, 2 * goodclues);
prob = (S-H +1.0) / 2.0;
} else {
prob = 0.5;
}
PRBool isJunk = (prob >= 0.90);
------------------------------------
Here's the chi2Q funcition:
double chi2Q (double x2, PRUint32 v) {
PRUint32 i;
double m = x2 / 2.0;
double sum = exp(-m);
double term = sum;
NS_ASSERTION(!(v & 1), "chi2Q called with odd value");
for (i=1 ; i<=v/2 ; ++i) {
term *= m / i;
sum += term;
}
return dmin(sum,1.0);
}
More information about the spambayes-dev
mailing list