[Tutor] about a program

Sun Jun 8 09:27:01 2003

--0-1080038127-1055078789=:59825
Content-Type: multipart/alternative; boundary="0-1564231918-1055078789=:59825"

--0-1564231918-1055078789=:59825
Content-Type: text/plain; charset=us-ascii

Hi,

I was working on a program that verifies whether a given message is spam or not. the program uses statistical analysis based on Paul Graham's plan for spam. However I set up the alforithm of the program as follows:

1.read the mails spam or non-spam form respective directories(build_corpus())

2.coumpute the the frequency of each word

3.get a message to check for "spamness" and compute the probility of each word

by using the the frequency of the words and put in adictionary {'word':probality}

4.take the 15 most improbaple and put it in a list and combine them

5. if the sscore of the combination is greater than 90% then the message is spam

PROBLEM: I have a problem getting wrong values, so can anyone have a look the program set up and comment if I can figure out why I am getting the wrong values.

I know it is hard to follow someones's program setup but sometimes with a third party could be a lot of help.

the program is attached with this e-mail

thanks in advance

---------------------------------
Do you Yahoo!?
Free online calendar with sync to Outlook(TM).
--0-1564231918-1055078789=:59825
Content-Type: text/html; charset=us-ascii

<DIV>
<DIV><FONT size=2>
<P>Hi,</P>
<P>I was working on a program that verifies whether a given message is spam or not. the program uses statistical analysis based on Paul Graham's plan for spam. However I set up the alforithm of the program as follows:</P>
<P>1.read the mails spam or non-spam form respective directories(build_corpus())</P>
<P>2.coumpute the the frequency of each word</P>
<P>3.get a message to check for "spamness" and compute the probility of each word</P>
<P>by using the the frequency of the words and put in adictionary {'word':probality}</P>
<P>4.take the 15 most improbaple and put it in a list and combine them</P>
<P>5. if the sscore of the combination is greater than 90% then the message is spam</P>
<P>PROBLEM: I have a problem getting wrong values, so can anyone have a look the program set up and comment if I can figure out why I am getting the wrong values.</P>
<P>I know it is hard to follow someones's program setup but sometimes with a third party could be a lot of help.</P>
<P>the program is attached with this e-mail</P>
<P>thanks in advance</P></FONT></DIV></DIV><p><hr SIZE=1>
Do you Yahoo!?<br>
Free <a href="http://us.rd.yahoo.com/mail_us/tag/*http://calendar.yahoo.com">online calendar</a> with sync to Outlook(TM).
--0-1564231918-1055078789=:59825--
--0-1080038127-1055078789=:59825
Content-Type: text/plain; name="TestTest.py"
Content-Description: TestTest.py
Content-Disposition: inline; filename="TestTest.py"

import sys
import os
import re
import math
import cPickle
import string
#from utils import *
from glob import glob

def incr(hash,key):
    """ this function counts the frequency of word """
    hash[key]=hash.get(key,0)+1

#this function used glob which reads multiple files
def getwords(fn,addfreq):
    #print "Read %s..."%fn
    file=open(fn)
    text=file.read()
    file.close()
    for word in text.split():
        if len(word)<100:
            lw=word.lower()
            incr(addfreq,lw)
            incr(addfreq,'*')

class Classifier(ClassifierI):

    def __init__(self):
        self.spam={'*': 0}
        self.nonspam={'*': 0}

    def classify(self,token):
        return Token(LabeledText(token.type(),'spam'),token.loc())
    def labels(self):
   	    return ('spam','nonspam')

    #def hello(self):
        #print "hello"

    def generate_prob(self,word):
        """ This function computes the probability that a word occurs n times """
        # first change case

        lowerWord = word.lower()
        #print "lowerWord= %s" %(lowerWord)#

        # goodword frequency
        g = float(self.nonspam.get(lowerWord, 0) * 2)
        print "g = %5.3f" %(g)

        #print self.nonspam.get(lowerWord, 0)

        #print" g = %5.3f" %(g)#
        # bad word frequency
        b = float(self.spam.get(lowerWord, 0))

        # non-spammed counts    
        goodCount = self.nonspam['*']
        #print "good count = %d" %(goodCount)#

        # spammed counts
        badCount = self.spam['*']

        # Not seen before      
        if g == 0 and b == 0:
            return 0.2
        # Not frequent enough   
        if  g + b < 5:
            return 0.2

        bfreq = min (1.0 , b / badCount )
        gfreq = min (1.0 , g / goodCount )

        result = max(0.01, min(0.99, (bfreq /gfreq + bfreq)))

        return result

    def isSpam(self, Message):
        """    """
        #setup a regular expression
        word_like = re.compile( '[-\w\'$]+')
        temp_result = word_like.findall(Message)

        #this is the third hashtable that will be stored
        #the word and its calculated probability temp_dict{'word':probability}
        temp_dict = {}
        for word in temp_result:
            p = self.generate_prob(word)

            #print " p = %5.3f" %(p)#
            p2 = abs(p - 0.5)
            temp_dict[word] = p2

        #print temp_dict
        # call for report token which counts the frequency in
        # descending order and returns a list of 16 less probable word frequency
        list = report_tokens(temp_dict)          

        # call for combine fucntion which combines the probabilty of
        # 16 less probable word frequency and returns combination
        # of score greater than 0.90%
        score = combine(list)
        print score

        if score > 0.90:
            spam_message = "the message is spammed....."
            print spam_message 
            #return spam_message              
        else:
            non_spam_message = "the message is non spammed....."
            print non_spam_message
            #return non_spam_message

def build_corpus():
    """it scans several mails """
    print "Scanning the files in the directory......"
    count=Classifier()
    for file in glob("Spam/msg*.txt"):
        #print file
        getwords(file,count.spam)
    print "Spam: %d words known" % len(count.spam)
    for file in glob("NonSpam/1000*-*.txt"):
        #print file        
        getwords(file,count.nonspam)
    print "Nonspam: %d words known" % len(count.nonspam)
    return count

def main():
    build_corpus()

    save_Data()
    sw =Classifier()

    # for tseting
    #'Spam/msg101.txt'
    file = open('NonSpam/10002-nspm.txt') 
    text = file.read()            
    sw.isSpam(text)

if __name__=='__main__':
    main()
    #for arg in sys.argv[1:]:
    #global results
    #load_Data()

    #build_corpus()
    #save_Data()
    #sw =Classifier()

    # for tseting
    #'Spam/msg101.txt'
    #file = open('10002-nspm.txt') 
    #text = file.read()            
    #sw.isSpam(text)

#------------------------------------------
#a = Classifier()
#rint a.classify(Token("this is a sentence"))
#b = a.hello()
#b
#------------------------------

--0-1080038127-1055078789=:59825--