[Spambayes-checkins] spambayes Persistent.py,1.1,1.2 hammiebulk.py,1.1,1.2 Corpus.py,1.2,1.3 FileCorpus.py,1.2,1.3 Options.py,1.75,1.76 TestDriver.py,1.30,1.31 Tester.py,1.8,1.9 classifier.py,1.53,1.54 dbdict.py,1.1,1.2 hammie.py,1.40,1.41 hammiefilter.py,1.2,1.3 pop3proxy.py,1.18,1.19 Bayes.py,1.5,NONE

Neale Pickett npickett@users.sourceforge.net
Mon Nov 25 02:29:47 2002


Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv31682

Modified Files:
	Corpus.py FileCorpus.py Options.py TestDriver.py Tester.py 
	classifier.py dbdict.py hammie.py hammiefilter.py pop3proxy.py 
Added Files:
	Persistent.py hammiebulk.py 
Removed Files:
	Bayes.py 
Log Message:
* Merge from hammie-playground to HEAD.  See spambayes list for more
  details.




Index: Corpus.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Corpus.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** Corpus.py	16 Nov 2002 19:03:15 -0000	1.2
--- Corpus.py	25 Nov 2002 02:29:44 -0000	1.3
***************
*** 230,234 ****
  
          return msg
!         
  
  class ExpiryCorpus:
--- 230,234 ----
  
          return msg
! 
  
  class ExpiryCorpus:
***************
*** 272,276 ****
      def __init__(self):
          '''Constructor()'''
!         pass
  
      def load(self):
--- 272,278 ----
      def __init__(self):
          '''Constructor()'''
! 
!         self.payload = None
!         self.hdrtxt = None
  
      def load(self):
***************
*** 297,301 ****
          '''Instance as a printable string'''
  
!         return self.substance
  
      def name(self):
--- 299,303 ----
          '''Instance as a printable string'''
  
!         return self.getSubstance()
  
      def name(self):
***************
*** 311,322 ****
      def setSubstance(self, sub):
          '''set this message substance'''
!         
!         self.substance = sub
!         
      def getSubstance(self):
          '''Return this message substance'''
!         
!         return self.substance
!         
      def setSpamprob(self, prob):
          '''Score of the last spamprob calc, may not be persistent'''
--- 313,328 ----
      def setSubstance(self, sub):
          '''set this message substance'''
! 
!         bodyRE = re.compile(r"\r?\n(\r?\n)(.*)", re.DOTALL+re.MULTILINE)
!         bmatch = bodyRE.search(sub)
!         if bmatch:
!             self.payload = bmatch.group(2)
!             self.hdrtxt = sub[:bmatch.start(2)]
! 
      def getSubstance(self):
          '''Return this message substance'''
! 
!         return self.hdrtxt + self.payload
! 
      def setSpamprob(self, prob):
          '''Score of the last spamprob calc, may not be persistent'''
***************
*** 327,331 ****
          '''Returns substance as tokens'''
  
!         return tokenizer.tokenize(self.substance)
  
      def createTimeStamp(self):
--- 333,337 ----
          '''Returns substance as tokens'''
  
!         return tokenizer.tokenize(self.getSubstance())
  
      def createTimeStamp(self):
***************
*** 335,338 ****
--- 341,398 ----
          raise NotImplementedError
  
+     def getFrom(self):
+         '''Return a message From header content'''
+ 
+         if self.hdrtxt:
+             match = re.search(r'^From:(.*)$', self.hdrtxt, re.MULTILINE)
+             return match.group(1)
+         else:
+             return None
+ 
+     def getSubject(self):
+         '''Return a message Subject header contents'''
+ 
+         if self.hdrtxt:
+             match = re.search(r'^Subject:(.*)$', self.hdrtxt, re.MULTILINE)
+             return match.group(1)
+         else:
+             return None
+ 
+     def getDate(self):
+         '''Return a message Date header contents'''
+ 
+         if self.hdrtxt:
+             match = re.search(r'^Date:(.*)$', self.hdrtxt, re.MULTILINE)
+             return match.group(1)
+         else:
+             return None
+ 
+     def getHeadersList(self):
+         '''Return a list of message header tuples'''
+ 
+         hdrregex = re.compile(r'^([A-Za-z0-9-_]*): ?(.*)$', re.MULTILINE)
+         data = re.sub(r'\r?\n\r?\s',' ',self.hdrtxt,re.MULTILINE)
+         match = hdrregex.findall(data)
+ 
+ 	return match
+ 	
+     def getHeaders(self):
+         '''Return message headers as text'''
+         
+         return self.hdrtxt
+ 
+     def getPayload(self):
+         '''Return the message body'''
+ 
+         return self.payload
+ 
+     def stripSBDHeader(self):
+         '''Removes the X-Spambayes-Disposition: header from the message'''
+ 
+         # This is useful for training, where a spammer may be spoofing
+         # our header, to make sure that our header doesn't become an
+         # overweight clue to hamminess
+ 
+         raise NotImplementedError
  
  

Index: FileCorpus.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/FileCorpus.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** FileCorpus.py	16 Nov 2002 19:06:27 -0000	1.2
--- FileCorpus.py	25 Nov 2002 02:29:44 -0000	1.3
***************
*** 86,90 ****
  
  import Corpus
! import Bayes
  import sys, os, gzip, fnmatch, getopt, errno, time, stat
  
--- 86,90 ----
  
  import Corpus
! import Persistent
  import sys, os, gzip, fnmatch, getopt, errno, time, stat
  
***************
*** 192,195 ****
--- 192,196 ----
          '''Constructor(message file name, corpus directory name)'''
  
+         Corpus.Message.__init__(self)
          self.file_name = file_name
          self.directory = directory
***************
*** 214,218 ****
                 raise
          else:
!            self.substance = fp.read()
             fp.close()
  
--- 215,219 ----
                 raise
          else:
!            self.setSubstance(fp.read())
             fp.close()
  
***************
*** 225,229 ****
          pn = self.pathname()
          fp = open(pn, 'wb')
!         fp.write(self.substance)
          fp.close()
  
--- 226,230 ----
          pn = self.pathname()
          fp = open(pn, 'wb')
!         fp.write(self.getSubstance())
          fp.close()
  
***************
*** 248,260 ****
  
          elip = ''
!         sub = self.substance
! 
          if Corpus.Verbose:
!             sub = self.substance
          else:
!             if len(self.substance) > 20:
!                 sub = self.substance[:20]
!                 if len(self.substance) > 40:
!                     sub += '...' + self.substance[-20:]
  
          pn = os.path.join(self.directory, self.file_name)
--- 249,261 ----
  
          elip = ''
!         sub = self.getSubstance()
!         
          if Corpus.Verbose:
!             sub = self.getSubstance()
          else:
!             if len(sub) > 20:
!                 sub = sub[:20]
!                 if len(sub) > 40:
!                     sub += '...' + sub[-20:]
  
          pn = os.path.join(self.directory, self.file_name)
***************
*** 304,308 ****
                  raise
          else:
!             self.substance = fp.read()
              fp.close()
  
--- 305,309 ----
                  raise
          else:
!             self.setSubstance(fp.read())
              fp.close()
  
***************
*** 316,320 ****
          pn = self.pathname()
          gz = gzip.open(pn, 'wb')
!         gz.write(self.substance)
          gz.flush()
          gz.close()
--- 317,321 ----
          pn = self.pathname()
          gz = gzip.open(pn, 'wb')
!         gz.write(self.getSubstance())
          gz.flush()
          gz.close()
***************
*** 342,354 ****
          print 'Executing with uncompressed files'
  
!     print '\n\nCreating two Bayes databases'
!     miscbayes = Bayes.PickledBayes('fctestmisc.bayes')
!     classbayes = Bayes.DBDictBayes('fctestclass.bayes')
  
      print '\n\nSetting up spam corpus'
      spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus')
!     spamtrainer = Bayes.SpamTrainer(miscbayes)
      spamcorpus.addObserver(spamtrainer)
!     anotherspamtrainer = Bayes.SpamTrainer(classbayes, Bayes.UPDATEPROBS)
      spamcorpus.addObserver(anotherspamtrainer)
  
--- 343,355 ----
          print 'Executing with uncompressed files'
  
!     print '\n\nCreating two Classifier databases'
!     miscbayes = Persistent.PickledClassifier('fctestmisc.bayes')
!     classbayes = Persistent.DBDictClassifier('fctestclass.bayes')
  
      print '\n\nSetting up spam corpus'
      spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus')
!     spamtrainer = Persistent.SpamTrainer(miscbayes)
      spamcorpus.addObserver(spamtrainer)
!     anotherspamtrainer = Persistent.SpamTrainer(classbayes, Persistent.UPDATEPROBS)
      spamcorpus.addObserver(anotherspamtrainer)
  
***************
*** 365,374 ****
                            'fctesthamcorpus', \
                            'MSG*')
!     hamtrainer = Bayes.HamTrainer(miscbayes)
      hamcorpus.addObserver(hamtrainer)
      hamtrainer.trainAll(hamcorpus)
  
! 
!     print '\n\nAdd a message to hamcorpus that does not match the filter'
      if useGzip:
          fmClass = GzipFileMessage
--- 366,374 ----
                            'fctesthamcorpus', \
                            'MSG*')
!     hamtrainer = Persistent.HamTrainer(miscbayes)
      hamcorpus.addObserver(hamtrainer)
      hamtrainer.trainAll(hamcorpus)
  
!     print '\n\nA couple of message related tests'
      if useGzip:
          fmClass = GzipFileMessage
***************
*** 377,380 ****
--- 377,383 ----
  
      m1 = fmClass('XMG00001', 'fctestspamcorpus')
+     m1.setSubstance(testmsg2())
+     
+     print '\n\nAdd a message to hamcorpus that does not match the filter'
  
      try:
***************
*** 417,421 ****
  
      print '\n\nTrain with an individual message'
!     anotherhamtrainer = Bayes.HamTrainer(classbayes)
      anotherhamtrainer.train(unsurecorpus['MSG00005'])
  
--- 420,424 ----
  
      print '\n\nTrain with an individual message'
!     anotherhamtrainer = Persistent.HamTrainer(classbayes)
      anotherhamtrainer.train(unsurecorpus['MSG00005'])
  
***************
*** 428,431 ****
--- 431,443 ----
      msg = spamcorpus['MSG00001']
      print msg
+     print '\n\nThis is some vital information in the message'
+     print 'Date header is',msg.getDate()
+     print 'Subject header is',msg.getSubject()
+     print 'From header is',msg.getFrom()
+     
+     print 'Header text is:',msg.getHeaders()
+     print 'Headers are:',msg.getHeadersList()
+     print 'Body is:',msg.getPayload()
+ 
  
  
***************
*** 526,538 ****
  
      m1 = fmClass('MSG00001', 'fctestspamcorpus')
!     m1.substance = tm1
      m1.store()
  
      m2 = fmClass('MSG00002', 'fctestspamcorpus')
!     m2.substance = tm2
      m2.store()
  
      m3 = fmClass('MSG00003', 'fctestunsurecorpus')
!     m3.substance = tm1
      m3.store()
  
--- 538,550 ----
  
      m1 = fmClass('MSG00001', 'fctestspamcorpus')
!     m1.setSubstance(tm1)
      m1.store()
  
      m2 = fmClass('MSG00002', 'fctestspamcorpus')
!     m2.setSubstance(tm2)
      m2.store()
  
      m3 = fmClass('MSG00003', 'fctestunsurecorpus')
!     m3.setSubstance(tm1)
      m3.store()
  
***************
*** 546,558 ****
  
      m4 = fmClass('MSG00004', 'fctestunsurecorpus')
!     m4.substance = tm1
      m4.store()
  
      m5 = fmClass('MSG00005', 'fctestunsurecorpus')
!     m5.substance = tm2
      m5.store()
  
      m6 = fmClass('MSG00006', 'fctestunsurecorpus')
!     m6.substance = tm2
      m6.store()
  
--- 558,570 ----
  
      m4 = fmClass('MSG00004', 'fctestunsurecorpus')
!     m4.setSubstance(tm1)
      m4.store()
  
      m5 = fmClass('MSG00005', 'fctestunsurecorpus')
!     m5.setSubstance(tm2)
      m5.store()
  
      m6 = fmClass('MSG00006', 'fctestunsurecorpus')
!     m6.setSubstance(tm2)
      m6.store()
  
***************
*** 583,587 ****
  Content-Type:text/plain; charset=us-ascii
  Content- Transfer- Encoding:7bit
- 
  Message-ID:<15814.42238.882013.702030@montanaro.dyndns.org>
  Date:Mon, 4 Nov 2002 10:49:02 -0600
--- 595,598 ----
***************
*** 644,648 ****
  Content-Type:text/plain; charset=us-ascii
  Content- Transfer- Encoding:7bit
- 
  X-Hammie- Disposition:Unsure
  
--- 655,658 ----

Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.75
retrieving revision 1.76
diff -C2 -d -r1.75 -r1.76
*** Options.py	20 Nov 2002 22:41:50 -0000	1.75
--- Options.py	25 Nov 2002 02:29:44 -0000	1.76
***************
*** 198,209 ****
  show_unsure: False
  
- # Near the end of Driver.test(), you can get a listing of the best
- # discriminators in the words from the training sets.  These are the
- # words whose WordInfo.killcount values are highest, meaning they most
- # often were among the most extreme clues spamprob() found.  The number
- # of best discriminators to show is given by show_best_discriminators;
- # set this <= 0 to suppress showing any of the best discriminators.
- show_best_discriminators: 30
- 
  # The maximum # of characters to display for a msg displayed due to the
  # show_xyz options above.
--- 198,201 ----
***************
*** 346,356 ****
  clue_mailheader_cutoff: 0.5
  
! # The default database path used by hammie
! persistent_storage_file: hammie.db
! 
! # hammie can use either a database (quick to score one message) or a pickle
! # (quick to train on huge amounts of messages). Set this to True to use a
! # database by default.
! persistent_use_database: False
  
  [pop3proxy]
--- 338,347 ----
  clue_mailheader_cutoff: 0.5
  
! [hammiefilter]
! # hammiefilter can use either a database (quick to score one message) or
! # a pickle (quick to train on huge amounts of messages). Set this to
! # True to use a database by default.
! hammiefilter_persistent_use_database: True
! hammiefilter_persistent_storage_file: ~/.hammiedb
  
  [pop3proxy]
***************
*** 368,371 ****
--- 359,364 ----
  pop3proxy_ham_cache: pop3proxy-ham-cache
  pop3proxy_unknown_cache: pop3proxy-unknown-cache
+ pop3proxy_persistent_use_database: False
+ pop3proxy_persistent_storage_file: hammie.db
  
  # Deprecated - use pop3proxy_servers and pop3proxy_ports instead.
***************
*** 411,415 ****
                     'show_histograms': boolean_cracker,
                     'percentiles': ('get', lambda s: map(float, s.split())),
-                    'show_best_discriminators': int_cracker,
                     'save_trained_pickles': boolean_cracker,
                     'save_histogram_pickles': boolean_cracker,
--- 404,407 ----
***************
*** 436,440 ****
                    },
      'Hammie': {'hammie_header_name': string_cracker,
-                'persistent_storage_file': string_cracker,
                 'clue_mailheader_cutoff': float_cracker,
                 'persistent_use_database': boolean_cracker,
--- 428,431 ----
***************
*** 447,450 ****
--- 438,444 ----
                 'hammie_debug_header_name': string_cracker,
                 },
+     'hammiefilter' : {'hammiefilter_persistent_use_database': boolean_cracker,
+                       'hammiefilter_persistent_storage_file': string_cracker,
+                       },
      'pop3proxy': {'pop3proxy_servers': string_cracker,
                    'pop3proxy_ports': string_cracker,
***************
*** 457,460 ****
--- 451,456 ----
                    'pop3proxy_ham_cache': string_cracker,
                    'pop3proxy_unknown_cache': string_cracker,
+                   'pop3proxy_persistent_use_database': boolean_cracker,
+                   'pop3proxy_persistent_storage_file': string_cracker,
                    },
      'html_ui': {'html_ui_port': int_cracker,

Index: TestDriver.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v
retrieving revision 1.30
retrieving revision 1.31
diff -C2 -d -r1.30 -r1.31
*** TestDriver.py	19 Nov 2002 17:43:27 -0000	1.30
--- TestDriver.py	25 Nov 2002 02:29:44 -0000	1.31
***************
*** 305,324 ****
              printmsg(e, prob, clues)
  
-         if options.show_best_discriminators > 0:
-             print
-             print "    best discriminators:"
-             stats = [(-1, None)] * options.show_best_discriminators
-             smallest_killcount = -1
-             for w, r in c.wordinfo.iteritems():
-                 if r.killcount > smallest_killcount:
-                     heapreplace(stats, (r.killcount, w))
-                     smallest_killcount = stats[0][0]
-             stats.sort()
-             for count, w in stats:
-                 if count < 0:
-                     continue
-                 r = c.wordinfo[w]
-                 print "        %r %d %g" % (w, r.killcount, r.spamprob)
- 
          if options.show_histograms:
              printhist("this pair:", local_ham_hist, local_spam_hist)
--- 305,308 ----

Index: Tester.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Tester.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** Tester.py	7 Nov 2002 22:30:04 -0000	1.8
--- Tester.py	25 Nov 2002 02:29:44 -0000	1.9
***************
*** 60,68 ****
          if hamstream is not None:
              for example in hamstream:
!                 learn(example, False, False)
          if spamstream is not None:
              for example in spamstream:
!                 learn(example, True, False)
!         self.classifier.update_probabilities()
  
      # Untrain the classifier on streams of ham and spam.  Updates
--- 60,67 ----
          if hamstream is not None:
              for example in hamstream:
!                 learn(example, False)
          if spamstream is not None:
              for example in spamstream:
!                 learn(example, True)
  
      # Untrain the classifier on streams of ham and spam.  Updates
***************
*** 73,81 ****
          if hamstream is not None:
              for example in hamstream:
!                 unlearn(example, False, False)
          if spamstream is not None:
              for example in spamstream:
!                 unlearn(example, True, False)
!         self.classifier.update_probabilities()
  
      # Run prediction on each sample in stream.  You're swearing that stream
--- 72,79 ----
          if hamstream is not None:
              for example in hamstream:
!                 unlearn(example, False)
          if spamstream is not None:
              for example in spamstream:
!                 unlearn(example, True)
  
      # Run prediction on each sample in stream.  You're swearing that stream

Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.53
retrieving revision 1.54
diff -C2 -d -r1.53 -r1.54
*** classifier.py	18 Nov 2002 18:23:09 -0000	1.53
--- classifier.py	25 Nov 2002 02:29:44 -0000	1.54
***************
*** 1,2 ****
--- 1,3 ----
+ #! /usr/bin/env python
  # An implementation of a Bayes-like spam classifier.
  #
***************
*** 32,36 ****
  
  import math
- import time
  from sets import Set
  
--- 33,36 ----
***************
*** 47,92 ****
  LN2 = math.log(2)       # used frequently by chi-combining
  
! PICKLE_VERSION = 1
  
- class WordInfo(object):
-     __slots__ = ('atime',     # when this record was last used by scoring(*)
-                  'spamcount', # # of spams in which this word appears
-                  'hamcount',  # # of hams in which this word appears
-                  'killcount', # # of times this made it to spamprob()'s nbest
-                  'spamprob',  # prob(spam | msg contains this word)
-                 )
  
      # Invariant:  For use in a classifier database, at least one of
      # spamcount and hamcount must be non-zero.
-     #
-     # (*)atime is the last access time, a UTC time.time() value.  It's the
-     # most recent time this word was used by scoring (i.e., by spamprob(),
-     # not by training via learn()); or, if the word has never been used by
-     # scoring, the time the word record was created (i.e., by learn()).
-     # One good criterion for identifying junk (word records that have no
-     # value) is to delete words that haven't been used for a long time.
-     # Perhaps they were typos, or unique identifiers, or relevant to a
-     # once-hot topic or scam that's fallen out of favor.  Whatever, if
-     # a word is no longer being used, it's just wasting space.
  
!     def __init__(self, atime, spamprob=options.unknown_word_prob):
!         self.atime = atime
!         self.spamcount = self.hamcount = self.killcount = 0
!         self.spamprob = spamprob
  
      def __repr__(self):
!         return "WordInfo%r" % repr((self.atime, self.spamcount,
!                                     self.hamcount, self.killcount,
!                                     self.spamprob))
  
      def __getstate__(self):
!         return (self.atime, self.spamcount, self.hamcount, self.killcount,
!                 self.spamprob)
  
      def __setstate__(self, t):
!         (self.atime, self.spamcount, self.hamcount, self.killcount,
!          self.spamprob) = t
  
! class Bayes:
      # Defining __slots__ here made Jeremy's life needlessly difficult when
      # trying to hook this all up to ZODB as a persistent object.  There's
--- 47,116 ----
  LN2 = math.log(2)       # used frequently by chi-combining
  
! PICKLE_VERSION = 4
! 
! class MetaInfo(object):
!     """Information about the corpora.
! 
!     Contains nham and nspam, used for calculating probabilities.  Also
!     has a revision, incremented every time nham or nspam is adjusted.
!     Nothing uses this, currently, but it's there if you want it.
! 
!     """
!     def __init__(self):
!         self.__setstate__((PICKLE_VERSION, 0, 0))
! 
!     def __repr__(self):
!         return "MetaInfo%r" % repr((self._nspam,
!                                     self._nham,
!                                     self.revision))
! 
!     def __getstate__(self):
!         return (PICKLE_VERSION, self._nspam, self._nham)
! 
!     def __setstate__(self, t):
!         if t[0] != PICKLE_VERSION:
!             raise ValueError("Can't unpickle -- version %s unknown" % t[0])
!         (self._nspam, self._nham) = t[1:]
!         self.revision = 0
! 
!     def incr_rev(self):
!         self.revision += 1
! 
!     def get_nham(self):
!         return self._nham
!     def set_nham(self, val):
!         self._nham = val
!         self.incr_rev()
!     nham = property(get_nham, set_nham)
! 
!     def set_nspam(self, val):
!         self._nspam = val
!     def get_nspam(self):
!         return self._nspam
!     nspam = property(get_nspam, set_nspam)
! 
! 
  
  
+ class WordInfo(object):
      # Invariant:  For use in a classifier database, at least one of
      # spamcount and hamcount must be non-zero.
  
!     def __init__(self):
!         self.__setstate__((0, 0))
  
      def __repr__(self):
!         return "WordInfo%r" % repr((self.spamcount,
!                                     self.hamcount))
  
      def __getstate__(self):
!         return (self.spamcount,
!                 self.hamcount)
  
      def __setstate__(self, t):
!         (self.spamcount, self.hamcount) = t
  
! 
! class Classifier:
      # Defining __slots__ here made Jeremy's life needlessly difficult when
      # trying to hook this all up to ZODB as a persistent object.  There's
***************
*** 105,117 ****
      def __init__(self):
          self.wordinfo = {}
!         self.nspam = self.nham = 0
  
      def __getstate__(self):
!         return PICKLE_VERSION, self.wordinfo, self.nspam, self.nham
  
      def __setstate__(self, t):
          if t[0] != PICKLE_VERSION:
              raise ValueError("Can't unpickle -- version %s unknown" % t[0])
!         self.wordinfo, self.nspam, self.nham = t[1:]
  
      # spamprob() implementations.  One of the following is aliased to
--- 129,156 ----
      def __init__(self):
          self.wordinfo = {}
!         self.meta = MetaInfo()
!         self.probcache = {}
  
      def __getstate__(self):
!         return PICKLE_VERSION, self.wordinfo, self.meta
  
      def __setstate__(self, t):
          if t[0] != PICKLE_VERSION:
              raise ValueError("Can't unpickle -- version %s unknown" % t[0])
!         self.wordinfo, self.meta = t[1:]
! 
!     # Slacker's way out--pass calls to nham/nspam up to the meta class
! 
!     def get_nham(self):
!         return self.meta.nham
!     def set_nham(self, val):
!         self.meta.nham = val
!     nham = property(get_nham, set_nham)
! 
!     def get_nspam(self):
!         return self.meta.nspam
!     def set_nspam(self, val):
!         self.meta.nspam = val
!     nspam = property(get_nspam, set_nspam)
  
      # spamprob() implementations.  One of the following is aliased to
***************
*** 145,150 ****
          clues = self._getclues(wordstream)
          for prob, word, record in clues:
-             if record is not None:  # else wordinfo doesn't know about it
-                 record.killcount += 1
              P *= 1.0 - prob
              Q *= prob
--- 184,187 ----
***************
*** 234,239 ****
          clues = self._getclues(wordstream)
          for prob, word, record in clues:
-             if record is not None:  # else wordinfo doesn't know about it
-                 record.killcount += 1
              S *= 1.0 - prob
              H *= prob
--- 271,274 ----
***************
*** 278,282 ****
          spamprob = chi2_spamprob
  
!     def learn(self, wordstream, is_spam, update_probabilities=True):
          """Teach the classifier by example.
  
--- 313,317 ----
          spamprob = chi2_spamprob
  
!     def learn(self, wordstream, is_spam):
          """Teach the classifier by example.
  
***************
*** 285,324 ****
          else that it's definitely not spam.
  
!         If optional arg update_probabilities is False (the default is True),
!         don't update word probabilities.  Updating them is expensive, and if
!         you're going to pass many messages to learn(), it's more efficient
!         to pass False here and call update_probabilities() once when you're
!         done -- or to call learn() with update_probabilities=True when
!         passing the last new example.  The important thing is that the
!         probabilities get updated before calling spamprob() again.
          """
  
          self._add_msg(wordstream, is_spam)
-         if update_probabilities:
-             self.update_probabilities()
  
!     def unlearn(self, wordstream, is_spam, update_probabilities=True):
          """In case of pilot error, call unlearn ASAP after screwing up.
  
          Pass the same arguments you passed to learn().
          """
- 
          self._remove_msg(wordstream, is_spam)
-         if update_probabilities:
-             self.update_probabilities()
  
!     def update_probabilities(self):
!         """Update the word probabilities in the spam database.
  
!         This computes a new probability for every word in the database,
!         so can be expensive.  learn() and unlearn() update the probabilities
!         each time by default.  Thay have an optional argument that allows
!         to skip this step when feeding in many messages, and in that case
!         you should call update_probabilities() after feeding the last
!         message and before calling spamprob().
          """
  
!         nham = float(self.nham or 1)
!         nspam = float(self.nspam or 1)
  
          if options.experimental_ham_spam_imbalance_adjustment:
--- 320,371 ----
          else that it's definitely not spam.
  
!         If optional arg update_word_probabilities is False (the default
!         is True), don't update individual words' probabilities.
!         Updating them is expensive, and if you're going to pass many
!         messages to learn(), it's more efficient to pass False here and
!         call update_probabilities() once when you're done.  The
!         important thing is that the probabilities get updated before
!         calling spamprob() again.
! 
          """
  
          self._add_msg(wordstream, is_spam)
  
!     def unlearn(self, wordstream, is_spam):
          """In case of pilot error, call unlearn ASAP after screwing up.
  
          Pass the same arguments you passed to learn().
          """
          self._remove_msg(wordstream, is_spam)
  
!     def probability(self, record):
!         """Compute, store, and return prob(msg is spam | msg contains word).
  
!         This is the Graham calculation, but stripped of biases, and
!         stripped of clamping into 0.01 thru 0.99.  The Bayesian
!         adjustment following keeps them in a sane range, and one
!         that naturally grows the more evidence there is to back up
!         a probability.
          """
  
!         spamcount = record.spamcount
!         hamcount = record.hamcount
!         
!         # Try the cache first
!         try:
!             return self.probcache[spamcount][hamcount]
!         except KeyError:
!             pass
! 
!         nham = float(self.meta.nham or 1)
!         nspam = float(self.meta.nspam or 1)
! 
!         assert hamcount <= nham
!         hamratio = hamcount / nham
! 
!         assert spamcount <= nspam
!         spamratio = spamcount / nspam
! 
!         prob = spamratio / (hamratio + spamratio)
  
          if options.experimental_ham_spam_imbalance_adjustment:
***************
*** 331,405 ****
          StimesX = S * options.unknown_word_prob
  
-         for word, record in self.wordinfo.iteritems():
-             # Compute p(word) = prob(msg is spam | msg contains word).
-             # This is the Graham calculation, but stripped of biases, and
-             # stripped of clamping into 0.01 thru 0.99.  The Bayesian
-             # adjustment following keeps them in a sane range, and one
-             # that naturally grows the more evidence there is to back up
-             # a probability.
-             hamcount = record.hamcount
-             assert hamcount <= nham
-             hamratio = hamcount / nham
  
!             spamcount = record.spamcount
!             assert spamcount <= nspam
!             spamratio = spamcount / nspam
! 
!             prob = spamratio / (hamratio + spamratio)
  
!             # Now do Robinson's Bayesian adjustment.
!             #
!             #         s*x + n*p(w)
!             # f(w) = --------------
!             #           s + n
!             #
!             # I find this easier to reason about like so (equivalent when
!             # s != 0):
!             #
!             #        x - p
!             #  p +  -------
!             #       1 + n/s
!             #
!             # IOW, it moves p a fraction of the distance from p to x, and
!             # less so the larger n is, or the smaller s is.
  
!             # Experimental:
!             # Picking a good value for n is interesting:  how much empirical
!             # evidence do we really have?  If nham == nspam,
!             # hamcount + spamcount makes a lot of sense, and the code here
!             # does that by default.
!             # But if, e.g., nham is much larger than nspam, p(w) can get a
!             # lot closer to 0.0 than it can get to 1.0.  That in turn makes
!             # strong ham words (high hamcount) much stronger than strong
!             # spam words (high spamcount), and that makes the accidental
!             # appearance of a strong ham word in spam much more damaging than
!             # the accidental appearance of a strong spam word in ham.
!             # So we don't give hamcount full credit when nham > nspam (or
!             # spamcount when nspam > nham):  instead we knock hamcount down
!             # to what it would have been had nham been equal to nspam.  IOW,
!             # we multiply hamcount by nspam/nham when nspam < nham; or, IOOW,
!             # we don't "believe" any count to an extent more than
!             # min(nspam, nham) justifies.
  
!             n = hamcount * spam2ham  +  spamcount * ham2spam
!             prob = (StimesX + n * prob) / (S + n)
  
!             if record.spamprob != prob:
!                 record.spamprob = prob
!                 # The next seemingly pointless line appears to be a hack
!                 # to allow a persistent db to realize the record has changed.
!                 self.wordinfo[word] = record
  
!     def clearjunk(self, oldesttime):
!         """Forget useless wordinfo records.  This can shrink the database size.
  
!         A record for a word will be retained only if the word was accessed
!         at or after oldesttime.
          """
  
!         wordinfo = self.wordinfo
!         tonuke = [w for w, r in wordinfo.iteritems() if r.atime < oldesttime]
!         for w in tonuke:
!             del wordinfo[w]
  
      # NOTE:  Graham's scheme had a strange asymmetry:  when a word appeared
--- 378,440 ----
          StimesX = S * options.unknown_word_prob
  
  
!         # Now do Robinson's Bayesian adjustment.
!         #
!         #         s*x + n*p(w)
!         # f(w) = --------------
!         #           s + n
!         #
!         # I find this easier to reason about like so (equivalent when
!         # s != 0):
!         #
!         #        x - p
!         #  p +  -------
!         #       1 + n/s
!         #
!         # IOW, it moves p a fraction of the distance from p to x, and
!         # less so the larger n is, or the smaller s is.
  
!         # Experimental:
!         # Picking a good value for n is interesting:  how much empirical
!         # evidence do we really have?  If nham == nspam,
!         # hamcount + spamcount makes a lot of sense, and the code here
!         # does that by default.
!         # But if, e.g., nham is much larger than nspam, p(w) can get a
!         # lot closer to 0.0 than it can get to 1.0.  That in turn makes
!         # strong ham words (high hamcount) much stronger than strong
!         # spam words (high spamcount), and that makes the accidental
!         # appearance of a strong ham word in spam much more damaging than
!         # the accidental appearance of a strong spam word in ham.
!         # So we don't give hamcount full credit when nham > nspam (or
!         # spamcount when nspam > nham):  instead we knock hamcount down
!         # to what it would have been had nham been equal to nspam.  IOW,
!         # we multiply hamcount by nspam/nham when nspam < nham; or, IOOW,
!         # we don't "believe" any count to an extent more than
!         # min(nspam, nham) justifies.
  
!         n = hamcount * spam2ham  +  spamcount * ham2spam
!         prob = (StimesX + n * prob) / (S + n)
  
!         # Update the cache
!         try:
!             self.probcache[spamcount][hamcount] = prob
!         except KeyError:
!             self.probcache[spamcount] = {hamcount: prob}
  
!         return prob
  
!     def update_probabilities(self):
!         """Update the word probabilities in the spam database.
  
!         This computes a new probability for every word in the database,
!         which can be expensive.  learn() and unlearn() clear the
!         probability cache each time by default, and that will be rebuilt
!         as probabilities are looked up.  If for some reason you need to
!         update all the probabilities in one step (say, for
!         benchmarking), you can call this method.
          """
  
!         for word, record in self.wordinfo.iteritems():
!             self.probability(record)
  
      # NOTE:  Graham's scheme had a strange asymmetry:  when a word appeared
***************
*** 424,439 ****
      # to exploit it.
      def _add_msg(self, wordstream, is_spam):
          if is_spam:
!             self.nspam += 1
          else:
!             self.nham += 1
  
          wordinfo = self.wordinfo
          wordinfoget = wordinfo.get
-         now = time.time()
          for word in Set(wordstream):
              record = wordinfoget(word)
              if record is None:
!                 record = self.WordInfoClass(now)
  
              if is_spam:
--- 459,474 ----
      # to exploit it.
      def _add_msg(self, wordstream, is_spam):
+         self.probcache = {}    # nuke the prob cache
          if is_spam:
!             self.meta.nspam += 1
          else:
!             self.meta.nham += 1
  
          wordinfo = self.wordinfo
          wordinfoget = wordinfo.get
          for word in Set(wordstream):
              record = wordinfoget(word)
              if record is None:
!                 record = self.WordInfoClass()
  
              if is_spam:
***************
*** 441,456 ****
              else:
                  record.hamcount += 1
              # Needed to tell a persistent DB that the content changed.
              wordinfo[word] = record
  
      def _remove_msg(self, wordstream, is_spam):
          if is_spam:
!             if self.nspam <= 0:
                  raise ValueError("spam count would go negative!")
!             self.nspam -= 1
          else:
!             if self.nham <= 0:
                  raise ValueError("non-spam count would go negative!")
!             self.nham -= 1
  
          wordinfo = self.wordinfo
--- 476,494 ----
              else:
                  record.hamcount += 1
+ 
              # Needed to tell a persistent DB that the content changed.
              wordinfo[word] = record
  
+ 
      def _remove_msg(self, wordstream, is_spam):
+         self.probcache = {}    # nuke the prob cache
          if is_spam:
!             if self.meta.nspam <= 0:
                  raise ValueError("spam count would go negative!")
!             self.meta.nspam -= 1
          else:
!             if self.meta.nham <= 0:
                  raise ValueError("non-spam count would go negative!")
!             self.meta.nham -= -1
  
          wordinfo = self.wordinfo
***************
*** 468,472 ****
                      del wordinfo[word]
                  else:
!                     # Needed to tell a persistent DB that the content changed.
                      wordinfo[word] = record
  
--- 506,511 ----
                      del wordinfo[word]
                  else:
!                     # Needed to tell a persistent DB that the content
!                     # changed.
                      wordinfo[word] = record
  
***************
*** 479,483 ****
  
          wordinfoget = self.wordinfo.get
-         now = time.time()
          for word in Set(wordstream):
              record = wordinfoget(word)
--- 518,521 ----
***************
*** 485,490 ****
                  prob = unknown
              else:
!                 record.atime = now
!                 prob = record.spamprob
              distance = abs(prob - 0.5)
              if distance >= mindist:
--- 523,527 ----
                  prob = unknown
              else:
!                 prob = self.probability(record)
              distance = abs(prob - 0.5)
              if distance >= mindist:
***************
*** 496,497 ****
--- 533,537 ----
          # Return (prob, word, record).
          return [t[1:] for t in clues]
+ 
+ 
+ Bayes = Classifier

Index: dbdict.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/dbdict.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** dbdict.py	19 Nov 2002 23:31:44 -0000	1.1
--- dbdict.py	25 Nov 2002 02:29:44 -0000	1.2
***************
*** 1,6 ****
  #! /usr/bin/env python
  
  from __future__ import generators
! import dbhash
  try:
      import cPickle as pickle
--- 1,55 ----
  #! /usr/bin/env python
  
+ """DBDict.py - Dictionary access to dbhash
+ 
+ Classes:
+     DBDict - wraps a dbhash file
+ 
+ Abstract:
+     DBDict class wraps a dbhash file with a reasonably complete set
+     of dictionary access methods.  DBDicts can be iterated like a dictionary.
+     
+     The constructor accepts a class name which is used specifically to
+     to pickle/unpickle an instance of that class.  When an instance of
+     that class is being pickled, the pickler (actually __getstate__) prepends
+     a 'W' to the pickled string, and when the unpickler (really __setstate__)
+     encounters that 'W', it constructs that class (with no constructor
+     arguments) and executes __setstate__ on the constructed instance.
+ 
+     DBDict accepts an iterskip operand on the constructor.  This is a tuple
+     of hash keys that will be skipped (not seen) during iteration.  This
+     is for iteration only.  Methods such as keys() will return the entire
+     complement of keys in the dbm hash, even if they're in iterskip.  An
+     iterkeys() method is provided for iterating with skipped keys, and
+     itervaluess() is provided for iterating values with skipped keys.
+ 
+         >>> d = DBDict('/tmp/goober.db', MODE_CREATE, ('skipme', 'skipmetoo'))
+         >>> d['skipme'] = 'booga'
+         >>> d['countme'] = 'wakka'
+         >>> print d.keys()
+         ['skipme', 'countme']
+         >>> for k in d.iterkeys():
+         ...     print k
+         countme
+         >>> for v in d.itervalues():
+         ...     print v
+         wakka
+         >>> for k,v in d.iteritems():
+         ...     print k,v
+         countme wakka
+ 
+ To Do:
+     """
+ 
+ # This module is part of the spambayes project, which is Copyright 2002
+ # The Python Software Foundation and is covered by the Python Software
+ # Foundation license.
+ 
+ __author__ = "Neale Pickett <neale@woozle.org>, \
+               Tim Stone <tim@fourstonesExpressions.com>"
+ __credits__ = "Tim Peters (author of DBDict class), \
+                all the spambayes contributors."
  from __future__ import generators
! 
  try:
      import cPickle as pickle
***************
*** 8,11 ****
--- 57,72 ----
      import pickle
  
+ import dbhash
+ import errno
+ import copy
+ import shutil
+ import os
+ 
+ MODE_CREATE = 'c'       # create file if necessary, open for readwrite
+ MODE_NEW = 'n'          # always create new file, open for readwrite
+ MODE_READWRITE = 'w'    # open existing file for readwrite
+ MODE_READONLY = 'r'     # open existing file for read only
+ 
+ 
  class DBDict:
      """Database Dictionary.
***************
*** 19,23 ****
      like .keys() still list everything.  For instance:
  
!     >>> d = DBDict('goober.db', 'c', ('skipme', 'skipmetoo'))
      >>> d['skipme'] = 'booga'
      >>> d['countme'] = 'wakka'
--- 80,84 ----
      like .keys() still list everything.  For instance:
  
!     >>> d = DBDict('goober.db', MODE_CREATE, ('skipme', 'skipmetoo'))
      >>> d['skipme'] = 'booga'
      >>> d['countme'] = 'wakka'
***************
*** 30,36 ****
      """
  
!     def __init__(self, dbname, mode, iterskip=()):
          self.hash = dbhash.open(dbname, mode)
!         self.iterskip = iterskip
  
      def __getitem__(self, key):
--- 91,121 ----
      """
  
!     def __init__(self, dbname, mode, wclass, iterskip=()):
          self.hash = dbhash.open(dbname, mode)
!         if not iterskip:
!             self.iterskip = iterskip
!         else:
!             self.iterskip = ()
!         self.wclass=wclass
! 
!     def __getitem__(self, key):
!         v = self.hash[key]
!         if v[0] == 'W':
!             val = pickle.loads(v[1:])
!             # We could be sneaky, like pickle.Unpickler.load_inst,
!             # but I think that's overly confusing.
!             obj = self.wclass()
!             obj.__setstate__(val)
!             return obj
!         else:
!             return pickle.loads(v)
! 
!     def __setitem__(self, key, val):
!         if isinstance(val, self.wclass):
!             val = val.__getstate__()
!             v = 'W' + pickle.dumps(val, 1)
!         else:
!             v = pickle.dumps(val, 1)
!         self.hash[key] = v
  
      def __getitem__(self, key):
***************
*** 79,82 ****
--- 164,168 ----
      def itervalues(self):
          return self.__iter__(lambda k: k[1])
+ 
  
  open = DBDict

Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.40
retrieving revision 1.41
diff -C2 -d -r1.40 -r1.41
*** hammie.py	18 Nov 2002 18:13:54 -0000	1.40
--- hammie.py	25 Nov 2002 02:29:44 -0000	1.41
***************
*** 1,56 ****
  #! /usr/bin/env python
  
- # A driver for the classifier module and Tim's tokenizer that you can
- # call from procmail.
- 
- """Usage: %(program)s [options]
- 
- Where:
-     -h
-         show usage and exit
-     -g PATH
-         mbox or directory of known good messages (non-spam) to train on.
-         Can be specified more than once, or use - for stdin.
-     -s PATH
-         mbox or directory of known spam messages to train on.
-         Can be specified more than once, or use - for stdin.
-     -u PATH
-         mbox of unknown messages.  A ham/spam decision is reported for each.
-         Can be specified more than once.
-     -r
-         reverse the meaning of the check (report ham instead of spam).
-         Only meaningful with the -u option.
-     -p FILE
-         use file as the persistent store.  loads data from this file if it
-         exists, and saves data to this file at the end.
-         Default: %(DEFAULTDB)s
-     -d
-         use the DBM store instead of cPickle.  The file is larger and
-         creating it is slower, but checking against it is much faster,
-         especially for large word databases. Default: %(USEDB)s
-     -D
-         the reverse of -d: use the cPickle instead of DBM
-     -f
-         run as a filter: read a single message from stdin, add an
-         %(DISPHEADER)s header, and write it to stdout.  If you want to
-         run from procmail, this is your option.
- """
- 
- from __future__ import generators
- 
- import sys
- import os
- import types
- import getopt
- import mailbox
- import glob
- import email
- import errno
- import anydbm
- import cPickle as pickle
  
  import mboxutils
! import classifier
  from Options import options
  
  try:
--- 1,10 ----
  #! /usr/bin/env python
  
  
+ import dbdict
  import mboxutils
! import Persistent
  from Options import options
+ from tokenizer import tokenize
  
  try:
***************
*** 61,224 ****
  
  
! program = sys.argv[0] # For usage(); referenced by docstring above
! 
! # Name of the header to add in filter mode
! DISPHEADER = options.hammie_header_name
! DEBUGHEADER = options.hammie_debug_header_name
! DODEBUG = options.hammie_debug_header
! 
! # Default database name
! DEFAULTDB = options.persistent_storage_file
! 
! # Probability at which a message is considered spam
! SPAM_THRESHOLD = options.spam_cutoff
! HAM_THRESHOLD = options.ham_cutoff
! 
! # Probability limit for a clue to be added to the DISPHEADER
! SHOWCLUE = options.clue_mailheader_cutoff
! 
! # Use a database? If False, use a pickle
! USEDB = options.persistent_use_database
! 
! # Tim's tokenizer kicks far more booty than anything I would have
! # written.  Score one for analysis ;)
! from tokenizer import tokenize
! 
! class DBDict:
! 
!     """Database Dictionary.
! 
!     This wraps an anydbm to make it look even more like a dictionary.
! 
!     Call it with the name of your database file.  Optionally, you can
!     specify a list of keys to skip when iterating.  This only affects
!     iterators; things like .keys() still list everything.  For instance:
! 
!     >>> d = DBDict('/tmp/goober.db', ('skipme', 'skipmetoo'))
!     >>> d['skipme'] = 'booga'
!     >>> d['countme'] = 'wakka'
!     >>> print d.keys()
!     ['skipme', 'countme']
!     >>> for k in d.iterkeys():
!     ...     print k
!     countme
! 
!     """
! 
!     def __init__(self, dbname, mode, iterskip=()):
!         self.hash = anydbm.open(dbname, mode)
!         self.iterskip = iterskip
! 
!     def __getitem__(self, key):
!         v = self.hash[key]
!         if v[0] == 'W':
!             val = pickle.loads(v[1:])
!             # We could be sneaky, like pickle.Unpickler.load_inst,
!             # but I think that's overly confusing.
!             obj = classifier.WordInfo(0)
!             obj.__setstate__(val)
!             return obj
!         else:
!             return pickle.loads(v)
! 
!     def __setitem__(self, key, val):
!         if isinstance(val, classifier.WordInfo):
!             val = val.__getstate__()
!             v = 'W' + pickle.dumps(val, 1)
!         else:
!             v = pickle.dumps(val, 1)
!         self.hash[key] = v
! 
!     def __delitem__(self, key, val):
!         del(self.hash[key])
! 
!     def __iter__(self, fn=None):
!         k = self.hash.first()
!         while k != None:
!             key = k[0]
!             val = self.__getitem__(key)
!             if key not in self.iterskip:
!                 if fn:
!                     yield fn((key, val))
!                 else:
!                     yield (key, val)
!             try:
!                 k = self.hash.next()
!             except KeyError:
!                 break
! 
!     def __contains__(self, name):
!         return self.has_key(name)
! 
!     def __getattr__(self, name):
!         # Pass the buck
!         return getattr(self.hash, name)
! 
!     def get(self, key, dfl=None):
!         if self.has_key(key):
!             return self[key]
!         else:
!             return dfl
! 
!     def iteritems(self):
!         return self.__iter__()
! 
!     def iterkeys(self):
!         return self.__iter__(lambda k: k[0])
! 
!     def itervalues(self):
!         return self.__iter__(lambda k: k[1])
! 
! 
! class PersistentBayes(classifier.Bayes):
! 
!     """A persistent Bayes classifier.
! 
!     This is just like classifier.Bayes, except that the dictionary is a
!     database.  You take less disk this way and you can pretend it's
!     persistent.  The tradeoffs vs. a pickle are: 1. it's slower
!     training, but faster checking, and 2. it needs less memory to run,
!     but takes more space on the hard drive.
  
!     On destruction, an instantiation of this class will write its state
!     to a special key.  When you instantiate a new one, it will attempt
!     to read these values out of that key again, so you can pick up where
!     you left off.
  
      """
  
-     # XXX: Would it be even faster to remember (in a list) which keys
-     # had been modified, and only recalculate those keys?  No sense in
-     # going over the entire word database if only 100 words are
-     # affected.
- 
-     # XXX: Another idea: cache stuff in memory.  But by then maybe we
-     # should just use ZODB.
- 
-     def __init__(self, dbname, mode):
-         classifier.Bayes.__init__(self)
-         self.statekey = "saved state"
-         self.wordinfo = DBDict(dbname, mode, (self.statekey,))
-         self.dbmode = mode
- 
-         self.restore_state()
- 
-     def __del__(self):
-         #super.__del__(self)
-         self.save_state()
- 
-     def save_state(self):
-         if self.dbmode != 'r':
-             self.wordinfo[self.statekey] = (self.nham, self.nspam)
- 
-     def restore_state(self):
-         if self.wordinfo.has_key(self.statekey):
-             self.nham, self.nspam = self.wordinfo[self.statekey]
- 
- 
- class Hammie:
- 
-     """A spambayes mail filter"""
- 
      def __init__(self, bayes):
          self.bayes = bayes
--- 15,26 ----
  
  
! class Hammie:
!     """A spambayes mail filter.
  
!     This implements the basic functionality needed to score, filter, or
!     train.  
  
      """
  
      def __init__(self, bayes):
          self.bayes = bayes
***************
*** 256,269 ****
          """
  
!         try:
!             return self._scoremsg(msg, evidence)
!         except:
!             print msg
!             import traceback
!             traceback.print_exc()
  
!     def filter(self, msg, header=DISPHEADER, spam_cutoff=SPAM_THRESHOLD,
!                ham_cutoff=HAM_THRESHOLD, debugheader=DEBUGHEADER,
!                debug=DODEBUG):
          """Score (judge) a message and add a disposition header.
  
--- 58,66 ----
          """
  
!         return self._scoremsg(msg, evidence)
  
!     def filter(self, msg, header=None, spam_cutoff=None,
!                ham_cutoff=None, debugheader=None,
!                debug=None):
          """Score (judge) a message and add a disposition header.
  
***************
*** 283,286 ****
--- 80,94 ----
          """
  
+         if header == None:
+             header = options.hammie_header_name
+         if spam_cutoff == None:
+             spam_cutoff = options.spam_cutoff
+         if ham_cutoff == None:
+             ham_cutoff = options.ham_cutoff
+         if debugheader == None:
+             debugheader = options.hammie_debug_header_name
+         if debug == None:
+             debug = options.hammie_debug_header
+ 
          msg = mboxutils.get_message(msg)
          try:
***************
*** 323,327 ****
          """
  
!         self.bayes.learn(tokenize(msg), is_spam, False)
  
      def train_ham(self, msg):
--- 131,135 ----
          """
  
!         self.bayes.learn(tokenize(msg), is_spam)
  
      def train_ham(self, msg):
***************
*** 349,510 ****
          self.train(msg, True)
  
!     def update_probabilities(self):
!         """Update probability values.
  
!         You would want to call this after a training session.  It's
!         pretty slow, so if you have a lot of messages to train, wait
!         until you're all done before calling this.
  
          """
  
!         self.bayes.update_probabilities()
! 
! 
! def train(hammie, msgs, is_spam):
!     """Train bayes with all messages from a mailbox."""
!     mbox = mboxutils.getmbox(msgs)
!     i = 0
!     for msg in mbox:
!         i += 1
!         # XXX: Is the \r a Unixism?  I seem to recall it working in DOS
!         # back in the day.  Maybe it's a line-printer-ism ;)
!         sys.stdout.write("\r%6d" % i)
!         sys.stdout.flush()
!         hammie.train(msg, is_spam)
!     print
! 
! def score(hammie, msgs, reverse=0):
!     """Score (judge) all messages from a mailbox."""
!     # XXX The reporting needs work!
!     mbox = mboxutils.getmbox(msgs)
!     i = 0
!     spams = hams = 0
!     for msg in mbox:
!         i += 1
!         prob, clues = hammie.score(msg, True)
!         if hasattr(msg, '_mh_msgno'):
!             msgno = msg._mh_msgno
!         else:
!             msgno = i
!         isspam = (prob >= SPAM_THRESHOLD)
!         if isspam:
!             spams += 1
!             if not reverse:
!                 print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
!                 print hammie.formatclues(clues)
!         else:
!             hams += 1
!             if reverse:
!                 print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
!                 print hammie.formatclues(clues)
!     return (spams, hams)
! 
! def createbayes(pck=DEFAULTDB, usedb=False, mode='r'):
!     """Create a Bayes instance for the given pickle (which
!     doesn't have to exist).  Create a PersistentBayes if
!     usedb is True."""
!     if usedb:
!         bayes = PersistentBayes(pck, mode)
!     else:
!         bayes = None
!         try:
!             fp = open(pck, 'rb')
!         except IOError, e:
!             if e.errno <> errno.ENOENT: raise
!         else:
!             bayes = pickle.load(fp)
!             fp.close()
!         if bayes is None:
!             bayes = classifier.Bayes()
!     return bayes
! 
! def usage(code, msg=''):
!     """Print usage message and sys.exit(code)."""
!     if msg:
!         print >> sys.stderr, msg
!         print >> sys.stderr
!     print >> sys.stderr, __doc__ % globals()
!     sys.exit(code)
! 
! def main():
!     """Main program; parse options and go."""
!     try:
!         opts, args = getopt.getopt(sys.argv[1:], 'hdDfg:s:p:u:r')
!     except getopt.error, msg:
!         usage(2, msg)
! 
!     if not opts:
!         usage(2, "No options given")
! 
!     pck = DEFAULTDB
!     good = []
!     spam = []
!     unknown = []
!     reverse = 0
!     do_filter = False
!     usedb = USEDB
!     mode = 'r'
!     for opt, arg in opts:
!         if opt == '-h':
!             usage(0)
!         elif opt == '-g':
!             good.append(arg)
!             mode = 'c'
!         elif opt == '-s':
!             spam.append(arg)
!             mode = 'c'
!         elif opt == '-p':
!             pck = arg
!         elif opt == "-d":
!             usedb = True
!         elif opt == "-D":
!             usedb = False
!         elif opt == "-f":
!             do_filter = True
!         elif opt == '-u':
!             unknown.append(arg)
!         elif opt == '-r':
!             reverse = 1
!     if args:
!         usage(2, "Positional arguments not allowed")
! 
!     save = False
  
-     bayes = createbayes(pck, usedb, mode)
-     h = Hammie(bayes)
  
!     for g in good:
!         print "Training ham (%s):" % g
!         train(h, g, False)
!         save = True
  
!     for s in spam:
!         print "Training spam (%s):" % s
!         train(h, s, True)
!         save = True
  
!     if save:
!         h.update_probabilities()
!         if not usedb and pck:
!             fp = open(pck, 'wb')
!             pickle.dump(bayes, fp, 1)
!             fp.close()
  
!     if do_filter:
!         msg = sys.stdin.read()
!         filtered = h.filter(msg)
!         sys.stdout.write(filtered)
  
!     if unknown:
!         (spams, hams) = (0, 0)
!         for u in unknown:
!             if len(unknown) > 1:
!                 print "Scoring", u
!             s, g = score(h, u, reverse)
!             spams += s
!             hams += g
!         print "Total %d spam, %d ham" % (spams, hams)
  
  
  if __name__ == "__main__":
!     main()
--- 157,192 ----
          self.train(msg, True)
  
!     def store(self):
!         """Write out the persistent store.
  
!         This makes sure the persistent store reflects what is currently
!         in memory.  You would want to do this after a write and before
!         exiting.
  
          """
  
!         self.bayes.store()
  
  
! def open(filename, usedb=True, mode='r'):
!     """Open a file, returning a Hammie instance.
  
!     If usedb is False, open as a pickle instead of a DBDict.  mode is
  
!     used as the flag to open DBDict objects.  'c' for read-write (create
!     if needed), 'r' for read-only, 'w' for read-write.
  
!     """
  
!     if usedb:
!         b = Persistent.DBDictClassifier(filename, mode)
!     else:
!         b = Persistent.PickledClassifier(filename)
!     return Hammie(b)
  
  
  if __name__ == "__main__":
!     # Everybody's used to running hammie.py.  Why mess with success?  ;)
!     import hammiebulk
! 
!     hammiebulk.main()

Index: hammiefilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** hammiefilter.py	18 Nov 2002 18:14:04 -0000	1.2
--- hammiefilter.py	25 Nov 2002 02:29:44 -0000	1.3
***************
*** 52,95 ****
      sys.exit(code)
  
! def jar_pickle(h):
!     if not options.persistent_use_database:
!         import pickle
!         fp = open(options.persistent_storage_file, 'wb')
!         pickle.dump(h.bayes, fp, 1)
!         fp.close()
!     
! 
! def hammie_open(mode):
!     b = hammie.createbayes(options.persistent_storage_file,
!                            options.persistent_use_database,
!                            mode)
!     return hammie.Hammie(b)
  
! def newdb():
!     h = hammie_open('n')
!     jar_pickle(h)
!     print "Created new database in", options.persistent_storage_file
  
! def filter():
!     h = hammie_open('r')
!     msg = sys.stdin.read()
!     print h.filter(msg)
  
! def train_ham():
!     h = hammie_open('w')
!     msg = sys.stdin.read()
!     h.train_ham(msg)
!     h.update_probabilities()
!     jar_pickle(h)    
  
! def train_spam():
!     h = hammie_open('w')
!     msg = sys.stdin.read()
!     h.train_spam(msg)
!     h.update_probabilities()
!     jar_pickle(h)    
  
  def main():
!     action = filter
      opts, args = getopt.getopt(sys.argv[1:], 'hngs')
      for opt, arg in opts:
--- 52,91 ----
      sys.exit(code)
  
! class HammieFilter(object):
!     def __init__(self):
!         options = Options.options
!         options.mergefiles(['/etc/hammierc',
!                             os.path.expanduser('~/.hammierc')])
!         
!         self.dbname = options.hammiefilter_persistent_storage_file
!         self.dbname = os.path.expanduser(self.dbname)
!         self.usedb = options.hammiefilter_persistent_use_database
!         
  
!     def newdb(self):
!         h = hammie.open(self.dbname, self.usedb, 'n')
!         h.store()
!         print "Created new database in", self.dbname
  
!     def filter(self):
!         h = hammie.open(self.dbname, self.usedb, 'r')
!         msg = sys.stdin.read()
!         print h.filter(msg)
  
!     def train_ham(self):
!         h = hammie.open(self.dbname, self.usedb, 'c')
!         msg = sys.stdin.read()
!         h.train_ham(msg)
!         h.store()
  
!     def train_spam(self):
!         h = hammie.open(self.dbname, self.usedb, 'c')
!         msg = sys.stdin.read()
!         h.train_spam(msg)
!         h.store()
  
  def main():
!     h = HammieFilter()
!     action = h.filter
      opts, args = getopt.getopt(sys.argv[1:], 'hngs')
      for opt, arg in opts:
***************
*** 97,114 ****
              usage(0)
          elif opt == '-g':
!             action = train_ham
          elif opt == '-s':
!             action = train_spam
          elif opt == "-n":
!             action = newdb
! 
!     # hammiefilter overrides
!     config_overrides = """[Hammie]
! persistent_storage_file = %s
! persistent_use_database = True
! """ % os.path.expanduser('~/.hammiedb')
!     options.mergefilelike(StringIO.StringIO(config_overrides))
!     options.mergefiles(['/etc/hammierc',
!                         os.path.expanduser('~/.hammierc')])
  
      action()
--- 93,101 ----
              usage(0)
          elif opt == '-g':
!             action = h.train_ham
          elif opt == '-s':
!             action = h.train_spam
          elif opt == "-n":
!             action = h.newdb
  
      action()

Index: pop3proxy.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v
retrieving revision 1.18
retrieving revision 1.19
diff -C2 -d -r1.18 -r1.19
*** pop3proxy.py	20 Nov 2002 22:41:50 -0000	1.18
--- pop3proxy.py	25 Nov 2002 02:29:44 -0000	1.19
***************
*** 119,123 ****
  import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect
  import socket, asyncore, asynchat, cgi, urlparse, webbrowser
! import Bayes, tokenizer, mboxutils
  from FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory
  from email.Iterators import typed_subpart_iterator
--- 119,123 ----
  import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect
  import socket, asyncore, asynchat, cgi, urlparse, webbrowser
! import Persistent, tokenizer, mboxutils
  from FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory
  from email.Iterators import typed_subpart_iterator
***************
*** 819,822 ****
--- 819,825 ----
          stateDict = state.__dict__
          stateDict.update(state.bayes.__dict__)
+         # so the property() isn't as cool as we thought.  -ntp
+         stateDict['nham'] = state.bayes.nham
+         stateDict['nspam'] = state.bayes.nspam
          body = (self.pageSection % ('Status', self.summary % stateDict)+
                  self.pageSection % ('Train on proxied messages', self.review)+
***************
*** 1119,1123 ****
  # This keeps the global state of the module - the command-line options,
  # statistics like how many mails have been classified, the handle of the
! # log file, the Bayes and FileCorpus objects, and so on.
  class State:
      def __init__(self):
--- 1122,1126 ----
  # This keeps the global state of the module - the command-line options,
  # statistics like how many mails have been classified, the handle of the
! # log file, the Classifier and FileCorpus objects, and so on.
  class State:
      def __init__(self):
***************
*** 1162,1167 ****
  
          # Load up the other settings from Option.py / bayescustomize.ini
!         self.databaseFilename = options.persistent_storage_file
!         self.useDB = options.persistent_use_database
          self.uiPort = options.html_ui_port
          self.launchUI = options.html_ui_launch_browser
--- 1165,1170 ----
  
          # Load up the other settings from Option.py / bayescustomize.ini
!         self.databaseFilename = options.pop3proxy_persistent_storage_file
!         self.useDB = options.pop3proxy_persistent_use_database
          self.uiPort = options.html_ui_port
          self.launchUI = options.html_ui_launch_browser
***************
*** 1200,1206 ****
              self.databaseFilename = '_pop3proxy_test.pickle'   # Never saved
          if self.useDB:
!             self.bayes = Bayes.DBDictBayes(self.databaseFilename)
          else:
!             self.bayes = Bayes.PickledBayes(self.databaseFilename)
          print "Done."
  
--- 1203,1209 ----
              self.databaseFilename = '_pop3proxy_test.pickle'   # Never saved
          if self.useDB:
!             self.bayes = Persistent.DBDictClassifier(self.databaseFilename)
          else:
!             self.bayes = Persistent.PickledClassifier(self.databaseFilename)
          print "Done."
  
***************
*** 1227,1232 ****
  
              # Create the Trainers.
!             self.spamTrainer = Bayes.SpamTrainer(self.bayes)
!             self.hamTrainer = Bayes.HamTrainer(self.bayes)
              self.spamCorpus.addObserver(self.spamTrainer)
              self.hamCorpus.addObserver(self.hamTrainer)
--- 1230,1235 ----
  
              # Create the Trainers.
!             self.spamTrainer = Persistent.SpamTrainer(self.bayes)
!             self.hamTrainer = Persistent.HamTrainer(self.bayes)
              self.spamCorpus.addObserver(self.spamTrainer)
              self.hamCorpus.addObserver(self.hamTrainer)

--- Bayes.py DELETED ---





More information about the Spambayes-checkins mailing list