From anadelonbrin at users.sourceforge.net Thu Apr 3 15:30:38 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Thu Apr 3 18:30:44 2003
Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.66,1.67
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv3792
Modified Files:
pop3proxy.py
Log Message:
Expire messages from the unknown cache as well as the ham
and spam caches. Also spin off threads to expire messages
(if necessary) each time a client connects to the proxy.
Index: pop3proxy.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v
retrieving revision 1.66
retrieving revision 1.67
diff -C2 -d -r1.66 -r1.67
*** pop3proxy.py 12 Mar 2003 03:29:58 -0000 1.66
--- pop3proxy.py 3 Apr 2003 23:30:34 -0000 1.67
***************
*** 141,144 ****
--- 141,145 ----
import socket, asyncore, asynchat, cgi
import mailbox, email.Header
+ from thread import start_new_thread
from email.Iterators import typed_subpart_iterator
import spambayes
***************
*** 391,394 ****
--- 392,399 ----
make multiple calls, or will cope with the headers being
different.
+
+ o USER:
+ o Does no processing based on the USER command itself, but
+ expires any old messages in the three caches.
"""
***************
*** 396,400 ****
POP3ProxyBase.__init__(self, clientSocket, serverName, serverPort)
self.handlers = {'STAT': self.onStat, 'LIST': self.onList,
! 'RETR': self.onRetr, 'TOP': self.onTop}
state.totalSessions += 1
state.activeSessions += 1
--- 401,406 ----
POP3ProxyBase.__init__(self, clientSocket, serverName, serverPort)
self.handlers = {'STAT': self.onStat, 'LIST': self.onList,
! 'RETR': self.onRetr, 'TOP': self.onTop,
! 'USER': self.onUser}
state.totalSessions += 1
state.activeSessions += 1
***************
*** 571,574 ****
--- 577,589 ----
return self.onRetr(command, args, response)
+ def onUser(self, command, args, response):
+ """Spins off three separate threads that expires any old messages
+ in the three caches, but does not do any processing of the USER
+ command itself."""
+ start_new_thread(state.spamCorpus.removeExpiredMessages, ())
+ start_new_thread(state.hamCorpus.removeExpiredMessages, ())
+ start_new_thread(state.unknownCorpus.removeExpiredMessages, ())
+ return response
+
def onUnknown(self, command, args, response):
"""Default handler; returns the server's response verbatim."""
***************
*** 1298,1304 ****
'[0123456789]*', cacheSize=20)
! # Expire old messages from the trained corpuses.
self.spamCorpus.removeExpiredMessages()
self.hamCorpus.removeExpiredMessages()
# Create the Trainers.
--- 1313,1324 ----
'[0123456789]*', cacheSize=20)
! # Given that (hopefully) users will get to the stage
! # where they do not need to do any more regular training to
! # be satisfied with spambayes' performance, we expire old
! # messages from not only the trained corpii, but the unknown
! # as well.
self.spamCorpus.removeExpiredMessages()
self.hamCorpus.removeExpiredMessages()
+ self.unknownCorpus.removeExpiredMessages()
# Create the Trainers.
From anadelonbrin at users.sourceforge.net Thu Apr 3 16:00:53 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Thu Apr 3 19:00:56 2003
Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.67,1.68
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv13747
Modified Files:
pop3proxy.py
Log Message:
Opps. Fix bug from the last commit.
Index: pop3proxy.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v
retrieving revision 1.67
retrieving revision 1.68
diff -C2 -d -r1.67 -r1.68
*** pop3proxy.py 3 Apr 2003 23:30:34 -0000 1.67
--- pop3proxy.py 4 Apr 2003 00:00:48 -0000 1.68
***************
*** 1309,1313 ****
options.pop3proxy_ham_cache,
'[0123456789]*', cacheSize=20)
! self.unknownCorpus = FileCorpus(factory,
options.pop3proxy_unknown_cache,
'[0123456789]*', cacheSize=20)
--- 1309,1313 ----
options.pop3proxy_ham_cache,
'[0123456789]*', cacheSize=20)
! self.unknownCorpus = ExpiryFileCorpus(age, factory,
options.pop3proxy_unknown_cache,
'[0123456789]*', cacheSize=20)
From anadelonbrin at users.sourceforge.net Thu Apr 3 17:11:59 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Thu Apr 3 20:12:04 2003
Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.68,1.69
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv3132
Modified Files:
pop3proxy.py
Log Message:
Add 'show clues' button to the review messages page
as requested by Paul Moore.
Index: pop3proxy.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v
retrieving revision 1.68
retrieving revision 1.69
diff -C2 -d -r1.68 -r1.69
*** pop3proxy.py 4 Apr 2003 00:00:48 -0000 1.68
--- pop3proxy.py 4 Apr 2003 01:11:57 -0000 1.69
***************
*** 906,909 ****
--- 906,911 ----
row.subject.href="view?key=%s&corpus=%s" % (key, label)
row.from_ = messageInfo.fromHeader
+ subj = cgi.escape(messageInfo.subjectHeader)
+ row.classify.href="showclues?key=%s&subject=%s" % (key, subj)
setattr(row, 'class', ['stripe_on', 'stripe_off'][stripe]) # Grr!
row = str(row).replace('TYPE', label).replace('KEY', key)
***************
*** 1084,1092 ****
self._writePostamble()
! def onClassify(self, file, text, which):
! """Classify an uploaded or pasted message."""
! message = file or text
! message = message.replace('\r\n', '\n').replace('\r', '\n') # For Macs
!
cluesTable = self.html.cluesTable.clone()
cluesRow = cluesTable.cluesRow.clone()
--- 1086,1090 ----
self._writePostamble()
! def _buildCluesTable(self, message, subject=None):
cluesTable = self.html.cluesTable.clone()
cluesRow = cluesTable.cluesRow.clone()
***************
*** 1099,1103 ****
results = self.html.classifyResults.clone()
results.probability = probability
! results.cluesBox = self._buildBox("Clues:", 'status.gif', cluesTable)
results.classifyAnother = self._buildClassifyBox()
self._writePreamble("Classify")
--- 1097,1125 ----
results = self.html.classifyResults.clone()
results.probability = probability
! if subject is None:
! heading = "Clues:"
! else:
! heading = "Clues for: " + subject
! results.cluesBox = self._buildBox(heading, 'status.gif', cluesTable)
! return results
!
! def onShowclues(self, key, subject):
! """Show clues for a message - linked from the Review page."""
! self._writePreamble("Message clues", parent=('review', 'Review'))
! message = state.unknownCorpus.get(key).getSubstance()
! message = message.replace('\r\n', '\n').replace('\r', '\n') # For Macs
! if message:
! results = self._buildCluesTable(message, subject)
! del results.classifyAnother
! self.write(results)
! else:
! self.write("Can't find message %r. Maybe it expired.
" % key)
! self._writePostamble()
!
! def onClassify(self, file, text, which):
! """Classify an uploaded or pasted message."""
! message = file or text
! message = message.replace('\r\n', '\n').replace('\r', '\n') # For Macs
! results = self._buildCluesTable(message)
results.classifyAnother = self._buildClassifyBox()
self._writePreamble("Classify")
From anadelonbrin at users.sourceforge.net Thu Apr 3 17:12:00 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Thu Apr 3 20:12:05 2003
Subject: [Spambayes-checkins] spambayes/spambayes/resources ui.html, 1.6,
1.7 ui_html.py, 1.6, 1.7
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes/resources
In directory sc8-pr-cvs1:/tmp/cvs-serv3132/spambayes/resources
Modified Files:
ui.html ui_html.py
Log Message:
Add 'show clues' button to the review messages page
as requested by Paul Moore.
Index: ui.html
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/ui.html,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** ui.html 13 Mar 2003 03:24:57 -0000 1.6
--- ui.html 4 Apr 2003 01:11:57 -0000 1.7
***************
*** 224,227 ****
--- 224,228 ----
id="spam" value='spam'/>
+ Show clues |
| | |
Index: ui_html.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/ui_html.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** ui_html.py 13 Mar 2003 03:24:58 -0000 1.6
--- ui_html.py 4 Apr 2003 01:11:57 -0000 1.7
***************
*** 5,80 ****
import zlib
! data = zlib.decompress("x[{oܸ\021\016<\025l]Iz]&S\037\\\\{p\010\022wW$\0226ޙ!)Q\017;\
! \036\000\004^H\016yfHϾp>]߿c?y9\016㿞\027\013b49bge\
! bd<\013jViBGb\010\026Gg\036=ym\005ߊP})Q,y(fccJ\
! \023[LM'\030/b\034r\023\001[^8,*Φl~+\0270\016R\026)m\021+\
! >M\016އ2Ŕf\002?!5}\014\026#ky\017j^M^LKl\033\024ㆹÍWkq!z\
! bC3d'O4w\0052dûU!,NYΣ(Vg/D,ӯbB\026(\016\
! ̡)`L-\0229o!i+M\014\021e\017\001NNH%5,JR(\016Y\
! B\000hl)B<ךG|;l\023Gj\015z;\0209\006\035\031.K.&\003|\002`C7p\
! e4L\017$'BrhAlPA'j:7\032\004ESp\035sQfz?[)Bg:V\025x\
! !9<;KO6\033[W1\033\033\003$D8;\024,Y<\001i)s\010p\015P^0\
! \025+oFx3ī2(\0218x-\024\000\012-jf\"\013vdy!*T\026\
! P:P\002cB,a\002\007W{6Ӏe4:eU\017d8\021\007l\026H\0045\027Eb\031gx\026,gRD\
! Kj0b6bطnE\001w\034^߃חU\0111^#(~\"\010\005 xP\005\017QabSY\
! 'U$*Sq\012Jh\013\002\000$[wQ\020'D[jkX\037%rv\010X\030\015\012\023\
! 걡LsKXHw`F,K\037\0060w\037G\003\"C\033\034Xi\033LB\
! I\035\007k,cp#M㒇k-Z4*\023\"*Q6\013gEeF\0218\014+: y/\004,\020'e\
! [\006a\003F8w\0255R]i\020'kndqGA\036v5ֽX-\003\004\002\004e\027 _\0310\
! 4\031\033Y^)26\027m@<и\022uПyG\023olL3\006\\\034a5*\004۶*xrl\
! Z=Z%=cjjז\030lg\031HW\003S7j_|摧?x\001\002:\007zY(ga;\
! CggL\"3\016W(o\037\002\030w\024\020\0105\014Q\022\036\022ݕL^_Abfw\007\023u:\
! \026?\010ctq\034$\021Q5zkD\027dY.i\014z\037Vo%i1m)\023U\014AVk\
! ǠU\002|%\0078\027\022Xy{!V~\000M\0370Y\0117\036TS[\0262eeXp\005,l\
! o}E\030~\004]!\004x\000C\023\027[U\031Hyț\037\020儓#1r-:`\012&vlJ\023ߐk\
! p٪
W$S\004Mn;B:$/`)jnP\022\030 l<߈\004%v^\027X\033aA\017\011|\
! Ďq1z\006\002\005Yn\000Q\006Ó\003LgI#|\003\\P$\011\0141\033`\006\
! ӒV3zp3\0300\024M\010\010(1(9E\020\034G\003HO\\\017Х\
! %kY*RL3ć0-t0\036|5\025tZ/g\0042\014\021y\004\014=\011h\010:C!vaWQ\016!+BtU9.\0112\010]4uVO;s m1W\
! !\016'\0246\026tz\006ߚB+\033Wq;M,S\016\003H仼'\021u\002\
! \006D@䌝BܖJ\015h\0017\025\003=\037\003E\025+''\004~~R\032۰\
! ij*,\034\024I]'PDi<\025!%um\017\037/\002\016eCBEHE3\0077\
! YGlȲ#1\013E\015#\035݁Z\001,@8\025\011ct\\oJ!\026\017\027l\003!ܞExGK\
! \003`f\002Y,\001\026\013LA]\025rU\0149R!\037\007\010\013\024\017*7\031#\030=\024Iw\
! @c2K\020\006\017A\027q\031\"bcF\023\001F%`j\0114CZIphRC\007@\002l]*s\
! R1\001\002\037/\004\036\035G\021\0253\037(A\014\010̣NkO\020@x48\\\011\003\036\
! \005\026>uCZ\034\033\022?T\0375KVq@Q\\\"@ȩ{@vpR\
! 2>@ԹH[hܰs,L\\xO\0272\011fi͔\011\
! {8\016 KS\023/\036dU\012x|s~3JD\\,8cOm)֟uti\033\
! ۋlF\004ⷿowߪ\022\0208\025\010)y;==\017\034BvEgg\004[7\
! \000\022\006iJ$gtGJ\033QC\0304Ar>`\022\003kj\036xM\
! Aʲwp;nW'v\021ڮ\016\003\023nYT\022\000X\006^\007\0010Z/\001Qo\013\
! >ln\035Qsh8~\030xM\033\027\000\"*o^`)v5j\024\026>\012J\020ۈ\016|\012\
! *\030\033\014>\017zf-@4\001{f\017o=;CI`Ezs ao/\033̍:8\017\
! f\012\031}ط\023\006\026>zM\034c,\002jfAo\000\0160OIQ\030\036b\015{U\
! ^(/x\024K\033%ld\004q8!VC[;:x;@xJ,`jteQ\
! \001wE³;4\001\027>%;PA_;aOV\026w%k`Ú\037C_R+DQ\010\
! ͫ\036\011y\010\004(ZRZ%*yÈ+n]eL\005tMq8,>x\021\000IKz}\
! OOﭪg#I\020#\017\005\035B\010J>6k\031\022 ,L**@O\011ԭ?\"l\024e\007܀=\
! 3\037O`_LOwvLRGC{\006\007差-魿ՓԬBmYe=\035\
! ='Zj?BۅC\013䌎R\030e\036:%8sZ\012{\003P\037D\\\002@&J\024a;\
! \033SWW~_YIsNJ;d݅a\000\020՜hH]\022\"u:x\
! f8֫1{zG-\022)4\036vXcF_\021\004JVqU\016}f\013\0050\002\034֕N\
! \024BYᅛG\0155\027/_=B躐\013!\0232KRC}\013hFa\014\011\0318\
! \013&}G&bF\020\021\036\021sSb:S,\020cD;dČIL\036Wt\016)N@\
! ԃ9:`I|'\026,Y}^jΔ-\0206\007>x\031xDL\011ܡ\
! }SZ\004\033!@VFȚ*l6\033}\004MН^>\0146`b\
! .G\025@\021ku{uv\036]\012\0202\007u\016Ca\031K\000\004KP\034L^\012(C,\
! \004\000et\010d(\002\025}GT7/j\032\007ɂNc~\\seџysi\
! _yw\031>@\03580߸\025\034*\030\013\003=Rv-2t4f4\013x;70W/\016\0064\013\013\025e\016\
! 渫F\002\014i\037WEB\016Q9&\0224\024P\012~\035@b)BA7Gn1\
! >W~TRA\032Y\035[cq(-\027^k\"\005\002k\013\024B\003\013\004{Wui\021&\
! >b/T$7PJ2tzxxtqk\021\030b2ZlfJk\025{E\026\004YJZ\033 ~\
! fl\022\004}&gC<3\030\015KS\010\026\020o\024^\013snIŐ\035*^^f)!;\
! \027ɋ1;LG\\JxٌJ;H\026\000~;xZYfk\010XiPu\000a\
! G#jSx\000טM1\001̈́=\007\0112;\030\013[\0225SN\000MV\005\002J=")
### end
--- 5,81 ----
import zlib
! data = zlib.decompress("x[{o\033\021\016-Q[$Ңԇ&\027ֵ8\034\016\001$ֻ>,}\014]Ê\
! C\037@.p8\0147\037.f?\\w7ᄏ`p׳rv\033^\014F'l$\
! T\011ë&\"S\020?\013YDp7/Ky<[\037rBdK\020!u1y/s\
! \025n'TI1f\037=c<<:bF\021[A\024rY̳Ll~a=ab W\
! |\034\036/T1\010~\026\017\022V-'Z\001U\032\030-6X\000C\")\0307\035o\\ǹB\
! ņftIO\036i\006s$4ɜ/W*\020fZ\004tR\0362YM_V_#\004\032'/ϿU?s\";\
! .T\012M#U$C6f_*\012\025w\022߸\020\010wvZ'/!\026yWBW\002m+\036\
! %dZ\016bgZ2oGmdXA\017^>s\007\"@%r輇\017W \025oP\033\
! ƛ(}4{\011T\016-*Tӳ__MZvPt:\"(2*Oc)S\
! \\yE[~/к8\034JBD_r\001\004%\022C\0221Ʌb\
! 3V;Y\010\005\036v\005\006RFM\026*\024A\005=\035P,e\002/ւEJ\037$<^\012\021i\
! \022jI-%F\014i#}[Nd|\007\003x}U@*Ar\034AKO\027B(\004D\007c\005*6L\014~*i\
! TF^\024DV&Q)\026nA\020@h.\012h\027k^45s,Ǐ);^F,\014\006S`(\012\
! aYc\027*NU\002aqZ7\"\012߁1\032!2\027\003?O\015`A\003\"c5\033\034Xi\033L\
! =\007k*%8فq\027k-J4*\021\"Q6sgE%F\021O\031Z\013:\"y\005,\020'e7\
! b\015p\007t\007a*k\0124\"Njݨ잂<ԽBY#\004b\0116/@B24a\
! i268IZ\026d\016u\027\025T\000\001@RT\036C6NFИVg\01469g[U:ªU\010m8V\
! qchuޫ\021\\[\012b]gl#]\005|ݨS\015G}#O1\002\014\005t\0162+9\
! \016MJEr\031g\0351/+C\010`<<8ܓR\026@f`\012\016d\014\012\022\033Իk7>x\036\025\
! _m.E\030\014e\034.\006D\015]y\022_\\e!Y'~\030[SƴѦO\022x-\
! A+\007EK\016p\025@#Y\0014\011T'V_{PMm勌\027Ϥf\
! \013|$ #\0302\025E\003\030\000?Z\012@ZC-PN89\022#B\024G\011\035`7\
! \032p*q\025\024!&bSۖK(\033\00446[&xm;7\"\002o]\004ZXC\001\037\
! :z=y^̀P Cw7I-Pp\027\020 \012ߠwuxr\000i6io}\020Q\004#\027*`\002\
! ӒV5zp\012\030f\020\010t(\022\024\002\"\010\016B#u\001G.\007\
! \0225\013^ƙf\017a\014\021/}`xȧh\026\015+h\006_\010e\030$#x*/1Wn\024h\
! OM\007?@z`Ih\012V'\015\024ǀ57{;SG訃wҋ\025\005\007\005.,\010#.buAv\003\027\
! YAl\006{hޡjD\031>{&/zB]\0256?禂wM{*x^B\002-F\
! \0177g:@'\001\031MAk\015\027<&ypn/%gD\010f\025\005\023E \035B\003Wa\
! z\0035LA E|2\033wjN\004S\005.\004]izi\012UhYb\006;r)\
! 1X9IRƘ+3\026\037\036v\035\037DkXI/k+ЅISk%5\007rޤ\034s\
! \035nhqJQ+cAg)\022q\030{T?Rv9d[\017\"ڝG\013\023\
! 4 zL\006:gl-\0376/D\014\016\026pS13Y;P\004\\XA\030rrB`G?'ui\006\011;\
! hFf\014^;PYB>\020\0120'\033bEHzI\035+\003s@CU^#+\021Rь\
! -}VQ$$*sBQhHGw Va\000\013%\020\016uE\030\035&[R胫\005@j\010\027kA\
! f)\0010E\001X\002,\026\023**\011\031R!\037\007\010K\024\017*0\031#\030=\024I\
! w@c2K\020\032\017A2_,dCF\023\001F%`j\0114\027\032z%\016\034\005\036+\004T\
! ԥcX\003(\005?^\0347\011>%=Z!\025\017RT \006lNk;\010$z;\\\011\003\036\
! \001\026s\027:A^cYX\016V\015\011+O\006JH8P\0378q\015Т]>\\\024\
! c{\014< x\013\0201u\006Z-w.\013\003o`8|LY&Z3U}\
! \001C(H9g\024&>%;l\001\034\017=Κo\006HV\005SvzX\017Ԕ\"i5;\
! s&ل\011o{U9 p*`\003\021\034y;=\003\037\034BvE);V\015\
! /D~5s\022ɴPFd\027\020\006\016\016\017?&H\007\031̟g\0154_9\030\016N6\
! HYN7\037f~4|=Bap\"]91J\002\000\013Z3< \036\023F$8m\
! '-ۍ#*p\016j\024\015\033\026qmS>\037AQ\005\013w\032=~\005%mD\007>\005\
! \025D\0064\017zf̓@\001{fo=;CI`Izs ao/\032̭\030;8\017\
! f2o\017['X\015,}[XKe\022a`537\000\007'$T(;T\014l_`\015\";U\
! N(x(`\022c2?]\037P\012^\016&P:\005Lܐ\016Y\006(\035u\030\
! gS\002v\026sp'N\013G9nG@\003(,px\017#o:f\
! \037Ztd\021\006\027Q<\005jO\\^\017ﭳZP)ux\036:G=G[w\0248F\015dviGe\
! VF|.\"/x\037\016g~vٜg\007\0147oW\036B&\030@m\021|ǘ^\023'MY(m\
! ^`u\037\"]8\031`\031\016rVDSl\026}]0\013mˢ\036mG\022Vgf=Y\
! >\015&rΔo)(9'uҜc哓\015!{Z[z_Du7ty\032\036\
! ~8\030,/Tfw٫\002\013RӶ\032Պc\0130Q~pQf\031A\014sܳ\036G̚H\
! P@)U\000ŨSn,ݾC]d/Qⵡ:5:Q; \032WD\013\0048c\003\
! dB\003\013\004;Wui!&b//.PmV_hμ#0T8ַ\
! .-|\023d\0128\007w3c X3\032=;)8ǰfC\000_όo\000\032J?k\
! 2瞕\000X\005U\002`g#vb;eѩ1(pl\006pa\000ە\
! ں4-&\021D\0070\000Î2ܦ\000Rb\002\011{\016\0228gv0\026\0125&%S!\000\
! \033(Ǘ\004\015B\006h")
### end
From anadelonbrin at users.sourceforge.net Sun Apr 6 18:15:53 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Sun Apr 6 20:15:57 2003
Subject: [Spambayes-checkins] spambayes FAQ.txt,NONE,1.1
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv13697
Added Files:
FAQ.txt
Log Message:
Frequently asked questions about the spambayes project.
--- NEW FILE: FAQ.txt ---
Q: Hey! Why don't you implement cool tokenizer trick X? I think it
would really foil those spammers!
A: Have you run your tokenizer trick against a set of messages to see if
it actually works? Many times what seems like a good idea turns out
not to help much, and sometimes even hurts. If you have a good idea,
you've run it against a batch of messages and can prove that it
helps, paste the code for your technique and the proof to the mailing
list. If you're not a coder, but are really keen on your idea, post
a feature request on the project page, and wait for someone else to
code it for you (but make sure you do some testing when it's done).
Otherwise, you will likely get a message from Tim Peters about
why you need to test your idea :)
Q: I just got a spam, but the system said it was "unsure". Why
couldn't it tell that it was spam - it's obvious?
A: It may be obvious to you, but the classifier only works on
the information it has been given. Maybe this is "new" (you've
never seen this particular flavour of spam before), or maybe
there aren't enough clues in the message which the system is
aware of as strong spam clues.
Q: OK, I trained on that message. But I just got *another* one,
and the stupid system still thinks it's unsure. Why did it
ignore me???
A: It didn't, but you may need to train on a few more of this type
of message to get it classified as "spam". The classification
algorithm weights its results based on the number of times it
has seen a particular clue, so that clues unique to this type
of message may need a few more instances to become "convincing".
From anadelonbrin at users.sourceforge.net Sun Apr 6 18:53:48 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Sun Apr 6 20:53:51 2003
Subject: [Spambayes-checkins] spambayes FAQ.txt,1.1,1.2
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv27837
Modified Files:
FAQ.txt
Log Message:
Add FAQ about training from scratch.
Index: FAQ.txt
===================================================================
RCS file: /cvsroot/spambayes/spambayes/FAQ.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** FAQ.txt 7 Apr 2003 00:15:51 -0000 1.1
--- FAQ.txt 7 Apr 2003 00:53:45 -0000 1.2
***************
*** 28,29 ****
--- 28,41 ----
has seen a particular clue, so that clues unique to this type
of message may need a few more instances to become "convincing".
+
+ Q: I've mucked up my training and I want to start all over again,
+ but there isn't an option for this anywhere. What do I do?
+ A: Because training from scratch is a very rare occurance, and because
+ deleting all your training information is something you don't want
+ to do by accident, there isn't an option for this. However, you
+ can quite simply do this manually. All the training data is stored
+ in a file, usually called hammie.db, and if you delete (or rename)
+ this, then you will start training from scratch. If you are using
+ the web interface for the POP3 proxy, the configuration page tells
+ you what this file is called (and where it is) down towards the
+ bottom of the page.
From anadelonbrin at users.sourceforge.net Mon Apr 7 01:26:29 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Mon Apr 7 03:26:38 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,NONE,1.1
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv12257
Added Files:
imapfilter.py
Log Message:
First steps towards an IMAP spambayes solution.
Currently will do (very) basic filtering and training.
--- NEW FILE: imapfilter.py ---
#!/usr/bin/env python
"""An IMAP filter. An IMAP message box is scanned and all non-scored
messages are scored and (where necessary) filtered.
It is suggested that this filter is set to run at certain intervals.
Note that it is (currently) fairly slow, so this should not be too
often. An alternative to this would be to keep the filter running
and logged in, and periodically check for new mail.
The original filter design owed much to isbg by Roger Binns
(http://www.rogerbinns.com/isbg).
"""
# This module is part of the spambayes project, which is Copyright 2002-3
# The Python Software Foundation and is covered by the Python Software
# Foundation license.
__author__ = "Tony Meyer "
__credits__ = "All the Spambayes folk."
# This code will benefit immensely from
# (a) The new message class, which can hold information such as
# whether a message has been seen before
# (b) The new header stuff, which will abstract out adding all
# the headers
try:
True, False
except NameError:
# Maintain compatibility with Python 2.2
True, False = 1, 0
import socket
import imaplib
import os
import re
import time
from spambayes.Options import options
from spambayes import tokenizer, storage
class IMAPFilter(object):
def __init__(self):
self.imap = imaplib.IMAP4(options.imap_server, options.imap_port)
if options.verbose:
print "Loading database...",
filename = options.pop3proxy_persistent_storage_file
filename = os.path.expanduser(filename)
if options.pop3proxy_persistent_use_database:
self.bayes = storage.DBDictClassifier(filename)
else:
self.bayes = storage.PickledClassifier(filename)
if options.verbose:
print "Done."
# Unique names for cached messages - see getNewMessageName() below.
self.lastBaseMessageName = ''
self.uniquifier = 2
def Login(self):
lgn = self.imap.login(options.imap_username, options.imap_password)
self._check(lgn, 'login')
def _check(self, response, command):
if response[0] != "OK":
print "Invalid response to %s:\n%s" % (command, response)
sys.exit(-1)
def _getUIDs(self, low, high):
# Retreive a list of uids corresponding to the given range
if high < low: return []
# request message range
range = str(low) + ":" + str(high)
res = self.imap.fetch(range, "UID")
self._check(res, 'fetch')
r = re.compile(r"[0-9]+ \(UID ([0-9]+)\)")
res2 = []
for i in res[1]:
mo = r.match(i)
if mo is not None:
res2.append(mo.group(1))
return res2
def getNewMessageName(self):
# The message name is the time it arrived, with a uniquifier
# appended if two arrive within one clock tick of each other.
# (This is completely taken from the same function in pop3proxy's
# State class.)
messageName = "%10.10d" % long(time.time())
if messageName == self.lastBaseMessageName:
messageName = "%s-%d" % (messageName, self.uniquifier)
self.uniquifier += 1
else:
self.lastBaseMessageName = messageName
self.uniquifier = 2
return messageName
def _selectFolder(self, name, read_only):
folder = self.imap.select(name, read_only)
self._check(folder, 'select')
return folder
def RetrieveMessage(self, uid):
response = self.imap.uid("FETCH", uid, "(RFC822.PEEK)")
self._check(response, 'uid fetch')
try:
messageText = response[1][0][1]
except:
print "Could not retrieve message (id %s)" % uid
messageText = ""
return messageText
def TrainFolder(self, folder_name, isSpam):
response = self._selectFolder(folder_name, True)
uids = self._getUIDs(1, int(response[1][0]))
for uid in uids:
messageText = self.RetrieveMessage(uid)
self.bayes.learn(tokenizer.tokenize(messageText), isSpam)
def Train(self):
if options.verbose:
t = time.time()
if options.imap_ham_train_folders != "":
ham_training_folders = options.imap_ham_train_folders.split()
for fol in ham_training_folders:
self.TrainFolder(fol, False)
if options.imap_spam_train_folders != "":
spam_training_folders = options.imap_spam_train_folders.split(' ' )
for fol in spam_training_folders:
self.TrainFolder(fol, True)
self.bayes.store()
if options.verbose:
print "Training took", time.time() - t, "seconds."
def Filter(self):
if options.verbose:
t = time.time()
inbox = self._selectFolder(options.imap_inbox, False)
# the number of messages are returned
# get all the corresponding UIDs
uids = self._getUIDs(1, int(inbox[1][0]))
for uid in uids:
messageText = self.RetrieveMessage(uid)
(prob, clues) = self.bayes.spamprob\
(tokenizer.tokenize(messageText),
evidence=True)
messageText = self._addHeaders(messageText, prob, clues)
#uid = self._updateMessage(uid, messageText)
self._filterMessage(uid, prob)
if options.verbose:
print "Filtering took", time.time() - t, "seconds."
def Logout(self):
# sign off
if options.imap_expunge:
self.imap.expunge()
self.imap.logout()
def _addHeaders(self, messageText, prob, clues):
if options.pop3proxy_strip_incoming_mailids == True:
s = re.compile(options.pop3proxy_mailid_header_name + \
': [\d-]+[\\r]?[\\n]?')
messageText = s.sub('', messageText)
headers, body = re.split(r'\n\r?\n', messageText, 1)
messageName = self.getNewMessageName()
headers += '\n'
if options.pop3proxy_add_mailid_to.find("header") != -1:
headers += options.pop3proxy_mailid_header_name \
+ ": " + messageName + "\r\n"
if options.pop3proxy_add_mailid_to.find("body") != -1:
body = body[:len(body)-3] + \
options.pop3proxy_mailid_header_name + ": " \
+ messageName + "\r\n.\r\n"
if options.pop3proxy_include_prob:
headers += '%s: %s\r\n' % (options.pop3proxy_prob_header_name,
prob)
if options.pop3proxy_include_thermostat:
thermostat = '**********'
headers += '%s: %s\r\n' % \
(options.pop3proxy_thermostat_header_name,
thermostat[int(prob*10):])
if options.pop3proxy_include_evidence:
headers += options.pop3proxy_evidence_header_name + ": "
headers += "; ".join(["%r: %.2f" % (word, prob)
for word, score in clues
if (word[0] == '*' or
score <= options.clue_mailheader_cutoff or
score >= 1.0 - options.clue_mailheader_cutoff)])
headers += "\r\n"
headers += "\r\n"
return headers + body
def _updateMessage(self, uid, messageText):
# we can't actually update the message with IMAP
# XXX (someone tell me if this is wrong!)
# so what we do is create a new message and delete the old one
# we return the new uid, which we obtain by searching for the
# spambayes id
res = self.imap.append(options.imap_inbox, None,
self._extractTimeFromMessage(messageText),
messageText)
self._check(res, "append")
res = self.imap.uid("STORE", uid, "+FLAGS.SILENT", "(\\Deleted)")
self._check(res, "uid store")
res = self.imap.uid("SEARCH", "(TEXT)", messageText)
self._check(res, "uid search")
return res[1][0]
def _extractTimeFromMessage(self, messageText):
# When we create a new copy of a message, we need to specify
# a timestamp for the message. Ideally, this would be the
# timestamp from the message itself, but for the moment, we
# just use the current time.
return imaplib.Time2Internaldate(time.time())
def _moveMessage(self, uid, dest):
# The IMAP copy command makes an alias, not a whole new
# copy, so what we need to do (sigh) is create a new message
# in the correct folder, and delete the old one
# XXX (someone tell me if this is wrong, too!)
response = self.imap.uid("FETCH", uid, "(RFC822.PEEK)")
self._check(response, 'uid fetch')
messageText = response[1][0][1]
response = self.imap.append(dest, None,
self._extractTimeFromMessage(messageText),
messageText)
self._check(response, "append")
res = self.imap.uid("STORE", uid, "+FLAGS.SILENT", "(\\Deleted)")
self._check(response, "uid store")
def _filterMessage(self, uid, prob):
if prob < options.ham_cutoff:
# we leave ham alone
pass
elif prob > options.spam_cutoff:
self._moveMessage(uid, options.imap_spam_folder)
else:
self._moveMessage(uid, options.imap_unsure_folder)
if __name__ == '__main__':
options.verbose = True
imap_filter = IMAPFilter()
imap_filter.Login()
imap_filter.Train()
imap_filter.Filter()
imap_filter.Logout()
From anadelonbrin at users.sourceforge.net Mon Apr 7 01:26:54 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Mon Apr 7 03:26:57 2003
Subject: [Spambayes-checkins] spambayes/spambayes Options.py,1.24,1.25
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv12257/spambayes
Modified Files:
Options.py
Log Message:
First steps towards an IMAP spambayes solution.
Currently will do (very) basic filtering and training.
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.24
retrieving revision 1.25
diff -C2 -d -r1.24 -r1.25
*** Options.py 14 Mar 2003 02:14:37 -0000 1.24
--- Options.py 7 Apr 2003 07:26:17 -0000 1.25
***************
*** 415,418 ****
--- 415,434 ----
smtpproxy_shutdown_address = spambayes_shutdown@localhost
+ [imap]
+ imap_server:
+ # the default IMAP port is 143, or 993 if using SSL
+ imap_port: 143
+ imap_username:
+ imap_password:
+ imap_expunge: False
+ imap_inbox: inbox
+ imap_unsure_folder:
+ imap_spam_folder:
+ # comma delimited list of folders that will be examined for messages
+ # to train as ham
+ imap_ham_train_folders:
+ # as for imap_ham_train_folders, but scan for messages to train as spam
+ imap_spam_train_folders:
+
[html_ui]
html_ui_port: 8880
***************
*** 533,536 ****
--- 549,563 ----
'smtpproxy_ports' : string_cracker,
},
+ 'imap': {'imap_server' : string_cracker,
+ 'imap_port' : int_cracker,
+ 'imap_username' : string_cracker,
+ 'imap_password' : string_cracker,
+ 'imap_inbox' : string_cracker,
+ 'imap_unsure_folder' : string_cracker,
+ 'imap_spam_folder' : string_cracker,
+ 'imap_ham_train_folders' : string_cracker,
+ 'imap_spam_train_folders' : string_cracker,
+ 'imap_expunge' : boolean_cracker,
+ },
'html_ui': {'html_ui_port': int_cracker,
'html_ui_launch_browser': boolean_cracker,
From timstone4 at users.sourceforge.net Mon Apr 7 19:21:30 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Mon Apr 7 21:21:34 2003
Subject: [Spambayes-checkins] spambayes/spambayes message.py,NONE,1.1
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv14085
Added Files:
message.py
Log Message:
An extension of email.message which includes methods that are useful for
spambayes. Specifically, message training and classification is remembered
in a simple pickle, by message id. This allows for convenient untraining by
any client that uses it to handle messages.
--- NEW FILE: message.py ---
#! /usr/bin/env python
'''message.py - Core Spambayes classes.
Classes:
Message - an email.Message.Message, extended with spambayes methods
MessageInfoDB - persistent state storage for Message
Abstract:
MessageInfoDB is a simple shelve persistency class for the persistent
state of a Message obect. For the moment, the db name is hard-coded,
but we'll have to do this a different way. Mark Hammond's idea is to
have a master database, that simply keeps track of the names and instances
of other databases, such as the wordinfo and msginfo databases. The
MessageInfoDB currently does not provide iterators, but should at some
point. This would allow us to, for example, see how many messages
have been trained differently than their classification, for fp/fn
assessment purposes.
Message is an extension of the email package Message class, to include
persistent message information and Spambayes specific header manipulations.
The persistent state -currently- consists of the message id, its current
classification, and its current training. The payload is not persisted.
Payload persistence is left to whatever mail client software is being used.
Usage:
A typical classification usage pattern would be something like:
>>>msg = spambayes.message.Message()
>>>msg.setPayload(substance) # substance comes from somewhere else
>>>id = msg.setIdFromPayload()
>>>if id is None:
>>> msg.setId(time()) # or some unique identifier
>>>msg.delSBHeaders() # never include sb headers in a classification
>>># bayes object is your responsibility
>>>(prob, clues) = bayes.spamprob(msg.asTokens(), evidence=True)
>>>msg.addSBHeaders(prob, clues)
A typical usage pattern to train as spam would be something like:
>>>msg = spambayes.message.Message()
>>>msg.setPayload(substance) # substance comes from somewhere else
>>>id = msg.setId(msgid) # id is a fname, outlook msg id, something...
>>>msg.delSBHeaders() # never include sb headers in a train
>>>if msg.isTrndHam():
>>> bayes.unlearn(msg.asTokens(), False) # untrain the ham
>>>bayes.learn(msg.asTokens(), True) # train as spam
>>>msg.trndAsSpam()
To Do:
o Master DB module
o Suggestions?
'''
# This module is part of the spambayes project, which is Copyright 2002
# The Python Software Foundation and is covered by the Python Software
# Foundation license.
__author__ = "Tim Stone "
__credits__ = "Mark Hammond, Tony Meyers, all the spambayes contributors."
from __future__ import generators
try:
True, False
except NameError:
# Maintain compatibility with Python 2.2
True, False = 1, 0
def bool(val):
return not not val
import sys
import email.Message
import email.Parser
from spambayes.tokenizer import tokenize
from spambayes.Options import options
from cStringIO import StringIO
from spambayes import dbmstorage
import shelve
# Make shelve use binary pickles by default.
oldShelvePickler = shelve.Pickler
def binaryDefaultPickler(f, binary=1):
return oldShelvePickler(f, binary)
shelve.Pickler = binaryDefaultPickler
class MessageInfoDB:
def __init__(self, db_name, mode='c'):
self.mode = mode
self.db_name = db_name
self.dbm = dbmstorage.open(self.db_name, self.mode)
self.db = shelve.Shelf(self.dbm)
def store(self):
self.db.sync()
def _getState(self, msg):
try:
return self.db[msg.getId()]
except KeyError:
return None
def _setState(self, msg):
self.db[msg.getId()] = msg
def _delState(self, msg):
del self.db[msg.getId()]
# this should come from a mark hammond idea of a master db
msginfoDB = MessageInfoDB("spambayes.messageinfo.db")
class Message(email.Message.Message):
'''An email.Message.Message extended for Spambayes'''
def __init__(self):
email.Message.Message.__init__(self)
# persistent state
self.id = None
self.c = None
self.t = None
# non-persistent state includes all of email.Message.Message state
def setPayload(self, payload):
prs = email.Parser.HeaderParser()
prs._parseheaders(self, StringIO(payload))
# we may want to do some header parsing error handling here
# to try to extract important headers regardless of malformations
prs._parsebody(self, StringIO(payload))
def setIdFromPayload(self):
try:
self.setId(self[options.pop3proxy_mailid_header_name])
except KeyError:
return None
return self.id
def setId(self, id):
if self.id:
raise ValueError, "MsgId has already been set, cannot be changed"
# we should probably enforce type(id) is StringType.
# the database will insist upon it, but at that point, it's harder
# to diagnose
if id is None:
raise ValueError, "MsgId must not be None"
self.id = id
msginfoDB._getState(self)
def getId(self):
return self.id
def addSBHeaders(self, prob, clues):
'''Add hammie header, and remember message's classification. Also,
add optional headers if needed.'''
if prob < options.ham_cutoff:
disposition = options.header_ham_string
self.clsfyAsHam()
elif prob > options.spam_cutoff:
disposition = options.header_spam_string
self.clsfyAsSpam()
else:
disposition = options.header_unsure_string
self.clsfyAsUnsure()
self[options.hammie_header_name] = disposition
if options.pop3proxy_include_prob:
self[options.pop3proxy_prob_header_name] = prob
if options.pop3proxy_include_thermostat:
thermostat = '**********'
self[options.pop3proxy_thermostat_header_name] = \
thermostat[:int(prob*10)]
if options.pop3proxy_include_evidence:
evd = "; ".join(["%r: %.2f" % (word, score)
for word, score in clues
if (word[0] == '*' or
score <= options.clue_mailheader_cutoff or
score >= 1.0 - options.clue_mailheader_cutoff)])
self[options.pop3proxy_evidence_header_name] = evd
if options.pop3proxy_add_mailid_to.find("header") != -1:
self[options.pop3proxy_mailid_header_name] = self.id
# This won't work for now, because email.Message does not isolate message body
# This is also not consistent with the function of this method...
# if options.pop3proxy_add_mailid_to.find("body") != -1:
# body = body[:len(body)-3] + \
# options.pop3proxy_mailid_header_name + ": " \
# + messageName + "\r\n.\r\n"
def delSBHeaders(self):
del self[options.hammie_header_name]
del self[options.pop3proxy_mailid_header_name]
del self[options.hammie_header_name + "-ID"] # test mode header
del self[options.pop3proxy_prob_header_name]
del self[options.pop3proxy_thermostat_header_name]
del self[options.pop3proxy_evidence_header_name]
def asTokens(self):
# use as_string() here because multipart/digest will return
# a list of message objects if get_payload() is used
return tokenize(self.as_string())
def modified(self):
if self.id: # only persist if key is present
msginfoDB._setState(self)
def isClsfdSpam(self):
return self.c == 's'
def isClsfdHam(self):
return self.c == 'h'
def isClsfdUnsure(self):
return self.c == 'u'
def isClassified(self):
return not self.c is None
def clsfyAsSpam(self):
self.c = 's'
self.modified()
def clsfyAsHam(self):
self.c = 'h'
self.modified()
def clsfyAsUnsure(self):
self.c = 'u'
self.modified()
def getClassification(self):
return self.c
def isTrndSpam(self):
return self.t == 's'
def isTrndHam(self):
return self.t == 'h'
def trndAsSpam(self):
self.t = 's'
self.modified()
def trndAsHam(self):
self.t = 'h'
self.modified()
def notTrained(self):
self.t = None
self.modified()
def isTrained(self):
return not self.t is None
def getTraining(self):
return self.t
def __repr__(self):
return "core.Message%r" % repr(self.__getstate__())
def __getstate__(self):
return (self.id, self.c, self.t)
def __setstate__(self, t):
(self.id, self.c, self.t) = t
From timstone4 at users.sourceforge.net Mon Apr 7 19:25:04 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Mon Apr 7 21:25:13 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.1,1.2
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv14906
Modified Files:
imapfilter.py
Log Message:
Changed to use the message class. Untested at this point. Your turn, Tony.
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** imapfilter.py 7 Apr 2003 07:26:24 -0000 1.1
--- imapfilter.py 8 Apr 2003 01:24:17 -0000 1.2
***************
*** 49,55 ****
filename = os.path.expanduser(filename)
if options.pop3proxy_persistent_use_database:
! self.bayes = storage.DBDictClassifier(filename)
else:
! self.bayes = storage.PickledClassifier(filename)
if options.verbose:
print "Done."
--- 49,55 ----
filename = os.path.expanduser(filename)
if options.pop3proxy_persistent_use_database:
! self.classifier = storage.DBDictClassifier(filename)
else:
! self.classifier = storage.PickledClassifier(filename)
if options.verbose:
print "Done."
***************
*** 109,113 ****
print "Could not retrieve message (id %s)" % uid
messageText = ""
! return messageText
def TrainFolder(self, folder_name, isSpam):
--- 109,120 ----
print "Could not retrieve message (id %s)" % uid
messageText = ""
!
! msg = spambayes.message.Message()
! msg.setPayload(messageText)
! msg.setId(uid)
!
! msg.delSBHeaders() # never include sb headers in a train
!
! return msg
def TrainFolder(self, folder_name, isSpam):
***************
*** 115,120 ****
uids = self._getUIDs(1, int(response[1][0]))
for uid in uids:
! messageText = self.RetrieveMessage(uid)
! self.bayes.learn(tokenizer.tokenize(messageText), isSpam)
def Train(self):
--- 122,139 ----
uids = self._getUIDs(1, int(response[1][0]))
for uid in uids:
! msg = self.RetrieveMessage(uid)
!
! if msg.isTrained():
! if isSpam and msg.isTrndHam():
! bayes.unlearn(msg.asTokens(), False) # untrain the ham
! elif not isSpam and msg.isTrndSpam():
! bayes.unlearn(msg.asTokens(), True)
!
! bayes.learn(msg.asTokens(), isSpam) # train as spam
!
! if isSpam:
! msg.trndAsSpam()
! else:
! msg.trndAsHam()
def Train(self):
***************
*** 129,133 ****
for fol in spam_training_folders:
self.TrainFolder(fol, True)
! self.bayes.store()
if options.verbose:
print "Training took", time.time() - t, "seconds."
--- 148,152 ----
for fol in spam_training_folders:
self.TrainFolder(fol, True)
! self.classifier.store()
if options.verbose:
print "Training took", time.time() - t, "seconds."
***************
*** 142,152 ****
for uid in uids:
! messageText = self.RetrieveMessage(uid)
! (prob, clues) = self.bayes.spamprob\
! (tokenizer.tokenize(messageText),
! evidence=True)
! messageText = self._addHeaders(messageText, prob, clues)
! #uid = self._updateMessage(uid, messageText)
! self._filterMessage(uid, prob)
if options.verbose:
print "Filtering took", time.time() - t, "seconds."
--- 161,169 ----
for uid in uids:
! msg = self.RetrieveMessage(uid)
! (prob, clues) = self.classifier.spamprob(msg.asTokens(), evidence=True)
! msg.addSBHeaders(prob, clues) # adds headers and remembers classification
! self._updateMessage(msg)
! self._filterMessage(msg)
if options.verbose:
print "Filtering took", time.time() - t, "seconds."
***************
*** 158,198 ****
self.imap.logout()
! def _addHeaders(self, messageText, prob, clues):
! if options.pop3proxy_strip_incoming_mailids == True:
! s = re.compile(options.pop3proxy_mailid_header_name + \
! ': [\d-]+[\\r]?[\\n]?')
! messageText = s.sub('', messageText)
!
! headers, body = re.split(r'\n\r?\n', messageText, 1)
! messageName = self.getNewMessageName()
! headers += '\n'
! if options.pop3proxy_add_mailid_to.find("header") != -1:
! headers += options.pop3proxy_mailid_header_name \
! + ": " + messageName + "\r\n"
! if options.pop3proxy_add_mailid_to.find("body") != -1:
! body = body[:len(body)-3] + \
! options.pop3proxy_mailid_header_name + ": " \
! + messageName + "\r\n.\r\n"
!
! if options.pop3proxy_include_prob:
! headers += '%s: %s\r\n' % (options.pop3proxy_prob_header_name,
! prob)
! if options.pop3proxy_include_thermostat:
! thermostat = '**********'
! headers += '%s: %s\r\n' % \
! (options.pop3proxy_thermostat_header_name,
! thermostat[int(prob*10):])
! if options.pop3proxy_include_evidence:
! headers += options.pop3proxy_evidence_header_name + ": "
! headers += "; ".join(["%r: %.2f" % (word, prob)
! for word, score in clues
! if (word[0] == '*' or
! score <= options.clue_mailheader_cutoff or
! score >= 1.0 - options.clue_mailheader_cutoff)])
! headers += "\r\n"
! headers += "\r\n"
! return headers + body
!
! def _updateMessage(self, uid, messageText):
# we can't actually update the message with IMAP
# XXX (someone tell me if this is wrong!)
--- 175,179 ----
self.imap.logout()
! def _updateMessage(self, msg):
# we can't actually update the message with IMAP
# XXX (someone tell me if this is wrong!)
***************
*** 201,214 ****
# spambayes id
res = self.imap.append(options.imap_inbox, None,
! self._extractTimeFromMessage(messageText),
! messageText)
self._check(res, "append")
! res = self.imap.uid("STORE", uid, "+FLAGS.SILENT", "(\\Deleted)")
self._check(res, "uid store")
! res = self.imap.uid("SEARCH", "(TEXT)", messageText)
self._check(res, "uid search")
return res[1][0]
! def _extractTimeFromMessage(self, messageText):
# When we create a new copy of a message, we need to specify
# a timestamp for the message. Ideally, this would be the
--- 182,195 ----
# spambayes id
res = self.imap.append(options.imap_inbox, None,
! self._extractTimeFromMessage(msg),
! msg.payload())
self._check(res, "append")
! res = self.imap.uid("STORE", msg.getId(), "+FLAGS.SILENT", "(\\Deleted)")
self._check(res, "uid store")
! res = self.imap.uid("SEARCH", "(TEXT)", msg.payload())
self._check(res, "uid search")
return res[1][0]
! def _extractTimeFromMessage(self, msg):
# When we create a new copy of a message, we need to specify
# a timestamp for the message. Ideally, this would be the
***************
*** 217,243 ****
return imaplib.Time2Internaldate(time.time())
! def _moveMessage(self, uid, dest):
# The IMAP copy command makes an alias, not a whole new
# copy, so what we need to do (sigh) is create a new message
# in the correct folder, and delete the old one
# XXX (someone tell me if this is wrong, too!)
! response = self.imap.uid("FETCH", uid, "(RFC822.PEEK)")
self._check(response, 'uid fetch')
! messageText = response[1][0][1]
! response = self.imap.append(dest, None,
! self._extractTimeFromMessage(messageText),
! messageText)
self._check(response, "append")
! res = self.imap.uid("STORE", uid, "+FLAGS.SILENT", "(\\Deleted)")
self._check(response, "uid store")
! def _filterMessage(self, uid, prob):
! if prob < options.ham_cutoff:
# we leave ham alone
pass
! elif prob > options.spam_cutoff:
! self._moveMessage(uid, options.imap_spam_folder)
else:
! self._moveMessage(uid, options.imap_unsure_folder)
if __name__ == '__main__':
--- 198,226 ----
return imaplib.Time2Internaldate(time.time())
! def _moveMessage(self, msg, dest):
# The IMAP copy command makes an alias, not a whole new
# copy, so what we need to do (sigh) is create a new message
# in the correct folder, and delete the old one
# XXX (someone tell me if this is wrong, too!)
! response = self.imap.uid("FETCH", msg.getId(), "(RFC822.PEEK)")
self._check(response, 'uid fetch')
!
! msg = spambayes.message.Message()
! msg.setPayload(response[1][0][1])
! msg.setId(_extractTimeFromMessage(msg))
!
! response = self.imap.append(dest, None, msg.getId(), msg.payload())
self._check(response, "append")
! res = self.imap.uid("STORE", msg.getId(), "+FLAGS.SILENT", "(\\Deleted)")
self._check(response, "uid store")
! def _filterMessage(self, msg, prob):
! if msg.isClsfdHam():
# we leave ham alone
pass
! elif msg.isClsfdSpam():
! self._moveMessage(msg, options.imap_spam_folder)
else:
! self._moveMessage(msg, options.imap_unsure_folder)
if __name__ == '__main__':
From anadelonbrin at users.sourceforge.net Tue Apr 8 01:35:44 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Tue Apr 8 03:35:50 2003
Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.1,1.2
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv3092/spambayes
Modified Files:
message.py
Log Message:
Fixes the spelling of my name ;)
Temp fix for infinite recursion error.
Adds a couple of little functions to simply use.
Adds a changeId function.
Index: message.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** message.py 8 Apr 2003 01:21:28 -0000 1.1
--- message.py 8 Apr 2003 07:35:42 -0000 1.2
***************
*** 69,73 ****
__author__ = "Tim Stone "
! __credits__ = "Mark Hammond, Tony Meyers, all the spambayes contributors."
from __future__ import generators
--- 69,73 ----
__author__ = "Tim Stone "
! __credits__ = "Mark Hammond, Tony Meyer, all the spambayes contributors."
from __future__ import generators
***************
*** 94,102 ****
import shelve
# Make shelve use binary pickles by default.
! oldShelvePickler = shelve.Pickler
! def binaryDefaultPickler(f, binary=1):
! return oldShelvePickler(f, binary)
! shelve.Pickler = binaryDefaultPickler
--- 94,104 ----
import shelve
+ # XXX Tim, what do you want to do here? This
+ # XXX recurses infinately at the moment
# Make shelve use binary pickles by default.
! #oldShelvePickler = shelve.Pickler
! #def binaryDefaultPickler(f, binary=1):
! # return oldShelvePickler(f, binary)
! #shelve.Pickler = binaryDefaultPickler
***************
*** 155,163 ****
return self.id
def setId(self, id):
if self.id:
raise ValueError, "MsgId has already been set, cannot be changed"
!
# we should probably enforce type(id) is StringType.
# the database will insist upon it, but at that point, it's harder
--- 157,179 ----
return self.id
+
+ def changeID(self, id):
+ # We cannot re-set an id (see below). However there are
+ # occasionally times when the id for a message will change,
+ # for example, on an IMAP server (or possibly an exchange
+ # server), the server may change the ids that we are using
+ # We enforce that this must be an explicit *change* rather
+ # than simply re-setting, by having this as a separate
+ # function
+ if not self.id:
+ raise ValueError, "MsgID has not been set, cannot be changed"
+ self._setId(id)
def setId(self, id):
if self.id:
raise ValueError, "MsgId has already been set, cannot be changed"
! self._setId(id)
!
! def _setId(self, id):
# we should probably enforce type(id) is StringType.
# the database will insist upon it, but at that point, it's harder
***************
*** 274,277 ****
--- 290,306 ----
self.modified()
+ def isTrndAs(self, isSpam):
+ if self.t == 'h' and not isSpam:
+ return True
+ if self.t == 's' and isSpam:
+ return True
+ return False
+
+ def trndAs(self, isSpam):
+ if isSpam:
+ self.t = 's'
+ else:
+ self.t = 'h'
+
def notTrained(self):
self.t = None
From anadelonbrin at users.sourceforge.net Tue Apr 8 01:37:31 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Tue Apr 8 03:37:35 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.2,1.3
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv4387
Modified Files:
imapfilter.py
Log Message:
Introduces an IMAPMessage class based on the
spambayes Message class.
Introduces an iterable IMAPFolder class.
Changes the code to use all of this.
Changed to allow multiple folders to filter.
Training seems to work, although filtering isn't. I'll get to it.
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** imapfilter.py 8 Apr 2003 01:24:17 -0000 1.2
--- imapfilter.py 8 Apr 2003 07:37:28 -0000 1.3
***************
*** 1,4 ****
--- 1,6 ----
#!/usr/bin/env python
+ from __future__ import generators
+
"""An IMAP filter. An IMAP message box is scanned and all non-scored
messages are scored and (where necessary) filtered.
***************
*** 18,28 ****
__author__ = "Tony Meyer "
! __credits__ = "All the Spambayes folk."
! # This code will benefit immensely from
! # (a) The new message class, which can hold information such as
! # whether a message has been seen before
! # (b) The new header stuff, which will abstract out adding all
! # the headers
try:
--- 20,31 ----
__author__ = "Tony Meyer "
! __credits__ = "Tim Stone, All the Spambayes folk."
! # Tony thinks it would be nice if there was a web ui to
! # this for the initial setup (i.e. like pop3proxy), which offered
! # a list of folders to filter/train/etc. It could then record a
! # uid for the folder rather than a name, and it avoids the problems
! # with different imap servers having different naming styles
! # a list is retrieved via imap.list()
try:
***************
*** 37,47 ****
import re
import time
from spambayes.Options import options
! from spambayes import tokenizer, storage
class IMAPFilter(object):
def __init__(self):
! self.imap = imaplib.IMAP4(options.imap_server, options.imap_port)
if options.verbose:
print "Loading database...",
--- 40,138 ----
import re
import time
+ import sys
from spambayes.Options import options
! from spambayes import tokenizer, storage, message
!
! # global IMAPlib object
! imap = None
!
! class IMAPMessage(message.Message):
! # response checking is necessary throughout this class
! def __init__(self, folder_id, folder_name, message_id):
! message.Message.__init__(self)
! self.setId(message_id)
! self.folder_id = folder_id
! self.folder_name = folder_name
!
! def extractTime(self):
! # When we create a new copy of a message, we need to specify
! # a timestamp for the message. Ideally, this would be the
! # timestamp from the message itself, but for the moment, we
! # just use the current time.
! return imaplib.Time2Internaldate(time.time())
!
! def Update(self):
! # we can't actually update the message with IMAP
! # so what we do is create a new message and delete the old one
! response = imap.append(self.folder_name, None,
! self.extractTime(), self.get_payload())
! response = imap.select(self.folder_name, False)
! response = imap.uid("STORE", self.getId(), "+FLAGS.SILENT",
! "(\\Deleted)")
! # we need to update the uid, as it will have changed
! response = imap.uid("SEARCH", "(TEXT)", self.get_payload())
! self.changeId(response[1][0])
!
!
! class IMAPFolder(object):
! # response checking is necessary throughout this class
! def __init__(self, folder_name, readOnly=True):
! self.name = folder_name
! # Convert folder name to a uid
! self.uid = None
! response = imap.select(self.name, readOnly)
! responses = imap.response("OK")[1]
! for response in responses:
! if response[:13] == "[UIDVALIDITY ":
! r = re.compile(r"(?P\d+)")
! self.uid = r.search(response[13:]).group('uid')
! # We really want to use RFC822.PEEK here, as that doesn't effect
! # the status of the message. Unfortunately, it appears that not
! # all IMAP servers support this, even though it is in RFC1730
! self.rfc822_command = "(RFC822.PEEK)"
! response = imap.fetch("1:1", self.rfc822_command)
! if response[0] != "OK":
! self.rfc822_command = "(RFC822)"
!
! def __iter__(self):
! '''IMAPFolder is iterable'''
! for key in self.keys():
! try:
! yield self[key]
! except KeyError:
! pass
!
! def keys(self):
! '''Returns uids for all the messages in the folder'''
! # request message range
! response = imap.select(self.name, True)
! total_messages = response[1][0]
! if total_messages == '0':
! return []
! response = imap.fetch("1:" + total_messages, "UID")
! r = re.compile(r"[0-9]+ \(UID ([0-9]+)\)")
! uids = []
! for i in response[1]:
! mo = r.match(i)
! if mo is not None:
! uids.append(mo.group(1))
! return uids
!
! def __getitem__(self, key):
! '''Return message matching the given uid'''
! response = imap.uid("FETCH", key, self.rfc822_command)
! messageText = response[1][0][1]
! # we return an instance of *our* message class, not the
! # raw rfc822 message
! msg = IMAPMessage(self.uid, self.name, key)
! msg.setPayload(messageText)
! return msg
!
class IMAPFilter(object):
def __init__(self):
! global imap
! imap = imaplib.IMAP4(options.imap_server, options.imap_port)
if options.verbose:
print "Loading database...",
***************
*** 54,64 ****
if options.verbose:
print "Done."
- # Unique names for cached messages - see getNewMessageName() below.
- self.lastBaseMessageName = ''
- self.uniquifier = 2
-
- def Login(self):
- lgn = self.imap.login(options.imap_username, options.imap_password)
- self._check(lgn, 'login')
def _check(self, response, command):
--- 145,148 ----
***************
*** 67,139 ****
sys.exit(-1)
- def _getUIDs(self, low, high):
- # Retreive a list of uids corresponding to the given range
- if high < low: return []
- # request message range
- range = str(low) + ":" + str(high)
- res = self.imap.fetch(range, "UID")
- self._check(res, 'fetch')
- r = re.compile(r"[0-9]+ \(UID ([0-9]+)\)")
- res2 = []
- for i in res[1]:
- mo = r.match(i)
- if mo is not None:
- res2.append(mo.group(1))
- return res2
-
- def getNewMessageName(self):
- # The message name is the time it arrived, with a uniquifier
- # appended if two arrive within one clock tick of each other.
- # (This is completely taken from the same function in pop3proxy's
- # State class.)
- messageName = "%10.10d" % long(time.time())
- if messageName == self.lastBaseMessageName:
- messageName = "%s-%d" % (messageName, self.uniquifier)
- self.uniquifier += 1
- else:
- self.lastBaseMessageName = messageName
- self.uniquifier = 2
- return messageName
-
def _selectFolder(self, name, read_only):
! folder = self.imap.select(name, read_only)
self._check(folder, 'select')
return folder
! def RetrieveMessage(self, uid):
! response = self.imap.uid("FETCH", uid, "(RFC822.PEEK)")
! self._check(response, 'uid fetch')
! try:
! messageText = response[1][0][1]
! except:
! print "Could not retrieve message (id %s)" % uid
! messageText = ""
!
! msg = spambayes.message.Message()
! msg.setPayload(messageText)
! msg.setId(uid)
!
! msg.delSBHeaders() # never include sb headers in a train
!
! return msg
def TrainFolder(self, folder_name, isSpam):
! response = self._selectFolder(folder_name, True)
! uids = self._getUIDs(1, int(response[1][0]))
! for uid in uids:
! msg = self.RetrieveMessage(uid)
!
if msg.isTrained():
! if isSpam and msg.isTrndHam():
! bayes.unlearn(msg.asTokens(), False) # untrain the ham
! elif not isSpam and msg.isTrndSpam():
! bayes.unlearn(msg.asTokens(), True)
!
! bayes.learn(msg.asTokens(), isSpam) # train as spam
!
! if isSpam:
! msg.trndAsSpam()
! else:
! msg.trndAsHam()
def Train(self):
--- 151,175 ----
sys.exit(-1)
def _selectFolder(self, name, read_only):
! folder = imap.select(name, read_only)
self._check(folder, 'select')
return folder
! def Login(self):
! lgn = imap.login(options.imap_username, options.imap_password)
! self._check(lgn, 'login')
def TrainFolder(self, folder_name, isSpam):
! folder = IMAPFolder(folder_name)
! for msg in folder:
if msg.isTrained():
! if msg.isTrndAs(isSpam):
! # already trained, nothing for us to do here
! # (we don't want to train the same message twice)
! continue
! if msg.isTrained():
! self.classifier.unlearn(msg.asTokens(), not isSpam)
! self.classifier.learn(msg.asTokens(), isSpam)
! msg.trndAs(isSpam)
def Train(self):
***************
*** 155,169 ****
if options.verbose:
t = time.time()
! inbox = self._selectFolder(options.imap_inbox, False)
! # the number of messages are returned
! # get all the corresponding UIDs
! uids = self._getUIDs(1, int(inbox[1][0]))
!
! for uid in uids:
! msg = self.RetrieveMessage(uid)
! (prob, clues) = self.classifier.spamprob(msg.asTokens(), evidence=True)
! msg.addSBHeaders(prob, clues) # adds headers and remembers classification
! self._updateMessage(msg)
! self._filterMessage(msg)
if options.verbose:
print "Filtering took", time.time() - t, "seconds."
--- 191,204 ----
if options.verbose:
t = time.time()
! for filter_folder in options.imap_filter_folders.split():
! folder = IMAPFolder(filter_folder, False)
! for msg in folder:
! (prob, clues) = self.classifier.spamprob(msg.asTokens(),
! evidence=True)
! # add headers and remember classification
! msg.addSBHeaders(prob, clues)
! # XXX updating is disabled for the moment
! # msg.Update()
! self._filterMessage(msg)
if options.verbose:
print "Filtering took", time.time() - t, "seconds."
***************
*** 172,193 ****
# sign off
if options.imap_expunge:
! self.imap.expunge()
! self.imap.logout()
!
! def _updateMessage(self, msg):
! # we can't actually update the message with IMAP
! # XXX (someone tell me if this is wrong!)
! # so what we do is create a new message and delete the old one
! # we return the new uid, which we obtain by searching for the
! # spambayes id
! res = self.imap.append(options.imap_inbox, None,
! self._extractTimeFromMessage(msg),
! msg.payload())
! self._check(res, "append")
! res = self.imap.uid("STORE", msg.getId(), "+FLAGS.SILENT", "(\\Deleted)")
! self._check(res, "uid store")
! res = self.imap.uid("SEARCH", "(TEXT)", msg.payload())
! self._check(res, "uid search")
! return res[1][0]
def _extractTimeFromMessage(self, msg):
--- 207,212 ----
# sign off
if options.imap_expunge:
! imap.expunge()
! imap.logout()
def _extractTimeFromMessage(self, msg):
***************
*** 198,221 ****
return imaplib.Time2Internaldate(time.time())
! def _moveMessage(self, msg, dest):
# The IMAP copy command makes an alias, not a whole new
# copy, so what we need to do (sigh) is create a new message
# in the correct folder, and delete the old one
! # XXX (someone tell me if this is wrong, too!)
! response = self.imap.uid("FETCH", msg.getId(), "(RFC822.PEEK)")
self._check(response, 'uid fetch')
!
! msg = spambayes.message.Message()
msg.setPayload(response[1][0][1])
! msg.setId(_extractTimeFromMessage(msg))
! response = self.imap.append(dest, None, msg.getId(), msg.payload())
self._check(response, "append")
! res = self.imap.uid("STORE", msg.getId(), "+FLAGS.SILENT", "(\\Deleted)")
self._check(response, "uid store")
! def _filterMessage(self, msg, prob):
if msg.isClsfdHam():
# we leave ham alone
pass
elif msg.isClsfdSpam():
--- 217,246 ----
return imaplib.Time2Internaldate(time.time())
! def _moveMessage(self, old_msg, dest):
# The IMAP copy command makes an alias, not a whole new
# copy, so what we need to do (sigh) is create a new message
# in the correct folder, and delete the old one
! # XXX (someone tell me if this is wrong)
! response = imap.uid("FETCH", old_msg.getId(), "(RFC822)")
self._check(response, 'uid fetch')
! msg = message.Message()
msg.setPayload(response[1][0][1])
! #response = imap.uid("SEARCH", "(TEXT)", msg.get_payload())
! #self._check(response, "search")
! #self.changeId(response[1][0])
! response = imap.append(dest, None,
! self._extractTimeFromMessage(msg),
! msg.get_payload())
self._check(response, "append")
! self._selectFolder(old_msg.folder_name, False)
! response = imap.uid("STORE", old_msg.getId(), "+FLAGS.SILENT",
! "(\\Deleted)")
self._check(response, "uid store")
! def _filterMessage(self, msg):
if msg.isClsfdHam():
# we leave ham alone
+ print "untouched"
pass
elif msg.isClsfdSpam():
***************
*** 227,230 ****
--- 252,256 ----
options.verbose = True
imap_filter = IMAPFilter()
+ # imap_filter.imap.debug = 10
imap_filter.Login()
imap_filter.Train()
From anadelonbrin at users.sourceforge.net Tue Apr 8 01:37:33 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Tue Apr 8 03:37:37 2003
Subject: [Spambayes-checkins] spambayes/spambayes Options.py,1.25,1.26
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv4387/spambayes
Modified Files:
Options.py
Log Message:
Introduces an IMAPMessage class based on the
spambayes Message class.
Introduces an iterable IMAPFolder class.
Changes the code to use all of this.
Changed to allow multiple folders to filter.
Training seems to work, although filtering isn't. I'll get to it.
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.25
retrieving revision 1.26
diff -C2 -d -r1.25 -r1.26
*** Options.py 7 Apr 2003 07:26:17 -0000 1.25
--- Options.py 8 Apr 2003 07:37:29 -0000 1.26
***************
*** 422,426 ****
imap_password:
imap_expunge: False
! imap_inbox: inbox
imap_unsure_folder:
imap_spam_folder:
--- 422,426 ----
imap_password:
imap_expunge: False
! imap_filter_folders: INBOX
imap_unsure_folder:
imap_spam_folder:
***************
*** 553,557 ****
'imap_username' : string_cracker,
'imap_password' : string_cracker,
! 'imap_inbox' : string_cracker,
'imap_unsure_folder' : string_cracker,
'imap_spam_folder' : string_cracker,
--- 553,557 ----
'imap_username' : string_cracker,
'imap_password' : string_cracker,
! 'imap_filter_folders' : string_cracker,
'imap_unsure_folder' : string_cracker,
'imap_spam_folder' : string_cracker,
From timstone4 at users.sourceforge.net Tue Apr 8 09:24:46 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Tue Apr 8 11:24:49 2003
Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.2,1.3
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv24156
Modified Files:
message.py
Log Message:
Added a couple more methods to support copying one message to another
Index: message.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** message.py 8 Apr 2003 07:35:42 -0000 1.2
--- message.py 8 Apr 2003 15:24:43 -0000 1.3
***************
*** 187,190 ****
--- 187,195 ----
def getId(self):
return self.id
+
+ def copy(self, old_msg):
+ self.setPayload(old_msg.payload()) # this is expensive...
+ self.setClassification(old_msg.getClassification())
+ self.setTraining(old_msg.getTraining())
def addSBHeaders(self, prob, clues):
***************
*** 275,278 ****
--- 280,290 ----
def getClassification(self):
return self.c
+
+ def setClassification(self, cls):
+ if cls == 's' or cls == 'h' or cls == 'u' or cls is None:
+ self.c = cls
+ self.modified()
+ else:
+ raise ValueError
def isTrndSpam(self):
***************
*** 312,316 ****
def getTraining(self):
return self.t
!
def __repr__(self):
return "core.Message%r" % repr(self.__getstate__())
--- 324,335 ----
def getTraining(self):
return self.t
!
! def setTraining(self, trn):
! if trn == 's' or trn == 'h' or trn is None:
! self.t = trn
! self.modified()
! else:
! raise ValueError
!
def __repr__(self):
return "core.Message%r" % repr(self.__getstate__())
From timstone4 at users.sourceforge.net Tue Apr 8 09:28:06 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Tue Apr 8 11:28:10 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.3,1.4
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv25441
Modified Files:
imapfilter.py
Log Message:
Added logic to ensure that classification and training memory is preserved
when IMAP messages are altered (i.e. deleted and added with a new id),
and when messages are retrained. Again... unable to test, so your turn,
Tony. I'm gonna have to get an IMAP thingy if I'm going to do much work
on this .
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** imapfilter.py 8 Apr 2003 07:37:28 -0000 1.3
--- imapfilter.py 8 Apr 2003 15:28:03 -0000 1.4
***************
*** 75,78 ****
--- 75,95 ----
self.changeId(response[1][0])
+ def Delete(self):
+ self._selectFolder(self.folder_name, False)
+ response = imap.uid("STORE", self.getId(), "+FLAGS.SILENT",
+ "(\\Deleted)")
+ self._check(response, "uid store")
+
+ # XXX there should actually be a delete from the msgid database here...
+ self.notTrained()
+ self.notClassified()
+
+ def Append(self):
+ response = imap.append(self.folder_name, None,
+ self.getId(),
+ self.get_payload())
+ self._check(response, "append")
+
+
class IMAPFolder(object):
***************
*** 135,138 ****
--- 152,159 ----
global imap
imap = imaplib.IMAP4(options.imap_server, options.imap_port)
+
+ self.spam_folder = IMAPFolder(options.imap_spam_folder)
+ self.unsure_folder = IMAPFolder(options.imap_unsure_folder)
+
if options.verbose:
print "Loading database...",
***************
*** 163,175 ****
folder = IMAPFolder(folder_name)
for msg in folder:
! if msg.isTrained():
! if msg.isTrndAs(isSpam):
! # already trained, nothing for us to do here
! # (we don't want to train the same message twice)
! continue
! if msg.isTrained():
! self.classifier.unlearn(msg.asTokens(), not isSpam)
! self.classifier.learn(msg.asTokens(), isSpam)
! msg.trndAs(isSpam)
def Train(self):
--- 184,200 ----
folder = IMAPFolder(folder_name)
for msg in folder:
! # XXX I've rewritten this logic. It looks a bit strange,
! # because of the msg.notTrained call immediately before the
! # test for isTrained, but this is safer. Once the message has
! # been untrained, it's training memory should reflect that
! # on the off chance that for some reason the training breaks,
! # which happens on occasion (the tokenizer is not yet perfect)
! if msg.isTrndAs(not isSpam):
! self.classifier.unlearn(msg.asTokens(), not isSpam)
! msg.notTrained()
!
! if not msg.isTrained():
! self.classifier.learn(msg.asTokens(), isSpam)
! msg.trndAs(isSpam)
def Train(self):
***************
*** 210,220 ****
imap.logout()
- def _extractTimeFromMessage(self, msg):
- # When we create a new copy of a message, we need to specify
- # a timestamp for the message. Ideally, this would be the
- # timestamp from the message itself, but for the moment, we
- # just use the current time.
- return imaplib.Time2Internaldate(time.time())
-
def _moveMessage(self, old_msg, dest):
# The IMAP copy command makes an alias, not a whole new
--- 235,238 ----
***************
*** 222,241 ****
# in the correct folder, and delete the old one
# XXX (someone tell me if this is wrong)
! response = imap.uid("FETCH", old_msg.getId(), "(RFC822)")
! self._check(response, 'uid fetch')
! msg = message.Message()
! msg.setPayload(response[1][0][1])
#response = imap.uid("SEARCH", "(TEXT)", msg.get_payload())
#self._check(response, "search")
#self.changeId(response[1][0])
! response = imap.append(dest, None,
! self._extractTimeFromMessage(msg),
! msg.get_payload())
! self._check(response, "append")
! self._selectFolder(old_msg.folder_name, False)
! response = imap.uid("STORE", old_msg.getId(), "+FLAGS.SILENT",
! "(\\Deleted)")
! self._check(response, "uid store")
def _filterMessage(self, msg):
--- 240,275 ----
# in the correct folder, and delete the old one
# XXX (someone tell me if this is wrong)
!
! # XXX I've redone this logic to use the IMAPMessage class. It
! # may be a bit of overkill, but it allows us to maintain the
! # proper training and classification memory for the message
! # as it's moved
!
! #response = imap.uid("FETCH", old_msg.getId(), "(RFC822)")
! #self._check(response, 'uid fetch')
! #msg = message.Message()
! #msg.setPayload(response[1][0][1])
!
! msg = IMAPMessage(dest.uid, dest.folder_name, None)
! msg.setId(msg.extractTime()) # this is kinda silly
! msg.copy(old_msg)
!
#response = imap.uid("SEARCH", "(TEXT)", msg.get_payload())
#self._check(response, "search")
#self.changeId(response[1][0])
! #response = imap.append(dest.folder_name, None,
! # msg.getId(),
! # msg.get_payload())
! #self._check(response, "append")
!
! msg.Append()
!
! #self._selectFolder(old_msg.folder_name, False)
! #response = imap.uid("STORE", old_msg.getId(), "+FLAGS.SILENT",
! # "(\\Deleted)")
! #self._check(response, "uid store")
!
! old_msg.Delete()
def _filterMessage(self, msg):
***************
*** 245,251 ****
pass
elif msg.isClsfdSpam():
! self._moveMessage(msg, options.imap_spam_folder)
else:
! self._moveMessage(msg, options.imap_unsure_folder)
if __name__ == '__main__':
--- 279,287 ----
pass
elif msg.isClsfdSpam():
! #XXX I actually think move should be a method on IMAPMessage
! #but I'm running out of time.
! self._moveMessage(msg, self.spam_folder)
else:
! self._moveMessage(msg, self.unsure_folder)
if __name__ == '__main__':
From timstone4 at users.sourceforge.net Tue Apr 8 21:25:27 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Tue Apr 8 23:25:30 2003
Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.3,1.4
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv8117
Modified Files:
message.py
Log Message:
A few corrections.
Index: message.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** message.py 8 Apr 2003 15:24:43 -0000 1.3
--- message.py 9 Apr 2003 03:25:24 -0000 1.4
***************
*** 82,85 ****
--- 82,86 ----
import sys
+ import types
import email.Message
***************
*** 158,162 ****
return self.id
! def changeID(self, id):
# We cannot re-set an id (see below). However there are
# occasionally times when the id for a message will change,
--- 159,163 ----
return self.id
! def changeId(self, id):
# We cannot re-set an id (see below). However there are
# occasionally times when the id for a message will change,
***************
*** 181,187 ****
--- 182,192 ----
if id is None:
raise ValueError, "MsgId must not be None"
+
+ if not type(id) in types.StringTypes:
+ raise TypeError, "Id must be a string"
self.id = id
msginfoDB._getState(self)
+ self.modified() # id has changed, force storage
def getId(self):
From anadelonbrin at users.sourceforge.net Wed Apr 9 00:14:31 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Wed Apr 9 02:14:39 2003
Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.4,1.5
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv20980/spambayes
Modified Files:
message.py
Log Message:
Changes the message class so that the base class is more
abstract and introduces a sub-class to add header add/remove
functions. Changes the set/get classify/training information
methods to a simpler version.
As per messages on list - if you don't like this version Tim feel free to change it back! :)
Index: message.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** message.py 9 Apr 2003 03:25:24 -0000 1.4
--- message.py 9 Apr 2003 06:14:29 -0000 1.5
***************
*** 126,133 ****
del self.db[msg.getId()]
! # this should come from a mark hammond idea of a master db
msginfoDB = MessageInfoDB("spambayes.messageinfo.db")
-
class Message(email.Message.Message):
'''An email.Message.Message extended for Spambayes'''
--- 126,132 ----
del self.db[msg.getId()]
! # this should come from a Mark Hammond idea of a master db
msginfoDB = MessageInfoDB("spambayes.messageinfo.db")
class Message(email.Message.Message):
'''An email.Message.Message extended for Spambayes'''
***************
*** 151,183 ****
prs._parsebody(self, StringIO(payload))
- def setIdFromPayload(self):
- try:
- self.setId(self[options.pop3proxy_mailid_header_name])
- except KeyError:
- return None
-
- return self.id
-
- def changeId(self, id):
- # We cannot re-set an id (see below). However there are
- # occasionally times when the id for a message will change,
- # for example, on an IMAP server (or possibly an exchange
- # server), the server may change the ids that we are using
- # We enforce that this must be an explicit *change* rather
- # than simply re-setting, by having this as a separate
- # function
- if not self.id:
- raise ValueError, "MsgID has not been set, cannot be changed"
- self._setId(id)
-
def setId(self, id):
if self.id:
raise ValueError, "MsgId has already been set, cannot be changed"
- self._setId(id)
! def _setId(self, id):
! # we should probably enforce type(id) is StringType.
! # the database will insist upon it, but at that point, it's harder
! # to diagnose
if id is None:
raise ValueError, "MsgId must not be None"
--- 150,160 ----
prs._parsebody(self, StringIO(payload))
def setId(self, id):
if self.id:
raise ValueError, "MsgId has already been set, cannot be changed"
! # XXX This isn't really needed since type(None) is not
! # XXX in types.StringTypes - do we still want it for the
! # XXX more informative error message?
if id is None:
raise ValueError, "MsgId must not be None"
***************
*** 193,201 ****
return self.id
! def copy(self, old_msg):
! self.setPayload(old_msg.payload()) # this is expensive...
! self.setClassification(old_msg.getClassification())
! self.setTraining(old_msg.getTraining())
def addSBHeaders(self, prob, clues):
'''Add hammie header, and remember message's classification. Also,
--- 170,218 ----
return self.id
! def asTokens(self):
! # use as_string() here because multipart/digest will return
! # a list of message objects if get_payload() is used
! return tokenize(self.as_string())
+ def modified(self):
+ if self.id: # only persist if key is present
+ msginfoDB._setState(self)
+
+ def GetClassification(self):
+ return self.c
+ def GetTrained(self):
+ return self.t
+ def RememberClassification(self, cls):
+ self.c = cls
+ self.modified()
+ def RememberTrained(self, isSpam):
+ self.t = isSpam
+ self.modified()
+
+ def __repr__(self):
+ return "core.Message%r" % repr(self.__getstate__())
+
+ def __getstate__(self):
+ return (self.id, self.c, self.t)
+
+ def __setstate__(self, t):
+ (self.id, self.c, self.t) = t
+
+ # XXX I can't think of a good name. Someone change
+ # XXX HeaderMessage to something better before it gets used
+ # XXX all over the place.
+ class HeaderMessage(Message):
+ '''Adds routines to add/remove headers for Spambayes'''
+ def __init__(self):
+ Message.__init__(self)
+
+ def setIdFromPayload(self):
+ try:
+ self.setId(self[options.pop3proxy_mailid_header_name])
+ except KeyError:
+ return None
+
+ return self.id
+
def addSBHeaders(self, prob, clues):
'''Add hammie header, and remember message's classification. Also,
***************
*** 204,215 ****
if prob < options.ham_cutoff:
disposition = options.header_ham_string
- self.clsfyAsHam()
elif prob > options.spam_cutoff:
disposition = options.header_spam_string
- self.clsfyAsSpam()
else:
disposition = options.header_unsure_string
! self.clsfyAsUnsure()
!
self[options.hammie_header_name] = disposition
--- 221,229 ----
if prob < options.ham_cutoff:
disposition = options.header_ham_string
elif prob > options.spam_cutoff:
disposition = options.header_spam_string
else:
disposition = options.header_unsure_string
! self.RememberClassification(disposition)
self[options.hammie_header_name] = disposition
***************
*** 228,232 ****
score <= options.clue_mailheader_cutoff or
score >= 1.0 - options.clue_mailheader_cutoff)])
-
self[options.pop3proxy_evidence_header_name] = evd
--- 242,245 ----
***************
*** 241,245 ****
# + messageName + "\r\n.\r\n"
-
def delSBHeaders(self):
del self[options.hammie_header_name]
--- 254,257 ----
***************
*** 249,346 ****
del self[options.pop3proxy_thermostat_header_name]
del self[options.pop3proxy_evidence_header_name]
-
- def asTokens(self):
- # use as_string() here because multipart/digest will return
- # a list of message objects if get_payload() is used
- return tokenize(self.as_string())
-
- def modified(self):
- if self.id: # only persist if key is present
- msginfoDB._setState(self)
-
- def isClsfdSpam(self):
- return self.c == 's'
-
- def isClsfdHam(self):
- return self.c == 'h'
-
- def isClsfdUnsure(self):
- return self.c == 'u'
-
- def isClassified(self):
- return not self.c is None
-
- def clsfyAsSpam(self):
- self.c = 's'
- self.modified()
-
- def clsfyAsHam(self):
- self.c = 'h'
- self.modified()
-
- def clsfyAsUnsure(self):
- self.c = 'u'
- self.modified()
-
- def getClassification(self):
- return self.c
-
- def setClassification(self, cls):
- if cls == 's' or cls == 'h' or cls == 'u' or cls is None:
- self.c = cls
- self.modified()
- else:
- raise ValueError
-
- def isTrndSpam(self):
- return self.t == 's'
-
- def isTrndHam(self):
- return self.t == 'h'
-
- def trndAsSpam(self):
- self.t = 's'
- self.modified()
-
- def trndAsHam(self):
- self.t = 'h'
- self.modified()
-
- def isTrndAs(self, isSpam):
- if self.t == 'h' and not isSpam:
- return True
- if self.t == 's' and isSpam:
- return True
- return False
-
- def trndAs(self, isSpam):
- if isSpam:
- self.t = 's'
- else:
- self.t = 'h'
-
- def notTrained(self):
- self.t = None
- self.modified()
-
- def isTrained(self):
- return not self.t is None
-
- def getTraining(self):
- return self.t
-
- def setTraining(self, trn):
- if trn == 's' or trn == 'h' or trn is None:
- self.t = trn
- self.modified()
- else:
- raise ValueError
-
- def __repr__(self):
- return "core.Message%r" % repr(self.__getstate__())
-
- def __getstate__(self):
- return (self.id, self.c, self.t)
-
- def __setstate__(self, t):
- (self.id, self.c, self.t) = t
\ No newline at end of file
--- 261,262 ----
From anadelonbrin at users.sourceforge.net Wed Apr 9 00:16:18 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Wed Apr 9 02:16:22 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.4,1.5
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv21599
Modified Files:
imapfilter.py
Log Message:
Updates the IMAPFilter to reflect the changes in the message class.
Lots of changes all over the place, integrating (and changing!)
Tim's code.
WARNING: It still seems to train fine (although maybe the saving
is also broken), but the filtering is still buggy. Over to people with
more time today.
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** imapfilter.py 8 Apr 2003 15:28:03 -0000 1.4
--- imapfilter.py 9 Apr 2003 06:16:15 -0000 1.5
***************
*** 29,32 ****
--- 29,39 ----
# a list is retrieved via imap.list()
+ # IMAPFolder objects get created all over the place, and don't persist
+ # at all. It would probably be good to change this, especially if
+ # the filter doesn't run just once
+
+ # All the imap responses should be checked - [0] should be "OK"
+ # otherwise an error will occur and who knows what will happen
+
try:
True, False
***************
*** 48,53 ****
imap = None
! class IMAPMessage(message.Message):
! # response checking is necessary throughout this class
def __init__(self, folder_id, folder_name, message_id):
message.Message.__init__(self)
--- 55,59 ----
imap = None
! class IMAPMessage(message.HeaderMessage):
def __init__(self, folder_id, folder_name, message_id):
message.Message.__init__(self)
***************
*** 55,58 ****
--- 61,65 ----
self.folder_id = folder_id
self.folder_name = folder_name
+ self.previous_folder = None
def extractTime(self):
***************
*** 63,98 ****
return imaplib.Time2Internaldate(time.time())
! def Update(self):
# we can't actually update the message with IMAP
# so what we do is create a new message and delete the old one
response = imap.append(self.folder_name, None,
self.extractTime(), self.get_payload())
- response = imap.select(self.folder_name, False)
- response = imap.uid("STORE", self.getId(), "+FLAGS.SILENT",
- "(\\Deleted)")
# we need to update the uid, as it will have changed
response = imap.uid("SEARCH", "(TEXT)", self.get_payload())
! self.changeId(response[1][0])
!
! def Delete(self):
! self._selectFolder(self.folder_name, False)
! response = imap.uid("STORE", self.getId(), "+FLAGS.SILENT",
! "(\\Deleted)")
! self._check(response, "uid store")
!
! # XXX there should actually be a delete from the msgid database here...
! self.notTrained()
! self.notClassified()
!
! def Append(self):
! response = imap.append(self.folder_name, None,
! self.getId(),
! self.get_payload())
! self._check(response, "append")
!
!
class IMAPFolder(object):
- # response checking is necessary throughout this class
def __init__(self, folder_name, readOnly=True):
self.name = folder_name
--- 70,100 ----
return imaplib.Time2Internaldate(time.time())
! def MoveTo(self, dest):
! # The move just changes where we think we are,
! # and we do an actual move on save (to avoid doing
! # this more than once)
! if self.previous_folder is not None:
! self.previous_folder = self.folder_name
! self.folder_name = dest
!
! def Save(self):
# we can't actually update the message with IMAP
# so what we do is create a new message and delete the old one
response = imap.append(self.folder_name, None,
self.extractTime(), self.get_payload())
# we need to update the uid, as it will have changed
+ # XXX there will be problems here if the message *has not*
+ # XXX changed, as the message to be deleted will be found first
+ # XXX (if they are in the same folder)
response = imap.uid("SEARCH", "(TEXT)", self.get_payload())
! old_id = self.id
! self.id = response[1][0]
! if self.previous_folder is not None:
! response = imap.select(self.previous_folder, False)
! self.previous_folder = None
! # this line is raising an error, but WHY?
! #response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)")
class IMAPFolder(object):
def __init__(self, folder_name, readOnly=True):
self.name = folder_name
***************
*** 147,150 ****
--- 149,184 ----
return msg
+ def Train(self, classifier, isSpam):
+ '''Train folder as spam/ham'''
+ for msg in self:
+ if msg.GetTrained() == isSpam:
+ classifier.unlearn(msg.asTokens(), not isSpam)
+ # Once the message has been untrained, it's training memory
+ # should reflect that on the off chance that for some reason
+ # the training breaks, which happens on occasion (the
+ # tokenizer is not yet perfect)
+ msg.RememberTrained(None)
+
+ if msg.GetTrained() is not None:
+ classifier.learn(msg.asTokens(), isSpam)
+ msg.RememberTrained(isSpam)
+
+ def FilterMessage(self, msg):
+ if msg.GetClassification() == options.header_ham_string:
+ # we leave ham alone
+ pass
+ elif msg.GetClassification() == options.header_spam_string:
+ msg.MoveTo(options.imap_spam_folder)
+ else:
+ msg.MoveTo(options.imap_unsure_folder)
+
+ def Filter(self, classifier):
+ for msg in self:
+ (prob, clues) = classifier.spamprob(msg.asTokens(), evidence=True)
+ # add headers and remember classification
+ msg.addSBHeaders(prob, clues)
+ self.FilterMessage(msg)
+ msg.Save()
+
class IMAPFilter(object):
***************
*** 153,159 ****
imap = imaplib.IMAP4(options.imap_server, options.imap_port)
- self.spam_folder = IMAPFolder(options.imap_spam_folder)
- self.unsure_folder = IMAPFolder(options.imap_unsure_folder)
-
if options.verbose:
print "Loading database...",
--- 187,190 ----
***************
*** 167,200 ****
print "Done."
- def _check(self, response, command):
- if response[0] != "OK":
- print "Invalid response to %s:\n%s" % (command, response)
- sys.exit(-1)
-
- def _selectFolder(self, name, read_only):
- folder = imap.select(name, read_only)
- self._check(folder, 'select')
- return folder
-
def Login(self):
lgn = imap.login(options.imap_username, options.imap_password)
- self._check(lgn, 'login')
-
- def TrainFolder(self, folder_name, isSpam):
- folder = IMAPFolder(folder_name)
- for msg in folder:
- # XXX I've rewritten this logic. It looks a bit strange,
- # because of the msg.notTrained call immediately before the
- # test for isTrained, but this is safer. Once the message has
- # been untrained, it's training memory should reflect that
- # on the off chance that for some reason the training breaks,
- # which happens on occasion (the tokenizer is not yet perfect)
- if msg.isTrndAs(not isSpam):
- self.classifier.unlearn(msg.asTokens(), not isSpam)
- msg.notTrained()
-
- if not msg.isTrained():
- self.classifier.learn(msg.asTokens(), isSpam)
- msg.trndAs(isSpam)
def Train(self):
--- 198,204 ----
print "Done."
def Login(self):
+ '''Log in to the IMAP server'''
lgn = imap.login(options.imap_username, options.imap_password)
def Train(self):
***************
*** 204,212 ****
ham_training_folders = options.imap_ham_train_folders.split()
for fol in ham_training_folders:
! self.TrainFolder(fol, False)
if options.imap_spam_train_folders != "":
spam_training_folders = options.imap_spam_train_folders.split(' ' )
for fol in spam_training_folders:
! self.TrainFolder(fol, True)
self.classifier.store()
if options.verbose:
--- 208,218 ----
ham_training_folders = options.imap_ham_train_folders.split()
for fol in ham_training_folders:
! folder = IMAPFolder(fol)
! folder.Train(self.classifier, False)
if options.imap_spam_train_folders != "":
spam_training_folders = options.imap_spam_train_folders.split(' ' )
for fol in spam_training_folders:
! folder = IMAPFolder(fol)
! folder.Train(self.classifier, True)
self.classifier.store()
if options.verbose:
***************
*** 218,288 ****
for filter_folder in options.imap_filter_folders.split():
folder = IMAPFolder(filter_folder, False)
! for msg in folder:
! (prob, clues) = self.classifier.spamprob(msg.asTokens(),
! evidence=True)
! # add headers and remember classification
! msg.addSBHeaders(prob, clues)
! # XXX updating is disabled for the moment
! # msg.Update()
! self._filterMessage(msg)
if options.verbose:
print "Filtering took", time.time() - t, "seconds."
def Logout(self):
! # sign off
if options.imap_expunge:
imap.expunge()
imap.logout()
- def _moveMessage(self, old_msg, dest):
- # The IMAP copy command makes an alias, not a whole new
- # copy, so what we need to do (sigh) is create a new message
- # in the correct folder, and delete the old one
- # XXX (someone tell me if this is wrong)
-
- # XXX I've redone this logic to use the IMAPMessage class. It
- # may be a bit of overkill, but it allows us to maintain the
- # proper training and classification memory for the message
- # as it's moved
-
- #response = imap.uid("FETCH", old_msg.getId(), "(RFC822)")
- #self._check(response, 'uid fetch')
- #msg = message.Message()
- #msg.setPayload(response[1][0][1])
-
- msg = IMAPMessage(dest.uid, dest.folder_name, None)
- msg.setId(msg.extractTime()) # this is kinda silly
- msg.copy(old_msg)
-
- #response = imap.uid("SEARCH", "(TEXT)", msg.get_payload())
- #self._check(response, "search")
- #self.changeId(response[1][0])
-
- #response = imap.append(dest.folder_name, None,
- # msg.getId(),
- # msg.get_payload())
- #self._check(response, "append")
-
- msg.Append()
-
- #self._selectFolder(old_msg.folder_name, False)
- #response = imap.uid("STORE", old_msg.getId(), "+FLAGS.SILENT",
- # "(\\Deleted)")
- #self._check(response, "uid store")
-
- old_msg.Delete()
-
- def _filterMessage(self, msg):
- if msg.isClsfdHam():
- # we leave ham alone
- print "untouched"
- pass
- elif msg.isClsfdSpam():
- #XXX I actually think move should be a method on IMAPMessage
- #but I'm running out of time.
- self._moveMessage(msg, self.spam_folder)
- else:
- self._moveMessage(msg, self.unsure_folder)
-
if __name__ == '__main__':
options.verbose = True
--- 224,237 ----
for filter_folder in options.imap_filter_folders.split():
folder = IMAPFolder(filter_folder, False)
! folder.Filter(self.classifier)
if options.verbose:
print "Filtering took", time.time() - t, "seconds."
def Logout(self):
! '''Log out of the IMAP server'''
if options.imap_expunge:
imap.expunge()
imap.logout()
if __name__ == '__main__':
options.verbose = True
***************
*** 290,294 ****
# imap_filter.imap.debug = 10
imap_filter.Login()
! imap_filter.Train()
imap_filter.Filter()
imap_filter.Logout()
--- 239,243 ----
# imap_filter.imap.debug = 10
imap_filter.Login()
! #imap_filter.Train()
imap_filter.Filter()
imap_filter.Logout()
From montanaro at users.sourceforge.net Thu Apr 10 07:28:31 2003
From: montanaro at users.sourceforge.net (Skip Montanaro)
Date: Thu Apr 10 09:28:35 2003
Subject: [Spambayes-checkins] spambayes mailsort.py,1.6,1.7
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv8670
Modified Files:
mailsort.py
Log Message:
correct misspelling of "Classifier"
Index: mailsort.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/mailsort.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** mailsort.py 16 Feb 2003 17:05:07 -0000 1.6
--- mailsort.py 10 Apr 2003 13:28:27 -0000 1.7
***************
*** 31,39 ****
def import_spambayes():
! global mboxutils, CdbClassifer, tokenize
if not os.environ.has_key('BAYESCUSTOMIZE'):
os.environ['BAYESCUSTOMIZE'] = os.path.expanduser(CONFIG_FILE)
from spambayes import mboxutils
! from spambayes.cdb_classifier import CdbClassifer
from spambayes.tokenizer import tokenize
--- 31,39 ----
def import_spambayes():
! global mboxutils, CdbClassifier, tokenize
if not os.environ.has_key('BAYESCUSTOMIZE'):
os.environ['BAYESCUSTOMIZE'] = os.path.expanduser(CONFIG_FILE)
from spambayes import mboxutils
! from spambayes.cdb_classifier import CdbClassifier
from spambayes.tokenizer import tokenize
***************
*** 88,92 ****
print "Creating", RC_DIR, "directory..."
os.mkdir(rc_dir)
! bayes = CdbClassifer()
print 'Training with ham...'
train(bayes, ham_name, False)
--- 88,92 ----
print "Creating", RC_DIR, "directory..."
os.mkdir(rc_dir)
! bayes = CdbClassifier()
print 'Training with ham...'
train(bayes, ham_name, False)
***************
*** 124,128 ****
msg = email.message_from_string(msgdata)
del msgdata
! bayes = CdbClassifer(open(DB_FILE, 'rb'))
prob = bayes.spamprob(tokenize(msg))
else:
--- 124,128 ----
msg = email.message_from_string(msgdata)
del msgdata
! bayes = CdbClassifier(open(DB_FILE, 'rb'))
prob = bayes.spamprob(tokenize(msg))
else:
***************
*** 139,143 ****
def print_message_score(msg_name, msg_fp):
msg = email.message_from_file(msg_fp)
! bayes = CdbClassifer(open(DB_FILE, 'rb'))
prob, evidence = bayes.spamprob(tokenize(msg), evidence=True)
print msg_name, prob
--- 139,143 ----
def print_message_score(msg_name, msg_fp):
msg = email.message_from_file(msg_fp)
! bayes = CdbClassifier(open(DB_FILE, 'rb'))
prob, evidence = bayes.spamprob(tokenize(msg), evidence=True)
print msg_name, prob
From montanaro at users.sourceforge.net Thu Apr 10 07:28:31 2003
From: montanaro at users.sourceforge.net (Skip Montanaro)
Date: Thu Apr 10 09:28:36 2003
Subject: [Spambayes-checkins] spambayes/spambayes cdb_classifier.py,1.1,1.2
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv8670/spambayes
Modified Files:
cdb_classifier.py
Log Message:
correct misspelling of "Classifier"
Index: cdb_classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/cdb_classifier.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** cdb_classifier.py 20 Jan 2003 03:14:32 -0000 1.1
--- cdb_classifier.py 10 Apr 2003 13:28:28 -0000 1.2
***************
*** 11,15 ****
from spambayes.classifier import Classifier
! class CdbClassifer(Classifier):
def __init__(self, cdbfile=None):
Classifier.__init__(self)
--- 11,15 ----
from spambayes.classifier import Classifier
! class CdbClassifier(Classifier):
def __init__(self, cdbfile=None):
Classifier.__init__(self)
From timstone4 at users.sourceforge.net Thu Apr 10 20:08:01 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Thu Apr 10 22:08:05 2003
Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.5,1.6
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv18493
Modified Files:
message.py
Log Message:
A start at addressing Mark's concerns/suggestions for this class. Eliminated
a bunch of YAGNI, moved some non-base methods into a subclass.
Index: message.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** message.py 9 Apr 2003 06:14:29 -0000 1.5
--- message.py 11 Apr 2003 02:07:57 -0000 1.6
***************
*** 5,8 ****
--- 5,9 ----
Classes:
Message - an email.Message.Message, extended with spambayes methods
+ SBHeaderMessage - A Message with spambayes header manipulations
MessageInfoDB - persistent state storage for Message
***************
*** 19,32 ****
assessment purposes.
! Message is an extension of the email package Message class, to include
! persistent message information and Spambayes specific header manipulations.
! The persistent state -currently- consists of the message id, its current
! classification, and its current training. The payload is not persisted.
! Payload persistence is left to whatever mail client software is being used.
Usage:
A typical classification usage pattern would be something like:
! >>>msg = spambayes.message.Message()
>>>msg.setPayload(substance) # substance comes from somewhere else
>>>id = msg.setIdFromPayload()
--- 20,37 ----
assessment purposes.
! Message is an extension of the email package Message class, to
! include persistent message information. The persistent state
! -currently- consists of the message id, its current
! classification, and its current training. The payload is not
! persisted. Payload persistence is left to whatever mail client
! software is being used.
!
! SBHeaderMessage extends Message to include spambayes header specific
! manipulations.
Usage:
A typical classification usage pattern would be something like:
! >>>msg = spambayes.message.SBHeaderMessage()
>>>msg.setPayload(substance) # substance comes from somewhere else
>>>id = msg.setIdFromPayload()
***************
*** 45,49 ****
A typical usage pattern to train as spam would be something like:
! >>>msg = spambayes.message.Message()
>>>msg.setPayload(substance) # substance comes from somewhere else
>>>id = msg.setId(msgid) # id is a fname, outlook msg id, something...
--- 50,54 ----
A typical usage pattern to train as spam would be something like:
! >>>msg = spambayes.message.SBHeaderMessage()
>>>msg.setPayload(substance) # substance comes from somewhere else
>>>id = msg.setId(msgid) # id is a fname, outlook msg id, something...
***************
*** 51,59 ****
>>>msg.delSBHeaders() # never include sb headers in a train
! >>>if msg.isTrndHam():
>>> bayes.unlearn(msg.asTokens(), False) # untrain the ham
>>>bayes.learn(msg.asTokens(), True) # train as spam
! >>>msg.trndAsSpam()
--- 56,64 ----
>>>msg.delSBHeaders() # never include sb headers in a train
! >>>if msg.getTraining() == False: # could be None, can't do boolean test
>>> bayes.unlearn(msg.asTokens(), False) # untrain the ham
>>>bayes.learn(msg.asTokens(), True) # train as spam
! >>>msg.rememberTraining(True)
***************
*** 95,106 ****
import shelve
- # XXX Tim, what do you want to do here? This
- # XXX recurses infinately at the moment
- # Make shelve use binary pickles by default.
- #oldShelvePickler = shelve.Pickler
- #def binaryDefaultPickler(f, binary=1):
- # return oldShelvePickler(f, binary)
- #shelve.Pickler = binaryDefaultPickler
-
class MessageInfoDB:
--- 100,103 ----
***************
*** 180,190 ****
def GetClassification(self):
! return self.c
! def GetTrained(self):
! return self.t
def RememberClassification(self, cls):
! self.c = cls
self.modified()
def RememberTrained(self, isSpam):
self.t = isSpam
self.modified()
--- 177,210 ----
def GetClassification(self):
! if self.c == 's':
! return options.header_spam_string
! if self.c == 'h':
! return options.header_ham_string
! if self.c == 'u':
! return options.header_unsure_string
!
! return None
!
def RememberClassification(self, cls):
! # this must store state independent of options settings, as they
! # may change, which would really screw this database up
!
! # an unrecoginzed string here is interpreted as unsure. Should
! # that condition actually raise an exception instead?
!
! if cls == options.header_spam_string:
! self.c = 's'
! elif cls == options.header_ham_string:
! self.c = 'h'
! else
! self.c = 'u'
!
self.modified()
+
+ def GetTrained(self):
+ return self.t
+
def RememberTrained(self, isSpam):
+ # isSpam == None means no training has been done
self.t = isSpam
self.modified()
***************
*** 199,207 ****
(self.id, self.c, self.t) = t
! # XXX I can't think of a good name. Someone change
! # XXX HeaderMessage to something better before it gets used
! # XXX all over the place.
! class HeaderMessage(Message):
! '''Adds routines to add/remove headers for Spambayes'''
def __init__(self):
Message.__init__(self)
--- 219,227 ----
(self.id, self.c, self.t) = t
!
! class SBHeaderMessage(Message):
! '''Message class that is cognizant of Spambayes headers.
! Adds routines to add/remove headers for Spambayes'''
!
def __init__(self):
Message.__init__(self)
From timstone4 at users.sourceforge.net Thu Apr 10 20:09:12 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Thu Apr 10 22:09:18 2003
Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.6,1.7
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv19044
Modified Files:
message.py
Log Message:
Missed a syntax error.
Index: message.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** message.py 11 Apr 2003 02:07:57 -0000 1.6
--- message.py 11 Apr 2003 02:09:09 -0000 1.7
***************
*** 197,201 ****
elif cls == options.header_ham_string:
self.c = 'h'
! else
self.c = 'u'
--- 197,201 ----
elif cls == options.header_ham_string:
self.c = 'h'
! else:
self.c = 'u'
From timstone4 at users.sourceforge.net Thu Apr 10 20:11:24 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Thu Apr 10 22:11:29 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.5,1.6
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv20052
Modified Files:
imapfilter.py
Log Message:
Made some changes to accomodate the new message class. Not tested
yet, but checked in on the chance that Tony wants to see it sooner than
I can get it tested.
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** imapfilter.py 9 Apr 2003 06:16:15 -0000 1.5
--- imapfilter.py 11 Apr 2003 02:11:21 -0000 1.6
***************
*** 29,39 ****
# a list is retrieved via imap.list()
- # IMAPFolder objects get created all over the place, and don't persist
- # at all. It would probably be good to change this, especially if
- # the filter doesn't run just once
-
- # All the imap responses should be checked - [0] should be "OK"
- # otherwise an error will occur and who knows what will happen
-
try:
True, False
--- 29,32 ----
***************
*** 55,65 ****
imap = None
! class IMAPMessage(message.HeaderMessage):
! def __init__(self, folder_id, folder_name, message_id):
message.Message.__init__(self)
self.setId(message_id)
! self.folder_id = folder_id
! self.folder_name = folder_name
! self.previous_folder = None
def extractTime(self):
--- 48,62 ----
imap = None
! class IMAPMessage(message.SBHeaderMessage):
! # response checking is necessary throughout this class
! def __init__(self, folder, message_id):
message.Message.__init__(self)
self.setId(message_id)
! self.folder = folder
!
! def _check(self, response, command):
! if response[0] != "OK":
! print "Invalid response to %s:\n%s" % (command, response)
! sys.exit(-1)
def extractTime(self):
***************
*** 75,85 ****
# this more than once)
if self.previous_folder is not None:
! self.previous_folder = self.folder_name
! self.folder_name = dest
def Save(self):
# we can't actually update the message with IMAP
# so what we do is create a new message and delete the old one
! response = imap.append(self.folder_name, None,
self.extractTime(), self.get_payload())
# we need to update the uid, as it will have changed
--- 72,82 ----
# this more than once)
if self.previous_folder is not None:
! self.previous_folder = self.folder
! self.folder = dest
def Save(self):
# we can't actually update the message with IMAP
# so what we do is create a new message and delete the old one
! response = imap.append(self.folder.name, None,
self.extractTime(), self.get_payload())
# we need to update the uid, as it will have changed
***************
*** 91,100 ****
self.id = response[1][0]
if self.previous_folder is not None:
! response = imap.select(self.previous_folder, False)
self.previous_folder = None
# this line is raising an error, but WHY?
#response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)")
class IMAPFolder(object):
def __init__(self, folder_name, readOnly=True):
self.name = folder_name
--- 88,99 ----
self.id = response[1][0]
if self.previous_folder is not None:
! response = imap.select(self.previous_folder.name, False)
self.previous_folder = None
# this line is raising an error, but WHY?
#response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)")
+
class IMAPFolder(object):
+ # response checking is necessary throughout this class
def __init__(self, folder_name, readOnly=True):
self.name = folder_name
***************
*** 114,118 ****
if response[0] != "OK":
self.rfc822_command = "(RFC822)"
!
def __iter__(self):
'''IMAPFolder is iterable'''
--- 113,126 ----
if response[0] != "OK":
self.rfc822_command = "(RFC822)"
!
! def Select(self):
! imap.select(self.name, False)
! self._check(folder, 'select')
!
! def _check(self, response, command):
! if response[0] != "OK":
! print "Invalid response to %s:\n%s" % (command, response)
! sys.exit(-1)
!
def __iter__(self):
'''IMAPFolder is iterable'''
***************
*** 145,152 ****
# we return an instance of *our* message class, not the
# raw rfc822 message
! msg = IMAPMessage(self.uid, self.name, key)
msg.setPayload(messageText)
return msg
!
def Train(self, classifier, isSpam):
'''Train folder as spam/ham'''
--- 153,160 ----
# we return an instance of *our* message class, not the
# raw rfc822 message
! msg = IMAPMessage(self, key)
msg.setPayload(messageText)
return msg
!
def Train(self, classifier, isSpam):
'''Train folder as spam/ham'''
***************
*** 164,184 ****
msg.RememberTrained(isSpam)
! def FilterMessage(self, msg):
if msg.GetClassification() == options.header_ham_string:
# we leave ham alone
pass
elif msg.GetClassification() == options.header_spam_string:
! msg.MoveTo(options.imap_spam_folder)
else:
! msg.MoveTo(options.imap_unsure_folder)
!
! def Filter(self, classifier):
! for msg in self:
! (prob, clues) = classifier.spamprob(msg.asTokens(), evidence=True)
! # add headers and remember classification
! msg.addSBHeaders(prob, clues)
! self.FilterMessage(msg)
! msg.Save()
class IMAPFilter(object):
--- 172,190 ----
msg.RememberTrained(isSpam)
! def Filter(self, classifier, spamfolder, unsurefolder):
! for msg in self:
! (prob, clues) = classifier.spamprob(msg.asTokens(), evidence=True)
! # add headers and remember classification
! msg.addSBHeaders(prob, clues)
!
if msg.GetClassification() == options.header_ham_string:
# we leave ham alone
pass
elif msg.GetClassification() == options.header_spam_string:
! msg.MoveTo(spamfolder)
else:
! msg.MoveTo(unsurefolder)
+ msg.Save()
class IMAPFilter(object):
***************
*** 187,203 ****
imap = imaplib.IMAP4(options.imap_server, options.imap_port)
! if options.verbose:
! print "Loading database...",
filename = options.pop3proxy_persistent_storage_file
filename = os.path.expanduser(filename)
if options.pop3proxy_persistent_use_database:
self.classifier = storage.DBDictClassifier(filename)
else:
self.classifier = storage.PickledClassifier(filename)
if options.verbose:
print "Done."
def Login(self):
- '''Log in to the IMAP server'''
lgn = imap.login(options.imap_username, options.imap_password)
--- 193,214 ----
imap = imaplib.IMAP4(options.imap_server, options.imap_port)
! self.spam_folder = IMAPFolder(options.imap_spam_folder)
! self.unsure_folder = IMAPFolder(options.imap_unsure_folder)
!
filename = options.pop3proxy_persistent_storage_file
filename = os.path.expanduser(filename)
+
+ if options.verbose:
+ print "Loading database %s..." % (filename),
+
if options.pop3proxy_persistent_use_database:
self.classifier = storage.DBDictClassifier(filename)
else:
self.classifier = storage.PickledClassifier(filename)
+
if options.verbose:
print "Done."
def Login(self):
lgn = imap.login(options.imap_username, options.imap_password)
***************
*** 205,208 ****
--- 216,220 ----
if options.verbose:
t = time.time()
+
if options.imap_ham_train_folders != "":
ham_training_folders = options.imap_ham_train_folders.split()
***************
*** 210,213 ****
--- 222,226 ----
folder = IMAPFolder(fol)
folder.Train(self.classifier, False)
+
if options.imap_spam_train_folders != "":
spam_training_folders = options.imap_spam_train_folders.split(' ' )
***************
*** 215,219 ****
--- 228,234 ----
folder = IMAPFolder(fol)
folder.Train(self.classifier, True)
+
self.classifier.store()
+
if options.verbose:
print "Training took", time.time() - t, "seconds."
***************
*** 222,237 ****
if options.verbose:
t = time.time()
for filter_folder in options.imap_filter_folders.split():
folder = IMAPFolder(filter_folder, False)
! folder.Filter(self.classifier)
if options.verbose:
print "Filtering took", time.time() - t, "seconds."
def Logout(self):
! '''Log out of the IMAP server'''
if options.imap_expunge:
imap.expunge()
imap.logout()
if __name__ == '__main__':
options.verbose = True
--- 237,255 ----
if options.verbose:
t = time.time()
+
for filter_folder in options.imap_filter_folders.split():
folder = IMAPFolder(filter_folder, False)
! folder.Filter(self.classifier, self.spam_folder, self.unsure_folder)
!
if options.verbose:
print "Filtering took", time.time() - t, "seconds."
def Logout(self):
! # sign off
if options.imap_expunge:
imap.expunge()
imap.logout()
+
if __name__ == '__main__':
options.verbose = True
***************
*** 239,243 ****
# imap_filter.imap.debug = 10
imap_filter.Login()
! #imap_filter.Train()
imap_filter.Filter()
imap_filter.Logout()
--- 257,261 ----
# imap_filter.imap.debug = 10
imap_filter.Login()
! imap_filter.Train()
imap_filter.Filter()
imap_filter.Logout()
From timstone4 at users.sourceforge.net Sat Apr 12 20:02:57 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Sat Apr 12 22:02:59 2003
Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.7,1.8
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv25842
Modified Files:
message.py
Log Message:
A few corrections
Index: message.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** message.py 11 Apr 2003 02:09:09 -0000 1.7
--- message.py 13 Apr 2003 02:02:54 -0000 1.8
***************
*** 141,145 ****
def setPayload(self, payload):
! prs = email.Parser.HeaderParser()
prs._parseheaders(self, StringIO(payload))
# we may want to do some header parsing error handling here
--- 141,147 ----
def setPayload(self, payload):
! prs = email.Parser.Parser()
! # this is kindof a hack, due to the fact that the parser creates a
! # new message object, and we already have the message object
prs._parseheaders(self, StringIO(payload))
# we may want to do some header parsing error handling here
***************
*** 183,187 ****
if self.c == 'u':
return options.header_unsure_string
!
return None
--- 185,189 ----
if self.c == 'u':
return options.header_unsure_string
!
return None
***************
*** 266,269 ****
--- 268,274 ----
if options.pop3proxy_add_mailid_to.find("header") != -1:
self[options.pop3proxy_mailid_header_name] = self.id
+
+ # print self._headers
+ # print self.as_string()
# This won't work for now, because email.Message does not isolate message body
From timstone4 at users.sourceforge.net Sat Apr 12 20:04:48 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Sat Apr 12 22:04:52 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.6,1.7
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv26297
Modified Files:
imapfilter.py
Log Message:
Lots and lots of development work, this is the first basically functional version
of the imap filter, and it's actually been 'tested'. Tested is quoted, because
IMAP seems to be a really flukey kind of interface, and until it's been used
on lots of imap servers, by lots of people, I won't be convinced that it's
really correct.
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** imapfilter.py 11 Apr 2003 02:11:21 -0000 1.6
--- imapfilter.py 13 Apr 2003 02:04:45 -0000 1.7
***************
*** 13,16 ****
--- 13,43 ----
The original filter design owed much to isbg by Roger Binns
(http://www.rogerbinns.com/isbg).
+
+ Usage:
+ imapfilter [options]
+
+ note: option values with spaces in them must be enclosed
+ in double quotes
+
+ options:
+ -d dbname : pickled training database filename
+ -D dbname : dbm training database filename
+ -t : train contents of spam folder and ham folder
+ -c : classify inbox
+ -h : help
+
+ Examples:
+
+ Classify inbox, with dbm database
+ imapfilter -c -D bayes.db
+
+ Train Spam and Ham, then classify inbox, with dbm database
+ imapfilter -t -c -D bayes.db
+
+ Train Spam and Ham only, with pickled database
+ imapfilter -t -d bayes.db
+
+ To Do:
+ o Suggestions?
"""
***************
*** 41,44 ****
--- 68,73 ----
import time
import sys
+ import getopt
+ import email.Parser
from spambayes.Options import options
***************
*** 50,57 ****
class IMAPMessage(message.SBHeaderMessage):
# response checking is necessary throughout this class
! def __init__(self, folder, message_id):
message.Message.__init__(self)
! self.setId(message_id)
! self.folder = folder
def _check(self, response, command):
--- 79,91 ----
class IMAPMessage(message.SBHeaderMessage):
# response checking is necessary throughout this class
! def __init__(self):
message.Message.__init__(self)
! #XXX When a message object is created, an id and a folder should
! #XXX immediately be set. These cannot be passed in on the
! #XXX constructor, due to the quirky way that email.Parser.Parser
! #XXX does its thing.
! self.id = None
! self.folder = None
! self.previous_folder = None
def _check(self, response, command):
***************
*** 65,69 ****
# timestamp from the message itself, but for the moment, we
# just use the current time.
! return imaplib.Time2Internaldate(time.time())
def MoveTo(self, dest):
--- 99,108 ----
# timestamp from the message itself, but for the moment, we
# just use the current time.
! #XXX the imaplib time function returns a string like
! #XXX "12-Apr-2003 19:56:28 -0500" This seems like a bad message id.
! #XXX For one thing, it only resolves to one second. Even a cheap
! #XXX refractor telescope can resolve better than that ;)
! # return imaplib.Time2Internaldate(time.time())
! return time.time()
def MoveTo(self, dest):
***************
*** 71,95 ****
# and we do an actual move on save (to avoid doing
# this more than once)
! if self.previous_folder is not None:
self.previous_folder = self.folder
! self.folder = dest
def Save(self):
# we can't actually update the message with IMAP
# so what we do is create a new message and delete the old one
response = imap.append(self.folder.name, None,
! self.extractTime(), self.get_payload())
# we need to update the uid, as it will have changed
# XXX there will be problems here if the message *has not*
# XXX changed, as the message to be deleted will be found first
# XXX (if they are in the same folder)
! response = imap.uid("SEARCH", "(TEXT)", self.get_payload())
old_id = self.id
! self.id = response[1][0]
if self.previous_folder is not None:
response = imap.select(self.previous_folder.name, False)
self.previous_folder = None
! # this line is raising an error, but WHY?
! #response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)")
--- 110,143 ----
# and we do an actual move on save (to avoid doing
# this more than once)
! if self.previous_folder is None and not self.folder == dest:
self.previous_folder = self.folder
! self.folder = dest
def Save(self):
# we can't actually update the message with IMAP
# so what we do is create a new message and delete the old one
+ new_id = self.extractTime()
response = imap.append(self.folder.name, None,
! new_id, self.as_string())
! self._check(response, 'append')
# we need to update the uid, as it will have changed
# XXX there will be problems here if the message *has not*
# XXX changed, as the message to be deleted will be found first
# XXX (if they are in the same folder)
! #response = imap.uid("SEARCH", "(TEXT)", self.as_string())
! #self._check(response, 'search')
! #self.id = response[1][0]
!
old_id = self.id
! self.id = new_id
if self.previous_folder is not None:
response = imap.select(self.previous_folder.name, False)
+ self._check(response, 'folder select')
self.previous_folder = None
! response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)")
! self._check(response, 'store')
!
! #XXX We really should delete the old message from the msgid db.
! #XXX There is currently no interface to do this with.
***************
*** 153,162 ****
# we return an instance of *our* message class, not the
# raw rfc822 message
! msg = IMAPMessage(self, key)
! msg.setPayload(messageText)
return msg
def Train(self, classifier, isSpam):
'''Train folder as spam/ham'''
for msg in self:
if msg.GetTrained() == isSpam:
--- 201,218 ----
# we return an instance of *our* message class, not the
# raw rfc822 message
! #XXX I can't get parsing to work correctly if I pull the guts
! #XXX out of Parser.parse() and do that in the setPayload method
! #XXX of the message class. Why? I have **NO** idea.
! #msg = IMAPMessage(self, key)
! #msg.setPayload(messageText)
! msg = email.Parser.Parser(_class=IMAPMessage).parsestr(messageText)
! msg.folder = self
! msg.setId(key)
!
return msg
def Train(self, classifier, isSpam):
'''Train folder as spam/ham'''
+ num_trained = 0
for msg in self:
if msg.GetTrained() == isSpam:
***************
*** 170,216 ****
if msg.GetTrained() is not None:
classifier.learn(msg.asTokens(), isSpam)
msg.RememberTrained(isSpam)
def Filter(self, classifier, spamfolder, unsurefolder):
for msg in self:
! (prob, clues) = classifier.spamprob(msg.asTokens(), evidence=True)
! # add headers and remember classification
! msg.addSBHeaders(prob, clues)
!
! if msg.GetClassification() == options.header_ham_string:
! # we leave ham alone
! pass
! elif msg.GetClassification() == options.header_spam_string:
! msg.MoveTo(spamfolder)
! else:
! msg.MoveTo(unsurefolder)
! msg.Save()
class IMAPFilter(object):
! def __init__(self):
global imap
imap = imaplib.IMAP4(options.imap_server, options.imap_port)
self.spam_folder = IMAPFolder(options.imap_spam_folder)
self.unsure_folder = IMAPFolder(options.imap_unsure_folder)
-
- filename = options.pop3proxy_persistent_storage_file
- filename = os.path.expanduser(filename)
! if options.verbose:
! print "Loading database %s..." % (filename),
- if options.pop3proxy_persistent_use_database:
- self.classifier = storage.DBDictClassifier(filename)
- else:
- self.classifier = storage.PickledClassifier(filename)
-
- if options.verbose:
- print "Done."
-
- def Login(self):
- lgn = imap.login(options.imap_username, options.imap_password)
-
def Train(self):
if options.verbose:
--- 226,263 ----
if msg.GetTrained() is not None:
classifier.learn(msg.asTokens(), isSpam)
+ num_trained += 1
msg.RememberTrained(isSpam)
+ return num_trained
+
def Filter(self, classifier, spamfolder, unsurefolder):
for msg in self:
! if msg.GetClassification() is None:
! (prob, clues) = classifier.spamprob(msg.asTokens(), evidence=True)
! # add headers and remember classification
! msg.addSBHeaders(prob, clues)
! cls = msg.GetClassification()
! if cls == options.header_ham_string:
! # we leave ham alone
! pass
! elif cls == options.header_spam_string:
! msg.MoveTo(spamfolder)
! else:
! msg.MoveTo(unsurefolder)
+ msg.Save()
+
class IMAPFilter(object):
! def __init__(self, classifier):
global imap
imap = imaplib.IMAP4(options.imap_server, options.imap_port)
+ lgn = imap.login(options.imap_username, options.imap_password)
self.spam_folder = IMAPFolder(options.imap_spam_folder)
self.unsure_folder = IMAPFolder(options.imap_unsure_folder)
! self.classifier = classifier
def Train(self):
if options.verbose:
***************
*** 221,225 ****
for fol in ham_training_folders:
folder = IMAPFolder(fol)
! folder.Train(self.classifier, False)
if options.imap_spam_train_folders != "":
--- 268,272 ----
for fol in ham_training_folders:
folder = IMAPFolder(fol)
! num_ham_trained = folder.Train(self.classifier, False)
if options.imap_spam_train_folders != "":
***************
*** 227,236 ****
for fol in spam_training_folders:
folder = IMAPFolder(fol)
! folder.Train(self.classifier, True)
! self.classifier.store()
if options.verbose:
! print "Training took", time.time() - t, "seconds."
def Filter(self):
--- 274,285 ----
for fol in spam_training_folders:
folder = IMAPFolder(fol)
! num_spam_trained = folder.Train(self.classifier, True)
! if num_ham_trained or num_spam_trained:
! self.classifier.store()
if options.verbose:
! print "Training took %s seconds, %s messages were trained" \
! % (time.time() - t, num_ham_trained + num_spam_trained)
def Filter(self):
***************
*** 253,261 ****
if __name__ == '__main__':
! options.verbose = True
! imap_filter = IMAPFilter()
# imap_filter.imap.debug = 10
! imap_filter.Login()
! imap_filter.Train()
! imap_filter.Filter()
imap_filter.Logout()
--- 302,355 ----
if __name__ == '__main__':
!
! try:
! opts, args = getopt.getopt(sys.argv[1:], 'htcvd:D:')
! except getopt.error, msg:
! print >>sys.stderr, str(msg) + '\n\n' + __doc__
! sys.exit()
!
! bdbname = options.pop3proxy_persistent_storage_file
! useDBM = options.pop3proxy_persistent_use_database
! doTrain = False
! doClassify = False
!
! for opt, arg in opts:
! if opt == '-h':
! print >>sys.stderr, __doc__
! sys.exit()
! elif opt == '-d':
! useDBM = False
! bdbname = arg
! elif opt == '-D':
! useDBM = True
! bdbname = arg
! elif opt == '-t':
! doTrain = True
! elif opt == '-c':
! doClassify = True
! elif opt == '-v':
! options.verbose = True
!
!
! bdbname = os.path.expanduser(bdbname)
!
! if options.verbose:
! print "Loading database %s..." % (bdbname),
!
! if useDBM:
! classifier = storage.DBDictClassifier(bdbname)
! else:
! classifier = storage.PickledClassifier(bdbname)
!
! if options.verbose:
! print "Done."
!
! imap_filter = IMAPFilter(classifier)
# imap_filter.imap.debug = 10
! # imap_filter.Login()
! if doTrain:
! imap_filter.Train()
! if doClassify:
! imap_filter.Filter()
!
imap_filter.Logout()
From timstone4 at users.sourceforge.net Sun Apr 13 06:54:05 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Sun Apr 13 08:54:08 2003
Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.8,1.9
Message-ID:
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv29022
Modified Files:
message.py
Log Message:
Raised an error on RememberClassification if the classification to be
remembered is not recognizable.
Index: message.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** message.py 13 Apr 2003 02:02:54 -0000 1.8
--- message.py 13 Apr 2003 12:54:02 -0000 1.9
***************
*** 142,151 ****
def setPayload(self, payload):
prs = email.Parser.Parser()
# this is kindof a hack, due to the fact that the parser creates a
# new message object, and we already have the message object
! prs._parseheaders(self, StringIO(payload))
# we may want to do some header parsing error handling here
# to try to extract important headers regardless of malformations
! prs._parsebody(self, StringIO(payload))
def setId(self, id):
--- 142,152 ----
def setPayload(self, payload):
prs = email.Parser.Parser()
+ fp = StringIO(payload)
# this is kindof a hack, due to the fact that the parser creates a
# new message object, and we already have the message object
! prs._parseheaders(self, fp)
# we may want to do some header parsing error handling here
# to try to extract important headers regardless of malformations
! prs._parsebody(self, fp)
def setId(self, id):
***************
*** 153,159 ****
raise ValueError, "MsgId has already been set, cannot be changed"
- # XXX This isn't really needed since type(None) is not
- # XXX in types.StringTypes - do we still want it for the
- # XXX more informative error message?
if id is None:
raise ValueError, "MsgId must not be None"
--- 154,157 ----
***************
*** 170,175 ****
def asTokens(self):
- # use as_string() here because multipart/digest will return
- # a list of message objects if get_payload() is used
return tokenize(self.as_string())
--- 168,171 ----
***************
*** 199,204 ****
elif cls == options.header_ham_string:
self.c = 'h'
! else:
self.c = 'u'
self.modified()
--- 195,203 ----
elif cls == options.header_ham_string:
self.c = 'h'
! elif cls == options.header_unsure_string:
self.c = 'u'
+ else:
+ raise ValueError, \
+ "Classification must match header strings in options"
self.modified()
***************
*** 213,217 ****
def __repr__(self):
! return "core.Message%r" % repr(self.__getstate__())
def __getstate__(self):
--- 212,216 ----
def __repr__(self):
! return "spambayes.message.Message%r" % repr(self.__getstate__())
def __getstate__(self):
From timstone4 at users.sourceforge.net Sun Apr 13 06:54:57 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Sun Apr 13 08:55:01 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.7,1.8
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv29478
Modified Files:
imapfilter.py
Log Message:
Made the message constructor work the way I wanted it to originally. Added
a couple of options, refactored a few methods, general code cleanup.
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** imapfilter.py 13 Apr 2003 02:04:45 -0000 1.7
--- imapfilter.py 13 Apr 2003 12:54:54 -0000 1.8
***************
*** 26,29 ****
--- 26,32 ----
-c : classify inbox
-h : help
+ -v : verbose mode
+ -e : sets expunge to the *opposite* of options.imap_expunge
+ -i debuglvl : a somewhat mysterious imaplib debugging level
Examples:
***************
*** 39,42 ****
--- 42,53 ----
To Do:
+ o Remove old msg from info database when saveing modified messages
+ o Use DELETE rather than storing //DELETED flag when saving modified messages
+ o Web UI for configuration and setup. # Tony thinks it would be
+ nice if there was a web ui to this for the initial setup (i.e. like
+ pop3proxy), which offered a list of folders to filter/train/etc. It
+ could then record a uid for the folder rather than a name, and it
+ avoids the problems with different imap servers having different
+ naming styles a list is retrieved via imap.list()
o Suggestions?
"""
***************
*** 49,59 ****
__credits__ = "Tim Stone, All the Spambayes folk."
- # Tony thinks it would be nice if there was a web ui to
- # this for the initial setup (i.e. like pop3proxy), which offered
- # a list of folders to filter/train/etc. It could then record a
- # uid for the folder rather than a name, and it avoids the problems
- # with different imap servers having different naming styles
- # a list is retrieved via imap.list()
-
try:
True, False
--- 60,63 ----
***************
*** 79,90 ****
class IMAPMessage(message.SBHeaderMessage):
# response checking is necessary throughout this class
! def __init__(self):
message.Message.__init__(self)
! #XXX When a message object is created, an id and a folder should
! #XXX immediately be set. These cannot be passed in on the
! #XXX constructor, due to the quirky way that email.Parser.Parser
! #XXX does its thing.
! self.id = None
! self.folder = None
self.previous_folder = None
--- 83,91 ----
class IMAPMessage(message.SBHeaderMessage):
# response checking is necessary throughout this class
! def __init__(self, folder, id):
message.Message.__init__(self)
!
! self.id = id
! self.folder = folder
self.previous_folder = None
***************
*** 110,114 ****
# and we do an actual move on save (to avoid doing
# this more than once)
! if self.previous_folder is None and not self.folder == dest:
self.previous_folder = self.folder
self.folder = dest
--- 111,115 ----
# and we do an actual move on save (to avoid doing
# this more than once)
! if self.previous_folder is None:
self.previous_folder = self.folder
self.folder = dest
***************
*** 201,212 ****
# we return an instance of *our* message class, not the
# raw rfc822 message
! #XXX I can't get parsing to work correctly if I pull the guts
! #XXX out of Parser.parse() and do that in the setPayload method
! #XXX of the message class. Why? I have **NO** idea.
! #msg = IMAPMessage(self, key)
! #msg.setPayload(messageText)
! msg = email.Parser.Parser(_class=IMAPMessage).parsestr(messageText)
! msg.folder = self
! msg.setId(key)
return msg
--- 202,208 ----
# we return an instance of *our* message class, not the
# raw rfc822 message
!
! msg = IMAPMessage(self, key)
! msg.setPayload(messageText)
return msg
***************
*** 253,257 ****
global imap
imap = imaplib.IMAP4(options.imap_server, options.imap_port)
! lgn = imap.login(options.imap_username, options.imap_password)
self.spam_folder = IMAPFolder(options.imap_spam_folder)
--- 249,254 ----
global imap
imap = imaplib.IMAP4(options.imap_server, options.imap_port)
!
! self.Login(options.imap_username, options.imap_password)
self.spam_folder = IMAPFolder(options.imap_spam_folder)
***************
*** 294,300 ****
print "Filtering took", time.time() - t, "seconds."
! def Logout(self):
# sign off
! if options.imap_expunge:
imap.expunge()
imap.logout()
--- 291,308 ----
print "Filtering took", time.time() - t, "seconds."
! def Login(self, uid, pw):
! try:
! lgn = imap.login(uid, pw)
! except imaplib.IMAP4.error, e:
! if str(e) == "permission denied":
! print "There was an error logging in to the IMAP server."
! print "The userid and/or password may be in error."
! sys.exit()
! else:
! raise
!
! def Logout(self, expunge):
# sign off
! if expunge:
imap.expunge()
imap.logout()
***************
*** 304,308 ****
try:
! opts, args = getopt.getopt(sys.argv[1:], 'htcvd:D:')
except getopt.error, msg:
print >>sys.stderr, str(msg) + '\n\n' + __doc__
--- 312,316 ----
try:
! opts, args = getopt.getopt(sys.argv[1:], 'htcvei:d:D:')
except getopt.error, msg:
print >>sys.stderr, str(msg) + '\n\n' + __doc__
***************
*** 313,316 ****
--- 321,326 ----
doTrain = False
doClassify = False
+ doExpunge = options.imap_expunge
+ imapDebug = 0
for opt, arg in opts:
***************
*** 330,351 ****
elif opt == '-v':
options.verbose = True
! bdbname = os.path.expanduser(bdbname)
!
! if options.verbose:
! print "Loading database %s..." % (bdbname),
!
! if useDBM:
! classifier = storage.DBDictClassifier(bdbname)
! else:
! classifier = storage.PickledClassifier(bdbname)
! if options.verbose:
! print "Done."
imap_filter = IMAPFilter(classifier)
! # imap_filter.imap.debug = 10
! # imap_filter.Login()
if doTrain:
imap_filter.Train()
--- 340,365 ----
elif opt == '-v':
options.verbose = True
+ elif opt == '-e':
+ doExpunge = not doExpunge
+ elif opt == '-i:':
+ imapDebug = int(arg)
! bdbname = os.path.expanduser(bdbname)
!
! if options.verbose:
! print "Loading database %s..." % (bdbname),
!
! if useDBM:
! classifier = storage.DBDictClassifier(bdbname)
! else:
! classifier = storage.PickledClassifier(bdbname)
! if options.verbose:
! print "Done."
imap_filter = IMAPFilter(classifier)
! imap.debug = imapDebug
!
if doTrain:
imap_filter.Train()
***************
*** 353,355 ****
imap_filter.Filter()
! imap_filter.Logout()
--- 367,369 ----
imap_filter.Filter()
! imap_filter.Logout(doExpunge)
From timstone4 at users.sourceforge.net Sun Apr 13 14:45:33 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Sun Apr 13 16:45:37 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.8,1.9
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv20262
Modified Files:
imapfilter.py
Log Message:
Cobbled together a hack to remove old message ids from the message info db.
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** imapfilter.py 13 Apr 2003 12:54:54 -0000 1.8
--- imapfilter.py 13 Apr 2003 20:45:30 -0000 1.9
***************
*** 42,46 ****
To Do:
! o Remove old msg from info database when saveing modified messages
o Use DELETE rather than storing //DELETED flag when saving modified messages
o Web UI for configuration and setup. # Tony thinks it would be
--- 42,47 ----
To Do:
! o Find a better way to remove old msg from info database when saving
! modified messages
o Use DELETE rather than storing //DELETED flag when saving modified messages
o Web UI for configuration and setup. # Tony thinks it would be
***************
*** 131,135 ****
old_id = self.id
- self.id = new_id
if self.previous_folder is not None:
response = imap.select(self.previous_folder.name, False)
--- 132,135 ----
***************
*** 139,144 ****
self._check(response, 'store')
! #XXX We really should delete the old message from the msgid db.
! #XXX There is currently no interface to do this with.
--- 139,150 ----
self._check(response, 'store')
! #XXX This code to deletd the old message id from the message
! #XXX info db and manipulate the message id, is a *serious* hack.
! #XXX There's gotta be a better way to do this.
!
! message.msginfoDB._delState(self)
!
! self.id = str(new_id)
! self.modified()
From montanaro at users.sourceforge.net Sun Apr 13 16:24:27 2003
From: montanaro at users.sourceforge.net (Skip Montanaro)
Date: Sun Apr 13 18:24:30 2003
Subject: [Spambayes-checkins] spambayes mboxtrain.py,1.6,1.7
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv6250
Modified Files:
mboxtrain.py
Log Message:
train() clearly couldn't be correct the way it was written. Pass in
trainnew from main() and test for the existence of path. (Not tested - I
don't use this code. Just responding to a note on the spambayes list.)
Index: mboxtrain.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/mboxtrain.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** mboxtrain.py 21 Mar 2003 01:33:19 -0000 1.6
--- mboxtrain.py 13 Apr 2003 22:24:24 -0000 1.7
***************
*** 208,213 ****
(trained, counter))
! def train(h, path, is_spam, force):
! if os.path.isfile(path):
mbox_train(h, path, is_spam, force)
elif os.path.isdir(os.path.join(path, "cur")):
--- 208,215 ----
(trained, counter))
! def train(h, path, is_spam, force, trainnew):
! if not os.path.exists(path):
! raise ValueError("Nonexistent path: %s" % path)
! elif os.path.isfile(path):
mbox_train(h, path, is_spam, force)
elif os.path.isdir(os.path.join(path, "cur")):
***************
*** 277,286 ****
for g in good:
if loud: print "Training ham (%s):" % g
! train(h, g, False, force)
save = True
for s in spam:
if loud: print "Training spam (%s):" % s
! train(h, s, True, force)
save = True
--- 279,288 ----
for g in good:
if loud: print "Training ham (%s):" % g
! train(h, g, False, force, trainnew)
save = True
for s in spam:
if loud: print "Training spam (%s):" % s
! train(h, s, True, force, trainnew)
save = True
From anadelonbrin at users.sourceforge.net Sun Apr 13 16:52:16 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Sun Apr 13 18:52:19 2003
Subject: [Spambayes-checkins] spambayes/windows pop3proxy_service.py, 1.3,
1.4
Message-ID:
Update of /cvsroot/spambayes/spambayes/windows
In directory sc8-pr-cvs1:/tmp/cvs-serv23596/windows
Modified Files:
pop3proxy_service.py
Log Message:
Fix for UI showing incorrect server strings as discovered
and solved by Paul Moore.
Index: pop3proxy_service.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/windows/pop3proxy_service.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** pop3proxy_service.py 23 Mar 2003 07:21:52 -0000 1.3
--- pop3proxy_service.py 13 Apr 2003 22:52:13 -0000 1.4
***************
*** 102,105 ****
--- 102,106 ----
def ServerThread(self):
state = pop3proxy.state
+ state.buildServerStrings()
pop3proxy.main(state.servers, state.proxyPorts, state.uiPort, state.launchUI)
From anadelonbrin at users.sourceforge.net Sun Apr 13 18:15:53 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Sun Apr 13 20:15:57 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.9,1.10
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv31908
Modified Files:
imapfilter.py
Log Message:
Various speed improvements.
Some comment changes.
Various small changes.
Tested with two IMAP servers and seems to still work :)
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** imapfilter.py 13 Apr 2003 20:45:30 -0000 1.9
--- imapfilter.py 14 Apr 2003 00:15:50 -0000 1.10
***************
*** 6,14 ****
messages are scored and (where necessary) filtered.
- It is suggested that this filter is set to run at certain intervals.
- Note that it is (currently) fairly slow, so this should not be too
- often. An alternative to this would be to keep the filter running
- and logged in, and periodically check for new mail.
-
The original filter design owed much to isbg by Roger Binns
(http://www.rogerbinns.com/isbg).
--- 6,9 ----
***************
*** 45,49 ****
modified messages
o Use DELETE rather than storing //DELETED flag when saving modified messages
! o Web UI for configuration and setup. # Tony thinks it would be
nice if there was a web ui to this for the initial setup (i.e. like
pop3proxy), which offered a list of folders to filter/train/etc. It
--- 40,44 ----
modified messages
o Use DELETE rather than storing //DELETED flag when saving modified messages
! o Web UI for configuration and setup. Tony thinks it would be
nice if there was a web ui to this for the initial setup (i.e. like
pop3proxy), which offered a list of folders to filter/train/etc. It
***************
*** 51,54 ****
--- 46,58 ----
avoids the problems with different imap servers having different
naming styles a list is retrieved via imap.list()
+ o IMAPMessage and IMAPFolder currently carry out very simple checks
+ of responses received from IMAP commands, but if the response is not
+ "OK", then the filter terminates. Handling of these errors could be
+ much nicer.
+ o The filter is currently designed to be periodically run (with cron,
+ for example). It would probably be nicer if it was continually
+ running (like pop3proxy, for example) and periodically checked for
+ any new messages to process (with the RECENT command). The period
+ could be an option.
o Suggestions?
"""
***************
*** 82,87 ****
imap = None
class IMAPMessage(message.SBHeaderMessage):
- # response checking is necessary throughout this class
def __init__(self, folder, id):
message.Message.__init__(self)
--- 86,116 ----
imap = None
+ # global rfc822 fetch command
+ rfc822_command = "(RFC822.PEEK)"
+
+ # For efficiency, we remember which folder we are currently
+ # in, and only send a select command to the IMAP server if
+ # we want to *change* folders. This function is used by
+ # both IMAPMessage and IMAPFolder.
+ # Occaisionally, we need to force a command, because we
+ # are interested in the response. Things would be much
+ # nicer if we cached this information somewhere.
+ # XXX If we wanted to be nice and tidy, this really belongs
+ # XXX in an IMAPUtilities class, or something like that.
+ current_folder = None
+ current_folder_readonly = None
+ def Select(folder, readOnly=True, force=False):
+ global current_folder
+ global current_folder_readonly
+ if current_folder != folder or current_folder_readonly != readOnly or force:
+ response = imap.select(folder, readOnly)
+ if response[0] != "OK":
+ print "Invalid response to %s:\n%s" % (command, response)
+ sys.exit(-1)
+ current_folder = folder
+ current_folder_readonly = readOnly
+ return response
+
class IMAPMessage(message.SBHeaderMessage):
def __init__(self, folder, id):
message.Message.__init__(self)
***************
*** 101,110 ****
# timestamp from the message itself, but for the moment, we
# just use the current time.
! #XXX the imaplib time function returns a string like
! #XXX "12-Apr-2003 19:56:28 -0500" This seems like a bad message id.
! #XXX For one thing, it only resolves to one second. Even a cheap
! #XXX refractor telescope can resolve better than that ;)
! # return imaplib.Time2Internaldate(time.time())
! return time.time()
def MoveTo(self, dest):
--- 130,134 ----
# timestamp from the message itself, but for the moment, we
# just use the current time.
! return imaplib.Time2Internaldate(time.time())
def MoveTo(self, dest):
***************
*** 119,125 ****
# we can't actually update the message with IMAP
# so what we do is create a new message and delete the old one
! new_id = self.extractTime()
response = imap.append(self.folder.name, None,
! new_id, self.as_string())
self._check(response, 'append')
# we need to update the uid, as it will have changed
--- 143,149 ----
# we can't actually update the message with IMAP
# so what we do is create a new message and delete the old one
! time_stamp = self.extractTime()
response = imap.append(self.folder.name, None,
! time_stamp, self.as_string())
self._check(response, 'append')
# we need to update the uid, as it will have changed
***************
*** 127,175 ****
# XXX changed, as the message to be deleted will be found first
# XXX (if they are in the same folder)
! #response = imap.uid("SEARCH", "(TEXT)", self.as_string())
! #self._check(response, 'search')
! #self.id = response[1][0]
old_id = self.id
! if self.previous_folder is not None:
! response = imap.select(self.previous_folder.name, False)
! self._check(response, 'folder select')
self.previous_folder = None
! response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)")
! self._check(response, 'store')
! #XXX This code to deletd the old message id from the message
! #XXX info db and manipulate the message id, is a *serious* hack.
! #XXX There's gotta be a better way to do this.
! message.msginfoDB._delState(self)
! self.id = str(new_id)
self.modified()
class IMAPFolder(object):
- # response checking is necessary throughout this class
def __init__(self, folder_name, readOnly=True):
self.name = folder_name
- # Convert folder name to a uid
- self.uid = None
- response = imap.select(self.name, readOnly)
- responses = imap.response("OK")[1]
- for response in responses:
- if response[:13] == "[UIDVALIDITY ":
- r = re.compile(r"(?P\d+)")
- self.uid = r.search(response[13:]).group('uid')
- # We really want to use RFC822.PEEK here, as that doesn't effect
- # the status of the message. Unfortunately, it appears that not
- # all IMAP servers support this, even though it is in RFC1730
- self.rfc822_command = "(RFC822.PEEK)"
- response = imap.fetch("1:1", self.rfc822_command)
- if response[0] != "OK":
- self.rfc822_command = "(RFC822)"
-
- def Select(self):
- imap.select(self.name, False)
- self._check(folder, 'select')
def _check(self, response, command):
--- 151,187 ----
# XXX changed, as the message to be deleted will be found first
# XXX (if they are in the same folder)
! # response = imap.uid("SEARCH", "(TEXT)", self.as_string())
! # self._check(response, 'search')
! # new_id = response[1][0]
! # XXX This fails at the moment and needs to be resolved,
! # XXX but it can't be properly checked until the header
! # XXX adding part of the message class works.
! # XXX For the moment, having a new empty-string id just
! # XXX mucks up our message database, not the training or
! # XXX filtering itself
! new_id = ""
old_id = self.id
! if self.previous_folder is None:
! self.folder.Select(False)
! else:
! self.previous_folder.Select(False)
self.previous_folder = None
! response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)")
! self._check(response, 'store')
! #XXX This code to delete the old message id from the message
! #XXX info db and manipulate the message id, is a *serious* hack.
! #XXX There's gotta be a better way to do this.
! message.msginfoDB._delState(self)
! self.id = new_id
self.modified()
class IMAPFolder(object):
def __init__(self, folder_name, readOnly=True):
self.name = folder_name
def _check(self, response, command):
***************
*** 189,193 ****
'''Returns uids for all the messages in the folder'''
# request message range
! response = imap.select(self.name, True)
total_messages = response[1][0]
if total_messages == '0':
--- 201,205 ----
'''Returns uids for all the messages in the folder'''
# request message range
! response = Select(self.name, True, True)
total_messages = response[1][0]
if total_messages == '0':
***************
*** 204,208 ****
def __getitem__(self, key):
'''Return message matching the given uid'''
! response = imap.uid("FETCH", key, self.rfc822_command)
messageText = response[1][0][1]
# we return an instance of *our* message class, not the
--- 216,229 ----
def __getitem__(self, key):
'''Return message matching the given uid'''
! global rfc822_command
! Select(self.name, True)
! # We really want to use RFC822.PEEK here, as that doesn't effect
! # the status of the message. Unfortunately, it appears that not
! # all IMAP servers support this, even though it is in RFC1730
! response = imap.uid("FETCH", key, rfc822_command)
! if response[0] != "OK":
! rfc822_command = "(RFC822)"
! response = imap.uid("FETCH", key, rfc822_command)
! self._check(response, "uid fetch")
messageText = response[1][0][1]
# we return an instance of *our* message class, not the
***************
*** 214,217 ****
--- 235,241 ----
return msg
+ def Select(self, readOnly):
+ return Select(self.name, readOnly)
+
def Train(self, classifier, isSpam):
'''Train folder as spam/ham'''
***************
*** 252,258 ****
class IMAPFilter(object):
! def __init__(self, classifier):
global imap
imap = imaplib.IMAP4(options.imap_server, options.imap_port)
self.Login(options.imap_username, options.imap_password)
--- 276,283 ----
class IMAPFilter(object):
! def __init__(self, classifier, debug):
global imap
imap = imaplib.IMAP4(options.imap_server, options.imap_port)
+ imap.debug = imapDebug
self.Login(options.imap_username, options.imap_password)
***************
*** 274,278 ****
if options.imap_spam_train_folders != "":
! spam_training_folders = options.imap_spam_train_folders.split(' ' )
for fol in spam_training_folders:
folder = IMAPFolder(fol)
--- 299,303 ----
if options.imap_spam_train_folders != "":
! spam_training_folders = options.imap_spam_train_folders.split()
for fol in spam_training_folders:
folder = IMAPFolder(fol)
***************
*** 303,307 ****
if str(e) == "permission denied":
print "There was an error logging in to the IMAP server."
! print "The userid and/or password may be in error."
sys.exit()
else:
--- 328,332 ----
if str(e) == "permission denied":
print "There was an error logging in to the IMAP server."
! print "The userid and/or password may be incorrect."
sys.exit()
else:
***************
*** 348,355 ****
elif opt == '-e':
doExpunge = not doExpunge
! elif opt == '-i:':
imapDebug = int(arg)
-
bdbname = os.path.expanduser(bdbname)
--- 373,379 ----
elif opt == '-e':
doExpunge = not doExpunge
! elif opt == '-i':
imapDebug = int(arg)
bdbname = os.path.expanduser(bdbname)
***************
*** 365,370 ****
print "Done."
! imap_filter = IMAPFilter(classifier)
! imap.debug = imapDebug
if doTrain:
--- 389,393 ----
print "Done."
! imap_filter = IMAPFilter(classifier, imapDebug)
if doTrain:
From timstone4 at users.sourceforge.net Sun Apr 13 23:07:34 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Mon Apr 14 01:07:38 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.10,1.11
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv31805
Modified Files:
imapfilter.py
Log Message:
Fixed the docstring not printing on -h, changed -e operand to have a value
argument of y/n to control expunging, and added a -l value operand for
looping the filter with a sleep time interval.
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** imapfilter.py 14 Apr 2003 00:15:50 -0000 1.10
--- imapfilter.py 14 Apr 2003 05:07:32 -0000 1.11
***************
*** 1,6 ****
#!/usr/bin/env python
- from __future__ import generators
-
"""An IMAP filter. An IMAP message box is scanned and all non-scored
messages are scored and (where necessary) filtered.
--- 1,4 ----
***************
*** 22,27 ****
-h : help
-v : verbose mode
! -e : sets expunge to the *opposite* of options.imap_expunge
-i debuglvl : a somewhat mysterious imaplib debugging level
Examples:
--- 20,26 ----
-h : help
-v : verbose mode
! -e y/n : sets expunge to the *opposite* of options.imap_expunge
-i debuglvl : a somewhat mysterious imaplib debugging level
+ -l minutes : period of time between filtering operations
Examples:
***************
*** 39,43 ****
o Find a better way to remove old msg from info database when saving
modified messages
- o Use DELETE rather than storing //DELETED flag when saving modified messages
o Web UI for configuration and setup. Tony thinks it would be
nice if there was a web ui to this for the initial setup (i.e. like
--- 38,41 ----
***************
*** 54,58 ****
running (like pop3proxy, for example) and periodically checked for
any new messages to process (with the RECENT command). The period
! could be an option.
o Suggestions?
"""
--- 52,56 ----
running (like pop3proxy, for example) and periodically checked for
any new messages to process (with the RECENT command). The period
! could be an option. This is partially done with the -l operand.
o Suggestions?
"""
***************
*** 65,68 ****
--- 63,68 ----
__credits__ = "Tim Stone, All the Spambayes folk."
+ from __future__ import generators
+
try:
True, False
***************
*** 343,347 ****
try:
! opts, args = getopt.getopt(sys.argv[1:], 'htcvei:d:D:')
except getopt.error, msg:
print >>sys.stderr, str(msg) + '\n\n' + __doc__
--- 343,347 ----
try:
! opts, args = getopt.getopt(sys.argv[1:], 'htcvl:e:i:d:D:')
except getopt.error, msg:
print >>sys.stderr, str(msg) + '\n\n' + __doc__
***************
*** 354,357 ****
--- 354,358 ----
doExpunge = options.imap_expunge
imapDebug = 0
+ sleepTime = 0
for opt, arg in opts:
***************
*** 372,378 ****
options.verbose = True
elif opt == '-e':
! doExpunge = not doExpunge
elif opt == '-i':
imapDebug = int(arg)
bdbname = os.path.expanduser(bdbname)
--- 373,388 ----
options.verbose = True
elif opt == '-e':
! if arg == 'y':
! doExpunge = True
! else:
! doExpunge = False
elif opt == '-i':
imapDebug = int(arg)
+ elif opt == '-l':
+ sleepTime = int(arg) * 60
+
+ if not (doClassify or doTrain):
+ print "-c and/or -t operands must be specified"
+ sys.exit()
bdbname = os.path.expanduser(bdbname)
***************
*** 390,398 ****
imap_filter = IMAPFilter(classifier, imapDebug)
! if doTrain:
! imap_filter.Train()
! if doClassify:
! imap_filter.Filter()
imap_filter.Logout(doExpunge)
--- 400,418 ----
imap_filter = IMAPFilter(classifier, imapDebug)
+ print sleepTime
+ while 1:
+ if doTrain:
+ if options.verbose:
+ print "Training"
+ imap_filter.Train()
+ if doClassify:
+ if options.verbose:
+ print "Classifying"
+ imap_filter.Filter()
! if sleepTime:
! time.sleep(sleepTime)
! else:
! break
imap_filter.Logout(doExpunge)
From timstone4 at users.sourceforge.net Sun Apr 13 23:15:32 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Mon Apr 14 01:15:35 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.11,1.12
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv3899
Modified Files:
imapfilter.py
Log Message:
Left a debug message in.
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.11
retrieving revision 1.12
diff -C2 -d -r1.11 -r1.12
*** imapfilter.py 14 Apr 2003 05:07:32 -0000 1.11
--- imapfilter.py 14 Apr 2003 05:15:27 -0000 1.12
***************
*** 400,404 ****
imap_filter = IMAPFilter(classifier, imapDebug)
! print sleepTime
while 1:
if doTrain:
--- 400,404 ----
imap_filter = IMAPFilter(classifier, imapDebug)
!
while 1:
if doTrain:
From timstone4 at users.sourceforge.net Mon Apr 14 20:22:45 2003
From: timstone4 at users.sourceforge.net (Tim Stone)
Date: Mon Apr 14 22:22:48 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.12,1.13
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv22122
Modified Files:
imapfilter.py
Log Message:
Timestamp for new messages extracted correctly
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.12
retrieving revision 1.13
diff -C2 -d -r1.12 -r1.13
*** imapfilter.py 14 Apr 2003 05:15:27 -0000 1.12
--- imapfilter.py 15 Apr 2003 02:22:42 -0000 1.13
***************
*** 130,134 ****
# timestamp from the message itself, but for the moment, we
# just use the current time.
! return imaplib.Time2Internaldate(time.time())
def MoveTo(self, dest):
--- 130,137 ----
# timestamp from the message itself, but for the moment, we
# just use the current time.
! try:
! return self["Date"]
! except KeyError:
! return imaplib.Time2Internaldate(time.time())
def MoveTo(self, dest):
From anadelonbrin at users.sourceforge.net Wed Apr 16 18:08:20 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Wed Apr 16 20:08:23 2003
Subject: [Spambayes-checkins] spambayes imapfilter.py,1.13,1.14
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv32173
Modified Files:
imapfilter.py
Log Message:
Fix the invalid date problem reported by Oliver Maunder.
Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -d -r1.13 -r1.14
*** imapfilter.py 15 Apr 2003 02:22:42 -0000 1.13
--- imapfilter.py 17 Apr 2003 00:08:17 -0000 1.14
***************
*** 20,24 ****
-h : help
-v : verbose mode
! -e y/n : sets expunge to the *opposite* of options.imap_expunge
-i debuglvl : a somewhat mysterious imaplib debugging level
-l minutes : period of time between filtering operations
--- 20,24 ----
-h : help
-v : verbose mode
! -e y/n : expunge/purge messages on exit (y) or not (n)
-i debuglvl : a somewhat mysterious imaplib debugging level
-l minutes : period of time between filtering operations
***************
*** 79,82 ****
--- 79,83 ----
import getopt
import email.Parser
+ from email.Utils import parsedate
from spambayes.Options import options
***************
*** 131,135 ****
# just use the current time.
try:
! return self["Date"]
except KeyError:
return imaplib.Time2Internaldate(time.time())
--- 132,136 ----
# just use the current time.
try:
! return imaplib.Time2Internaldate(time.mktime(parsedate(self["Date"])))
except KeyError:
return imaplib.Time2Internaldate(time.time())
***************
*** 150,167 ****
time_stamp, self.as_string())
self._check(response, 'append')
- # we need to update the uid, as it will have changed
- # XXX there will be problems here if the message *has not*
- # XXX changed, as the message to be deleted will be found first
- # XXX (if they are in the same folder)
- # response = imap.uid("SEARCH", "(TEXT)", self.as_string())
- # self._check(response, 'search')
- # new_id = response[1][0]
- # XXX This fails at the moment and needs to be resolved,
- # XXX but it can't be properly checked until the header
- # XXX adding part of the message class works.
- # XXX For the moment, having a new empty-string id just
- # XXX mucks up our message database, not the training or
- # XXX filtering itself
- new_id = ""
old_id = self.id
--- 151,154 ----
***************
*** 174,177 ****
--- 161,174 ----
self._check(response, 'store')
+ # We need to update the uid, as it will have changed
+ # XXX There will be problems here if the message *has not*
+ # XXX changed, as the message to be deleted will be found first
+ # XXX (if they are in the same folder)
+ self.folder.Select(True)
+ #response = imap.uid("SEARCH", "TEXT", self.as_string())
+ #self._check(response, 'search')
+ #new_id = response[1][0]
+ new_id = ""
+
#XXX This code to delete the old message id from the message
#XXX info db and manipulate the message id, is a *serious* hack.
***************
*** 179,183 ****
message.msginfoDB._delState(self)
-
self.id = new_id
self.modified()
--- 176,179 ----
***************
*** 404,408 ****
imap_filter = IMAPFilter(classifier, imapDebug)
! while 1:
if doTrain:
if options.verbose:
--- 400,404 ----
imap_filter = IMAPFilter(classifier, imapDebug)
! while True:
if doTrain:
if options.verbose:
From anadelonbrin at users.sourceforge.net Wed Apr 16 18:41:32 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Wed Apr 16 20:42:02 2003
Subject: [Spambayes-checkins] spambayes INTEGRATION.txt,1.7,1.8
Message-ID:
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv11441
Modified Files:
INTEGRATION.txt
Log Message:
Update to include basic information about the IMAP filter.
Also updated SMTP proxy information so that it is (hopefully!)
clearer.
Updated the POP3 proxy information so that users are
directed to configure spambayes via the web user interface
rather than mucking about in bayescustomize.ini.
Index: INTEGRATION.txt
===================================================================
RCS file: /cvsroot/spambayes/spambayes/INTEGRATION.txt,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** INTEGRATION.txt 11 Mar 2003 01:51:05 -0000 1.7
--- INTEGRATION.txt 17 Apr 2003 00:41:30 -0000 1.8
***************
*** 63,67 ****
--------
! There are six main components to the Spambayes system:
o A database. Loosely speaking, this is a collection of words and
--- 63,67 ----
--------
! There are eight main components to the Spambayes system:
o A database. Loosely speaking, this is a collection of words and
***************
*** 80,86 ****
o The POP3 proxy. This sits between your email client (Eudora, Outlook
! Express, etc) and your email server, and adds the classification header
! to emails as you download them. A typical user's email setup looks
! like this:
+-----------------+ +-------------+
--- 80,86 ----
o The POP3 proxy. This sits between your email client (Eudora, Outlook
! Express, etc) and your incoming email server, and adds the
! classification header to emails as you download them. A typical
! user's email setup looks like this:
+-----------------+ +-------------+
***************
*** 109,116 ****
servers, if you have more than one email account.
! o The web interface. This is a server that runs alongside the POP3 proxy
! and lets you control it through the web. You can upload emails to it
! for training or classification, query the probabilities database ("How
! many of my emails really *do* contain the word Viagra"?) and most
importantly, train it on the emails you've received. When you start
using the system, unless you train it using the Hammie script it will
--- 109,148 ----
servers, if you have more than one email account.
! o The SMTP proxy. This sits between your email client (Eudora, Outlook
! Express, etc) and your outgoing email server. Any mail sent to
! spambayes_spam@localhost or spambayes_ham@localhost is intercepted
! and trained appropriately. A typical user's email setup looks like
! this:
!
! +-----------------+ +-------------+
! | Outlook Express | Internet or intranet | |
! | (or similar) | <--------------------------> | SMTP server |
! | | | |
! +-----------------+ +-------------+
!
! The SMTP server runs either at your ISP for internet mail, or somewhere
! on your internal network for corporate mail. The SMTP proxy sits in the
! middle and checks for mail to train on as you send your email:
!
! +-----------------+ +------------+ +-------------+
! | Outlook Express | | Spambayes | | |
! | (or similar) | <----> | SMTP proxy | <----> | SMTP server |
! | | | | | |
! +-----------------+ +------------+ +-------------+
!
! So where you currently have your email client configured to talk to
! say, "smtp.my-isp.com", you instead configure the *proxy* to talk to
! "smtp.my-isp.com" and configure your email client to talk to the proxy.
! The SMTP proxy can live on your PC, or on the same machine as the SMTP
! server, or on a different machine entirely, it really doesn't matter.
! Say it's living on your PC, you'd configure your email client to talk
! to "localhost". You can configure the proxy to talk to multiple SMTP
! servers, if you have more than one email account.
!
! o The web interface. This is a server that runs alongside the POP3 proxy,
! SMTP proxy, and IMAP filter (see below) and lets you control it through
! the web. You can upload emails to it for training or classification,
! query the probabilities database ("How many of my emails really *do*
! contain the word Viagra"?), find particular messages, and most
importantly, train it on the emails you've received. When you start
using the system, unless you train it using the Hammie script it will
***************
*** 124,133 ****
to do it correct the odd mistake - it's very quick and easy.
! o The Outlook plug-in. For Outlook 2000 users (not Outlook Express) this
! lets you manage the whole thing from within Outlook. You set up a Ham
! folder and a Spam folder, and train it simply by dragging messages into
! those folders. Alternatively there are buttons to do the same thing.
! And it integrates into Outlook's filtering system to make it easy to
! file all the suspected spam into its own folder, for instance.
o The Hammie script. This does three jobs: command-line training,
--- 156,165 ----
to do it correct the odd mistake - it's very quick and easy.
! o The Outlook plug-in. For Outlook 2000 and Outlook XP users (not Outlook
! Express) this lets you manage the whole thing from within Outlook. You
! set up a Ham folder and a Spam folder, and train it simply by dragging
! messages into those folders. Alternatively there are buttons to do the
! same thing. And it integrates into Outlook's filtering system to make it
! easy to file all the suspected spam into its own folder, for instance.
o The Hammie script. This does three jobs: command-line training,
***************
*** 138,149 ****
hammiesrv.py.
Where things live
-----------------
! The Hammie script is called hammie.py. The POP3 proxy and the web
! interface live in pop3proxy.py. The Outlook plug-in lives in the
! Outlook2000 subdirectory - see the README.txt in that directory for more
! information on that.
As well as these components, there's also a whole pile of utility scripts,
--- 170,189 ----
hammiesrv.py.
+ o The IMAP filter. This is a cross between the POP3 proxy and the Outlook
+ plugin. If your mail sits on an IMAP server, you can use the this to
+ filter your mail. You can designate folders that contain mail to train
+ as ham and folders that contain mail to train as spam, and the filter
+ does this for you. You can also designate folders to filter, along with
+ a folder for messages Spambayes is unsure about, and a folder for
+ suspected spam. When new mail arrives, the filter will move mail to the
+ appropriate location (ham is left in the original folder).
Where things live
-----------------
! The Hammie script is called hammie.py. The POP3 proxy lives in pop3proxy.py,
! and the smtpproxy lives in smtpproxy.py. The IMAP filter lives in
! imapfilter.py. The Outlook plug-in lives in the Outlook2000 subdirectory
! - see the README.txt in that directory for more information on that.
As well as these components, there's also a whole pile of utility scripts,
***************
*** 163,166 ****
--- 203,211 ----
them, all lives in Options.py. To change an option, create a
bayescustomize.ini and add the option to that - don't edit Options.py.
+ If you are using the POP3 proxy, SMTP proxy or IMAP filter, you can also
+ change most of the options you will need to access via the web user
+ interface. You will probably find this at . To
+ configure the Outlook plugin, you should click on the Anti-Spam button on
+ the toolbar.
***************
*** 181,213 ****
--------------------------------------------------------
! The minimum you need to do to get started is create a bayescustomize.ini
! containing the following:
!
! [pop3proxy]
! pop3proxy_servers: pop3.my-isp.com
!
! where "pop3.my-isp.com" is wherever you currently have your email client
! configured to collect mail from. The proxy will run on port 110 - if you're
! already running a real POP3 proxy on that port, or you're running on a
! platform that won't let unprivileged processes use that port (eg. unix),
! you can use a different one by adding a line like this:
!
! pop3proxy_ports: 1110
!
! to the [pop3proxy] section of bayescustomize.ini.
!
! You can now run the proxy by running "python pop3proxy.py". This will
! print some status messages, which should include:
!
! BayesProxyListener listening on port 110.
! UserInterfaceListener listening on port 8880.
! What that means is that the POP3 proxy is ready for your email client to
! connect to it on port 110 and that the web interface is ready for your
! browser to connect to it. The address of the web interface is
! http://localhost:8880/ (or if you're running it on a different machine,
! replace 'localhost' with the name of the machine). You can have a look
! at the web interface now, but it won't be very interesting because the
! system hasn't seen any messages yet.
--- 226,238 ----
--------------------------------------------------------
! To setup the POP3 and SMTP proxies, run
! pop3proxy.py -b
! from the command line. The web interface should open in your default
! browser. You need to click on the "Configuration Link" to go to the setup
! page. The minimum you need to do to get started is enter the servers and
! ports information in the POP3 proxy and SMTP proxy sections.
! The POP3 proxy is then ready for your email client to connect to it on
! port 110 and the SMTP proxy is ready for connections on port 25.
***************
*** 215,222 ****
------------------------------------------
! You now need to configure your email client to talk to the proxy instead of
! the real email server. Change your equivalent of "pop3.my-isp.com" to
"localhost" (or to the name of the machine you're running the proxy on) in
! your email client's setup. Hit "Get new email" and look at the headers of
the emails (send yourself an email if you don't have any!) - there should
be an X-Spambayes-Classification header there. It probably says "unsure",
--- 240,248 ----
------------------------------------------
! You now need to configure your email client to talk to the proxies instead of
! the real email servers. Change your equivalent of "pop3.my-isp.com" to
"localhost" (or to the name of the machine you're running the proxy on) in
! your email client's setup, and do the same with your equivalent of
! "smtp.my-isp.com". Hit "Get new email" and look at the headers of
the emails (send yourself an email if you don't have any!) - there should
be an X-Spambayes-Classification header there. It probably says "unsure",
***************
*** 234,237 ****
--- 260,277 ----
"Total emails trained" has increased.
+ Alternatively, when you receive an incorrectly classified message, you can
+ forward it to the SMTP proxy for training. If the message should have been
+ classified as spam, forward or bounce the message to spambayes_spam@localhost,
+ and if the message should have been classified as ham, forward it to
+ spambayes_ham@localhost. You can still review the training through the web
+ interface, if you wish to do so.
+
+ Note that some mail clients (particularly Outlook Express) do not forward
+ all headers when you bounce, forward or redirect mail. For these clients,
+ you will need to set (via the web interface) the "add mail id to" option
+ to body, which will add a unique id to the body of each message you
+ receive. You can also use this id to find a particular message via the
+ web interface.
+
Once you've done this on a few spams and a few hams, you'll find that the
X-Spambayes-Classification header is getting it right most of the time. The
***************
*** 245,268 ****
messages to the web interface via the "Train" form on the Home page. You
can train on individual messages (which is tedious) or using mbox files.
-
- An alternative to training via the web interface is to run the SMTP proxy.
- Just as the POP3 proxy sits between your mail client and your POP3 server,
- the SMTP proxy sits between your mail client and your SMTP server. To run
- the server, start pop3proxy with the "-s" switch. You will need to setup
- your mail client just as with the POP3 proxy - change the outgoing mail
- (SMTP) server to localhost (or if you are running pop3proxy on a different
- machine, replace localhost with the name of the machine). In the web
- interface, set the SMTP options to the address and port of your SMTP
- server. You will also need to set the "add mail id to" option to "header".
- To train, you can now forward or bounce mail to spambayes_ham@localhost,
- or spambayes_spam@localhost (you can change these addresses via the web
- interface).
-
- Note that some mail clients (particularly Outlook Express) do not forward
- all headers when you bounce, forward or redirect mail. For these clients,
- you will need to set (via the web interface) the "add mail id to" option
- to body, which will add a unique id to the body of each message you
- receive. You can also use this id to find a particular message via the
- web interface.
--- 285,288 ----
From anadelonbrin at users.sourceforge.net Wed Apr 16 19:52:20 2003
From: anadelonbrin at users.sourceforge.net (Tony Meyer)
Date: Wed Apr 16 21:52:25 2003
Subject: [Spambayes-checkins]
website faq.ht, NONE, 1.1 applications.ht, 1.2,
1.3 background.ht, 1.13, 1.14 developer.ht, 1.7, 1.8 docs.ht,
1.8, 1.9 download.ht, 1.2, 1.3 index.ht, 1.11, 1.12 links.h,
1.5, 1.6
Message-ID:
Update of /cvsroot/spambayes/website
In directory sc8-pr-cvs1:/tmp/cvs-serv1396
Modified Files:
applications.ht background.ht developer.ht docs.ht download.ht
index.ht links.h
Added Files:
faq.ht
Log Message:
General update to cover the new applications,
the alpha2 release, and an increase in
documentation.
--- NEW FILE: faq.ht ---
Title: SpamBayes: Frequently Asked Questions
Author-Email: spambayes@python.org
Author: spambayes
Frequently Asked Questions
- Development
- Hey! Why don't you implement cool tokenizer trick X? I think it would really foil those spammers!
- This software is great! I want to implement it for all my users. Are there plans to develop a server-side spambayes solution?
- Using Spambayes
- I just got a spam, but the system said it was "unsure". Why couldn't it tell that it was spam - it's obvious?
- OK, I trained on that message. But I just got *another* one, and the stupid system still thinks it's unsure. Why did it ignore me???
- I've mucked up my training and I want to start all over again, but there isn't an option for this anywhere. What do I do?
If you have any suggestions about other questions and answers that should be included
here, please mail the list with them.
Have you run your tokenizer trick against a set of messages to see if
it actually works? Many times what seems like a good idea turns out
not to help much, and sometimes even hurts. If you have a good idea,
you've run it against a batch of messages and can prove that it
helps, paste the code for your technique and the proof to the mailing
list. If you're not a coder, but are really keen on your idea, post
a feature request on the project page, and wait for someone else to
code it for you (but make sure you do some testing when it's done).
Otherwise, you will likely get a message from Tim Peters about
why you need to test your idea :)
The problem with a server-side solution is that everyone has a
different idea of what is spam - that's the whole strength of the
bayesian-style filtering concept. If you are certain that *all*
of your users would agree on what is spam and what is not, then
this might work for you, but otherwise you really have to have
individual databases for each user. Either way, you should be
able to modify spambayes easily enough to fit into your setup.
Please let the list know if you do have success in this area, and
we'll update this answer.
It may be obvious to you, but the classifier only works on
the information it has been given. Maybe this is "new" (you've
never seen this particular flavour of spam before), or maybe
there aren't enough clues in the message which the system is
aware of as strong spam clues.
It didn't, but you may need to train on a few more of this type
of message to get it classified as "spam". The classification
algorithm weights its results based on the number of times it
has seen a particular clue, so that clues unique to this type
of message may need a few more instances to become "convincing".
Because training from scratch is a very rare occurance, and because
deleting all your training information is something you don't want
to do by accident, there isn't an option for this. However, you
can quite simply do this manually. All the training data is stored
in a file, usually called hammie.db, and if you delete (or rename)
this, then you will start training from scratch. If you are using
the web interface for the POP3 proxy, the configuration page tells
you what this file is called (and where it is) down towards the
bottom of the page.
Index: applications.ht
===================================================================
RCS file: /cvsroot/spambayes/website/applications.ht,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** applications.ht 3 Mar 2003 22:24:39 -0000 1.2
--- applications.ht 17 Apr 2003 01:52:17 -0000 1.3
***************
*** 8,18 ****
getting there (and help is always appreciated).
! Outlook2000
! Sean True and Mark Hammond have developed an addin for Outlook2000 that
adds support for the spambayes classifier.
Requirements