From anadelonbrin at users.sourceforge.net Thu Apr 3 15:30:38 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Thu Apr 3 18:30:44 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.66,1.67 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv3792 Modified Files: pop3proxy.py Log Message: Expire messages from the unknown cache as well as the ham and spam caches. Also spin off threads to expire messages (if necessary) each time a client connects to the proxy. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.66 retrieving revision 1.67 diff -C2 -d -r1.66 -r1.67 *** pop3proxy.py 12 Mar 2003 03:29:58 -0000 1.66 --- pop3proxy.py 3 Apr 2003 23:30:34 -0000 1.67 *************** *** 141,144 **** --- 141,145 ---- import socket, asyncore, asynchat, cgi import mailbox, email.Header + from thread import start_new_thread from email.Iterators import typed_subpart_iterator import spambayes *************** *** 391,394 **** --- 392,399 ---- make multiple calls, or will cope with the headers being different. + + o USER: + o Does no processing based on the USER command itself, but + expires any old messages in the three caches. """ *************** *** 396,400 **** POP3ProxyBase.__init__(self, clientSocket, serverName, serverPort) self.handlers = {'STAT': self.onStat, 'LIST': self.onList, ! 'RETR': self.onRetr, 'TOP': self.onTop} state.totalSessions += 1 state.activeSessions += 1 --- 401,406 ---- POP3ProxyBase.__init__(self, clientSocket, serverName, serverPort) self.handlers = {'STAT': self.onStat, 'LIST': self.onList, ! 'RETR': self.onRetr, 'TOP': self.onTop, ! 'USER': self.onUser} state.totalSessions += 1 state.activeSessions += 1 *************** *** 571,574 **** --- 577,589 ---- return self.onRetr(command, args, response) + def onUser(self, command, args, response): + """Spins off three separate threads that expires any old messages + in the three caches, but does not do any processing of the USER + command itself.""" + start_new_thread(state.spamCorpus.removeExpiredMessages, ()) + start_new_thread(state.hamCorpus.removeExpiredMessages, ()) + start_new_thread(state.unknownCorpus.removeExpiredMessages, ()) + return response + def onUnknown(self, command, args, response): """Default handler; returns the server's response verbatim.""" *************** *** 1298,1304 **** '[0123456789]*', cacheSize=20) ! # Expire old messages from the trained corpuses. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() # Create the Trainers. --- 1313,1324 ---- '[0123456789]*', cacheSize=20) ! # Given that (hopefully) users will get to the stage ! # where they do not need to do any more regular training to ! # be satisfied with spambayes' performance, we expire old ! # messages from not only the trained corpii, but the unknown ! # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() + self.unknownCorpus.removeExpiredMessages() # Create the Trainers. From anadelonbrin at users.sourceforge.net Thu Apr 3 16:00:53 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Thu Apr 3 19:00:56 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.67,1.68 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv13747 Modified Files: pop3proxy.py Log Message: Opps. Fix bug from the last commit. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.67 retrieving revision 1.68 diff -C2 -d -r1.67 -r1.68 *** pop3proxy.py 3 Apr 2003 23:30:34 -0000 1.67 --- pop3proxy.py 4 Apr 2003 00:00:48 -0000 1.68 *************** *** 1309,1313 **** options.pop3proxy_ham_cache, '[0123456789]*', cacheSize=20) ! self.unknownCorpus = FileCorpus(factory, options.pop3proxy_unknown_cache, '[0123456789]*', cacheSize=20) --- 1309,1313 ---- options.pop3proxy_ham_cache, '[0123456789]*', cacheSize=20) ! self.unknownCorpus = ExpiryFileCorpus(age, factory, options.pop3proxy_unknown_cache, '[0123456789]*', cacheSize=20) From anadelonbrin at users.sourceforge.net Thu Apr 3 17:11:59 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Thu Apr 3 20:12:04 2003 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.68,1.69 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv3132 Modified Files: pop3proxy.py Log Message: Add 'show clues' button to the review messages page as requested by Paul Moore. Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.68 retrieving revision 1.69 diff -C2 -d -r1.68 -r1.69 *** pop3proxy.py 4 Apr 2003 00:00:48 -0000 1.68 --- pop3proxy.py 4 Apr 2003 01:11:57 -0000 1.69 *************** *** 906,909 **** --- 906,911 ---- row.subject.href="view?key=%s&corpus=%s" % (key, label) row.from_ = messageInfo.fromHeader + subj = cgi.escape(messageInfo.subjectHeader) + row.classify.href="showclues?key=%s&subject=%s" % (key, subj) setattr(row, 'class', ['stripe_on', 'stripe_off'][stripe]) # Grr! row = str(row).replace('TYPE', label).replace('KEY', key) *************** *** 1084,1092 **** self._writePostamble() ! def onClassify(self, file, text, which): ! """Classify an uploaded or pasted message.""" ! message = file or text ! message = message.replace('\r\n', '\n').replace('\r', '\n') # For Macs ! cluesTable = self.html.cluesTable.clone() cluesRow = cluesTable.cluesRow.clone() --- 1086,1090 ---- self._writePostamble() ! def _buildCluesTable(self, message, subject=None): cluesTable = self.html.cluesTable.clone() cluesRow = cluesTable.cluesRow.clone() *************** *** 1099,1103 **** results = self.html.classifyResults.clone() results.probability = probability ! results.cluesBox = self._buildBox("Clues:", 'status.gif', cluesTable) results.classifyAnother = self._buildClassifyBox() self._writePreamble("Classify") --- 1097,1125 ---- results = self.html.classifyResults.clone() results.probability = probability ! if subject is None: ! heading = "Clues:" ! else: ! heading = "Clues for: " + subject ! results.cluesBox = self._buildBox(heading, 'status.gif', cluesTable) ! return results ! ! def onShowclues(self, key, subject): ! """Show clues for a message - linked from the Review page.""" ! self._writePreamble("Message clues", parent=('review', 'Review')) ! message = state.unknownCorpus.get(key).getSubstance() ! message = message.replace('\r\n', '\n').replace('\r', '\n') # For Macs ! if message: ! results = self._buildCluesTable(message, subject) ! del results.classifyAnother ! self.write(results) ! else: ! self.write("

Can't find message %r. Maybe it expired.

" % key) ! self._writePostamble() ! ! def onClassify(self, file, text, which): ! """Classify an uploaded or pasted message.""" ! message = file or text ! message = message.replace('\r\n', '\n').replace('\r', '\n') # For Macs ! results = self._buildCluesTable(message) results.classifyAnother = self._buildClassifyBox() self._writePreamble("Classify") From anadelonbrin at users.sourceforge.net Thu Apr 3 17:12:00 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Thu Apr 3 20:12:05 2003 Subject: [Spambayes-checkins] spambayes/spambayes/resources ui.html, 1.6, 1.7 ui_html.py, 1.6, 1.7 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes/resources In directory sc8-pr-cvs1:/tmp/cvs-serv3132/spambayes/resources Modified Files: ui.html ui_html.py Log Message: Add 'show clues' button to the review messages page as requested by Paul Moore. Index: ui.html =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/ui.html,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** ui.html 13 Mar 2003 03:24:57 -0000 1.6 --- ui.html 4 Apr 2003 01:11:57 -0000 1.7 *************** *** 224,227 **** --- 224,228 ---- id="spam" value='spam'/> + Show clues     Index: ui_html.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/resources/ui_html.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** ui_html.py 13 Mar 2003 03:24:58 -0000 1.6 --- ui_html.py 4 Apr 2003 01:11:57 -0000 1.7 *************** *** 5,80 **** import zlib ! data = zlib.decompress("x[{oܸ\021\016<\025l]Iz]&S\037\\\\{p\010\022wW$\0226ޙ!)Q\017;\ ! \036\000\004^H\016yfHϾp>]߿c?y9\016㿞\027\013b49bge\ ! bd<\013jViBGb\010\026Gg\036=ym\005ߊP})Q,y(fccJ\ ! \023[LM'\030/b\034r\023\001[^8,*Φl~+\0270\016R\026)m\021+\ ! >M\016އ2Ŕf\002?!5}\014\026#ky\017j^M^LKl\033\024ㆹÍWkq!z\ ! bC3d'O4w\0052dûU!,‰NYΣ(Vg/D,ӯbB\026(\016\ ! ̡)`L-\0229o!i+M\014\021e\017\001NNH%5,JR(\016Y\ ! B\000hl)B<ךG|;l\023Gj\015z;\0209\006\035\031.K.&\003|\002`C7p\ ! e4L\017$'BrhAlPA'j:7\032\004ESp\035sQfz?[)Bg:V\025x\ ! !9<;KO6\033[W1\033\033\003$D8;\024,Y<\001i)s\010p\015P^0\ ! \025+oFx3ī2(\0218x-\024\000\012-jf\"\013vdy!*T\026\ ! P:P\002cB,a\002\007W{6Ӏe4:eU\017d8\021\007l\026H\0045\027Eb\031gx\026,gRD\ ! Kj0b6bطnE\001w\034^߃חU\0111^#(~\"\010\005 xP\005\017QabSY\ ! 'U$*Sq\012Jh\013\002\000$[wQ\020'D[jkX\037%rv\010X\030\015\012\023\ ! 걡LsKXHw`F,K\037\0060w\037G\003\"C\033\034Xi\033LB\ ! I\035\007k,cp#M㒇k-Z4*\023\"*Q6\013gEeF\0218\014+: y/\004,\020'e\ ! [\006a\003F8w\0255R]i\020'kndqGA\036v5ֽX-\003\004\002\004e\027 _\0310\ ! 4\031\033Y^)26\027m@<и\022uПyG\023olL3\006\\\034a5*\004۶*xrl\ ! Z=Z%=cjjז\030lg\031HW\003S7j_|摧?x\001\002:\007zY(ga;\ ! CggL\"3\016W(o\037\002\030w\024\020\0105\014Q\022\036\022ݕL^_Abfw\007\023u:\ ! \026?\010ctq\034$\021Q5zkD\027dY.i\014z֋\037Vo%i1m)\023U\014AVk\ ! ǠU\002|%\0078\027\022Xy{!V~\000M\0370Y\0117\036TS[\0262eeXp\005,l\ ! o}E\030~\004]!\004x\000C\023\027[U\031Hyț\037\020儓#1r-:`\012&vlJ\023ߐk\ ! p٪…W$S\004Mn;B:$/`)jnP\022\030 l<߈\004%v^\027X\033aA\017\011|\ ! Ďq1z\006\002\005Yn\000Q\006Ó\003LgI#|\003\\P$\011\0141\033`\006\ ! ӒV3޼zp3\0300\024M\010\010(1(9E\020\034G\003HO\\\017Х\ ! %kY*RL3ć0-t0\036|5\025tZ/g\0042\014\021y\004\014=\011h\010:C!vaWQ\016!+BtU9.\0112\010]4uVO;s m1W\ ! !\016'\0246\026tz\006ߚB+\033Wq;M,S\016\003H仼'\021u\002\ ! \006D@䌝BܖJ\015h\0017\025\003=\037\003E\025+''\004~~R\032۰\ ! ij*,\034\024I]'PDi<\025!%um\017\037/\002\016eCBEHE3\0077\ ! YGlȲ#1\013E\015#\035݁Z\001,@8\025\011ct\\oJ!\026\017\027l\003!ܞExGK\ ! \003`f\002Y,\001\026\013LA]\025rU\0149R!\037\007\010\013\024\017*7\031#\030=\024Iw\ ! @c2K\020\006\017A\027q\031\"bcF\023\001F%`j\0114CZIphRC\007@\002l]*s\ ! R1\001\002\037/\004\036\035G\021\0253\037(A\014\010̣NkO\020@x48\\\011\003\036\ ! \005\026>uCZ\034\033\022?T\0375KVq@Q\\\"@ȩ{@vpR\ ! 2>@ԹH[hܰs,L\\xO\0272\011fi͔\011\ ! {8\016 KS\023/\036dU\012x|s~3JD\\,8cOm)֟uti\033\ ! ۋlF\004ⷿowߪ\022\0208\025\010)y;==\017\034BvEgg\004[7\ ! \000\022\006iJ$gtGJ\033QC\0304Ar>`\022\003kj\036xM\ ! Aʲwp;nW'v\021ڮ\016\003\023nYT\022\000X\006^\007\0010Z/\001Qo\013\ ! >ln\035Qsh8~\030xM\033\027\000\"*o^`)v5j\024\026>\012J\020ۈ\016|\012\ ! *\030\033\014>\017zf-@4\001{f\017o=;CI`Ezs ao/\033̍:8\017\ ! f\012\031}ط\023\006\026>zM\034c,\002jfAo\000\0160OIQ\030\036b\015{U\ ! ^(/x\024K\033%ld\004q8!VC[;:x;@xJ,`jteQ\ ! \001wE³;4\001\027>%;PA_;aOV\026w%k`Ú\037C_R+DQ\010\ ! ͫ\036\011y\010\004(ZRZ%*yÈ+n]eL\005tMq8,>x\021\000IKz}\ ! OOﭪg#I\020#\017\005\035B\010J>6k\031\022 ,L**@O\011ԭ?\"l\024e\007܀=\ ! 3\037O`_LOwvLRGC{\006\007差-魿ՓԬBmYe=\035\ ! ='Zj?BۅC\013䌎R\030e\036:%8sZ\012{\003P\037D\\\002@&J\024a;\ ! \033SWW~_YIsNJ;d݅a\000\020՜hH]\022\"u:x\ ! f8֫1{zG-\022)4\036vXcF_\021\004JVqU\016}f\013\0050\002\034֕N\ ! \024BYᅛG\0155\027/_=B躐\013!\0232KRC}\013hFa\014\011\0318\ ! \013&}G&bF\020\021\036\021sSb:S,\020cD;dČIL\036Wt\016)񇜑N@\ ! ԃ9:`I|'\026,Y}^jΔ-\0206\007>x\031xD޽L\011ܡ\ ! }SZ\004\033!@VFȚ*l6\033}\004MН^>\0146`b\ ! .G\025@\021ku{uv\036]\012\0202\007u\016Ca\031K\000\004KP\034L^\012(C,\ ! \004\000et\010d(\002\025}GT7/j\032\007ɂNc~\\؜seџysi\ ! _yw\031>@\03580߸\025\034*\030\013\003=Rv-2t4f4\013x;70W/\016\0064\013\013\025e\016\ ! 渫F\002\014i\037WEB\016Q9&\0224\024P\012~\035@b)BA7Gn1\ ! >W~TRA\032Y\035[cq(-\027^k\"\005\002k\013\024B\003\013\004{Wui\021&\ ! >b/T$7PJ2tzxxtqk\021\030b2ZlfJk\025{E\026\004YJZ\033 ~\ ! fl\022\004}&gC<3\030\015KS\010\026\020o\024^\013snIŐ\035*^^f)!;\ ! \027ɋ1;LG\\JxٌJ;H\026\000~;xZYfk\010XiPu\000a\ ! G#jSx\000טM1\001̈́=\007\0112;\030\013[\0225SN\000MV\005\002J=") ### end --- 5,81 ---- import zlib ! data = zlib.decompress("x[{o\033\021\016-Q[$Ңԇ&\027ֵ8\034\016\001$ֻ>,}\014]Ê\ ! C\037@.p8\0147\037.f?\\w7ᄏ`p׳rv\033^\014F'l$\ ! T\011ë&\"S\020?\013YDp7/Ky<[\037rBdK\020!u1y/s\ ! \025n'TI1f\037=c<<:bF\021[A\024rY̳Ll~a=ab W\ ! |\034\036/T1\010~\026\017\022V-'Z\001U\032\030-6X\000C\")\0307\035o\\ǹB\ ! ņftIO\036i\006s$4ɜ/W*\020fZ\004tR\0362YM_V_#\004\032'/ϿU?s\";\ ! .T\012M#U$C6f_*\012\025w\022߸\020\010wvZ'/!\026yWBW\002m+\036\ ! %dZ\016bgZ2oGmdXA\017^>s\007\"@%r輇\017W \025oP\033\ ! ƛ(}4{\011T\016-*Tӳ__MZvPt:\"(2*Oc)S\ ! \\yE[~/к8\034JBD_r\001\004%\022C\0221Ʌb\ ! 3V;Y\010\005\036v\005\006RFM\026*\024A\005=\035P,e\002/ւEJ\037$<^\012\021i\ ! \022jI-%F\014i#}[Nd|\007\003x}U@*Ar\034AKO\027B(\004D\007c\005*6L\014~*i\ ! TF^\024DV&Q)\026nA\020@h.\012h\027k^45s,Ǐ);^F,\014\006S`(\012\ ! aYc\027*NU\002aqZ7\"\012߁1\032!2\027\003?O\015`A\003\"c5\033\034Xi\033L\ ! =\007k*%8فq\027k-J4*\021\"Q6sgE%F\021O\031Z\013:\"y\005,\020'e7\ ! b\015p\007t\007a*k\0124\"Njݨ잂<ԽBY#\004b\0116/@B24a\ ! i268IZ\026d\016u\027\025T\000\001@RT\036C6NFИVg\01469g[U:ªU\010m8V\ ! qchuޫ\021\\[\012b]gl#]\005|ݨS\015G}#O1\002\014\005t\0162+9\ ! \016MJEr\031g\0351/+C\010`<<8ܓR\026@f`\012\016d\014\012\022\033Իk7>x\036\025\ ! _m.E\030\014e\034.\006D\015]y\022_\\e!Y'~\030[SƴѦO\022x-\ ! A+\007EK\016p\025@#Y\0014\011T'V_{PMm勌\027Ϥf\ ! \013|$ #\0302\025E\003\030\000?Z\012@ZC-PN89\022#B\024G\011\035`7\ ! \032p*q\025\024!&bSۖK(\033\00446[&xm;7\"\002o]\004ZXC\001\037\ ! :z=y^̀P Cw7I-Pp\027\020 \012ߠwuxr\000i6io}\020Q\004#\027*`\002\ ! ӒV5޼zp\012\030f\020\010t(\022\024\002\"\010\016B#u\001G.\007\ ! \0225\013^ƙf\017a\014\021/}`xȧh\026\015+h\006_\010e\030$#x*/1Wn\024h\ ! OM\007?@z`Ih\012V'\015\024ǀ57{;SG訃wҋ\025\005\007\005.,\010#.buAv\003\027\ ! YAl\006{hޡjD\031>{&/zB]\0256?禂wM{*x^B\002-F\ ! \0177g:@'\001\031MAk\015\027<&ypn/%gD\010f\025\005\023E \035B\003Wa\ ! z\0035LA E|2\033wjN\004S\005.\004]izi\012UhYb\006;r)\ ! 1X9IRƘ+3\026\037\036v\035\037DkXI/k+ЅISk%5\007rޤ\034s\ ! \035nhqJQ+cAg)\022q\030{T?Rv9d[\017\"ڝG\013\023\ ! 4 zL\006:gl-\0376/D\014\016\026pS13Y;P\004\\XA\030rrB`G?'ui\006\011;\ ! hFf\014^;PYB>\020\0120'\033bEHzI\035+\003s@CU^#+\021Rь\ ! -}VQ$$*sBQhHGw Va\000\013%\020\016uE\030\035&[R胫\005@j\010\027kA\ ! f)\0010E\001X\002,\026\023**\011\031R!\037\007\010K\024\017*0\031#\030=\024I\ ! w@c2K\020\032\017A2_,dCF\023\001F%`j\0114\027\032z%\016\034\005\036+\004T\ ! ԥcX\003(\005?^\0347\011>%=Z!\025\017RT \006lNk;\010$z;\\\011\003\036\ ! \001\026s\027:A^cYX\016V\015\011+O\006JH8P\0378q\015Т]>\\\024\ ! c{\014< x\013\0201u\006Z-w.\013\003o`8|LY&Z3U}\ ! \001C(H9g\024&>%;l\001\034\017=Κo\006HV\005SvzX\017Ԕ\"i5;\ ! ؁s&ل\011o{U9 p*`\003\021\034y;=\003\037\034BvE);V\015\ ! /D~5s\022ɴPFd\027\020\006\016\016\017?&H\007\031̟g\0154_9\030\016N6\ ! HYN7\037f~4|=Bap\"]91J\002\000\013Z3< \036\023F$8m\ ! '-ۍ#*p\016j\024\015\033\026qmS>\037AQ\005\013w\032=~\005%mD\007>\005\ ! \025D\0064\017zf̓޲@\001{fo=;CI`Izs ao/\032̭\030;8\017\ ! f2o\017['X\015,}[XKe\022a`537\000\007'$T(;T\014l_`\015\";U\ ! N(x(`\022c2?]\037P\012^\016&P:\005Lܐ\016Y\006(\035u\030\ ! gS\002v\026sp'N\013G9nG@\003(,px\017#o:f\ ! \037Ztd\021\006\027Q<\005jO\\^\017ﭳZP)ux\036:G=G[w\0248F\015dviGe\ ! VF|.\"/x\037\016g~vٜg\007\0147oW\036B&\030@m\021|ǘ^\023'MY(m\ ! ^`u\037\"]8\031`\031\016rVDSl\026}]0\013mˢ\036mG\022Vgf=Y\ ! >\015&rΔo)(9'uҜc哓\015!{Z[z_Du7ty\032\036\ ! ~8\030,/Tfw٫\002\013RӶ\032Պc\0130Q~pQf\031A\014sܳ\036G̚H\ ! P@)U\000ŨSn,ݾC]d/Qⵡ:5:Q; \032WD\013\0048c\003\ ! dB\003\013\004;Wui!&b//.PmV_hμ#0T8ַ\ ! .-|\023d\0128\007w3c X3\032=;)8ǰfC\000_όo\000\032J?k\ ! 2瞕\000X\005U\002`g#vb;eѩ1(pl\006pa\000ە\ ! ں4-&\021D\0070\000Î2ܦ\000׭Rb\002\011{\016\0228gv0\026\0125&%S!\000\ ! \033(Ǘ\004\015B\006h") ### end From anadelonbrin at users.sourceforge.net Sun Apr 6 18:15:53 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Sun Apr 6 20:15:57 2003 Subject: [Spambayes-checkins] spambayes FAQ.txt,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv13697 Added Files: FAQ.txt Log Message: Frequently asked questions about the spambayes project. --- NEW FILE: FAQ.txt --- Q: Hey! Why don't you implement cool tokenizer trick X? I think it would really foil those spammers! A: Have you run your tokenizer trick against a set of messages to see if it actually works? Many times what seems like a good idea turns out not to help much, and sometimes even hurts. If you have a good idea, you've run it against a batch of messages and can prove that it helps, paste the code for your technique and the proof to the mailing list. If you're not a coder, but are really keen on your idea, post a feature request on the project page, and wait for someone else to code it for you (but make sure you do some testing when it's done). Otherwise, you will likely get a message from Tim Peters about why you need to test your idea :) Q: I just got a spam, but the system said it was "unsure". Why couldn't it tell that it was spam - it's obvious? A: It may be obvious to you, but the classifier only works on the information it has been given. Maybe this is "new" (you've never seen this particular flavour of spam before), or maybe there aren't enough clues in the message which the system is aware of as strong spam clues. Q: OK, I trained on that message. But I just got *another* one, and the stupid system still thinks it's unsure. Why did it ignore me??? A: It didn't, but you may need to train on a few more of this type of message to get it classified as "spam". The classification algorithm weights its results based on the number of times it has seen a particular clue, so that clues unique to this type of message may need a few more instances to become "convincing". From anadelonbrin at users.sourceforge.net Sun Apr 6 18:53:48 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Sun Apr 6 20:53:51 2003 Subject: [Spambayes-checkins] spambayes FAQ.txt,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv27837 Modified Files: FAQ.txt Log Message: Add FAQ about training from scratch. Index: FAQ.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/FAQ.txt,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** FAQ.txt 7 Apr 2003 00:15:51 -0000 1.1 --- FAQ.txt 7 Apr 2003 00:53:45 -0000 1.2 *************** *** 28,29 **** --- 28,41 ---- has seen a particular clue, so that clues unique to this type of message may need a few more instances to become "convincing". + + Q: I've mucked up my training and I want to start all over again, + but there isn't an option for this anywhere. What do I do? + A: Because training from scratch is a very rare occurance, and because + deleting all your training information is something you don't want + to do by accident, there isn't an option for this. However, you + can quite simply do this manually. All the training data is stored + in a file, usually called hammie.db, and if you delete (or rename) + this, then you will start training from scratch. If you are using + the web interface for the POP3 proxy, the configuration page tells + you what this file is called (and where it is) down towards the + bottom of the page. From anadelonbrin at users.sourceforge.net Mon Apr 7 01:26:29 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Mon Apr 7 03:26:38 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv12257 Added Files: imapfilter.py Log Message: First steps towards an IMAP spambayes solution. Currently will do (very) basic filtering and training. --- NEW FILE: imapfilter.py --- #!/usr/bin/env python """An IMAP filter. An IMAP message box is scanned and all non-scored messages are scored and (where necessary) filtered. It is suggested that this filter is set to run at certain intervals. Note that it is (currently) fairly slow, so this should not be too often. An alternative to this would be to keep the filter running and logged in, and periodically check for new mail. The original filter design owed much to isbg by Roger Binns (http://www.rogerbinns.com/isbg). """ # This module is part of the spambayes project, which is Copyright 2002-3 # The Python Software Foundation and is covered by the Python Software # Foundation license. __author__ = "Tony Meyer " __credits__ = "All the Spambayes folk." # This code will benefit immensely from # (a) The new message class, which can hold information such as # whether a message has been seen before # (b) The new header stuff, which will abstract out adding all # the headers try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 import socket import imaplib import os import re import time from spambayes.Options import options from spambayes import tokenizer, storage class IMAPFilter(object): def __init__(self): self.imap = imaplib.IMAP4(options.imap_server, options.imap_port) if options.verbose: print "Loading database...", filename = options.pop3proxy_persistent_storage_file filename = os.path.expanduser(filename) if options.pop3proxy_persistent_use_database: self.bayes = storage.DBDictClassifier(filename) else: self.bayes = storage.PickledClassifier(filename) if options.verbose: print "Done." # Unique names for cached messages - see getNewMessageName() below. self.lastBaseMessageName = '' self.uniquifier = 2 def Login(self): lgn = self.imap.login(options.imap_username, options.imap_password) self._check(lgn, 'login') def _check(self, response, command): if response[0] != "OK": print "Invalid response to %s:\n%s" % (command, response) sys.exit(-1) def _getUIDs(self, low, high): # Retreive a list of uids corresponding to the given range if high < low: return [] # request message range range = str(low) + ":" + str(high) res = self.imap.fetch(range, "UID") self._check(res, 'fetch') r = re.compile(r"[0-9]+ \(UID ([0-9]+)\)") res2 = [] for i in res[1]: mo = r.match(i) if mo is not None: res2.append(mo.group(1)) return res2 def getNewMessageName(self): # The message name is the time it arrived, with a uniquifier # appended if two arrive within one clock tick of each other. # (This is completely taken from the same function in pop3proxy's # State class.) messageName = "%10.10d" % long(time.time()) if messageName == self.lastBaseMessageName: messageName = "%s-%d" % (messageName, self.uniquifier) self.uniquifier += 1 else: self.lastBaseMessageName = messageName self.uniquifier = 2 return messageName def _selectFolder(self, name, read_only): folder = self.imap.select(name, read_only) self._check(folder, 'select') return folder def RetrieveMessage(self, uid): response = self.imap.uid("FETCH", uid, "(RFC822.PEEK)") self._check(response, 'uid fetch') try: messageText = response[1][0][1] except: print "Could not retrieve message (id %s)" % uid messageText = "" return messageText def TrainFolder(self, folder_name, isSpam): response = self._selectFolder(folder_name, True) uids = self._getUIDs(1, int(response[1][0])) for uid in uids: messageText = self.RetrieveMessage(uid) self.bayes.learn(tokenizer.tokenize(messageText), isSpam) def Train(self): if options.verbose: t = time.time() if options.imap_ham_train_folders != "": ham_training_folders = options.imap_ham_train_folders.split() for fol in ham_training_folders: self.TrainFolder(fol, False) if options.imap_spam_train_folders != "": spam_training_folders = options.imap_spam_train_folders.split(' ' ) for fol in spam_training_folders: self.TrainFolder(fol, True) self.bayes.store() if options.verbose: print "Training took", time.time() - t, "seconds." def Filter(self): if options.verbose: t = time.time() inbox = self._selectFolder(options.imap_inbox, False) # the number of messages are returned # get all the corresponding UIDs uids = self._getUIDs(1, int(inbox[1][0])) for uid in uids: messageText = self.RetrieveMessage(uid) (prob, clues) = self.bayes.spamprob\ (tokenizer.tokenize(messageText), evidence=True) messageText = self._addHeaders(messageText, prob, clues) #uid = self._updateMessage(uid, messageText) self._filterMessage(uid, prob) if options.verbose: print "Filtering took", time.time() - t, "seconds." def Logout(self): # sign off if options.imap_expunge: self.imap.expunge() self.imap.logout() def _addHeaders(self, messageText, prob, clues): if options.pop3proxy_strip_incoming_mailids == True: s = re.compile(options.pop3proxy_mailid_header_name + \ ': [\d-]+[\\r]?[\\n]?') messageText = s.sub('', messageText) headers, body = re.split(r'\n\r?\n', messageText, 1) messageName = self.getNewMessageName() headers += '\n' if options.pop3proxy_add_mailid_to.find("header") != -1: headers += options.pop3proxy_mailid_header_name \ + ": " + messageName + "\r\n" if options.pop3proxy_add_mailid_to.find("body") != -1: body = body[:len(body)-3] + \ options.pop3proxy_mailid_header_name + ": " \ + messageName + "\r\n.\r\n" if options.pop3proxy_include_prob: headers += '%s: %s\r\n' % (options.pop3proxy_prob_header_name, prob) if options.pop3proxy_include_thermostat: thermostat = '**********' headers += '%s: %s\r\n' % \ (options.pop3proxy_thermostat_header_name, thermostat[int(prob*10):]) if options.pop3proxy_include_evidence: headers += options.pop3proxy_evidence_header_name + ": " headers += "; ".join(["%r: %.2f" % (word, prob) for word, score in clues if (word[0] == '*' or score <= options.clue_mailheader_cutoff or score >= 1.0 - options.clue_mailheader_cutoff)]) headers += "\r\n" headers += "\r\n" return headers + body def _updateMessage(self, uid, messageText): # we can't actually update the message with IMAP # XXX (someone tell me if this is wrong!) # so what we do is create a new message and delete the old one # we return the new uid, which we obtain by searching for the # spambayes id res = self.imap.append(options.imap_inbox, None, self._extractTimeFromMessage(messageText), messageText) self._check(res, "append") res = self.imap.uid("STORE", uid, "+FLAGS.SILENT", "(\\Deleted)") self._check(res, "uid store") res = self.imap.uid("SEARCH", "(TEXT)", messageText) self._check(res, "uid search") return res[1][0] def _extractTimeFromMessage(self, messageText): # When we create a new copy of a message, we need to specify # a timestamp for the message. Ideally, this would be the # timestamp from the message itself, but for the moment, we # just use the current time. return imaplib.Time2Internaldate(time.time()) def _moveMessage(self, uid, dest): # The IMAP copy command makes an alias, not a whole new # copy, so what we need to do (sigh) is create a new message # in the correct folder, and delete the old one # XXX (someone tell me if this is wrong, too!) response = self.imap.uid("FETCH", uid, "(RFC822.PEEK)") self._check(response, 'uid fetch') messageText = response[1][0][1] response = self.imap.append(dest, None, self._extractTimeFromMessage(messageText), messageText) self._check(response, "append") res = self.imap.uid("STORE", uid, "+FLAGS.SILENT", "(\\Deleted)") self._check(response, "uid store") def _filterMessage(self, uid, prob): if prob < options.ham_cutoff: # we leave ham alone pass elif prob > options.spam_cutoff: self._moveMessage(uid, options.imap_spam_folder) else: self._moveMessage(uid, options.imap_unsure_folder) if __name__ == '__main__': options.verbose = True imap_filter = IMAPFilter() imap_filter.Login() imap_filter.Train() imap_filter.Filter() imap_filter.Logout() From anadelonbrin at users.sourceforge.net Mon Apr 7 01:26:54 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Mon Apr 7 03:26:57 2003 Subject: [Spambayes-checkins] spambayes/spambayes Options.py,1.24,1.25 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv12257/spambayes Modified Files: Options.py Log Message: First steps towards an IMAP spambayes solution. Currently will do (very) basic filtering and training. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v retrieving revision 1.24 retrieving revision 1.25 diff -C2 -d -r1.24 -r1.25 *** Options.py 14 Mar 2003 02:14:37 -0000 1.24 --- Options.py 7 Apr 2003 07:26:17 -0000 1.25 *************** *** 415,418 **** --- 415,434 ---- smtpproxy_shutdown_address = spambayes_shutdown@localhost + [imap] + imap_server: + # the default IMAP port is 143, or 993 if using SSL + imap_port: 143 + imap_username: + imap_password: + imap_expunge: False + imap_inbox: inbox + imap_unsure_folder: + imap_spam_folder: + # comma delimited list of folders that will be examined for messages + # to train as ham + imap_ham_train_folders: + # as for imap_ham_train_folders, but scan for messages to train as spam + imap_spam_train_folders: + [html_ui] html_ui_port: 8880 *************** *** 533,536 **** --- 549,563 ---- 'smtpproxy_ports' : string_cracker, }, + 'imap': {'imap_server' : string_cracker, + 'imap_port' : int_cracker, + 'imap_username' : string_cracker, + 'imap_password' : string_cracker, + 'imap_inbox' : string_cracker, + 'imap_unsure_folder' : string_cracker, + 'imap_spam_folder' : string_cracker, + 'imap_ham_train_folders' : string_cracker, + 'imap_spam_train_folders' : string_cracker, + 'imap_expunge' : boolean_cracker, + }, 'html_ui': {'html_ui_port': int_cracker, 'html_ui_launch_browser': boolean_cracker, From timstone4 at users.sourceforge.net Mon Apr 7 19:21:30 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Mon Apr 7 21:21:34 2003 Subject: [Spambayes-checkins] spambayes/spambayes message.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv14085 Added Files: message.py Log Message: An extension of email.message which includes methods that are useful for spambayes. Specifically, message training and classification is remembered in a simple pickle, by message id. This allows for convenient untraining by any client that uses it to handle messages. --- NEW FILE: message.py --- #! /usr/bin/env python '''message.py - Core Spambayes classes. Classes: Message - an email.Message.Message, extended with spambayes methods MessageInfoDB - persistent state storage for Message Abstract: MessageInfoDB is a simple shelve persistency class for the persistent state of a Message obect. For the moment, the db name is hard-coded, but we'll have to do this a different way. Mark Hammond's idea is to have a master database, that simply keeps track of the names and instances of other databases, such as the wordinfo and msginfo databases. The MessageInfoDB currently does not provide iterators, but should at some point. This would allow us to, for example, see how many messages have been trained differently than their classification, for fp/fn assessment purposes. Message is an extension of the email package Message class, to include persistent message information and Spambayes specific header manipulations. The persistent state -currently- consists of the message id, its current classification, and its current training. The payload is not persisted. Payload persistence is left to whatever mail client software is being used. Usage: A typical classification usage pattern would be something like: >>>msg = spambayes.message.Message() >>>msg.setPayload(substance) # substance comes from somewhere else >>>id = msg.setIdFromPayload() >>>if id is None: >>> msg.setId(time()) # or some unique identifier >>>msg.delSBHeaders() # never include sb headers in a classification >>># bayes object is your responsibility >>>(prob, clues) = bayes.spamprob(msg.asTokens(), evidence=True) >>>msg.addSBHeaders(prob, clues) A typical usage pattern to train as spam would be something like: >>>msg = spambayes.message.Message() >>>msg.setPayload(substance) # substance comes from somewhere else >>>id = msg.setId(msgid) # id is a fname, outlook msg id, something... >>>msg.delSBHeaders() # never include sb headers in a train >>>if msg.isTrndHam(): >>> bayes.unlearn(msg.asTokens(), False) # untrain the ham >>>bayes.learn(msg.asTokens(), True) # train as spam >>>msg.trndAsSpam() To Do: o Master DB module o Suggestions? ''' # This module is part of the spambayes project, which is Copyright 2002 # The Python Software Foundation and is covered by the Python Software # Foundation license. __author__ = "Tim Stone " __credits__ = "Mark Hammond, Tony Meyers, all the spambayes contributors." from __future__ import generators try: True, False except NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 def bool(val): return not not val import sys import email.Message import email.Parser from spambayes.tokenizer import tokenize from spambayes.Options import options from cStringIO import StringIO from spambayes import dbmstorage import shelve # Make shelve use binary pickles by default. oldShelvePickler = shelve.Pickler def binaryDefaultPickler(f, binary=1): return oldShelvePickler(f, binary) shelve.Pickler = binaryDefaultPickler class MessageInfoDB: def __init__(self, db_name, mode='c'): self.mode = mode self.db_name = db_name self.dbm = dbmstorage.open(self.db_name, self.mode) self.db = shelve.Shelf(self.dbm) def store(self): self.db.sync() def _getState(self, msg): try: return self.db[msg.getId()] except KeyError: return None def _setState(self, msg): self.db[msg.getId()] = msg def _delState(self, msg): del self.db[msg.getId()] # this should come from a mark hammond idea of a master db msginfoDB = MessageInfoDB("spambayes.messageinfo.db") class Message(email.Message.Message): '''An email.Message.Message extended for Spambayes''' def __init__(self): email.Message.Message.__init__(self) # persistent state self.id = None self.c = None self.t = None # non-persistent state includes all of email.Message.Message state def setPayload(self, payload): prs = email.Parser.HeaderParser() prs._parseheaders(self, StringIO(payload)) # we may want to do some header parsing error handling here # to try to extract important headers regardless of malformations prs._parsebody(self, StringIO(payload)) def setIdFromPayload(self): try: self.setId(self[options.pop3proxy_mailid_header_name]) except KeyError: return None return self.id def setId(self, id): if self.id: raise ValueError, "MsgId has already been set, cannot be changed" # we should probably enforce type(id) is StringType. # the database will insist upon it, but at that point, it's harder # to diagnose if id is None: raise ValueError, "MsgId must not be None" self.id = id msginfoDB._getState(self) def getId(self): return self.id def addSBHeaders(self, prob, clues): '''Add hammie header, and remember message's classification. Also, add optional headers if needed.''' if prob < options.ham_cutoff: disposition = options.header_ham_string self.clsfyAsHam() elif prob > options.spam_cutoff: disposition = options.header_spam_string self.clsfyAsSpam() else: disposition = options.header_unsure_string self.clsfyAsUnsure() self[options.hammie_header_name] = disposition if options.pop3proxy_include_prob: self[options.pop3proxy_prob_header_name] = prob if options.pop3proxy_include_thermostat: thermostat = '**********' self[options.pop3proxy_thermostat_header_name] = \ thermostat[:int(prob*10)] if options.pop3proxy_include_evidence: evd = "; ".join(["%r: %.2f" % (word, score) for word, score in clues if (word[0] == '*' or score <= options.clue_mailheader_cutoff or score >= 1.0 - options.clue_mailheader_cutoff)]) self[options.pop3proxy_evidence_header_name] = evd if options.pop3proxy_add_mailid_to.find("header") != -1: self[options.pop3proxy_mailid_header_name] = self.id # This won't work for now, because email.Message does not isolate message body # This is also not consistent with the function of this method... # if options.pop3proxy_add_mailid_to.find("body") != -1: # body = body[:len(body)-3] + \ # options.pop3proxy_mailid_header_name + ": " \ # + messageName + "\r\n.\r\n" def delSBHeaders(self): del self[options.hammie_header_name] del self[options.pop3proxy_mailid_header_name] del self[options.hammie_header_name + "-ID"] # test mode header del self[options.pop3proxy_prob_header_name] del self[options.pop3proxy_thermostat_header_name] del self[options.pop3proxy_evidence_header_name] def asTokens(self): # use as_string() here because multipart/digest will return # a list of message objects if get_payload() is used return tokenize(self.as_string()) def modified(self): if self.id: # only persist if key is present msginfoDB._setState(self) def isClsfdSpam(self): return self.c == 's' def isClsfdHam(self): return self.c == 'h' def isClsfdUnsure(self): return self.c == 'u' def isClassified(self): return not self.c is None def clsfyAsSpam(self): self.c = 's' self.modified() def clsfyAsHam(self): self.c = 'h' self.modified() def clsfyAsUnsure(self): self.c = 'u' self.modified() def getClassification(self): return self.c def isTrndSpam(self): return self.t == 's' def isTrndHam(self): return self.t == 'h' def trndAsSpam(self): self.t = 's' self.modified() def trndAsHam(self): self.t = 'h' self.modified() def notTrained(self): self.t = None self.modified() def isTrained(self): return not self.t is None def getTraining(self): return self.t def __repr__(self): return "core.Message%r" % repr(self.__getstate__()) def __getstate__(self): return (self.id, self.c, self.t) def __setstate__(self, t): (self.id, self.c, self.t) = t From timstone4 at users.sourceforge.net Mon Apr 7 19:25:04 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Mon Apr 7 21:25:13 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv14906 Modified Files: imapfilter.py Log Message: Changed to use the message class. Untested at this point. Your turn, Tony. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** imapfilter.py 7 Apr 2003 07:26:24 -0000 1.1 --- imapfilter.py 8 Apr 2003 01:24:17 -0000 1.2 *************** *** 49,55 **** filename = os.path.expanduser(filename) if options.pop3proxy_persistent_use_database: ! self.bayes = storage.DBDictClassifier(filename) else: ! self.bayes = storage.PickledClassifier(filename) if options.verbose: print "Done." --- 49,55 ---- filename = os.path.expanduser(filename) if options.pop3proxy_persistent_use_database: ! self.classifier = storage.DBDictClassifier(filename) else: ! self.classifier = storage.PickledClassifier(filename) if options.verbose: print "Done." *************** *** 109,113 **** print "Could not retrieve message (id %s)" % uid messageText = "" ! return messageText def TrainFolder(self, folder_name, isSpam): --- 109,120 ---- print "Could not retrieve message (id %s)" % uid messageText = "" ! ! msg = spambayes.message.Message() ! msg.setPayload(messageText) ! msg.setId(uid) ! ! msg.delSBHeaders() # never include sb headers in a train ! ! return msg def TrainFolder(self, folder_name, isSpam): *************** *** 115,120 **** uids = self._getUIDs(1, int(response[1][0])) for uid in uids: ! messageText = self.RetrieveMessage(uid) ! self.bayes.learn(tokenizer.tokenize(messageText), isSpam) def Train(self): --- 122,139 ---- uids = self._getUIDs(1, int(response[1][0])) for uid in uids: ! msg = self.RetrieveMessage(uid) ! ! if msg.isTrained(): ! if isSpam and msg.isTrndHam(): ! bayes.unlearn(msg.asTokens(), False) # untrain the ham ! elif not isSpam and msg.isTrndSpam(): ! bayes.unlearn(msg.asTokens(), True) ! ! bayes.learn(msg.asTokens(), isSpam) # train as spam ! ! if isSpam: ! msg.trndAsSpam() ! else: ! msg.trndAsHam() def Train(self): *************** *** 129,133 **** for fol in spam_training_folders: self.TrainFolder(fol, True) ! self.bayes.store() if options.verbose: print "Training took", time.time() - t, "seconds." --- 148,152 ---- for fol in spam_training_folders: self.TrainFolder(fol, True) ! self.classifier.store() if options.verbose: print "Training took", time.time() - t, "seconds." *************** *** 142,152 **** for uid in uids: ! messageText = self.RetrieveMessage(uid) ! (prob, clues) = self.bayes.spamprob\ ! (tokenizer.tokenize(messageText), ! evidence=True) ! messageText = self._addHeaders(messageText, prob, clues) ! #uid = self._updateMessage(uid, messageText) ! self._filterMessage(uid, prob) if options.verbose: print "Filtering took", time.time() - t, "seconds." --- 161,169 ---- for uid in uids: ! msg = self.RetrieveMessage(uid) ! (prob, clues) = self.classifier.spamprob(msg.asTokens(), evidence=True) ! msg.addSBHeaders(prob, clues) # adds headers and remembers classification ! self._updateMessage(msg) ! self._filterMessage(msg) if options.verbose: print "Filtering took", time.time() - t, "seconds." *************** *** 158,198 **** self.imap.logout() ! def _addHeaders(self, messageText, prob, clues): ! if options.pop3proxy_strip_incoming_mailids == True: ! s = re.compile(options.pop3proxy_mailid_header_name + \ ! ': [\d-]+[\\r]?[\\n]?') ! messageText = s.sub('', messageText) ! ! headers, body = re.split(r'\n\r?\n', messageText, 1) ! messageName = self.getNewMessageName() ! headers += '\n' ! if options.pop3proxy_add_mailid_to.find("header") != -1: ! headers += options.pop3proxy_mailid_header_name \ ! + ": " + messageName + "\r\n" ! if options.pop3proxy_add_mailid_to.find("body") != -1: ! body = body[:len(body)-3] + \ ! options.pop3proxy_mailid_header_name + ": " \ ! + messageName + "\r\n.\r\n" ! ! if options.pop3proxy_include_prob: ! headers += '%s: %s\r\n' % (options.pop3proxy_prob_header_name, ! prob) ! if options.pop3proxy_include_thermostat: ! thermostat = '**********' ! headers += '%s: %s\r\n' % \ ! (options.pop3proxy_thermostat_header_name, ! thermostat[int(prob*10):]) ! if options.pop3proxy_include_evidence: ! headers += options.pop3proxy_evidence_header_name + ": " ! headers += "; ".join(["%r: %.2f" % (word, prob) ! for word, score in clues ! if (word[0] == '*' or ! score <= options.clue_mailheader_cutoff or ! score >= 1.0 - options.clue_mailheader_cutoff)]) ! headers += "\r\n" ! headers += "\r\n" ! return headers + body ! ! def _updateMessage(self, uid, messageText): # we can't actually update the message with IMAP # XXX (someone tell me if this is wrong!) --- 175,179 ---- self.imap.logout() ! def _updateMessage(self, msg): # we can't actually update the message with IMAP # XXX (someone tell me if this is wrong!) *************** *** 201,214 **** # spambayes id res = self.imap.append(options.imap_inbox, None, ! self._extractTimeFromMessage(messageText), ! messageText) self._check(res, "append") ! res = self.imap.uid("STORE", uid, "+FLAGS.SILENT", "(\\Deleted)") self._check(res, "uid store") ! res = self.imap.uid("SEARCH", "(TEXT)", messageText) self._check(res, "uid search") return res[1][0] ! def _extractTimeFromMessage(self, messageText): # When we create a new copy of a message, we need to specify # a timestamp for the message. Ideally, this would be the --- 182,195 ---- # spambayes id res = self.imap.append(options.imap_inbox, None, ! self._extractTimeFromMessage(msg), ! msg.payload()) self._check(res, "append") ! res = self.imap.uid("STORE", msg.getId(), "+FLAGS.SILENT", "(\\Deleted)") self._check(res, "uid store") ! res = self.imap.uid("SEARCH", "(TEXT)", msg.payload()) self._check(res, "uid search") return res[1][0] ! def _extractTimeFromMessage(self, msg): # When we create a new copy of a message, we need to specify # a timestamp for the message. Ideally, this would be the *************** *** 217,243 **** return imaplib.Time2Internaldate(time.time()) ! def _moveMessage(self, uid, dest): # The IMAP copy command makes an alias, not a whole new # copy, so what we need to do (sigh) is create a new message # in the correct folder, and delete the old one # XXX (someone tell me if this is wrong, too!) ! response = self.imap.uid("FETCH", uid, "(RFC822.PEEK)") self._check(response, 'uid fetch') ! messageText = response[1][0][1] ! response = self.imap.append(dest, None, ! self._extractTimeFromMessage(messageText), ! messageText) self._check(response, "append") ! res = self.imap.uid("STORE", uid, "+FLAGS.SILENT", "(\\Deleted)") self._check(response, "uid store") ! def _filterMessage(self, uid, prob): ! if prob < options.ham_cutoff: # we leave ham alone pass ! elif prob > options.spam_cutoff: ! self._moveMessage(uid, options.imap_spam_folder) else: ! self._moveMessage(uid, options.imap_unsure_folder) if __name__ == '__main__': --- 198,226 ---- return imaplib.Time2Internaldate(time.time()) ! def _moveMessage(self, msg, dest): # The IMAP copy command makes an alias, not a whole new # copy, so what we need to do (sigh) is create a new message # in the correct folder, and delete the old one # XXX (someone tell me if this is wrong, too!) ! response = self.imap.uid("FETCH", msg.getId(), "(RFC822.PEEK)") self._check(response, 'uid fetch') ! ! msg = spambayes.message.Message() ! msg.setPayload(response[1][0][1]) ! msg.setId(_extractTimeFromMessage(msg)) ! ! response = self.imap.append(dest, None, msg.getId(), msg.payload()) self._check(response, "append") ! res = self.imap.uid("STORE", msg.getId(), "+FLAGS.SILENT", "(\\Deleted)") self._check(response, "uid store") ! def _filterMessage(self, msg, prob): ! if msg.isClsfdHam(): # we leave ham alone pass ! elif msg.isClsfdSpam(): ! self._moveMessage(msg, options.imap_spam_folder) else: ! self._moveMessage(msg, options.imap_unsure_folder) if __name__ == '__main__': From anadelonbrin at users.sourceforge.net Tue Apr 8 01:35:44 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Tue Apr 8 03:35:50 2003 Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv3092/spambayes Modified Files: message.py Log Message: Fixes the spelling of my name ;) Temp fix for infinite recursion error. Adds a couple of little functions to simply use. Adds a changeId function. Index: message.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** message.py 8 Apr 2003 01:21:28 -0000 1.1 --- message.py 8 Apr 2003 07:35:42 -0000 1.2 *************** *** 69,73 **** __author__ = "Tim Stone " ! __credits__ = "Mark Hammond, Tony Meyers, all the spambayes contributors." from __future__ import generators --- 69,73 ---- __author__ = "Tim Stone " ! __credits__ = "Mark Hammond, Tony Meyer, all the spambayes contributors." from __future__ import generators *************** *** 94,102 **** import shelve # Make shelve use binary pickles by default. ! oldShelvePickler = shelve.Pickler ! def binaryDefaultPickler(f, binary=1): ! return oldShelvePickler(f, binary) ! shelve.Pickler = binaryDefaultPickler --- 94,104 ---- import shelve + # XXX Tim, what do you want to do here? This + # XXX recurses infinately at the moment # Make shelve use binary pickles by default. ! #oldShelvePickler = shelve.Pickler ! #def binaryDefaultPickler(f, binary=1): ! # return oldShelvePickler(f, binary) ! #shelve.Pickler = binaryDefaultPickler *************** *** 155,163 **** return self.id def setId(self, id): if self.id: raise ValueError, "MsgId has already been set, cannot be changed" ! # we should probably enforce type(id) is StringType. # the database will insist upon it, but at that point, it's harder --- 157,179 ---- return self.id + + def changeID(self, id): + # We cannot re-set an id (see below). However there are + # occasionally times when the id for a message will change, + # for example, on an IMAP server (or possibly an exchange + # server), the server may change the ids that we are using + # We enforce that this must be an explicit *change* rather + # than simply re-setting, by having this as a separate + # function + if not self.id: + raise ValueError, "MsgID has not been set, cannot be changed" + self._setId(id) def setId(self, id): if self.id: raise ValueError, "MsgId has already been set, cannot be changed" ! self._setId(id) ! ! def _setId(self, id): # we should probably enforce type(id) is StringType. # the database will insist upon it, but at that point, it's harder *************** *** 274,277 **** --- 290,306 ---- self.modified() + def isTrndAs(self, isSpam): + if self.t == 'h' and not isSpam: + return True + if self.t == 's' and isSpam: + return True + return False + + def trndAs(self, isSpam): + if isSpam: + self.t = 's' + else: + self.t = 'h' + def notTrained(self): self.t = None From anadelonbrin at users.sourceforge.net Tue Apr 8 01:37:31 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Tue Apr 8 03:37:35 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv4387 Modified Files: imapfilter.py Log Message: Introduces an IMAPMessage class based on the spambayes Message class. Introduces an iterable IMAPFolder class. Changes the code to use all of this. Changed to allow multiple folders to filter. Training seems to work, although filtering isn't. I'll get to it. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** imapfilter.py 8 Apr 2003 01:24:17 -0000 1.2 --- imapfilter.py 8 Apr 2003 07:37:28 -0000 1.3 *************** *** 1,4 **** --- 1,6 ---- #!/usr/bin/env python + from __future__ import generators + """An IMAP filter. An IMAP message box is scanned and all non-scored messages are scored and (where necessary) filtered. *************** *** 18,28 **** __author__ = "Tony Meyer " ! __credits__ = "All the Spambayes folk." ! # This code will benefit immensely from ! # (a) The new message class, which can hold information such as ! # whether a message has been seen before ! # (b) The new header stuff, which will abstract out adding all ! # the headers try: --- 20,31 ---- __author__ = "Tony Meyer " ! __credits__ = "Tim Stone, All the Spambayes folk." ! # Tony thinks it would be nice if there was a web ui to ! # this for the initial setup (i.e. like pop3proxy), which offered ! # a list of folders to filter/train/etc. It could then record a ! # uid for the folder rather than a name, and it avoids the problems ! # with different imap servers having different naming styles ! # a list is retrieved via imap.list() try: *************** *** 37,47 **** import re import time from spambayes.Options import options ! from spambayes import tokenizer, storage class IMAPFilter(object): def __init__(self): ! self.imap = imaplib.IMAP4(options.imap_server, options.imap_port) if options.verbose: print "Loading database...", --- 40,138 ---- import re import time + import sys from spambayes.Options import options ! from spambayes import tokenizer, storage, message ! ! # global IMAPlib object ! imap = None ! ! class IMAPMessage(message.Message): ! # response checking is necessary throughout this class ! def __init__(self, folder_id, folder_name, message_id): ! message.Message.__init__(self) ! self.setId(message_id) ! self.folder_id = folder_id ! self.folder_name = folder_name ! ! def extractTime(self): ! # When we create a new copy of a message, we need to specify ! # a timestamp for the message. Ideally, this would be the ! # timestamp from the message itself, but for the moment, we ! # just use the current time. ! return imaplib.Time2Internaldate(time.time()) ! ! def Update(self): ! # we can't actually update the message with IMAP ! # so what we do is create a new message and delete the old one ! response = imap.append(self.folder_name, None, ! self.extractTime(), self.get_payload()) ! response = imap.select(self.folder_name, False) ! response = imap.uid("STORE", self.getId(), "+FLAGS.SILENT", ! "(\\Deleted)") ! # we need to update the uid, as it will have changed ! response = imap.uid("SEARCH", "(TEXT)", self.get_payload()) ! self.changeId(response[1][0]) ! ! ! class IMAPFolder(object): ! # response checking is necessary throughout this class ! def __init__(self, folder_name, readOnly=True): ! self.name = folder_name ! # Convert folder name to a uid ! self.uid = None ! response = imap.select(self.name, readOnly) ! responses = imap.response("OK")[1] ! for response in responses: ! if response[:13] == "[UIDVALIDITY ": ! r = re.compile(r"(?P\d+)") ! self.uid = r.search(response[13:]).group('uid') ! # We really want to use RFC822.PEEK here, as that doesn't effect ! # the status of the message. Unfortunately, it appears that not ! # all IMAP servers support this, even though it is in RFC1730 ! self.rfc822_command = "(RFC822.PEEK)" ! response = imap.fetch("1:1", self.rfc822_command) ! if response[0] != "OK": ! self.rfc822_command = "(RFC822)" ! ! def __iter__(self): ! '''IMAPFolder is iterable''' ! for key in self.keys(): ! try: ! yield self[key] ! except KeyError: ! pass ! ! def keys(self): ! '''Returns uids for all the messages in the folder''' ! # request message range ! response = imap.select(self.name, True) ! total_messages = response[1][0] ! if total_messages == '0': ! return [] ! response = imap.fetch("1:" + total_messages, "UID") ! r = re.compile(r"[0-9]+ \(UID ([0-9]+)\)") ! uids = [] ! for i in response[1]: ! mo = r.match(i) ! if mo is not None: ! uids.append(mo.group(1)) ! return uids ! ! def __getitem__(self, key): ! '''Return message matching the given uid''' ! response = imap.uid("FETCH", key, self.rfc822_command) ! messageText = response[1][0][1] ! # we return an instance of *our* message class, not the ! # raw rfc822 message ! msg = IMAPMessage(self.uid, self.name, key) ! msg.setPayload(messageText) ! return msg ! class IMAPFilter(object): def __init__(self): ! global imap ! imap = imaplib.IMAP4(options.imap_server, options.imap_port) if options.verbose: print "Loading database...", *************** *** 54,64 **** if options.verbose: print "Done." - # Unique names for cached messages - see getNewMessageName() below. - self.lastBaseMessageName = '' - self.uniquifier = 2 - - def Login(self): - lgn = self.imap.login(options.imap_username, options.imap_password) - self._check(lgn, 'login') def _check(self, response, command): --- 145,148 ---- *************** *** 67,139 **** sys.exit(-1) - def _getUIDs(self, low, high): - # Retreive a list of uids corresponding to the given range - if high < low: return [] - # request message range - range = str(low) + ":" + str(high) - res = self.imap.fetch(range, "UID") - self._check(res, 'fetch') - r = re.compile(r"[0-9]+ \(UID ([0-9]+)\)") - res2 = [] - for i in res[1]: - mo = r.match(i) - if mo is not None: - res2.append(mo.group(1)) - return res2 - - def getNewMessageName(self): - # The message name is the time it arrived, with a uniquifier - # appended if two arrive within one clock tick of each other. - # (This is completely taken from the same function in pop3proxy's - # State class.) - messageName = "%10.10d" % long(time.time()) - if messageName == self.lastBaseMessageName: - messageName = "%s-%d" % (messageName, self.uniquifier) - self.uniquifier += 1 - else: - self.lastBaseMessageName = messageName - self.uniquifier = 2 - return messageName - def _selectFolder(self, name, read_only): ! folder = self.imap.select(name, read_only) self._check(folder, 'select') return folder ! def RetrieveMessage(self, uid): ! response = self.imap.uid("FETCH", uid, "(RFC822.PEEK)") ! self._check(response, 'uid fetch') ! try: ! messageText = response[1][0][1] ! except: ! print "Could not retrieve message (id %s)" % uid ! messageText = "" ! ! msg = spambayes.message.Message() ! msg.setPayload(messageText) ! msg.setId(uid) ! ! msg.delSBHeaders() # never include sb headers in a train ! ! return msg def TrainFolder(self, folder_name, isSpam): ! response = self._selectFolder(folder_name, True) ! uids = self._getUIDs(1, int(response[1][0])) ! for uid in uids: ! msg = self.RetrieveMessage(uid) ! if msg.isTrained(): ! if isSpam and msg.isTrndHam(): ! bayes.unlearn(msg.asTokens(), False) # untrain the ham ! elif not isSpam and msg.isTrndSpam(): ! bayes.unlearn(msg.asTokens(), True) ! ! bayes.learn(msg.asTokens(), isSpam) # train as spam ! ! if isSpam: ! msg.trndAsSpam() ! else: ! msg.trndAsHam() def Train(self): --- 151,175 ---- sys.exit(-1) def _selectFolder(self, name, read_only): ! folder = imap.select(name, read_only) self._check(folder, 'select') return folder ! def Login(self): ! lgn = imap.login(options.imap_username, options.imap_password) ! self._check(lgn, 'login') def TrainFolder(self, folder_name, isSpam): ! folder = IMAPFolder(folder_name) ! for msg in folder: if msg.isTrained(): ! if msg.isTrndAs(isSpam): ! # already trained, nothing for us to do here ! # (we don't want to train the same message twice) ! continue ! if msg.isTrained(): ! self.classifier.unlearn(msg.asTokens(), not isSpam) ! self.classifier.learn(msg.asTokens(), isSpam) ! msg.trndAs(isSpam) def Train(self): *************** *** 155,169 **** if options.verbose: t = time.time() ! inbox = self._selectFolder(options.imap_inbox, False) ! # the number of messages are returned ! # get all the corresponding UIDs ! uids = self._getUIDs(1, int(inbox[1][0])) ! ! for uid in uids: ! msg = self.RetrieveMessage(uid) ! (prob, clues) = self.classifier.spamprob(msg.asTokens(), evidence=True) ! msg.addSBHeaders(prob, clues) # adds headers and remembers classification ! self._updateMessage(msg) ! self._filterMessage(msg) if options.verbose: print "Filtering took", time.time() - t, "seconds." --- 191,204 ---- if options.verbose: t = time.time() ! for filter_folder in options.imap_filter_folders.split(): ! folder = IMAPFolder(filter_folder, False) ! for msg in folder: ! (prob, clues) = self.classifier.spamprob(msg.asTokens(), ! evidence=True) ! # add headers and remember classification ! msg.addSBHeaders(prob, clues) ! # XXX updating is disabled for the moment ! # msg.Update() ! self._filterMessage(msg) if options.verbose: print "Filtering took", time.time() - t, "seconds." *************** *** 172,193 **** # sign off if options.imap_expunge: ! self.imap.expunge() ! self.imap.logout() ! ! def _updateMessage(self, msg): ! # we can't actually update the message with IMAP ! # XXX (someone tell me if this is wrong!) ! # so what we do is create a new message and delete the old one ! # we return the new uid, which we obtain by searching for the ! # spambayes id ! res = self.imap.append(options.imap_inbox, None, ! self._extractTimeFromMessage(msg), ! msg.payload()) ! self._check(res, "append") ! res = self.imap.uid("STORE", msg.getId(), "+FLAGS.SILENT", "(\\Deleted)") ! self._check(res, "uid store") ! res = self.imap.uid("SEARCH", "(TEXT)", msg.payload()) ! self._check(res, "uid search") ! return res[1][0] def _extractTimeFromMessage(self, msg): --- 207,212 ---- # sign off if options.imap_expunge: ! imap.expunge() ! imap.logout() def _extractTimeFromMessage(self, msg): *************** *** 198,221 **** return imaplib.Time2Internaldate(time.time()) ! def _moveMessage(self, msg, dest): # The IMAP copy command makes an alias, not a whole new # copy, so what we need to do (sigh) is create a new message # in the correct folder, and delete the old one ! # XXX (someone tell me if this is wrong, too!) ! response = self.imap.uid("FETCH", msg.getId(), "(RFC822.PEEK)") self._check(response, 'uid fetch') ! ! msg = spambayes.message.Message() msg.setPayload(response[1][0][1]) ! msg.setId(_extractTimeFromMessage(msg)) ! response = self.imap.append(dest, None, msg.getId(), msg.payload()) self._check(response, "append") ! res = self.imap.uid("STORE", msg.getId(), "+FLAGS.SILENT", "(\\Deleted)") self._check(response, "uid store") ! def _filterMessage(self, msg, prob): if msg.isClsfdHam(): # we leave ham alone pass elif msg.isClsfdSpam(): --- 217,246 ---- return imaplib.Time2Internaldate(time.time()) ! def _moveMessage(self, old_msg, dest): # The IMAP copy command makes an alias, not a whole new # copy, so what we need to do (sigh) is create a new message # in the correct folder, and delete the old one ! # XXX (someone tell me if this is wrong) ! response = imap.uid("FETCH", old_msg.getId(), "(RFC822)") self._check(response, 'uid fetch') ! msg = message.Message() msg.setPayload(response[1][0][1]) ! #response = imap.uid("SEARCH", "(TEXT)", msg.get_payload()) ! #self._check(response, "search") ! #self.changeId(response[1][0]) ! response = imap.append(dest, None, ! self._extractTimeFromMessage(msg), ! msg.get_payload()) self._check(response, "append") ! self._selectFolder(old_msg.folder_name, False) ! response = imap.uid("STORE", old_msg.getId(), "+FLAGS.SILENT", ! "(\\Deleted)") self._check(response, "uid store") ! def _filterMessage(self, msg): if msg.isClsfdHam(): # we leave ham alone + print "untouched" pass elif msg.isClsfdSpam(): *************** *** 227,230 **** --- 252,256 ---- options.verbose = True imap_filter = IMAPFilter() + # imap_filter.imap.debug = 10 imap_filter.Login() imap_filter.Train() From anadelonbrin at users.sourceforge.net Tue Apr 8 01:37:33 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Tue Apr 8 03:37:37 2003 Subject: [Spambayes-checkins] spambayes/spambayes Options.py,1.25,1.26 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv4387/spambayes Modified Files: Options.py Log Message: Introduces an IMAPMessage class based on the spambayes Message class. Introduces an iterable IMAPFolder class. Changes the code to use all of this. Changed to allow multiple folders to filter. Training seems to work, although filtering isn't. I'll get to it. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v retrieving revision 1.25 retrieving revision 1.26 diff -C2 -d -r1.25 -r1.26 *** Options.py 7 Apr 2003 07:26:17 -0000 1.25 --- Options.py 8 Apr 2003 07:37:29 -0000 1.26 *************** *** 422,426 **** imap_password: imap_expunge: False ! imap_inbox: inbox imap_unsure_folder: imap_spam_folder: --- 422,426 ---- imap_password: imap_expunge: False ! imap_filter_folders: INBOX imap_unsure_folder: imap_spam_folder: *************** *** 553,557 **** 'imap_username' : string_cracker, 'imap_password' : string_cracker, ! 'imap_inbox' : string_cracker, 'imap_unsure_folder' : string_cracker, 'imap_spam_folder' : string_cracker, --- 553,557 ---- 'imap_username' : string_cracker, 'imap_password' : string_cracker, ! 'imap_filter_folders' : string_cracker, 'imap_unsure_folder' : string_cracker, 'imap_spam_folder' : string_cracker, From timstone4 at users.sourceforge.net Tue Apr 8 09:24:46 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Tue Apr 8 11:24:49 2003 Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv24156 Modified Files: message.py Log Message: Added a couple more methods to support copying one message to another Index: message.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** message.py 8 Apr 2003 07:35:42 -0000 1.2 --- message.py 8 Apr 2003 15:24:43 -0000 1.3 *************** *** 187,190 **** --- 187,195 ---- def getId(self): return self.id + + def copy(self, old_msg): + self.setPayload(old_msg.payload()) # this is expensive... + self.setClassification(old_msg.getClassification()) + self.setTraining(old_msg.getTraining()) def addSBHeaders(self, prob, clues): *************** *** 275,278 **** --- 280,290 ---- def getClassification(self): return self.c + + def setClassification(self, cls): + if cls == 's' or cls == 'h' or cls == 'u' or cls is None: + self.c = cls + self.modified() + else: + raise ValueError def isTrndSpam(self): *************** *** 312,316 **** def getTraining(self): return self.t ! def __repr__(self): return "core.Message%r" % repr(self.__getstate__()) --- 324,335 ---- def getTraining(self): return self.t ! ! def setTraining(self, trn): ! if trn == 's' or trn == 'h' or trn is None: ! self.t = trn ! self.modified() ! else: ! raise ValueError ! def __repr__(self): return "core.Message%r" % repr(self.__getstate__()) From timstone4 at users.sourceforge.net Tue Apr 8 09:28:06 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Tue Apr 8 11:28:10 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv25441 Modified Files: imapfilter.py Log Message: Added logic to ensure that classification and training memory is preserved when IMAP messages are altered (i.e. deleted and added with a new id), and when messages are retrained. Again... unable to test, so your turn, Tony. I'm gonna have to get an IMAP thingy if I'm going to do much work on this . Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** imapfilter.py 8 Apr 2003 07:37:28 -0000 1.3 --- imapfilter.py 8 Apr 2003 15:28:03 -0000 1.4 *************** *** 75,78 **** --- 75,95 ---- self.changeId(response[1][0]) + def Delete(self): + self._selectFolder(self.folder_name, False) + response = imap.uid("STORE", self.getId(), "+FLAGS.SILENT", + "(\\Deleted)") + self._check(response, "uid store") + + # XXX there should actually be a delete from the msgid database here... + self.notTrained() + self.notClassified() + + def Append(self): + response = imap.append(self.folder_name, None, + self.getId(), + self.get_payload()) + self._check(response, "append") + + class IMAPFolder(object): *************** *** 135,138 **** --- 152,159 ---- global imap imap = imaplib.IMAP4(options.imap_server, options.imap_port) + + self.spam_folder = IMAPFolder(options.imap_spam_folder) + self.unsure_folder = IMAPFolder(options.imap_unsure_folder) + if options.verbose: print "Loading database...", *************** *** 163,175 **** folder = IMAPFolder(folder_name) for msg in folder: ! if msg.isTrained(): ! if msg.isTrndAs(isSpam): ! # already trained, nothing for us to do here ! # (we don't want to train the same message twice) ! continue ! if msg.isTrained(): ! self.classifier.unlearn(msg.asTokens(), not isSpam) ! self.classifier.learn(msg.asTokens(), isSpam) ! msg.trndAs(isSpam) def Train(self): --- 184,200 ---- folder = IMAPFolder(folder_name) for msg in folder: ! # XXX I've rewritten this logic. It looks a bit strange, ! # because of the msg.notTrained call immediately before the ! # test for isTrained, but this is safer. Once the message has ! # been untrained, it's training memory should reflect that ! # on the off chance that for some reason the training breaks, ! # which happens on occasion (the tokenizer is not yet perfect) ! if msg.isTrndAs(not isSpam): ! self.classifier.unlearn(msg.asTokens(), not isSpam) ! msg.notTrained() ! ! if not msg.isTrained(): ! self.classifier.learn(msg.asTokens(), isSpam) ! msg.trndAs(isSpam) def Train(self): *************** *** 210,220 **** imap.logout() - def _extractTimeFromMessage(self, msg): - # When we create a new copy of a message, we need to specify - # a timestamp for the message. Ideally, this would be the - # timestamp from the message itself, but for the moment, we - # just use the current time. - return imaplib.Time2Internaldate(time.time()) - def _moveMessage(self, old_msg, dest): # The IMAP copy command makes an alias, not a whole new --- 235,238 ---- *************** *** 222,241 **** # in the correct folder, and delete the old one # XXX (someone tell me if this is wrong) ! response = imap.uid("FETCH", old_msg.getId(), "(RFC822)") ! self._check(response, 'uid fetch') ! msg = message.Message() ! msg.setPayload(response[1][0][1]) #response = imap.uid("SEARCH", "(TEXT)", msg.get_payload()) #self._check(response, "search") #self.changeId(response[1][0]) ! response = imap.append(dest, None, ! self._extractTimeFromMessage(msg), ! msg.get_payload()) ! self._check(response, "append") ! self._selectFolder(old_msg.folder_name, False) ! response = imap.uid("STORE", old_msg.getId(), "+FLAGS.SILENT", ! "(\\Deleted)") ! self._check(response, "uid store") def _filterMessage(self, msg): --- 240,275 ---- # in the correct folder, and delete the old one # XXX (someone tell me if this is wrong) ! ! # XXX I've redone this logic to use the IMAPMessage class. It ! # may be a bit of overkill, but it allows us to maintain the ! # proper training and classification memory for the message ! # as it's moved ! ! #response = imap.uid("FETCH", old_msg.getId(), "(RFC822)") ! #self._check(response, 'uid fetch') ! #msg = message.Message() ! #msg.setPayload(response[1][0][1]) ! ! msg = IMAPMessage(dest.uid, dest.folder_name, None) ! msg.setId(msg.extractTime()) # this is kinda silly ! msg.copy(old_msg) ! #response = imap.uid("SEARCH", "(TEXT)", msg.get_payload()) #self._check(response, "search") #self.changeId(response[1][0]) ! #response = imap.append(dest.folder_name, None, ! # msg.getId(), ! # msg.get_payload()) ! #self._check(response, "append") ! ! msg.Append() ! ! #self._selectFolder(old_msg.folder_name, False) ! #response = imap.uid("STORE", old_msg.getId(), "+FLAGS.SILENT", ! # "(\\Deleted)") ! #self._check(response, "uid store") ! ! old_msg.Delete() def _filterMessage(self, msg): *************** *** 245,251 **** pass elif msg.isClsfdSpam(): ! self._moveMessage(msg, options.imap_spam_folder) else: ! self._moveMessage(msg, options.imap_unsure_folder) if __name__ == '__main__': --- 279,287 ---- pass elif msg.isClsfdSpam(): ! #XXX I actually think move should be a method on IMAPMessage ! #but I'm running out of time. ! self._moveMessage(msg, self.spam_folder) else: ! self._moveMessage(msg, self.unsure_folder) if __name__ == '__main__': From timstone4 at users.sourceforge.net Tue Apr 8 21:25:27 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Tue Apr 8 23:25:30 2003 Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv8117 Modified Files: message.py Log Message: A few corrections. Index: message.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** message.py 8 Apr 2003 15:24:43 -0000 1.3 --- message.py 9 Apr 2003 03:25:24 -0000 1.4 *************** *** 82,85 **** --- 82,86 ---- import sys + import types import email.Message *************** *** 158,162 **** return self.id ! def changeID(self, id): # We cannot re-set an id (see below). However there are # occasionally times when the id for a message will change, --- 159,163 ---- return self.id ! def changeId(self, id): # We cannot re-set an id (see below). However there are # occasionally times when the id for a message will change, *************** *** 181,187 **** --- 182,192 ---- if id is None: raise ValueError, "MsgId must not be None" + + if not type(id) in types.StringTypes: + raise TypeError, "Id must be a string" self.id = id msginfoDB._getState(self) + self.modified() # id has changed, force storage def getId(self): From anadelonbrin at users.sourceforge.net Wed Apr 9 00:14:31 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Wed Apr 9 02:14:39 2003 Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv20980/spambayes Modified Files: message.py Log Message: Changes the message class so that the base class is more abstract and introduces a sub-class to add header add/remove functions. Changes the set/get classify/training information methods to a simpler version. As per messages on list - if you don't like this version Tim feel free to change it back! :) Index: message.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** message.py 9 Apr 2003 03:25:24 -0000 1.4 --- message.py 9 Apr 2003 06:14:29 -0000 1.5 *************** *** 126,133 **** del self.db[msg.getId()] ! # this should come from a mark hammond idea of a master db msginfoDB = MessageInfoDB("spambayes.messageinfo.db") - class Message(email.Message.Message): '''An email.Message.Message extended for Spambayes''' --- 126,132 ---- del self.db[msg.getId()] ! # this should come from a Mark Hammond idea of a master db msginfoDB = MessageInfoDB("spambayes.messageinfo.db") class Message(email.Message.Message): '''An email.Message.Message extended for Spambayes''' *************** *** 151,183 **** prs._parsebody(self, StringIO(payload)) - def setIdFromPayload(self): - try: - self.setId(self[options.pop3proxy_mailid_header_name]) - except KeyError: - return None - - return self.id - - def changeId(self, id): - # We cannot re-set an id (see below). However there are - # occasionally times when the id for a message will change, - # for example, on an IMAP server (or possibly an exchange - # server), the server may change the ids that we are using - # We enforce that this must be an explicit *change* rather - # than simply re-setting, by having this as a separate - # function - if not self.id: - raise ValueError, "MsgID has not been set, cannot be changed" - self._setId(id) - def setId(self, id): if self.id: raise ValueError, "MsgId has already been set, cannot be changed" - self._setId(id) ! def _setId(self, id): ! # we should probably enforce type(id) is StringType. ! # the database will insist upon it, but at that point, it's harder ! # to diagnose if id is None: raise ValueError, "MsgId must not be None" --- 150,160 ---- prs._parsebody(self, StringIO(payload)) def setId(self, id): if self.id: raise ValueError, "MsgId has already been set, cannot be changed" ! # XXX This isn't really needed since type(None) is not ! # XXX in types.StringTypes - do we still want it for the ! # XXX more informative error message? if id is None: raise ValueError, "MsgId must not be None" *************** *** 193,201 **** return self.id ! def copy(self, old_msg): ! self.setPayload(old_msg.payload()) # this is expensive... ! self.setClassification(old_msg.getClassification()) ! self.setTraining(old_msg.getTraining()) def addSBHeaders(self, prob, clues): '''Add hammie header, and remember message's classification. Also, --- 170,218 ---- return self.id ! def asTokens(self): ! # use as_string() here because multipart/digest will return ! # a list of message objects if get_payload() is used ! return tokenize(self.as_string()) + def modified(self): + if self.id: # only persist if key is present + msginfoDB._setState(self) + + def GetClassification(self): + return self.c + def GetTrained(self): + return self.t + def RememberClassification(self, cls): + self.c = cls + self.modified() + def RememberTrained(self, isSpam): + self.t = isSpam + self.modified() + + def __repr__(self): + return "core.Message%r" % repr(self.__getstate__()) + + def __getstate__(self): + return (self.id, self.c, self.t) + + def __setstate__(self, t): + (self.id, self.c, self.t) = t + + # XXX I can't think of a good name. Someone change + # XXX HeaderMessage to something better before it gets used + # XXX all over the place. + class HeaderMessage(Message): + '''Adds routines to add/remove headers for Spambayes''' + def __init__(self): + Message.__init__(self) + + def setIdFromPayload(self): + try: + self.setId(self[options.pop3proxy_mailid_header_name]) + except KeyError: + return None + + return self.id + def addSBHeaders(self, prob, clues): '''Add hammie header, and remember message's classification. Also, *************** *** 204,215 **** if prob < options.ham_cutoff: disposition = options.header_ham_string - self.clsfyAsHam() elif prob > options.spam_cutoff: disposition = options.header_spam_string - self.clsfyAsSpam() else: disposition = options.header_unsure_string ! self.clsfyAsUnsure() ! self[options.hammie_header_name] = disposition --- 221,229 ---- if prob < options.ham_cutoff: disposition = options.header_ham_string elif prob > options.spam_cutoff: disposition = options.header_spam_string else: disposition = options.header_unsure_string ! self.RememberClassification(disposition) self[options.hammie_header_name] = disposition *************** *** 228,232 **** score <= options.clue_mailheader_cutoff or score >= 1.0 - options.clue_mailheader_cutoff)]) - self[options.pop3proxy_evidence_header_name] = evd --- 242,245 ---- *************** *** 241,245 **** # + messageName + "\r\n.\r\n" - def delSBHeaders(self): del self[options.hammie_header_name] --- 254,257 ---- *************** *** 249,346 **** del self[options.pop3proxy_thermostat_header_name] del self[options.pop3proxy_evidence_header_name] - - def asTokens(self): - # use as_string() here because multipart/digest will return - # a list of message objects if get_payload() is used - return tokenize(self.as_string()) - - def modified(self): - if self.id: # only persist if key is present - msginfoDB._setState(self) - - def isClsfdSpam(self): - return self.c == 's' - - def isClsfdHam(self): - return self.c == 'h' - - def isClsfdUnsure(self): - return self.c == 'u' - - def isClassified(self): - return not self.c is None - - def clsfyAsSpam(self): - self.c = 's' - self.modified() - - def clsfyAsHam(self): - self.c = 'h' - self.modified() - - def clsfyAsUnsure(self): - self.c = 'u' - self.modified() - - def getClassification(self): - return self.c - - def setClassification(self, cls): - if cls == 's' or cls == 'h' or cls == 'u' or cls is None: - self.c = cls - self.modified() - else: - raise ValueError - - def isTrndSpam(self): - return self.t == 's' - - def isTrndHam(self): - return self.t == 'h' - - def trndAsSpam(self): - self.t = 's' - self.modified() - - def trndAsHam(self): - self.t = 'h' - self.modified() - - def isTrndAs(self, isSpam): - if self.t == 'h' and not isSpam: - return True - if self.t == 's' and isSpam: - return True - return False - - def trndAs(self, isSpam): - if isSpam: - self.t = 's' - else: - self.t = 'h' - - def notTrained(self): - self.t = None - self.modified() - - def isTrained(self): - return not self.t is None - - def getTraining(self): - return self.t - - def setTraining(self, trn): - if trn == 's' or trn == 'h' or trn is None: - self.t = trn - self.modified() - else: - raise ValueError - - def __repr__(self): - return "core.Message%r" % repr(self.__getstate__()) - - def __getstate__(self): - return (self.id, self.c, self.t) - - def __setstate__(self, t): - (self.id, self.c, self.t) = t \ No newline at end of file --- 261,262 ---- From anadelonbrin at users.sourceforge.net Wed Apr 9 00:16:18 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Wed Apr 9 02:16:22 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv21599 Modified Files: imapfilter.py Log Message: Updates the IMAPFilter to reflect the changes in the message class. Lots of changes all over the place, integrating (and changing!) Tim's code. WARNING: It still seems to train fine (although maybe the saving is also broken), but the filtering is still buggy. Over to people with more time today. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** imapfilter.py 8 Apr 2003 15:28:03 -0000 1.4 --- imapfilter.py 9 Apr 2003 06:16:15 -0000 1.5 *************** *** 29,32 **** --- 29,39 ---- # a list is retrieved via imap.list() + # IMAPFolder objects get created all over the place, and don't persist + # at all. It would probably be good to change this, especially if + # the filter doesn't run just once + + # All the imap responses should be checked - [0] should be "OK" + # otherwise an error will occur and who knows what will happen + try: True, False *************** *** 48,53 **** imap = None ! class IMAPMessage(message.Message): ! # response checking is necessary throughout this class def __init__(self, folder_id, folder_name, message_id): message.Message.__init__(self) --- 55,59 ---- imap = None ! class IMAPMessage(message.HeaderMessage): def __init__(self, folder_id, folder_name, message_id): message.Message.__init__(self) *************** *** 55,58 **** --- 61,65 ---- self.folder_id = folder_id self.folder_name = folder_name + self.previous_folder = None def extractTime(self): *************** *** 63,98 **** return imaplib.Time2Internaldate(time.time()) ! def Update(self): # we can't actually update the message with IMAP # so what we do is create a new message and delete the old one response = imap.append(self.folder_name, None, self.extractTime(), self.get_payload()) - response = imap.select(self.folder_name, False) - response = imap.uid("STORE", self.getId(), "+FLAGS.SILENT", - "(\\Deleted)") # we need to update the uid, as it will have changed response = imap.uid("SEARCH", "(TEXT)", self.get_payload()) ! self.changeId(response[1][0]) ! ! def Delete(self): ! self._selectFolder(self.folder_name, False) ! response = imap.uid("STORE", self.getId(), "+FLAGS.SILENT", ! "(\\Deleted)") ! self._check(response, "uid store") ! ! # XXX there should actually be a delete from the msgid database here... ! self.notTrained() ! self.notClassified() ! ! def Append(self): ! response = imap.append(self.folder_name, None, ! self.getId(), ! self.get_payload()) ! self._check(response, "append") ! ! class IMAPFolder(object): - # response checking is necessary throughout this class def __init__(self, folder_name, readOnly=True): self.name = folder_name --- 70,100 ---- return imaplib.Time2Internaldate(time.time()) ! def MoveTo(self, dest): ! # The move just changes where we think we are, ! # and we do an actual move on save (to avoid doing ! # this more than once) ! if self.previous_folder is not None: ! self.previous_folder = self.folder_name ! self.folder_name = dest ! ! def Save(self): # we can't actually update the message with IMAP # so what we do is create a new message and delete the old one response = imap.append(self.folder_name, None, self.extractTime(), self.get_payload()) # we need to update the uid, as it will have changed + # XXX there will be problems here if the message *has not* + # XXX changed, as the message to be deleted will be found first + # XXX (if they are in the same folder) response = imap.uid("SEARCH", "(TEXT)", self.get_payload()) ! old_id = self.id ! self.id = response[1][0] ! if self.previous_folder is not None: ! response = imap.select(self.previous_folder, False) ! self.previous_folder = None ! # this line is raising an error, but WHY? ! #response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)") class IMAPFolder(object): def __init__(self, folder_name, readOnly=True): self.name = folder_name *************** *** 147,150 **** --- 149,184 ---- return msg + def Train(self, classifier, isSpam): + '''Train folder as spam/ham''' + for msg in self: + if msg.GetTrained() == isSpam: + classifier.unlearn(msg.asTokens(), not isSpam) + # Once the message has been untrained, it's training memory + # should reflect that on the off chance that for some reason + # the training breaks, which happens on occasion (the + # tokenizer is not yet perfect) + msg.RememberTrained(None) + + if msg.GetTrained() is not None: + classifier.learn(msg.asTokens(), isSpam) + msg.RememberTrained(isSpam) + + def FilterMessage(self, msg): + if msg.GetClassification() == options.header_ham_string: + # we leave ham alone + pass + elif msg.GetClassification() == options.header_spam_string: + msg.MoveTo(options.imap_spam_folder) + else: + msg.MoveTo(options.imap_unsure_folder) + + def Filter(self, classifier): + for msg in self: + (prob, clues) = classifier.spamprob(msg.asTokens(), evidence=True) + # add headers and remember classification + msg.addSBHeaders(prob, clues) + self.FilterMessage(msg) + msg.Save() + class IMAPFilter(object): *************** *** 153,159 **** imap = imaplib.IMAP4(options.imap_server, options.imap_port) - self.spam_folder = IMAPFolder(options.imap_spam_folder) - self.unsure_folder = IMAPFolder(options.imap_unsure_folder) - if options.verbose: print "Loading database...", --- 187,190 ---- *************** *** 167,200 **** print "Done." - def _check(self, response, command): - if response[0] != "OK": - print "Invalid response to %s:\n%s" % (command, response) - sys.exit(-1) - - def _selectFolder(self, name, read_only): - folder = imap.select(name, read_only) - self._check(folder, 'select') - return folder - def Login(self): lgn = imap.login(options.imap_username, options.imap_password) - self._check(lgn, 'login') - - def TrainFolder(self, folder_name, isSpam): - folder = IMAPFolder(folder_name) - for msg in folder: - # XXX I've rewritten this logic. It looks a bit strange, - # because of the msg.notTrained call immediately before the - # test for isTrained, but this is safer. Once the message has - # been untrained, it's training memory should reflect that - # on the off chance that for some reason the training breaks, - # which happens on occasion (the tokenizer is not yet perfect) - if msg.isTrndAs(not isSpam): - self.classifier.unlearn(msg.asTokens(), not isSpam) - msg.notTrained() - - if not msg.isTrained(): - self.classifier.learn(msg.asTokens(), isSpam) - msg.trndAs(isSpam) def Train(self): --- 198,204 ---- print "Done." def Login(self): + '''Log in to the IMAP server''' lgn = imap.login(options.imap_username, options.imap_password) def Train(self): *************** *** 204,212 **** ham_training_folders = options.imap_ham_train_folders.split() for fol in ham_training_folders: ! self.TrainFolder(fol, False) if options.imap_spam_train_folders != "": spam_training_folders = options.imap_spam_train_folders.split(' ' ) for fol in spam_training_folders: ! self.TrainFolder(fol, True) self.classifier.store() if options.verbose: --- 208,218 ---- ham_training_folders = options.imap_ham_train_folders.split() for fol in ham_training_folders: ! folder = IMAPFolder(fol) ! folder.Train(self.classifier, False) if options.imap_spam_train_folders != "": spam_training_folders = options.imap_spam_train_folders.split(' ' ) for fol in spam_training_folders: ! folder = IMAPFolder(fol) ! folder.Train(self.classifier, True) self.classifier.store() if options.verbose: *************** *** 218,288 **** for filter_folder in options.imap_filter_folders.split(): folder = IMAPFolder(filter_folder, False) ! for msg in folder: ! (prob, clues) = self.classifier.spamprob(msg.asTokens(), ! evidence=True) ! # add headers and remember classification ! msg.addSBHeaders(prob, clues) ! # XXX updating is disabled for the moment ! # msg.Update() ! self._filterMessage(msg) if options.verbose: print "Filtering took", time.time() - t, "seconds." def Logout(self): ! # sign off if options.imap_expunge: imap.expunge() imap.logout() - def _moveMessage(self, old_msg, dest): - # The IMAP copy command makes an alias, not a whole new - # copy, so what we need to do (sigh) is create a new message - # in the correct folder, and delete the old one - # XXX (someone tell me if this is wrong) - - # XXX I've redone this logic to use the IMAPMessage class. It - # may be a bit of overkill, but it allows us to maintain the - # proper training and classification memory for the message - # as it's moved - - #response = imap.uid("FETCH", old_msg.getId(), "(RFC822)") - #self._check(response, 'uid fetch') - #msg = message.Message() - #msg.setPayload(response[1][0][1]) - - msg = IMAPMessage(dest.uid, dest.folder_name, None) - msg.setId(msg.extractTime()) # this is kinda silly - msg.copy(old_msg) - - #response = imap.uid("SEARCH", "(TEXT)", msg.get_payload()) - #self._check(response, "search") - #self.changeId(response[1][0]) - - #response = imap.append(dest.folder_name, None, - # msg.getId(), - # msg.get_payload()) - #self._check(response, "append") - - msg.Append() - - #self._selectFolder(old_msg.folder_name, False) - #response = imap.uid("STORE", old_msg.getId(), "+FLAGS.SILENT", - # "(\\Deleted)") - #self._check(response, "uid store") - - old_msg.Delete() - - def _filterMessage(self, msg): - if msg.isClsfdHam(): - # we leave ham alone - print "untouched" - pass - elif msg.isClsfdSpam(): - #XXX I actually think move should be a method on IMAPMessage - #but I'm running out of time. - self._moveMessage(msg, self.spam_folder) - else: - self._moveMessage(msg, self.unsure_folder) - if __name__ == '__main__': options.verbose = True --- 224,237 ---- for filter_folder in options.imap_filter_folders.split(): folder = IMAPFolder(filter_folder, False) ! folder.Filter(self.classifier) if options.verbose: print "Filtering took", time.time() - t, "seconds." def Logout(self): ! '''Log out of the IMAP server''' if options.imap_expunge: imap.expunge() imap.logout() if __name__ == '__main__': options.verbose = True *************** *** 290,294 **** # imap_filter.imap.debug = 10 imap_filter.Login() ! imap_filter.Train() imap_filter.Filter() imap_filter.Logout() --- 239,243 ---- # imap_filter.imap.debug = 10 imap_filter.Login() ! #imap_filter.Train() imap_filter.Filter() imap_filter.Logout() From montanaro at users.sourceforge.net Thu Apr 10 07:28:31 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Thu Apr 10 09:28:35 2003 Subject: [Spambayes-checkins] spambayes mailsort.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv8670 Modified Files: mailsort.py Log Message: correct misspelling of "Classifier" Index: mailsort.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mailsort.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** mailsort.py 16 Feb 2003 17:05:07 -0000 1.6 --- mailsort.py 10 Apr 2003 13:28:27 -0000 1.7 *************** *** 31,39 **** def import_spambayes(): ! global mboxutils, CdbClassifer, tokenize if not os.environ.has_key('BAYESCUSTOMIZE'): os.environ['BAYESCUSTOMIZE'] = os.path.expanduser(CONFIG_FILE) from spambayes import mboxutils ! from spambayes.cdb_classifier import CdbClassifer from spambayes.tokenizer import tokenize --- 31,39 ---- def import_spambayes(): ! global mboxutils, CdbClassifier, tokenize if not os.environ.has_key('BAYESCUSTOMIZE'): os.environ['BAYESCUSTOMIZE'] = os.path.expanduser(CONFIG_FILE) from spambayes import mboxutils ! from spambayes.cdb_classifier import CdbClassifier from spambayes.tokenizer import tokenize *************** *** 88,92 **** print "Creating", RC_DIR, "directory..." os.mkdir(rc_dir) ! bayes = CdbClassifer() print 'Training with ham...' train(bayes, ham_name, False) --- 88,92 ---- print "Creating", RC_DIR, "directory..." os.mkdir(rc_dir) ! bayes = CdbClassifier() print 'Training with ham...' train(bayes, ham_name, False) *************** *** 124,128 **** msg = email.message_from_string(msgdata) del msgdata ! bayes = CdbClassifer(open(DB_FILE, 'rb')) prob = bayes.spamprob(tokenize(msg)) else: --- 124,128 ---- msg = email.message_from_string(msgdata) del msgdata ! bayes = CdbClassifier(open(DB_FILE, 'rb')) prob = bayes.spamprob(tokenize(msg)) else: *************** *** 139,143 **** def print_message_score(msg_name, msg_fp): msg = email.message_from_file(msg_fp) ! bayes = CdbClassifer(open(DB_FILE, 'rb')) prob, evidence = bayes.spamprob(tokenize(msg), evidence=True) print msg_name, prob --- 139,143 ---- def print_message_score(msg_name, msg_fp): msg = email.message_from_file(msg_fp) ! bayes = CdbClassifier(open(DB_FILE, 'rb')) prob, evidence = bayes.spamprob(tokenize(msg), evidence=True) print msg_name, prob From montanaro at users.sourceforge.net Thu Apr 10 07:28:31 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Thu Apr 10 09:28:36 2003 Subject: [Spambayes-checkins] spambayes/spambayes cdb_classifier.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv8670/spambayes Modified Files: cdb_classifier.py Log Message: correct misspelling of "Classifier" Index: cdb_classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/cdb_classifier.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** cdb_classifier.py 20 Jan 2003 03:14:32 -0000 1.1 --- cdb_classifier.py 10 Apr 2003 13:28:28 -0000 1.2 *************** *** 11,15 **** from spambayes.classifier import Classifier ! class CdbClassifer(Classifier): def __init__(self, cdbfile=None): Classifier.__init__(self) --- 11,15 ---- from spambayes.classifier import Classifier ! class CdbClassifier(Classifier): def __init__(self, cdbfile=None): Classifier.__init__(self) From timstone4 at users.sourceforge.net Thu Apr 10 20:08:01 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Thu Apr 10 22:08:05 2003 Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv18493 Modified Files: message.py Log Message: A start at addressing Mark's concerns/suggestions for this class. Eliminated a bunch of YAGNI, moved some non-base methods into a subclass. Index: message.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** message.py 9 Apr 2003 06:14:29 -0000 1.5 --- message.py 11 Apr 2003 02:07:57 -0000 1.6 *************** *** 5,8 **** --- 5,9 ---- Classes: Message - an email.Message.Message, extended with spambayes methods + SBHeaderMessage - A Message with spambayes header manipulations MessageInfoDB - persistent state storage for Message *************** *** 19,32 **** assessment purposes. ! Message is an extension of the email package Message class, to include ! persistent message information and Spambayes specific header manipulations. ! The persistent state -currently- consists of the message id, its current ! classification, and its current training. The payload is not persisted. ! Payload persistence is left to whatever mail client software is being used. Usage: A typical classification usage pattern would be something like: ! >>>msg = spambayes.message.Message() >>>msg.setPayload(substance) # substance comes from somewhere else >>>id = msg.setIdFromPayload() --- 20,37 ---- assessment purposes. ! Message is an extension of the email package Message class, to ! include persistent message information. The persistent state ! -currently- consists of the message id, its current ! classification, and its current training. The payload is not ! persisted. Payload persistence is left to whatever mail client ! software is being used. ! ! SBHeaderMessage extends Message to include spambayes header specific ! manipulations. Usage: A typical classification usage pattern would be something like: ! >>>msg = spambayes.message.SBHeaderMessage() >>>msg.setPayload(substance) # substance comes from somewhere else >>>id = msg.setIdFromPayload() *************** *** 45,49 **** A typical usage pattern to train as spam would be something like: ! >>>msg = spambayes.message.Message() >>>msg.setPayload(substance) # substance comes from somewhere else >>>id = msg.setId(msgid) # id is a fname, outlook msg id, something... --- 50,54 ---- A typical usage pattern to train as spam would be something like: ! >>>msg = spambayes.message.SBHeaderMessage() >>>msg.setPayload(substance) # substance comes from somewhere else >>>id = msg.setId(msgid) # id is a fname, outlook msg id, something... *************** *** 51,59 **** >>>msg.delSBHeaders() # never include sb headers in a train ! >>>if msg.isTrndHam(): >>> bayes.unlearn(msg.asTokens(), False) # untrain the ham >>>bayes.learn(msg.asTokens(), True) # train as spam ! >>>msg.trndAsSpam() --- 56,64 ---- >>>msg.delSBHeaders() # never include sb headers in a train ! >>>if msg.getTraining() == False: # could be None, can't do boolean test >>> bayes.unlearn(msg.asTokens(), False) # untrain the ham >>>bayes.learn(msg.asTokens(), True) # train as spam ! >>>msg.rememberTraining(True) *************** *** 95,106 **** import shelve - # XXX Tim, what do you want to do here? This - # XXX recurses infinately at the moment - # Make shelve use binary pickles by default. - #oldShelvePickler = shelve.Pickler - #def binaryDefaultPickler(f, binary=1): - # return oldShelvePickler(f, binary) - #shelve.Pickler = binaryDefaultPickler - class MessageInfoDB: --- 100,103 ---- *************** *** 180,190 **** def GetClassification(self): ! return self.c ! def GetTrained(self): ! return self.t def RememberClassification(self, cls): ! self.c = cls self.modified() def RememberTrained(self, isSpam): self.t = isSpam self.modified() --- 177,210 ---- def GetClassification(self): ! if self.c == 's': ! return options.header_spam_string ! if self.c == 'h': ! return options.header_ham_string ! if self.c == 'u': ! return options.header_unsure_string ! ! return None ! def RememberClassification(self, cls): ! # this must store state independent of options settings, as they ! # may change, which would really screw this database up ! ! # an unrecoginzed string here is interpreted as unsure. Should ! # that condition actually raise an exception instead? ! ! if cls == options.header_spam_string: ! self.c = 's' ! elif cls == options.header_ham_string: ! self.c = 'h' ! else ! self.c = 'u' ! self.modified() + + def GetTrained(self): + return self.t + def RememberTrained(self, isSpam): + # isSpam == None means no training has been done self.t = isSpam self.modified() *************** *** 199,207 **** (self.id, self.c, self.t) = t ! # XXX I can't think of a good name. Someone change ! # XXX HeaderMessage to something better before it gets used ! # XXX all over the place. ! class HeaderMessage(Message): ! '''Adds routines to add/remove headers for Spambayes''' def __init__(self): Message.__init__(self) --- 219,227 ---- (self.id, self.c, self.t) = t ! ! class SBHeaderMessage(Message): ! '''Message class that is cognizant of Spambayes headers. ! Adds routines to add/remove headers for Spambayes''' ! def __init__(self): Message.__init__(self) From timstone4 at users.sourceforge.net Thu Apr 10 20:09:12 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Thu Apr 10 22:09:18 2003 Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv19044 Modified Files: message.py Log Message: Missed a syntax error. Index: message.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** message.py 11 Apr 2003 02:07:57 -0000 1.6 --- message.py 11 Apr 2003 02:09:09 -0000 1.7 *************** *** 197,201 **** elif cls == options.header_ham_string: self.c = 'h' ! else self.c = 'u' --- 197,201 ---- elif cls == options.header_ham_string: self.c = 'h' ! else: self.c = 'u' From timstone4 at users.sourceforge.net Thu Apr 10 20:11:24 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Thu Apr 10 22:11:29 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv20052 Modified Files: imapfilter.py Log Message: Made some changes to accomodate the new message class. Not tested yet, but checked in on the chance that Tony wants to see it sooner than I can get it tested. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** imapfilter.py 9 Apr 2003 06:16:15 -0000 1.5 --- imapfilter.py 11 Apr 2003 02:11:21 -0000 1.6 *************** *** 29,39 **** # a list is retrieved via imap.list() - # IMAPFolder objects get created all over the place, and don't persist - # at all. It would probably be good to change this, especially if - # the filter doesn't run just once - - # All the imap responses should be checked - [0] should be "OK" - # otherwise an error will occur and who knows what will happen - try: True, False --- 29,32 ---- *************** *** 55,65 **** imap = None ! class IMAPMessage(message.HeaderMessage): ! def __init__(self, folder_id, folder_name, message_id): message.Message.__init__(self) self.setId(message_id) ! self.folder_id = folder_id ! self.folder_name = folder_name ! self.previous_folder = None def extractTime(self): --- 48,62 ---- imap = None ! class IMAPMessage(message.SBHeaderMessage): ! # response checking is necessary throughout this class ! def __init__(self, folder, message_id): message.Message.__init__(self) self.setId(message_id) ! self.folder = folder ! ! def _check(self, response, command): ! if response[0] != "OK": ! print "Invalid response to %s:\n%s" % (command, response) ! sys.exit(-1) def extractTime(self): *************** *** 75,85 **** # this more than once) if self.previous_folder is not None: ! self.previous_folder = self.folder_name ! self.folder_name = dest def Save(self): # we can't actually update the message with IMAP # so what we do is create a new message and delete the old one ! response = imap.append(self.folder_name, None, self.extractTime(), self.get_payload()) # we need to update the uid, as it will have changed --- 72,82 ---- # this more than once) if self.previous_folder is not None: ! self.previous_folder = self.folder ! self.folder = dest def Save(self): # we can't actually update the message with IMAP # so what we do is create a new message and delete the old one ! response = imap.append(self.folder.name, None, self.extractTime(), self.get_payload()) # we need to update the uid, as it will have changed *************** *** 91,100 **** self.id = response[1][0] if self.previous_folder is not None: ! response = imap.select(self.previous_folder, False) self.previous_folder = None # this line is raising an error, but WHY? #response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)") class IMAPFolder(object): def __init__(self, folder_name, readOnly=True): self.name = folder_name --- 88,99 ---- self.id = response[1][0] if self.previous_folder is not None: ! response = imap.select(self.previous_folder.name, False) self.previous_folder = None # this line is raising an error, but WHY? #response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)") + class IMAPFolder(object): + # response checking is necessary throughout this class def __init__(self, folder_name, readOnly=True): self.name = folder_name *************** *** 114,118 **** if response[0] != "OK": self.rfc822_command = "(RFC822)" ! def __iter__(self): '''IMAPFolder is iterable''' --- 113,126 ---- if response[0] != "OK": self.rfc822_command = "(RFC822)" ! ! def Select(self): ! imap.select(self.name, False) ! self._check(folder, 'select') ! ! def _check(self, response, command): ! if response[0] != "OK": ! print "Invalid response to %s:\n%s" % (command, response) ! sys.exit(-1) ! def __iter__(self): '''IMAPFolder is iterable''' *************** *** 145,152 **** # we return an instance of *our* message class, not the # raw rfc822 message ! msg = IMAPMessage(self.uid, self.name, key) msg.setPayload(messageText) return msg ! def Train(self, classifier, isSpam): '''Train folder as spam/ham''' --- 153,160 ---- # we return an instance of *our* message class, not the # raw rfc822 message ! msg = IMAPMessage(self, key) msg.setPayload(messageText) return msg ! def Train(self, classifier, isSpam): '''Train folder as spam/ham''' *************** *** 164,184 **** msg.RememberTrained(isSpam) ! def FilterMessage(self, msg): if msg.GetClassification() == options.header_ham_string: # we leave ham alone pass elif msg.GetClassification() == options.header_spam_string: ! msg.MoveTo(options.imap_spam_folder) else: ! msg.MoveTo(options.imap_unsure_folder) ! ! def Filter(self, classifier): ! for msg in self: ! (prob, clues) = classifier.spamprob(msg.asTokens(), evidence=True) ! # add headers and remember classification ! msg.addSBHeaders(prob, clues) ! self.FilterMessage(msg) ! msg.Save() class IMAPFilter(object): --- 172,190 ---- msg.RememberTrained(isSpam) ! def Filter(self, classifier, spamfolder, unsurefolder): ! for msg in self: ! (prob, clues) = classifier.spamprob(msg.asTokens(), evidence=True) ! # add headers and remember classification ! msg.addSBHeaders(prob, clues) ! if msg.GetClassification() == options.header_ham_string: # we leave ham alone pass elif msg.GetClassification() == options.header_spam_string: ! msg.MoveTo(spamfolder) else: ! msg.MoveTo(unsurefolder) + msg.Save() class IMAPFilter(object): *************** *** 187,203 **** imap = imaplib.IMAP4(options.imap_server, options.imap_port) ! if options.verbose: ! print "Loading database...", filename = options.pop3proxy_persistent_storage_file filename = os.path.expanduser(filename) if options.pop3proxy_persistent_use_database: self.classifier = storage.DBDictClassifier(filename) else: self.classifier = storage.PickledClassifier(filename) if options.verbose: print "Done." def Login(self): - '''Log in to the IMAP server''' lgn = imap.login(options.imap_username, options.imap_password) --- 193,214 ---- imap = imaplib.IMAP4(options.imap_server, options.imap_port) ! self.spam_folder = IMAPFolder(options.imap_spam_folder) ! self.unsure_folder = IMAPFolder(options.imap_unsure_folder) ! filename = options.pop3proxy_persistent_storage_file filename = os.path.expanduser(filename) + + if options.verbose: + print "Loading database %s..." % (filename), + if options.pop3proxy_persistent_use_database: self.classifier = storage.DBDictClassifier(filename) else: self.classifier = storage.PickledClassifier(filename) + if options.verbose: print "Done." def Login(self): lgn = imap.login(options.imap_username, options.imap_password) *************** *** 205,208 **** --- 216,220 ---- if options.verbose: t = time.time() + if options.imap_ham_train_folders != "": ham_training_folders = options.imap_ham_train_folders.split() *************** *** 210,213 **** --- 222,226 ---- folder = IMAPFolder(fol) folder.Train(self.classifier, False) + if options.imap_spam_train_folders != "": spam_training_folders = options.imap_spam_train_folders.split(' ' ) *************** *** 215,219 **** --- 228,234 ---- folder = IMAPFolder(fol) folder.Train(self.classifier, True) + self.classifier.store() + if options.verbose: print "Training took", time.time() - t, "seconds." *************** *** 222,237 **** if options.verbose: t = time.time() for filter_folder in options.imap_filter_folders.split(): folder = IMAPFolder(filter_folder, False) ! folder.Filter(self.classifier) if options.verbose: print "Filtering took", time.time() - t, "seconds." def Logout(self): ! '''Log out of the IMAP server''' if options.imap_expunge: imap.expunge() imap.logout() if __name__ == '__main__': options.verbose = True --- 237,255 ---- if options.verbose: t = time.time() + for filter_folder in options.imap_filter_folders.split(): folder = IMAPFolder(filter_folder, False) ! folder.Filter(self.classifier, self.spam_folder, self.unsure_folder) ! if options.verbose: print "Filtering took", time.time() - t, "seconds." def Logout(self): ! # sign off if options.imap_expunge: imap.expunge() imap.logout() + if __name__ == '__main__': options.verbose = True *************** *** 239,243 **** # imap_filter.imap.debug = 10 imap_filter.Login() ! #imap_filter.Train() imap_filter.Filter() imap_filter.Logout() --- 257,261 ---- # imap_filter.imap.debug = 10 imap_filter.Login() ! imap_filter.Train() imap_filter.Filter() imap_filter.Logout() From timstone4 at users.sourceforge.net Sat Apr 12 20:02:57 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Sat Apr 12 22:02:59 2003 Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv25842 Modified Files: message.py Log Message: A few corrections Index: message.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** message.py 11 Apr 2003 02:09:09 -0000 1.7 --- message.py 13 Apr 2003 02:02:54 -0000 1.8 *************** *** 141,145 **** def setPayload(self, payload): ! prs = email.Parser.HeaderParser() prs._parseheaders(self, StringIO(payload)) # we may want to do some header parsing error handling here --- 141,147 ---- def setPayload(self, payload): ! prs = email.Parser.Parser() ! # this is kindof a hack, due to the fact that the parser creates a ! # new message object, and we already have the message object prs._parseheaders(self, StringIO(payload)) # we may want to do some header parsing error handling here *************** *** 183,187 **** if self.c == 'u': return options.header_unsure_string ! return None --- 185,189 ---- if self.c == 'u': return options.header_unsure_string ! return None *************** *** 266,269 **** --- 268,274 ---- if options.pop3proxy_add_mailid_to.find("header") != -1: self[options.pop3proxy_mailid_header_name] = self.id + + # print self._headers + # print self.as_string() # This won't work for now, because email.Message does not isolate message body From timstone4 at users.sourceforge.net Sat Apr 12 20:04:48 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Sat Apr 12 22:04:52 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv26297 Modified Files: imapfilter.py Log Message: Lots and lots of development work, this is the first basically functional version of the imap filter, and it's actually been 'tested'. Tested is quoted, because IMAP seems to be a really flukey kind of interface, and until it's been used on lots of imap servers, by lots of people, I won't be convinced that it's really correct. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** imapfilter.py 11 Apr 2003 02:11:21 -0000 1.6 --- imapfilter.py 13 Apr 2003 02:04:45 -0000 1.7 *************** *** 13,16 **** --- 13,43 ---- The original filter design owed much to isbg by Roger Binns (http://www.rogerbinns.com/isbg). + + Usage: + imapfilter [options] + + note: option values with spaces in them must be enclosed + in double quotes + + options: + -d dbname : pickled training database filename + -D dbname : dbm training database filename + -t : train contents of spam folder and ham folder + -c : classify inbox + -h : help + + Examples: + + Classify inbox, with dbm database + imapfilter -c -D bayes.db + + Train Spam and Ham, then classify inbox, with dbm database + imapfilter -t -c -D bayes.db + + Train Spam and Ham only, with pickled database + imapfilter -t -d bayes.db + + To Do: + o Suggestions? """ *************** *** 41,44 **** --- 68,73 ---- import time import sys + import getopt + import email.Parser from spambayes.Options import options *************** *** 50,57 **** class IMAPMessage(message.SBHeaderMessage): # response checking is necessary throughout this class ! def __init__(self, folder, message_id): message.Message.__init__(self) ! self.setId(message_id) ! self.folder = folder def _check(self, response, command): --- 79,91 ---- class IMAPMessage(message.SBHeaderMessage): # response checking is necessary throughout this class ! def __init__(self): message.Message.__init__(self) ! #XXX When a message object is created, an id and a folder should ! #XXX immediately be set. These cannot be passed in on the ! #XXX constructor, due to the quirky way that email.Parser.Parser ! #XXX does its thing. ! self.id = None ! self.folder = None ! self.previous_folder = None def _check(self, response, command): *************** *** 65,69 **** # timestamp from the message itself, but for the moment, we # just use the current time. ! return imaplib.Time2Internaldate(time.time()) def MoveTo(self, dest): --- 99,108 ---- # timestamp from the message itself, but for the moment, we # just use the current time. ! #XXX the imaplib time function returns a string like ! #XXX "12-Apr-2003 19:56:28 -0500" This seems like a bad message id. ! #XXX For one thing, it only resolves to one second. Even a cheap ! #XXX refractor telescope can resolve better than that ;) ! # return imaplib.Time2Internaldate(time.time()) ! return time.time() def MoveTo(self, dest): *************** *** 71,95 **** # and we do an actual move on save (to avoid doing # this more than once) ! if self.previous_folder is not None: self.previous_folder = self.folder ! self.folder = dest def Save(self): # we can't actually update the message with IMAP # so what we do is create a new message and delete the old one response = imap.append(self.folder.name, None, ! self.extractTime(), self.get_payload()) # we need to update the uid, as it will have changed # XXX there will be problems here if the message *has not* # XXX changed, as the message to be deleted will be found first # XXX (if they are in the same folder) ! response = imap.uid("SEARCH", "(TEXT)", self.get_payload()) old_id = self.id ! self.id = response[1][0] if self.previous_folder is not None: response = imap.select(self.previous_folder.name, False) self.previous_folder = None ! # this line is raising an error, but WHY? ! #response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)") --- 110,143 ---- # and we do an actual move on save (to avoid doing # this more than once) ! if self.previous_folder is None and not self.folder == dest: self.previous_folder = self.folder ! self.folder = dest def Save(self): # we can't actually update the message with IMAP # so what we do is create a new message and delete the old one + new_id = self.extractTime() response = imap.append(self.folder.name, None, ! new_id, self.as_string()) ! self._check(response, 'append') # we need to update the uid, as it will have changed # XXX there will be problems here if the message *has not* # XXX changed, as the message to be deleted will be found first # XXX (if they are in the same folder) ! #response = imap.uid("SEARCH", "(TEXT)", self.as_string()) ! #self._check(response, 'search') ! #self.id = response[1][0] ! old_id = self.id ! self.id = new_id if self.previous_folder is not None: response = imap.select(self.previous_folder.name, False) + self._check(response, 'folder select') self.previous_folder = None ! response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)") ! self._check(response, 'store') ! ! #XXX We really should delete the old message from the msgid db. ! #XXX There is currently no interface to do this with. *************** *** 153,162 **** # we return an instance of *our* message class, not the # raw rfc822 message ! msg = IMAPMessage(self, key) ! msg.setPayload(messageText) return msg def Train(self, classifier, isSpam): '''Train folder as spam/ham''' for msg in self: if msg.GetTrained() == isSpam: --- 201,218 ---- # we return an instance of *our* message class, not the # raw rfc822 message ! #XXX I can't get parsing to work correctly if I pull the guts ! #XXX out of Parser.parse() and do that in the setPayload method ! #XXX of the message class. Why? I have **NO** idea. ! #msg = IMAPMessage(self, key) ! #msg.setPayload(messageText) ! msg = email.Parser.Parser(_class=IMAPMessage).parsestr(messageText) ! msg.folder = self ! msg.setId(key) ! return msg def Train(self, classifier, isSpam): '''Train folder as spam/ham''' + num_trained = 0 for msg in self: if msg.GetTrained() == isSpam: *************** *** 170,216 **** if msg.GetTrained() is not None: classifier.learn(msg.asTokens(), isSpam) msg.RememberTrained(isSpam) def Filter(self, classifier, spamfolder, unsurefolder): for msg in self: ! (prob, clues) = classifier.spamprob(msg.asTokens(), evidence=True) ! # add headers and remember classification ! msg.addSBHeaders(prob, clues) ! ! if msg.GetClassification() == options.header_ham_string: ! # we leave ham alone ! pass ! elif msg.GetClassification() == options.header_spam_string: ! msg.MoveTo(spamfolder) ! else: ! msg.MoveTo(unsurefolder) ! msg.Save() class IMAPFilter(object): ! def __init__(self): global imap imap = imaplib.IMAP4(options.imap_server, options.imap_port) self.spam_folder = IMAPFolder(options.imap_spam_folder) self.unsure_folder = IMAPFolder(options.imap_unsure_folder) - - filename = options.pop3proxy_persistent_storage_file - filename = os.path.expanduser(filename) ! if options.verbose: ! print "Loading database %s..." % (filename), - if options.pop3proxy_persistent_use_database: - self.classifier = storage.DBDictClassifier(filename) - else: - self.classifier = storage.PickledClassifier(filename) - - if options.verbose: - print "Done." - - def Login(self): - lgn = imap.login(options.imap_username, options.imap_password) - def Train(self): if options.verbose: --- 226,263 ---- if msg.GetTrained() is not None: classifier.learn(msg.asTokens(), isSpam) + num_trained += 1 msg.RememberTrained(isSpam) + return num_trained + def Filter(self, classifier, spamfolder, unsurefolder): for msg in self: ! if msg.GetClassification() is None: ! (prob, clues) = classifier.spamprob(msg.asTokens(), evidence=True) ! # add headers and remember classification ! msg.addSBHeaders(prob, clues) ! cls = msg.GetClassification() ! if cls == options.header_ham_string: ! # we leave ham alone ! pass ! elif cls == options.header_spam_string: ! msg.MoveTo(spamfolder) ! else: ! msg.MoveTo(unsurefolder) + msg.Save() + class IMAPFilter(object): ! def __init__(self, classifier): global imap imap = imaplib.IMAP4(options.imap_server, options.imap_port) + lgn = imap.login(options.imap_username, options.imap_password) self.spam_folder = IMAPFolder(options.imap_spam_folder) self.unsure_folder = IMAPFolder(options.imap_unsure_folder) ! self.classifier = classifier def Train(self): if options.verbose: *************** *** 221,225 **** for fol in ham_training_folders: folder = IMAPFolder(fol) ! folder.Train(self.classifier, False) if options.imap_spam_train_folders != "": --- 268,272 ---- for fol in ham_training_folders: folder = IMAPFolder(fol) ! num_ham_trained = folder.Train(self.classifier, False) if options.imap_spam_train_folders != "": *************** *** 227,236 **** for fol in spam_training_folders: folder = IMAPFolder(fol) ! folder.Train(self.classifier, True) ! self.classifier.store() if options.verbose: ! print "Training took", time.time() - t, "seconds." def Filter(self): --- 274,285 ---- for fol in spam_training_folders: folder = IMAPFolder(fol) ! num_spam_trained = folder.Train(self.classifier, True) ! if num_ham_trained or num_spam_trained: ! self.classifier.store() if options.verbose: ! print "Training took %s seconds, %s messages were trained" \ ! % (time.time() - t, num_ham_trained + num_spam_trained) def Filter(self): *************** *** 253,261 **** if __name__ == '__main__': ! options.verbose = True ! imap_filter = IMAPFilter() # imap_filter.imap.debug = 10 ! imap_filter.Login() ! imap_filter.Train() ! imap_filter.Filter() imap_filter.Logout() --- 302,355 ---- if __name__ == '__main__': ! ! try: ! opts, args = getopt.getopt(sys.argv[1:], 'htcvd:D:') ! except getopt.error, msg: ! print >>sys.stderr, str(msg) + '\n\n' + __doc__ ! sys.exit() ! ! bdbname = options.pop3proxy_persistent_storage_file ! useDBM = options.pop3proxy_persistent_use_database ! doTrain = False ! doClassify = False ! ! for opt, arg in opts: ! if opt == '-h': ! print >>sys.stderr, __doc__ ! sys.exit() ! elif opt == '-d': ! useDBM = False ! bdbname = arg ! elif opt == '-D': ! useDBM = True ! bdbname = arg ! elif opt == '-t': ! doTrain = True ! elif opt == '-c': ! doClassify = True ! elif opt == '-v': ! options.verbose = True ! ! ! bdbname = os.path.expanduser(bdbname) ! ! if options.verbose: ! print "Loading database %s..." % (bdbname), ! ! if useDBM: ! classifier = storage.DBDictClassifier(bdbname) ! else: ! classifier = storage.PickledClassifier(bdbname) ! ! if options.verbose: ! print "Done." ! ! imap_filter = IMAPFilter(classifier) # imap_filter.imap.debug = 10 ! # imap_filter.Login() ! if doTrain: ! imap_filter.Train() ! if doClassify: ! imap_filter.Filter() ! imap_filter.Logout() From timstone4 at users.sourceforge.net Sun Apr 13 06:54:05 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Sun Apr 13 08:54:08 2003 Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.8,1.9 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv29022 Modified Files: message.py Log Message: Raised an error on RememberClassification if the classification to be remembered is not recognizable. Index: message.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** message.py 13 Apr 2003 02:02:54 -0000 1.8 --- message.py 13 Apr 2003 12:54:02 -0000 1.9 *************** *** 142,151 **** def setPayload(self, payload): prs = email.Parser.Parser() # this is kindof a hack, due to the fact that the parser creates a # new message object, and we already have the message object ! prs._parseheaders(self, StringIO(payload)) # we may want to do some header parsing error handling here # to try to extract important headers regardless of malformations ! prs._parsebody(self, StringIO(payload)) def setId(self, id): --- 142,152 ---- def setPayload(self, payload): prs = email.Parser.Parser() + fp = StringIO(payload) # this is kindof a hack, due to the fact that the parser creates a # new message object, and we already have the message object ! prs._parseheaders(self, fp) # we may want to do some header parsing error handling here # to try to extract important headers regardless of malformations ! prs._parsebody(self, fp) def setId(self, id): *************** *** 153,159 **** raise ValueError, "MsgId has already been set, cannot be changed" - # XXX This isn't really needed since type(None) is not - # XXX in types.StringTypes - do we still want it for the - # XXX more informative error message? if id is None: raise ValueError, "MsgId must not be None" --- 154,157 ---- *************** *** 170,175 **** def asTokens(self): - # use as_string() here because multipart/digest will return - # a list of message objects if get_payload() is used return tokenize(self.as_string()) --- 168,171 ---- *************** *** 199,204 **** elif cls == options.header_ham_string: self.c = 'h' ! else: self.c = 'u' self.modified() --- 195,203 ---- elif cls == options.header_ham_string: self.c = 'h' ! elif cls == options.header_unsure_string: self.c = 'u' + else: + raise ValueError, \ + "Classification must match header strings in options" self.modified() *************** *** 213,217 **** def __repr__(self): ! return "core.Message%r" % repr(self.__getstate__()) def __getstate__(self): --- 212,216 ---- def __repr__(self): ! return "spambayes.message.Message%r" % repr(self.__getstate__()) def __getstate__(self): From timstone4 at users.sourceforge.net Sun Apr 13 06:54:57 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Sun Apr 13 08:55:01 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv29478 Modified Files: imapfilter.py Log Message: Made the message constructor work the way I wanted it to originally. Added a couple of options, refactored a few methods, general code cleanup. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** imapfilter.py 13 Apr 2003 02:04:45 -0000 1.7 --- imapfilter.py 13 Apr 2003 12:54:54 -0000 1.8 *************** *** 26,29 **** --- 26,32 ---- -c : classify inbox -h : help + -v : verbose mode + -e : sets expunge to the *opposite* of options.imap_expunge + -i debuglvl : a somewhat mysterious imaplib debugging level Examples: *************** *** 39,42 **** --- 42,53 ---- To Do: + o Remove old msg from info database when saveing modified messages + o Use DELETE rather than storing //DELETED flag when saving modified messages + o Web UI for configuration and setup. # Tony thinks it would be + nice if there was a web ui to this for the initial setup (i.e. like + pop3proxy), which offered a list of folders to filter/train/etc. It + could then record a uid for the folder rather than a name, and it + avoids the problems with different imap servers having different + naming styles a list is retrieved via imap.list() o Suggestions? """ *************** *** 49,59 **** __credits__ = "Tim Stone, All the Spambayes folk." - # Tony thinks it would be nice if there was a web ui to - # this for the initial setup (i.e. like pop3proxy), which offered - # a list of folders to filter/train/etc. It could then record a - # uid for the folder rather than a name, and it avoids the problems - # with different imap servers having different naming styles - # a list is retrieved via imap.list() - try: True, False --- 60,63 ---- *************** *** 79,90 **** class IMAPMessage(message.SBHeaderMessage): # response checking is necessary throughout this class ! def __init__(self): message.Message.__init__(self) ! #XXX When a message object is created, an id and a folder should ! #XXX immediately be set. These cannot be passed in on the ! #XXX constructor, due to the quirky way that email.Parser.Parser ! #XXX does its thing. ! self.id = None ! self.folder = None self.previous_folder = None --- 83,91 ---- class IMAPMessage(message.SBHeaderMessage): # response checking is necessary throughout this class ! def __init__(self, folder, id): message.Message.__init__(self) ! ! self.id = id ! self.folder = folder self.previous_folder = None *************** *** 110,114 **** # and we do an actual move on save (to avoid doing # this more than once) ! if self.previous_folder is None and not self.folder == dest: self.previous_folder = self.folder self.folder = dest --- 111,115 ---- # and we do an actual move on save (to avoid doing # this more than once) ! if self.previous_folder is None: self.previous_folder = self.folder self.folder = dest *************** *** 201,212 **** # we return an instance of *our* message class, not the # raw rfc822 message ! #XXX I can't get parsing to work correctly if I pull the guts ! #XXX out of Parser.parse() and do that in the setPayload method ! #XXX of the message class. Why? I have **NO** idea. ! #msg = IMAPMessage(self, key) ! #msg.setPayload(messageText) ! msg = email.Parser.Parser(_class=IMAPMessage).parsestr(messageText) ! msg.folder = self ! msg.setId(key) return msg --- 202,208 ---- # we return an instance of *our* message class, not the # raw rfc822 message ! ! msg = IMAPMessage(self, key) ! msg.setPayload(messageText) return msg *************** *** 253,257 **** global imap imap = imaplib.IMAP4(options.imap_server, options.imap_port) ! lgn = imap.login(options.imap_username, options.imap_password) self.spam_folder = IMAPFolder(options.imap_spam_folder) --- 249,254 ---- global imap imap = imaplib.IMAP4(options.imap_server, options.imap_port) ! ! self.Login(options.imap_username, options.imap_password) self.spam_folder = IMAPFolder(options.imap_spam_folder) *************** *** 294,300 **** print "Filtering took", time.time() - t, "seconds." ! def Logout(self): # sign off ! if options.imap_expunge: imap.expunge() imap.logout() --- 291,308 ---- print "Filtering took", time.time() - t, "seconds." ! def Login(self, uid, pw): ! try: ! lgn = imap.login(uid, pw) ! except imaplib.IMAP4.error, e: ! if str(e) == "permission denied": ! print "There was an error logging in to the IMAP server." ! print "The userid and/or password may be in error." ! sys.exit() ! else: ! raise ! ! def Logout(self, expunge): # sign off ! if expunge: imap.expunge() imap.logout() *************** *** 304,308 **** try: ! opts, args = getopt.getopt(sys.argv[1:], 'htcvd:D:') except getopt.error, msg: print >>sys.stderr, str(msg) + '\n\n' + __doc__ --- 312,316 ---- try: ! opts, args = getopt.getopt(sys.argv[1:], 'htcvei:d:D:') except getopt.error, msg: print >>sys.stderr, str(msg) + '\n\n' + __doc__ *************** *** 313,316 **** --- 321,326 ---- doTrain = False doClassify = False + doExpunge = options.imap_expunge + imapDebug = 0 for opt, arg in opts: *************** *** 330,351 **** elif opt == '-v': options.verbose = True ! bdbname = os.path.expanduser(bdbname) ! ! if options.verbose: ! print "Loading database %s..." % (bdbname), ! ! if useDBM: ! classifier = storage.DBDictClassifier(bdbname) ! else: ! classifier = storage.PickledClassifier(bdbname) ! if options.verbose: ! print "Done." imap_filter = IMAPFilter(classifier) ! # imap_filter.imap.debug = 10 ! # imap_filter.Login() if doTrain: imap_filter.Train() --- 340,365 ---- elif opt == '-v': options.verbose = True + elif opt == '-e': + doExpunge = not doExpunge + elif opt == '-i:': + imapDebug = int(arg) ! bdbname = os.path.expanduser(bdbname) ! ! if options.verbose: ! print "Loading database %s..." % (bdbname), ! ! if useDBM: ! classifier = storage.DBDictClassifier(bdbname) ! else: ! classifier = storage.PickledClassifier(bdbname) ! if options.verbose: ! print "Done." imap_filter = IMAPFilter(classifier) ! imap.debug = imapDebug ! if doTrain: imap_filter.Train() *************** *** 353,355 **** imap_filter.Filter() ! imap_filter.Logout() --- 367,369 ---- imap_filter.Filter() ! imap_filter.Logout(doExpunge) From timstone4 at users.sourceforge.net Sun Apr 13 14:45:33 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Sun Apr 13 16:45:37 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.8,1.9 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv20262 Modified Files: imapfilter.py Log Message: Cobbled together a hack to remove old message ids from the message info db. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** imapfilter.py 13 Apr 2003 12:54:54 -0000 1.8 --- imapfilter.py 13 Apr 2003 20:45:30 -0000 1.9 *************** *** 42,46 **** To Do: ! o Remove old msg from info database when saveing modified messages o Use DELETE rather than storing //DELETED flag when saving modified messages o Web UI for configuration and setup. # Tony thinks it would be --- 42,47 ---- To Do: ! o Find a better way to remove old msg from info database when saving ! modified messages o Use DELETE rather than storing //DELETED flag when saving modified messages o Web UI for configuration and setup. # Tony thinks it would be *************** *** 131,135 **** old_id = self.id - self.id = new_id if self.previous_folder is not None: response = imap.select(self.previous_folder.name, False) --- 132,135 ---- *************** *** 139,144 **** self._check(response, 'store') ! #XXX We really should delete the old message from the msgid db. ! #XXX There is currently no interface to do this with. --- 139,150 ---- self._check(response, 'store') ! #XXX This code to deletd the old message id from the message ! #XXX info db and manipulate the message id, is a *serious* hack. ! #XXX There's gotta be a better way to do this. ! ! message.msginfoDB._delState(self) ! ! self.id = str(new_id) ! self.modified() From montanaro at users.sourceforge.net Sun Apr 13 16:24:27 2003 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sun Apr 13 18:24:30 2003 Subject: [Spambayes-checkins] spambayes mboxtrain.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv6250 Modified Files: mboxtrain.py Log Message: train() clearly couldn't be correct the way it was written. Pass in trainnew from main() and test for the existence of path. (Not tested - I don't use this code. Just responding to a note on the spambayes list.) Index: mboxtrain.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mboxtrain.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** mboxtrain.py 21 Mar 2003 01:33:19 -0000 1.6 --- mboxtrain.py 13 Apr 2003 22:24:24 -0000 1.7 *************** *** 208,213 **** (trained, counter)) ! def train(h, path, is_spam, force): ! if os.path.isfile(path): mbox_train(h, path, is_spam, force) elif os.path.isdir(os.path.join(path, "cur")): --- 208,215 ---- (trained, counter)) ! def train(h, path, is_spam, force, trainnew): ! if not os.path.exists(path): ! raise ValueError("Nonexistent path: %s" % path) ! elif os.path.isfile(path): mbox_train(h, path, is_spam, force) elif os.path.isdir(os.path.join(path, "cur")): *************** *** 277,286 **** for g in good: if loud: print "Training ham (%s):" % g ! train(h, g, False, force) save = True for s in spam: if loud: print "Training spam (%s):" % s ! train(h, s, True, force) save = True --- 279,288 ---- for g in good: if loud: print "Training ham (%s):" % g ! train(h, g, False, force, trainnew) save = True for s in spam: if loud: print "Training spam (%s):" % s ! train(h, s, True, force, trainnew) save = True From anadelonbrin at users.sourceforge.net Sun Apr 13 16:52:16 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Sun Apr 13 18:52:19 2003 Subject: [Spambayes-checkins] spambayes/windows pop3proxy_service.py, 1.3, 1.4 Message-ID: Update of /cvsroot/spambayes/spambayes/windows In directory sc8-pr-cvs1:/tmp/cvs-serv23596/windows Modified Files: pop3proxy_service.py Log Message: Fix for UI showing incorrect server strings as discovered and solved by Paul Moore. Index: pop3proxy_service.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/windows/pop3proxy_service.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** pop3proxy_service.py 23 Mar 2003 07:21:52 -0000 1.3 --- pop3proxy_service.py 13 Apr 2003 22:52:13 -0000 1.4 *************** *** 102,105 **** --- 102,106 ---- def ServerThread(self): state = pop3proxy.state + state.buildServerStrings() pop3proxy.main(state.servers, state.proxyPorts, state.uiPort, state.launchUI) From anadelonbrin at users.sourceforge.net Sun Apr 13 18:15:53 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Sun Apr 13 20:15:57 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv31908 Modified Files: imapfilter.py Log Message: Various speed improvements. Some comment changes. Various small changes. Tested with two IMAP servers and seems to still work :) Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** imapfilter.py 13 Apr 2003 20:45:30 -0000 1.9 --- imapfilter.py 14 Apr 2003 00:15:50 -0000 1.10 *************** *** 6,14 **** messages are scored and (where necessary) filtered. - It is suggested that this filter is set to run at certain intervals. - Note that it is (currently) fairly slow, so this should not be too - often. An alternative to this would be to keep the filter running - and logged in, and periodically check for new mail. - The original filter design owed much to isbg by Roger Binns (http://www.rogerbinns.com/isbg). --- 6,9 ---- *************** *** 45,49 **** modified messages o Use DELETE rather than storing //DELETED flag when saving modified messages ! o Web UI for configuration and setup. # Tony thinks it would be nice if there was a web ui to this for the initial setup (i.e. like pop3proxy), which offered a list of folders to filter/train/etc. It --- 40,44 ---- modified messages o Use DELETE rather than storing //DELETED flag when saving modified messages ! o Web UI for configuration and setup. Tony thinks it would be nice if there was a web ui to this for the initial setup (i.e. like pop3proxy), which offered a list of folders to filter/train/etc. It *************** *** 51,54 **** --- 46,58 ---- avoids the problems with different imap servers having different naming styles a list is retrieved via imap.list() + o IMAPMessage and IMAPFolder currently carry out very simple checks + of responses received from IMAP commands, but if the response is not + "OK", then the filter terminates. Handling of these errors could be + much nicer. + o The filter is currently designed to be periodically run (with cron, + for example). It would probably be nicer if it was continually + running (like pop3proxy, for example) and periodically checked for + any new messages to process (with the RECENT command). The period + could be an option. o Suggestions? """ *************** *** 82,87 **** imap = None class IMAPMessage(message.SBHeaderMessage): - # response checking is necessary throughout this class def __init__(self, folder, id): message.Message.__init__(self) --- 86,116 ---- imap = None + # global rfc822 fetch command + rfc822_command = "(RFC822.PEEK)" + + # For efficiency, we remember which folder we are currently + # in, and only send a select command to the IMAP server if + # we want to *change* folders. This function is used by + # both IMAPMessage and IMAPFolder. + # Occaisionally, we need to force a command, because we + # are interested in the response. Things would be much + # nicer if we cached this information somewhere. + # XXX If we wanted to be nice and tidy, this really belongs + # XXX in an IMAPUtilities class, or something like that. + current_folder = None + current_folder_readonly = None + def Select(folder, readOnly=True, force=False): + global current_folder + global current_folder_readonly + if current_folder != folder or current_folder_readonly != readOnly or force: + response = imap.select(folder, readOnly) + if response[0] != "OK": + print "Invalid response to %s:\n%s" % (command, response) + sys.exit(-1) + current_folder = folder + current_folder_readonly = readOnly + return response + class IMAPMessage(message.SBHeaderMessage): def __init__(self, folder, id): message.Message.__init__(self) *************** *** 101,110 **** # timestamp from the message itself, but for the moment, we # just use the current time. ! #XXX the imaplib time function returns a string like ! #XXX "12-Apr-2003 19:56:28 -0500" This seems like a bad message id. ! #XXX For one thing, it only resolves to one second. Even a cheap ! #XXX refractor telescope can resolve better than that ;) ! # return imaplib.Time2Internaldate(time.time()) ! return time.time() def MoveTo(self, dest): --- 130,134 ---- # timestamp from the message itself, but for the moment, we # just use the current time. ! return imaplib.Time2Internaldate(time.time()) def MoveTo(self, dest): *************** *** 119,125 **** # we can't actually update the message with IMAP # so what we do is create a new message and delete the old one ! new_id = self.extractTime() response = imap.append(self.folder.name, None, ! new_id, self.as_string()) self._check(response, 'append') # we need to update the uid, as it will have changed --- 143,149 ---- # we can't actually update the message with IMAP # so what we do is create a new message and delete the old one ! time_stamp = self.extractTime() response = imap.append(self.folder.name, None, ! time_stamp, self.as_string()) self._check(response, 'append') # we need to update the uid, as it will have changed *************** *** 127,175 **** # XXX changed, as the message to be deleted will be found first # XXX (if they are in the same folder) ! #response = imap.uid("SEARCH", "(TEXT)", self.as_string()) ! #self._check(response, 'search') ! #self.id = response[1][0] old_id = self.id ! if self.previous_folder is not None: ! response = imap.select(self.previous_folder.name, False) ! self._check(response, 'folder select') self.previous_folder = None ! response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)") ! self._check(response, 'store') ! #XXX This code to deletd the old message id from the message ! #XXX info db and manipulate the message id, is a *serious* hack. ! #XXX There's gotta be a better way to do this. ! message.msginfoDB._delState(self) ! self.id = str(new_id) self.modified() class IMAPFolder(object): - # response checking is necessary throughout this class def __init__(self, folder_name, readOnly=True): self.name = folder_name - # Convert folder name to a uid - self.uid = None - response = imap.select(self.name, readOnly) - responses = imap.response("OK")[1] - for response in responses: - if response[:13] == "[UIDVALIDITY ": - r = re.compile(r"(?P\d+)") - self.uid = r.search(response[13:]).group('uid') - # We really want to use RFC822.PEEK here, as that doesn't effect - # the status of the message. Unfortunately, it appears that not - # all IMAP servers support this, even though it is in RFC1730 - self.rfc822_command = "(RFC822.PEEK)" - response = imap.fetch("1:1", self.rfc822_command) - if response[0] != "OK": - self.rfc822_command = "(RFC822)" - - def Select(self): - imap.select(self.name, False) - self._check(folder, 'select') def _check(self, response, command): --- 151,187 ---- # XXX changed, as the message to be deleted will be found first # XXX (if they are in the same folder) ! # response = imap.uid("SEARCH", "(TEXT)", self.as_string()) ! # self._check(response, 'search') ! # new_id = response[1][0] ! # XXX This fails at the moment and needs to be resolved, ! # XXX but it can't be properly checked until the header ! # XXX adding part of the message class works. ! # XXX For the moment, having a new empty-string id just ! # XXX mucks up our message database, not the training or ! # XXX filtering itself ! new_id = "" old_id = self.id ! if self.previous_folder is None: ! self.folder.Select(False) ! else: ! self.previous_folder.Select(False) self.previous_folder = None ! response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)") ! self._check(response, 'store') ! #XXX This code to delete the old message id from the message ! #XXX info db and manipulate the message id, is a *serious* hack. ! #XXX There's gotta be a better way to do this. ! message.msginfoDB._delState(self) ! self.id = new_id self.modified() class IMAPFolder(object): def __init__(self, folder_name, readOnly=True): self.name = folder_name def _check(self, response, command): *************** *** 189,193 **** '''Returns uids for all the messages in the folder''' # request message range ! response = imap.select(self.name, True) total_messages = response[1][0] if total_messages == '0': --- 201,205 ---- '''Returns uids for all the messages in the folder''' # request message range ! response = Select(self.name, True, True) total_messages = response[1][0] if total_messages == '0': *************** *** 204,208 **** def __getitem__(self, key): '''Return message matching the given uid''' ! response = imap.uid("FETCH", key, self.rfc822_command) messageText = response[1][0][1] # we return an instance of *our* message class, not the --- 216,229 ---- def __getitem__(self, key): '''Return message matching the given uid''' ! global rfc822_command ! Select(self.name, True) ! # We really want to use RFC822.PEEK here, as that doesn't effect ! # the status of the message. Unfortunately, it appears that not ! # all IMAP servers support this, even though it is in RFC1730 ! response = imap.uid("FETCH", key, rfc822_command) ! if response[0] != "OK": ! rfc822_command = "(RFC822)" ! response = imap.uid("FETCH", key, rfc822_command) ! self._check(response, "uid fetch") messageText = response[1][0][1] # we return an instance of *our* message class, not the *************** *** 214,217 **** --- 235,241 ---- return msg + def Select(self, readOnly): + return Select(self.name, readOnly) + def Train(self, classifier, isSpam): '''Train folder as spam/ham''' *************** *** 252,258 **** class IMAPFilter(object): ! def __init__(self, classifier): global imap imap = imaplib.IMAP4(options.imap_server, options.imap_port) self.Login(options.imap_username, options.imap_password) --- 276,283 ---- class IMAPFilter(object): ! def __init__(self, classifier, debug): global imap imap = imaplib.IMAP4(options.imap_server, options.imap_port) + imap.debug = imapDebug self.Login(options.imap_username, options.imap_password) *************** *** 274,278 **** if options.imap_spam_train_folders != "": ! spam_training_folders = options.imap_spam_train_folders.split(' ' ) for fol in spam_training_folders: folder = IMAPFolder(fol) --- 299,303 ---- if options.imap_spam_train_folders != "": ! spam_training_folders = options.imap_spam_train_folders.split() for fol in spam_training_folders: folder = IMAPFolder(fol) *************** *** 303,307 **** if str(e) == "permission denied": print "There was an error logging in to the IMAP server." ! print "The userid and/or password may be in error." sys.exit() else: --- 328,332 ---- if str(e) == "permission denied": print "There was an error logging in to the IMAP server." ! print "The userid and/or password may be incorrect." sys.exit() else: *************** *** 348,355 **** elif opt == '-e': doExpunge = not doExpunge ! elif opt == '-i:': imapDebug = int(arg) - bdbname = os.path.expanduser(bdbname) --- 373,379 ---- elif opt == '-e': doExpunge = not doExpunge ! elif opt == '-i': imapDebug = int(arg) bdbname = os.path.expanduser(bdbname) *************** *** 365,370 **** print "Done." ! imap_filter = IMAPFilter(classifier) ! imap.debug = imapDebug if doTrain: --- 389,393 ---- print "Done." ! imap_filter = IMAPFilter(classifier, imapDebug) if doTrain: From timstone4 at users.sourceforge.net Sun Apr 13 23:07:34 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Mon Apr 14 01:07:38 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.10,1.11 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv31805 Modified Files: imapfilter.py Log Message: Fixed the docstring not printing on -h, changed -e operand to have a value argument of y/n to control expunging, and added a -l value operand for looping the filter with a sleep time interval. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** imapfilter.py 14 Apr 2003 00:15:50 -0000 1.10 --- imapfilter.py 14 Apr 2003 05:07:32 -0000 1.11 *************** *** 1,6 **** #!/usr/bin/env python - from __future__ import generators - """An IMAP filter. An IMAP message box is scanned and all non-scored messages are scored and (where necessary) filtered. --- 1,4 ---- *************** *** 22,27 **** -h : help -v : verbose mode ! -e : sets expunge to the *opposite* of options.imap_expunge -i debuglvl : a somewhat mysterious imaplib debugging level Examples: --- 20,26 ---- -h : help -v : verbose mode ! -e y/n : sets expunge to the *opposite* of options.imap_expunge -i debuglvl : a somewhat mysterious imaplib debugging level + -l minutes : period of time between filtering operations Examples: *************** *** 39,43 **** o Find a better way to remove old msg from info database when saving modified messages - o Use DELETE rather than storing //DELETED flag when saving modified messages o Web UI for configuration and setup. Tony thinks it would be nice if there was a web ui to this for the initial setup (i.e. like --- 38,41 ---- *************** *** 54,58 **** running (like pop3proxy, for example) and periodically checked for any new messages to process (with the RECENT command). The period ! could be an option. o Suggestions? """ --- 52,56 ---- running (like pop3proxy, for example) and periodically checked for any new messages to process (with the RECENT command). The period ! could be an option. This is partially done with the -l operand. o Suggestions? """ *************** *** 65,68 **** --- 63,68 ---- __credits__ = "Tim Stone, All the Spambayes folk." + from __future__ import generators + try: True, False *************** *** 343,347 **** try: ! opts, args = getopt.getopt(sys.argv[1:], 'htcvei:d:D:') except getopt.error, msg: print >>sys.stderr, str(msg) + '\n\n' + __doc__ --- 343,347 ---- try: ! opts, args = getopt.getopt(sys.argv[1:], 'htcvl:e:i:d:D:') except getopt.error, msg: print >>sys.stderr, str(msg) + '\n\n' + __doc__ *************** *** 354,357 **** --- 354,358 ---- doExpunge = options.imap_expunge imapDebug = 0 + sleepTime = 0 for opt, arg in opts: *************** *** 372,378 **** options.verbose = True elif opt == '-e': ! doExpunge = not doExpunge elif opt == '-i': imapDebug = int(arg) bdbname = os.path.expanduser(bdbname) --- 373,388 ---- options.verbose = True elif opt == '-e': ! if arg == 'y': ! doExpunge = True ! else: ! doExpunge = False elif opt == '-i': imapDebug = int(arg) + elif opt == '-l': + sleepTime = int(arg) * 60 + + if not (doClassify or doTrain): + print "-c and/or -t operands must be specified" + sys.exit() bdbname = os.path.expanduser(bdbname) *************** *** 390,398 **** imap_filter = IMAPFilter(classifier, imapDebug) ! if doTrain: ! imap_filter.Train() ! if doClassify: ! imap_filter.Filter() imap_filter.Logout(doExpunge) --- 400,418 ---- imap_filter = IMAPFilter(classifier, imapDebug) + print sleepTime + while 1: + if doTrain: + if options.verbose: + print "Training" + imap_filter.Train() + if doClassify: + if options.verbose: + print "Classifying" + imap_filter.Filter() ! if sleepTime: ! time.sleep(sleepTime) ! else: ! break imap_filter.Logout(doExpunge) From timstone4 at users.sourceforge.net Sun Apr 13 23:15:32 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Mon Apr 14 01:15:35 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.11,1.12 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv3899 Modified Files: imapfilter.py Log Message: Left a debug message in. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** imapfilter.py 14 Apr 2003 05:07:32 -0000 1.11 --- imapfilter.py 14 Apr 2003 05:15:27 -0000 1.12 *************** *** 400,404 **** imap_filter = IMAPFilter(classifier, imapDebug) ! print sleepTime while 1: if doTrain: --- 400,404 ---- imap_filter = IMAPFilter(classifier, imapDebug) ! while 1: if doTrain: From timstone4 at users.sourceforge.net Mon Apr 14 20:22:45 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Mon Apr 14 22:22:48 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.12,1.13 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv22122 Modified Files: imapfilter.py Log Message: Timestamp for new messages extracted correctly Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** imapfilter.py 14 Apr 2003 05:15:27 -0000 1.12 --- imapfilter.py 15 Apr 2003 02:22:42 -0000 1.13 *************** *** 130,134 **** # timestamp from the message itself, but for the moment, we # just use the current time. ! return imaplib.Time2Internaldate(time.time()) def MoveTo(self, dest): --- 130,137 ---- # timestamp from the message itself, but for the moment, we # just use the current time. ! try: ! return self["Date"] ! except KeyError: ! return imaplib.Time2Internaldate(time.time()) def MoveTo(self, dest): From anadelonbrin at users.sourceforge.net Wed Apr 16 18:08:20 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Wed Apr 16 20:08:23 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.13,1.14 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv32173 Modified Files: imapfilter.py Log Message: Fix the invalid date problem reported by Oliver Maunder. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** imapfilter.py 15 Apr 2003 02:22:42 -0000 1.13 --- imapfilter.py 17 Apr 2003 00:08:17 -0000 1.14 *************** *** 20,24 **** -h : help -v : verbose mode ! -e y/n : sets expunge to the *opposite* of options.imap_expunge -i debuglvl : a somewhat mysterious imaplib debugging level -l minutes : period of time between filtering operations --- 20,24 ---- -h : help -v : verbose mode ! -e y/n : expunge/purge messages on exit (y) or not (n) -i debuglvl : a somewhat mysterious imaplib debugging level -l minutes : period of time between filtering operations *************** *** 79,82 **** --- 79,83 ---- import getopt import email.Parser + from email.Utils import parsedate from spambayes.Options import options *************** *** 131,135 **** # just use the current time. try: ! return self["Date"] except KeyError: return imaplib.Time2Internaldate(time.time()) --- 132,136 ---- # just use the current time. try: ! return imaplib.Time2Internaldate(time.mktime(parsedate(self["Date"]))) except KeyError: return imaplib.Time2Internaldate(time.time()) *************** *** 150,167 **** time_stamp, self.as_string()) self._check(response, 'append') - # we need to update the uid, as it will have changed - # XXX there will be problems here if the message *has not* - # XXX changed, as the message to be deleted will be found first - # XXX (if they are in the same folder) - # response = imap.uid("SEARCH", "(TEXT)", self.as_string()) - # self._check(response, 'search') - # new_id = response[1][0] - # XXX This fails at the moment and needs to be resolved, - # XXX but it can't be properly checked until the header - # XXX adding part of the message class works. - # XXX For the moment, having a new empty-string id just - # XXX mucks up our message database, not the training or - # XXX filtering itself - new_id = "" old_id = self.id --- 151,154 ---- *************** *** 174,177 **** --- 161,174 ---- self._check(response, 'store') + # We need to update the uid, as it will have changed + # XXX There will be problems here if the message *has not* + # XXX changed, as the message to be deleted will be found first + # XXX (if they are in the same folder) + self.folder.Select(True) + #response = imap.uid("SEARCH", "TEXT", self.as_string()) + #self._check(response, 'search') + #new_id = response[1][0] + new_id = "" + #XXX This code to delete the old message id from the message #XXX info db and manipulate the message id, is a *serious* hack. *************** *** 179,183 **** message.msginfoDB._delState(self) - self.id = new_id self.modified() --- 176,179 ---- *************** *** 404,408 **** imap_filter = IMAPFilter(classifier, imapDebug) ! while 1: if doTrain: if options.verbose: --- 400,404 ---- imap_filter = IMAPFilter(classifier, imapDebug) ! while True: if doTrain: if options.verbose: From anadelonbrin at users.sourceforge.net Wed Apr 16 18:41:32 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Wed Apr 16 20:42:02 2003 Subject: [Spambayes-checkins] spambayes INTEGRATION.txt,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv11441 Modified Files: INTEGRATION.txt Log Message: Update to include basic information about the IMAP filter. Also updated SMTP proxy information so that it is (hopefully!) clearer. Updated the POP3 proxy information so that users are directed to configure spambayes via the web user interface rather than mucking about in bayescustomize.ini. Index: INTEGRATION.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/INTEGRATION.txt,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** INTEGRATION.txt 11 Mar 2003 01:51:05 -0000 1.7 --- INTEGRATION.txt 17 Apr 2003 00:41:30 -0000 1.8 *************** *** 63,67 **** -------- ! There are six main components to the Spambayes system: o A database. Loosely speaking, this is a collection of words and --- 63,67 ---- -------- ! There are eight main components to the Spambayes system: o A database. Loosely speaking, this is a collection of words and *************** *** 80,86 **** o The POP3 proxy. This sits between your email client (Eudora, Outlook ! Express, etc) and your email server, and adds the classification header ! to emails as you download them. A typical user's email setup looks ! like this: +-----------------+ +-------------+ --- 80,86 ---- o The POP3 proxy. This sits between your email client (Eudora, Outlook ! Express, etc) and your incoming email server, and adds the ! classification header to emails as you download them. A typical ! user's email setup looks like this: +-----------------+ +-------------+ *************** *** 109,116 **** servers, if you have more than one email account. ! o The web interface. This is a server that runs alongside the POP3 proxy ! and lets you control it through the web. You can upload emails to it ! for training or classification, query the probabilities database ("How ! many of my emails really *do* contain the word Viagra"?) and most importantly, train it on the emails you've received. When you start using the system, unless you train it using the Hammie script it will --- 109,148 ---- servers, if you have more than one email account. ! o The SMTP proxy. This sits between your email client (Eudora, Outlook ! Express, etc) and your outgoing email server. Any mail sent to ! spambayes_spam@localhost or spambayes_ham@localhost is intercepted ! and trained appropriately. A typical user's email setup looks like ! this: ! ! +-----------------+ +-------------+ ! | Outlook Express | Internet or intranet | | ! | (or similar) | <--------------------------> | SMTP server | ! | | | | ! +-----------------+ +-------------+ ! ! The SMTP server runs either at your ISP for internet mail, or somewhere ! on your internal network for corporate mail. The SMTP proxy sits in the ! middle and checks for mail to train on as you send your email: ! ! +-----------------+ +------------+ +-------------+ ! | Outlook Express | | Spambayes | | | ! | (or similar) | <----> | SMTP proxy | <----> | SMTP server | ! | | | | | | ! +-----------------+ +------------+ +-------------+ ! ! So where you currently have your email client configured to talk to ! say, "smtp.my-isp.com", you instead configure the *proxy* to talk to ! "smtp.my-isp.com" and configure your email client to talk to the proxy. ! The SMTP proxy can live on your PC, or on the same machine as the SMTP ! server, or on a different machine entirely, it really doesn't matter. ! Say it's living on your PC, you'd configure your email client to talk ! to "localhost". You can configure the proxy to talk to multiple SMTP ! servers, if you have more than one email account. ! ! o The web interface. This is a server that runs alongside the POP3 proxy, ! SMTP proxy, and IMAP filter (see below) and lets you control it through ! the web. You can upload emails to it for training or classification, ! query the probabilities database ("How many of my emails really *do* ! contain the word Viagra"?), find particular messages, and most importantly, train it on the emails you've received. When you start using the system, unless you train it using the Hammie script it will *************** *** 124,133 **** to do it correct the odd mistake - it's very quick and easy. ! o The Outlook plug-in. For Outlook 2000 users (not Outlook Express) this ! lets you manage the whole thing from within Outlook. You set up a Ham ! folder and a Spam folder, and train it simply by dragging messages into ! those folders. Alternatively there are buttons to do the same thing. ! And it integrates into Outlook's filtering system to make it easy to ! file all the suspected spam into its own folder, for instance. o The Hammie script. This does three jobs: command-line training, --- 156,165 ---- to do it correct the odd mistake - it's very quick and easy. ! o The Outlook plug-in. For Outlook 2000 and Outlook XP users (not Outlook ! Express) this lets you manage the whole thing from within Outlook. You ! set up a Ham folder and a Spam folder, and train it simply by dragging ! messages into those folders. Alternatively there are buttons to do the ! same thing. And it integrates into Outlook's filtering system to make it ! easy to file all the suspected spam into its own folder, for instance. o The Hammie script. This does three jobs: command-line training, *************** *** 138,149 **** hammiesrv.py. Where things live ----------------- ! The Hammie script is called hammie.py. The POP3 proxy and the web ! interface live in pop3proxy.py. The Outlook plug-in lives in the ! Outlook2000 subdirectory - see the README.txt in that directory for more ! information on that. As well as these components, there's also a whole pile of utility scripts, --- 170,189 ---- hammiesrv.py. + o The IMAP filter. This is a cross between the POP3 proxy and the Outlook + plugin. If your mail sits on an IMAP server, you can use the this to + filter your mail. You can designate folders that contain mail to train + as ham and folders that contain mail to train as spam, and the filter + does this for you. You can also designate folders to filter, along with + a folder for messages Spambayes is unsure about, and a folder for + suspected spam. When new mail arrives, the filter will move mail to the + appropriate location (ham is left in the original folder). Where things live ----------------- ! The Hammie script is called hammie.py. The POP3 proxy lives in pop3proxy.py, ! and the smtpproxy lives in smtpproxy.py. The IMAP filter lives in ! imapfilter.py. The Outlook plug-in lives in the Outlook2000 subdirectory ! - see the README.txt in that directory for more information on that. As well as these components, there's also a whole pile of utility scripts, *************** *** 163,166 **** --- 203,211 ---- them, all lives in Options.py. To change an option, create a bayescustomize.ini and add the option to that - don't edit Options.py. + If you are using the POP3 proxy, SMTP proxy or IMAP filter, you can also + change most of the options you will need to access via the web user + interface. You will probably find this at . To + configure the Outlook plugin, you should click on the Anti-Spam button on + the toolbar. *************** *** 181,213 **** -------------------------------------------------------- ! The minimum you need to do to get started is create a bayescustomize.ini ! containing the following: ! ! [pop3proxy] ! pop3proxy_servers: pop3.my-isp.com ! ! where "pop3.my-isp.com" is wherever you currently have your email client ! configured to collect mail from. The proxy will run on port 110 - if you're ! already running a real POP3 proxy on that port, or you're running on a ! platform that won't let unprivileged processes use that port (eg. unix), ! you can use a different one by adding a line like this: ! ! pop3proxy_ports: 1110 ! ! to the [pop3proxy] section of bayescustomize.ini. ! ! You can now run the proxy by running "python pop3proxy.py". This will ! print some status messages, which should include: ! ! BayesProxyListener listening on port 110. ! UserInterfaceListener listening on port 8880. ! What that means is that the POP3 proxy is ready for your email client to ! connect to it on port 110 and that the web interface is ready for your ! browser to connect to it. The address of the web interface is ! http://localhost:8880/ (or if you're running it on a different machine, ! replace 'localhost' with the name of the machine). You can have a look ! at the web interface now, but it won't be very interesting because the ! system hasn't seen any messages yet. --- 226,238 ---- -------------------------------------------------------- ! To setup the POP3 and SMTP proxies, run ! pop3proxy.py -b ! from the command line. The web interface should open in your default ! browser. You need to click on the "Configuration Link" to go to the setup ! page. The minimum you need to do to get started is enter the servers and ! ports information in the POP3 proxy and SMTP proxy sections. ! The POP3 proxy is then ready for your email client to connect to it on ! port 110 and the SMTP proxy is ready for connections on port 25. *************** *** 215,222 **** ------------------------------------------ ! You now need to configure your email client to talk to the proxy instead of ! the real email server. Change your equivalent of "pop3.my-isp.com" to "localhost" (or to the name of the machine you're running the proxy on) in ! your email client's setup. Hit "Get new email" and look at the headers of the emails (send yourself an email if you don't have any!) - there should be an X-Spambayes-Classification header there. It probably says "unsure", --- 240,248 ---- ------------------------------------------ ! You now need to configure your email client to talk to the proxies instead of ! the real email servers. Change your equivalent of "pop3.my-isp.com" to "localhost" (or to the name of the machine you're running the proxy on) in ! your email client's setup, and do the same with your equivalent of ! "smtp.my-isp.com". Hit "Get new email" and look at the headers of the emails (send yourself an email if you don't have any!) - there should be an X-Spambayes-Classification header there. It probably says "unsure", *************** *** 234,237 **** --- 260,277 ---- "Total emails trained" has increased. + Alternatively, when you receive an incorrectly classified message, you can + forward it to the SMTP proxy for training. If the message should have been + classified as spam, forward or bounce the message to spambayes_spam@localhost, + and if the message should have been classified as ham, forward it to + spambayes_ham@localhost. You can still review the training through the web + interface, if you wish to do so. + + Note that some mail clients (particularly Outlook Express) do not forward + all headers when you bounce, forward or redirect mail. For these clients, + you will need to set (via the web interface) the "add mail id to" option + to body, which will add a unique id to the body of each message you + receive. You can also use this id to find a particular message via the + web interface. + Once you've done this on a few spams and a few hams, you'll find that the X-Spambayes-Classification header is getting it right most of the time. The *************** *** 245,268 **** messages to the web interface via the "Train" form on the Home page. You can train on individual messages (which is tedious) or using mbox files. - - An alternative to training via the web interface is to run the SMTP proxy. - Just as the POP3 proxy sits between your mail client and your POP3 server, - the SMTP proxy sits between your mail client and your SMTP server. To run - the server, start pop3proxy with the "-s" switch. You will need to setup - your mail client just as with the POP3 proxy - change the outgoing mail - (SMTP) server to localhost (or if you are running pop3proxy on a different - machine, replace localhost with the name of the machine). In the web - interface, set the SMTP options to the address and port of your SMTP - server. You will also need to set the "add mail id to" option to "header". - To train, you can now forward or bounce mail to spambayes_ham@localhost, - or spambayes_spam@localhost (you can change these addresses via the web - interface). - - Note that some mail clients (particularly Outlook Express) do not forward - all headers when you bounce, forward or redirect mail. For these clients, - you will need to set (via the web interface) the "add mail id to" option - to body, which will add a unique id to the body of each message you - receive. You can also use this id to find a particular message via the - web interface. --- 285,288 ---- From anadelonbrin at users.sourceforge.net Wed Apr 16 19:52:20 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Wed Apr 16 21:52:25 2003 Subject: [Spambayes-checkins] website faq.ht, NONE, 1.1 applications.ht, 1.2, 1.3 background.ht, 1.13, 1.14 developer.ht, 1.7, 1.8 docs.ht, 1.8, 1.9 download.ht, 1.2, 1.3 index.ht, 1.11, 1.12 links.h, 1.5, 1.6 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv1396 Modified Files: applications.ht background.ht developer.ht docs.ht download.ht index.ht links.h Added Files: faq.ht Log Message: General update to cover the new applications, the alpha2 release, and an increase in documentation. --- NEW FILE: faq.ht --- Title: SpamBayes: Frequently Asked Questions Author-Email: spambayes@python.org Author: spambayes

Frequently Asked Questions

  1. Development
    1. Hey! Why don't you implement cool tokenizer trick X? I think it would really foil those spammers!
    2. This software is great! I want to implement it for all my users. Are there plans to develop a server-side spambayes solution?
  2. Using Spambayes
    1. I just got a spam, but the system said it was "unsure". Why couldn't it tell that it was spam - it's obvious?
    2. OK, I trained on that message. But I just got *another* one, and the stupid system still thinks it's unsure. Why did it ignore me???
    3. I've mucked up my training and I want to start all over again, but there isn't an option for this anywhere. What do I do?

If you have any suggestions about other questions and answers that should be included here, please mail the list with them.

Hey! Why don't you implement cool tokenizer trick X? I think it would really foil those spammers!

Have you run your tokenizer trick against a set of messages to see if it actually works? Many times what seems like a good idea turns out not to help much, and sometimes even hurts. If you have a good idea, you've run it against a batch of messages and can prove that it helps, paste the code for your technique and the proof to the mailing list. If you're not a coder, but are really keen on your idea, post a feature request on the project page, and wait for someone else to code it for you (but make sure you do some testing when it's done). Otherwise, you will likely get a message from Tim Peters about why you need to test your idea :)

This software is great! I want to implement it for all my users. Are there plans to develop a server-side spambayes solution?

The problem with a server-side solution is that everyone has a different idea of what is spam - that's the whole strength of the bayesian-style filtering concept. If you are certain that *all* of your users would agree on what is spam and what is not, then this might work for you, but otherwise you really have to have individual databases for each user. Either way, you should be able to modify spambayes easily enough to fit into your setup. Please let the list know if you do have success in this area, and we'll update this answer.

I just got a spam, but the system said it was "unsure". Why couldn't it tell that it was spam - it's obvious?

It may be obvious to you, but the classifier only works on the information it has been given. Maybe this is "new" (you've never seen this particular flavour of spam before), or maybe there aren't enough clues in the message which the system is aware of as strong spam clues.

OK, I trained on that message. But I just got *another* one, and the stupid system still thinks it's unsure. Why did it ignore me???

It didn't, but you may need to train on a few more of this type of message to get it classified as "spam". The classification algorithm weights its results based on the number of times it has seen a particular clue, so that clues unique to this type of message may need a few more instances to become "convincing".

I've mucked up my training and I want to start all over again, but there isn't an option for this anywhere. What do I do?

Because training from scratch is a very rare occurance, and because deleting all your training information is something you don't want to do by accident, there isn't an option for this. However, you can quite simply do this manually. All the training data is stored in a file, usually called hammie.db, and if you delete (or rename) this, then you will start training from scratch. If you are using the web interface for the POP3 proxy, the configuration page tells you what this file is called (and where it is) down towards the bottom of the page.

Index: applications.ht =================================================================== RCS file: /cvsroot/spambayes/website/applications.ht,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** applications.ht 3 Mar 2003 22:24:39 -0000 1.2 --- applications.ht 17 Apr 2003 01:52:17 -0000 1.3 *************** *** 8,18 **** getting there (and help is always appreciated).

!

Outlook2000

!

Sean True and Mark Hammond have developed an addin for Outlook2000 that adds support for the spambayes classifier.

Requirements

    !
  • Python2.2 or later (2.2.2 recommended) !
  • Outlook 2000 (not Outlook Express)
  • Python's win32com extensions (win32all-149 or later - currently ActivePython is not suitable) --- 8,18 ---- getting there (and help is always appreciated).

    !

    Outlook

    !

    Sean True and Mark Hammond have developed an addin for Outlook (2000 and XP) that adds support for the spambayes classifier.

    Requirements

      !
    • Python 2.2 or later (2.2.2 recommended) !
    • Outlook 2000 or Outlook XP (not Outlook Express)
    • Python's win32com extensions (win32all-149 or later - currently ActivePython is not suitable) *************** *** 22,33 ****

      Availability

      !

      At the moment, you'll need to use CVS to get the code - go to the CVS page on the project's sourceforge site for more.

      hammie.py

      !

      hammie is a command line tool for marking mail as ham or spam. Skip Montanaro has started a guide to integrating hammie with your mailer (Unix-only instructions at the moment - additions welcome!). Currently it focusses on running hammie via procmail.

      Requirements

        !
      • Python2.2 or later (2.2.2 recommended)
      • Currently documentation focusses on Unix.
      --- 22,37 ----

      Availability

      !

      Mark has packaged together an installer for the plugin. ! You can download it from his website. ! This is currently at version 002.

      !

      Alternatively, you can use CVS to get the code - go to the CVS page on the project's sourceforge site for more.

      hammie.py

      !

      hammie is a command line tool for marking mail as ham or spam. Skip Montanaro has started a ! guide to integrating hammie with your mailer (Unix-only instructions at the moment - additions welcome!). Currently it focusses on running hammie via procmail.

      Requirements

        !
      • Python 2.2 or later (2.2.2 recommended)
      • Currently documentation focusses on Unix.
      *************** *** 38,41 **** --- 42,58 ----

      pop3proxy sits between your mail client and your real POP3 server and marks mail as ham or spam as it passes through. See the docstring at the top of pop3proxy.py for more. +

      Requirements

      +
        +
      • Python2.2 or later (2.2.2 recommended) +
      • Should work on windows/unix/whatever... ? +
      +

      +

      Availability

      +

      At the moment, you'll need to use CVS to get the code - go to the CVS page on the project's sourceforge site for more.

      + +

      imapfilter.py

      +

      imap filter connects to your imap server and marks mail as ham or spam, + moving it to appropriate folders as it arrives. + See the docstring at the top of imapfilter.py for more.

      Requirements

        Index: background.ht =================================================================== RCS file: /cvsroot/spambayes/website/background.ht,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** background.ht 22 Jan 2003 08:30:05 -0000 1.13 --- background.ht 17 Apr 2003 01:52:17 -0000 1.14 *************** *** 97,101 **** them wrong, but it's completely confident in its (wrong) score. (Note that the difference isn't as apparent as it could be - it's a logarithmic scale graph!)

        !

        Add more here - cancellation disease, fudge factors, &c

        Gary Robinson's --- 97,101 ---- them wrong, but it's completely confident in its (wrong) score. (Note that the difference isn't as apparent as it could be - it's a logarithmic scale graph!)

        !

        Gary Robinson's *************** *** 230,235 ****

        There's a lot of background on what's been tried available from the mailing list archives. Initially, the discussion started on ! the python-dev list, but then moved to the ! spambayes list.

        Index: download.ht =================================================================== RCS file: /cvsroot/spambayes/website/download.ht,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** download.ht 17 Jan 2003 17:00:52 -0000 1.2 --- download.ht 17 Apr 2003 01:52:18 -0000 1.3 *************** *** 4,9 ****

        Source Releases

        !

        The first pre-release of version 1.0 of the SpamBayes project is available. ! Download version 1.0a1 from the sourceforge Files page as either a gzipped tarball or a zip file of the source files.

        --- 4,9 ----

        Source Releases

        !

        The second pre-release of version 1.0 of the SpamBayes project is available. ! Download version 1.0a2 from the sourceforge Files page as either a gzipped tarball or a zip file of the source files.

        *************** *** 22,25 **** --- 22,30 ----

        Binary Releases

        +

        Outlook Plugin

        +

        Mark has packaged together an installer for the plugin. + You can download it from his website. + This is currently at version 002.

        +

        Other

        None as yet.

        *************** *** 29,45 **** more details.

        -

        Update:

        -

        - (2003-01-14 14:04:19 - Project CVS Services) As of 2003-01-14, - pserver-based CVS repository access and ViewCVS (web-based) CVS repository - access have been taken offline as to stabilize CVS server performance for - developers. These services will be re-enabled as soon as the underlying - scalability issues have been analyzed and resolved (as soon as 2003-01-15, - if possible). Additional updates will be posted to the Site Status page as - they become available. Your patience is appreciated. -

        - -

        Nightly snapshots

        -

        A nightly snapshot is available: spambayes-nightly.tar.gz.

        -

        Note that due to some Sourceforge issues, this is currently being built with a "manual cron" (i.e. when I remember). Once Sourceforge's CVS issues are resolved, this will be available automatically.

        - --- 34,35 ---- Index: index.ht =================================================================== RCS file: /cvsroot/spambayes/website/index.ht,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** index.ht 20 Jan 2003 20:32:52 -0000 1.11 --- index.ht 17 Apr 2003 01:52:18 -0000 1.12 *************** *** 5,10 ****

        News

        !

        First pre-release available. See the download page for more.

        !

        What is SpamBayes?

        The SpamBayes --- 5,10 ----

        News

        !

        Second pre-release available. See the download page for more.

        !

        What is SpamBayes?

        The SpamBayes *************** *** 92,96 ****

        The code is currently available from a variety of methods from the downloads page. The current release is ! 1.0 prerelease 1.

        --- 92,96 ----

        The code is currently available from a variety of methods from the downloads page. The current release is ! 1.0 alpha 2.

        Index: links.h =================================================================== RCS file: /cvsroot/spambayes/website/links.h,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** links.h 17 Jan 2003 07:34:44 -0000 1.5 --- links.h 17 Apr 2003 01:52:18 -0000 1.6 *************** *** 8,11 ****

        Getting the code

      • Releases -
      • Snapshots
      • CVS access --- 8,10 ---- From anthonybaxter at users.sourceforge.net Wed Apr 16 23:14:39 2003 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Thu Apr 17 01:14:43 2003 Subject: [Spambayes-checkins] website faq.ht,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv25804 Modified Files: faq.ht Log Message: note about stupid beats smart in the tokenizer tricks q. Index: faq.ht =================================================================== RCS file: /cvsroot/spambayes/website/faq.ht,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** faq.ht 17 Apr 2003 01:52:18 -0000 1.1 --- faq.ht 17 Apr 2003 05:14:36 -0000 1.2 *************** *** 19,24 ****

        If you have any suggestions about other questions and answers that should be included here, please mail the list with them.

        !

        Hey! Why don't you implement cool tokenizer trick X? I think it ! would really foil those spammers!

        Have you run your tokenizer trick against a set of messages to see if it actually works? Many times what seems like a good idea turns out --- 19,24 ----

        If you have any suggestions about other questions and answers that should be included here, please mail the list with them.

        !

        Hey! Why don't you implement cool tokenizer trick ! X? I think it would really foil those spammers!

        Have you run your tokenizer trick against a set of messages to see if it actually works? Many times what seems like a good idea turns out *************** *** 30,36 **** code it for you (but make sure you do some testing when it's done). Otherwise, you will likely get a message from Tim Peters about ! why you need to test your idea :)

        !

        This software is great! I want to implement it for all my users. Are there plans to develop a server-side spambayes solution?

        The problem with a server-side solution is that everyone has a --- 30,41 ---- code it for you (but make sure you do some testing when it's done). Otherwise, you will likely get a message from Tim Peters about ! why you need to test your idea :) Note that as a general rule, ! we've found that with the tokenizer, "stupid beats smart" -- that is, ! very specialised tokenizer behaviour usually produces worse results than ! a more general approach that just generates tokens and throws them at the ! classifier.

        !

        This software is great! I want to implement it ! for all my users. Are there plans to develop a server-side spambayes solution?

        The problem with a server-side solution is that everyone has a *************** *** 44,49 **** we'll update this answer.

        !

        I just got a spam, but the system said it was "unsure". Why ! couldn't it tell that it was spam - it's obvious?

        It may be obvious to you, but the classifier only works on the information it has been given. Maybe this is "new" (you've --- 49,54 ---- we'll update this answer.

        !

        I just got a spam, but the system said it was "unsure". ! Why couldn't it tell that it was spam - it's obvious?

        It may be obvious to you, but the classifier only works on the information it has been given. Maybe this is "new" (you've *************** *** 52,58 **** aware of as strong spam clues.

        !

        OK, I trained on that message. But I just got *another* one, ! and the stupid system still thinks it's unsure. Why did it ! ignore me???

        It didn't, but you may need to train on a few more of this type of message to get it classified as "spam". The classification --- 57,63 ---- aware of as strong spam clues.

        !

        OK, I trained on that message. But I just got ! another one, and the stupid system still thinks it's unsure. Why ! did it ignore me???

        It didn't, but you may need to train on a few more of this type of message to get it classified as "spam". The classification *************** *** 61,66 **** of message may need a few more instances to become "convincing".

        !

        I've mucked up my training and I want to start all over again, ! but there isn't an option for this anywhere. What do I do?

        Because training from scratch is a very rare occurance, and because deleting all your training information is something you don't want --- 66,72 ---- of message may need a few more instances to become "convincing".

        !

        I've mucked up my training and I want to ! start all over again, but there isn't an option for this anywhere. ! What do I do?

        Because training from scratch is a very rare occurance, and because deleting all your training information is something you don't want From timstone4 at users.sourceforge.net Thu Apr 17 09:47:25 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Thu Apr 17 11:47:32 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.14,1.15 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv8030 Modified Files: imapfilter.py Log Message: 1. Corrected newline mangling (I hope), variations of which caused problems for Oliver Maunder and probably for David Abrahams (see mailing list) 2. Corrected folder comparison operation 3. Refactored functionality into an IMAPSession class, which results in a more consistent IMAP semantic 4. Added -p option to prompt for password, negating the necessity of storing the imap password in clear text in Options 5. Corrected error in training, which resulted in no training being performed Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.14 retrieving revision 1.15 diff -C2 -d -r1.14 -r1.15 *** imapfilter.py 17 Apr 2003 00:08:17 -0000 1.14 --- imapfilter.py 17 Apr 2003 15:47:20 -0000 1.15 *************** *** 20,23 **** --- 20,25 ---- -h : help -v : verbose mode + -p : security option to prompt for imap password, + rather than look in options.imap_password -e y/n : expunge/purge messages on exit (y) or not (n) -i debuglvl : a somewhat mysterious imaplib debugging level *************** *** 78,81 **** --- 80,84 ---- import sys import getopt + from getpass import getpass import email.Parser from email.Utils import parsedate *************** *** 85,88 **** --- 88,92 ---- # global IMAPlib object + global imap imap = None *************** *** 98,118 **** # nicer if we cached this information somewhere. # XXX If we wanted to be nice and tidy, this really belongs ! # XXX in an IMAPUtilities class, or something like that. ! current_folder = None ! current_folder_readonly = None ! def Select(folder, readOnly=True, force=False): ! global current_folder ! global current_folder_readonly ! if current_folder != folder or current_folder_readonly != readOnly or force: ! response = imap.select(folder, readOnly) ! if response[0] != "OK": ! print "Invalid response to %s:\n%s" % (command, response) ! sys.exit(-1) ! current_folder = folder ! current_folder_readonly = readOnly ! return response class IMAPMessage(message.SBHeaderMessage): ! def __init__(self, folder, id): message.Message.__init__(self) --- 102,149 ---- # nicer if we cached this information somewhere. # XXX If we wanted to be nice and tidy, this really belongs ! # XXX in an IMAPUtilities class, or something like that, ! # XXX or something like this: ! ! class IMAPSession(imaplib.IMAP4): ! '''A class extending the IMAP4 class, with a few optimizations''' ! ! def __init__(self, server, port, debug): ! imaplib.Debug = debug # this is a global in the imaplib module ! imaplib.IMAP4.__init__(self, server, port) ! self.current_folder = None ! self.current_folder_readonly = None ! ! def login(self, uid, pw): ! try: ! imaplib.IMAP4.login(self, uid, pw) # superclass login ! except imaplib.IMAP4.error, e: ! if str(e) == "permission denied": ! print "There was an error logging in to the IMAP server." ! print "The userid and/or password may be incorrect." ! sys.exit() ! else: ! raise ! ! def logout(self, expunge): ! # sign off ! if expunge: ! self.expunge() ! imaplib.IMAP4.logout(self) # superclass logout ! ! def SelectFolder(self, folder, readOnly=True, force=False): ! '''A method to point ensuing imap operations at a target folder''' ! ! if self.current_folder != folder or \ ! self.current_folder_readonly != readOnly or force: ! response = self.select(folder, readOnly) ! if response[0] != "OK": ! print "Invalid response to %s:\n%s" % (command, response) ! sys.exit(-1) ! self.current_folder = folder ! self.current_folder_readonly = readOnly ! return response class IMAPMessage(message.SBHeaderMessage): ! def __init__(self, folder, id): message.Message.__init__(self) *************** *** 128,141 **** def extractTime(self): # When we create a new copy of a message, we need to specify ! # a timestamp for the message. Ideally, this would be the ! # timestamp from the message itself, but for the moment, we ! # just use the current time. try: ! return imaplib.Time2Internaldate(time.mktime(parsedate(self["Date"]))) except KeyError: return imaplib.Time2Internaldate(time.time()) def MoveTo(self, dest): ! # The move just changes where we think we are, # and we do an actual move on save (to avoid doing # this more than once) --- 159,172 ---- def extractTime(self): # When we create a new copy of a message, we need to specify ! # a timestamp for the message. If the message has a date header ! # we use that. Otherwise, we use the current time. try: ! return imaplib.Time2Internaldate(\ ! time.mktime(parsedate(self["Date"]))) except KeyError: return imaplib.Time2Internaldate(time.time()) def MoveTo(self, dest): ! # This move operation just changes where we think we are, # and we do an actual move on save (to avoid doing # this more than once) *************** *** 148,160 **** # so what we do is create a new message and delete the old one time_stamp = self.extractTime() response = imap.append(self.folder.name, None, ! time_stamp, self.as_string()) self._check(response, 'append') old_id = self.id if self.previous_folder is None: ! self.folder.Select(False) else: ! self.previous_folder.Select(False) self.previous_folder = None response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)") --- 179,193 ---- # so what we do is create a new message and delete the old one time_stamp = self.extractTime() + msgstr = re.sub('([^\r])\n', r'\1\r\n', self.as_string()) + response = imap.append(self.folder.name, None, ! time_stamp, msgstr) self._check(response, 'append') old_id = self.id if self.previous_folder is None: ! imap.SelectFolder(self.folder.name, False) else: ! imap.SelectFolder(self.previous_folder.name, False) self.previous_folder = None response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)") *************** *** 165,169 **** # XXX changed, as the message to be deleted will be found first # XXX (if they are in the same folder) ! self.folder.Select(True) #response = imap.uid("SEARCH", "TEXT", self.as_string()) #self._check(response, 'search') --- 198,202 ---- # XXX changed, as the message to be deleted will be found first # XXX (if they are in the same folder) ! imap.SelectFolder(self.folder.name, True) #response = imap.uid("SEARCH", "TEXT", self.as_string()) #self._check(response, 'search') *************** *** 184,187 **** --- 217,224 ---- self.name = folder_name + def __cmp__(self, obj): + '''Two folders are equal if their names are equal''' + return cmp(self.name, obj.name) + def _check(self, response, command): if response[0] != "OK": *************** *** 200,204 **** '''Returns uids for all the messages in the folder''' # request message range ! response = Select(self.name, True, True) total_messages = response[1][0] if total_messages == '0': --- 237,241 ---- '''Returns uids for all the messages in the folder''' # request message range ! response = imap.SelectFolder(self.name, True, True) total_messages = response[1][0] if total_messages == '0': *************** *** 216,220 **** '''Return message matching the given uid''' global rfc822_command ! Select(self.name, True) # We really want to use RFC822.PEEK here, as that doesn't effect # the status of the message. Unfortunately, it appears that not --- 253,257 ---- '''Return message matching the given uid''' global rfc822_command ! imap.SelectFolder(self.name, True) # We really want to use RFC822.PEEK here, as that doesn't effect # the status of the message. Unfortunately, it appears that not *************** *** 233,240 **** return msg ! ! def Select(self, readOnly): ! return Select(self.name, readOnly) ! def Train(self, classifier, isSpam): '''Train folder as spam/ham''' --- 270,274 ---- return msg ! def Train(self, classifier, isSpam): '''Train folder as spam/ham''' *************** *** 249,253 **** msg.RememberTrained(None) ! if msg.GetTrained() is not None: classifier.learn(msg.asTokens(), isSpam) num_trained += 1 --- 283,287 ---- msg.RememberTrained(None) ! if msg.GetTrained() is None: classifier.learn(msg.asTokens(), isSpam) num_trained += 1 *************** *** 276,285 **** class IMAPFilter(object): def __init__(self, classifier, debug): - global imap - imap = imaplib.IMAP4(options.imap_server, options.imap_port) - imap.debug = imapDebug - - self.Login(options.imap_username, options.imap_password) - self.spam_folder = IMAPFolder(options.imap_spam_folder) self.unsure_folder = IMAPFolder(options.imap_unsure_folder) --- 310,313 ---- *************** *** 320,340 **** if options.verbose: print "Filtering took", time.time() - t, "seconds." ! ! def Login(self, uid, pw): ! try: ! lgn = imap.login(uid, pw) ! except imaplib.IMAP4.error, e: ! if str(e) == "permission denied": ! print "There was an error logging in to the IMAP server." ! print "The userid and/or password may be incorrect." ! sys.exit() ! else: ! raise ! ! def Logout(self, expunge): ! # sign off ! if expunge: ! imap.expunge() ! imap.logout() --- 348,352 ---- if options.verbose: print "Filtering took", time.time() - t, "seconds." ! *************** *** 342,346 **** try: ! opts, args = getopt.getopt(sys.argv[1:], 'htcvl:e:i:d:D:') except getopt.error, msg: print >>sys.stderr, str(msg) + '\n\n' + __doc__ --- 354,358 ---- try: ! opts, args = getopt.getopt(sys.argv[1:], 'htcvpl:e:i:d:D:') except getopt.error, msg: print >>sys.stderr, str(msg) + '\n\n' + __doc__ *************** *** 354,357 **** --- 366,370 ---- imapDebug = 0 sleepTime = 0 + promptForPass = 0 for opt, arg in opts: *************** *** 367,370 **** --- 380,385 ---- elif opt == '-t': doTrain = True + elif opt == '-p': + promptForPass = 1 elif opt == '-c': doClassify = True *************** *** 385,388 **** --- 400,413 ---- sys.exit() + imap = IMAPSession(options.imap_server, options.imap_port, \ + imapDebug) + + if promptForPass: + pwd = getpass() + else: + pwd = options.imap_password + + imap.login(options.imap_username, pwd) + bdbname = os.path.expanduser(bdbname) *************** *** 415,417 **** break ! imap_filter.Logout(doExpunge) --- 440,442 ---- break ! imap.logout(doExpunge) From timstone4 at users.sourceforge.net Thu Apr 17 13:54:14 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Thu Apr 17 15:54:18 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.15,1.16 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv30858 Modified Files: imapfilter.py Log Message: 1. Corrected an error in the timed loop, that kept an imap session open while the filter was sleeping Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** imapfilter.py 17 Apr 2003 15:47:20 -0000 1.15 --- imapfilter.py 17 Apr 2003 19:54:10 -0000 1.16 *************** *** 400,406 **** sys.exit() - imap = IMAPSession(options.imap_server, options.imap_port, \ - imapDebug) - if promptForPass: pwd = getpass() --- 400,403 ---- *************** *** 408,413 **** pwd = options.imap_password - imap.login(options.imap_username, pwd) - bdbname = os.path.expanduser(bdbname) --- 405,408 ---- *************** *** 423,429 **** --- 418,429 ---- print "Done." + imap = IMAPSession(options.imap_server, options.imap_port, \ + imapDebug) + imap_filter = IMAPFilter(classifier, imapDebug) while True: + imap.login(options.imap_username, pwd) + if doTrain: if options.verbose: *************** *** 435,442 **** imap_filter.Filter() if sleepTime: time.sleep(sleepTime) else: break - - imap.logout(doExpunge) --- 435,442 ---- imap_filter.Filter() + imap.logout(doExpunge) + if sleepTime: time.sleep(sleepTime) else: break From timstone4 at users.sourceforge.net Thu Apr 17 15:14:57 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Thu Apr 17 17:15:01 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.16,1.17 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv29664 Modified Files: imapfilter.py Log Message: Removed debug from IMAPFilter constructor. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** imapfilter.py 17 Apr 2003 19:54:10 -0000 1.16 --- imapfilter.py 17 Apr 2003 21:14:53 -0000 1.17 *************** *** 309,313 **** class IMAPFilter(object): ! def __init__(self, classifier, debug): self.spam_folder = IMAPFolder(options.imap_spam_folder) self.unsure_folder = IMAPFolder(options.imap_unsure_folder) --- 309,313 ---- class IMAPFilter(object): ! def __init__(self, classifier): self.spam_folder = IMAPFolder(options.imap_spam_folder) self.unsure_folder = IMAPFolder(options.imap_unsure_folder) *************** *** 421,425 **** imapDebug) ! imap_filter = IMAPFilter(classifier, imapDebug) while True: --- 421,425 ---- imapDebug) ! imap_filter = IMAPFilter(classifier) while True: From anadelonbrin at users.sourceforge.net Thu Apr 17 17:49:43 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Thu Apr 17 19:49:46 2003 Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv23128/spambayes Modified Files: message.py Log Message: Documentation fixes/additions. Index: message.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** message.py 13 Apr 2003 12:54:02 -0000 1.9 --- message.py 17 Apr 2003 23:49:41 -0000 1.10 *************** *** 188,192 **** # may change, which would really screw this database up ! # an unrecoginzed string here is interpreted as unsure. Should # that condition actually raise an exception instead? --- 188,192 ---- # may change, which would really screw this database up ! # an unrecognized string here is interpreted as unsure. Should # that condition actually raise an exception instead? From anadelonbrin at users.sourceforge.net Thu Apr 17 17:49:43 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Thu Apr 17 19:49:48 2003 Subject: [Spambayes-checkins] spambayes FAQ.txt, 1.2, 1.3 imapfilter.py, 1.17, 1.18 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv23128 Modified Files: FAQ.txt imapfilter.py Log Message: Documentation fixes/additions. Index: FAQ.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/FAQ.txt,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** FAQ.txt 7 Apr 2003 00:53:45 -0000 1.2 --- FAQ.txt 17 Apr 2003 23:49:40 -0000 1.3 *************** *** 40,41 **** --- 40,53 ---- you what this file is called (and where it is) down towards the bottom of the page. + + Q: This software is great! I want to implement it for all my users. + Are there plans to develop a server-side spambayes solution? + A: The problem with a server-side solution is that everyone has a + different idea of what is spam - that's the whole strength of the + bayesian-style filtering concept. If you are certain that *all* + of your users would agree on what is spam and what is not, then + this might work for you, but otherwise you really have to have + individual databases for each user. Either way, you should be + able to modify spambayes easily enough to fit into your setup. + Please let the list know if you do have success in this area, and + we'll update this answer. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.17 retrieving revision 1.18 diff -C2 -d -r1.17 -r1.18 *** imapfilter.py 17 Apr 2003 21:14:53 -0000 1.17 --- imapfilter.py 17 Apr 2003 23:49:40 -0000 1.18 *************** *** 36,40 **** Train Spam and Ham only, with pickled database imapfilter -t -d bayes.db ! To Do: o Find a better way to remove old msg from info database when saving --- 36,58 ---- Train Spam and Ham only, with pickled database imapfilter -t -d bayes.db ! ! Warnings: ! o This is very alpha. The filter is currently being developed and ! tested. We do *not* recommend using it on a production system unless ! you are confident that you can get your mail back if you lose it. On ! the other hand, we do recommend that you test it for us and let us ! know if anything does go wrong. Once this appears in a release, ! rather than just cvs, you can feel a *little* more confident ! about using it. ! o By default, the filter does *not* delete, modify or move any of your ! mail. Due to quirks in how imap works, new versions of your mail are ! modified and placed in new folders, but there originals are still ! available. These are flagged with the /Deleted flag so that you know ! that they can be removed. Your mailer may not show these messages ! by default, but there should be an option to do so. *However*, if ! your mailer automatically purges/expunges (i.e. permantently deletes) ! mail flagged as such, *or* if you set the imap_expunge option to ! True, then this mail will be irretrievably lost. ! To Do: o Find a better way to remove old msg from info database when saving *************** *** 62,67 **** # Foundation license. ! __author__ = "Tony Meyer " ! __credits__ = "Tim Stone, All the Spambayes folk." from __future__ import generators --- 80,85 ---- # Foundation license. ! __author__ = "Tony Meyer , Tim Stone" ! __credits__ = "All the Spambayes folk." from __future__ import generators *************** *** 94,108 **** rfc822_command = "(RFC822.PEEK)" - # For efficiency, we remember which folder we are currently - # in, and only send a select command to the IMAP server if - # we want to *change* folders. This function is used by - # both IMAPMessage and IMAPFolder. - # Occaisionally, we need to force a command, because we - # are interested in the response. Things would be much - # nicer if we cached this information somewhere. - # XXX If we wanted to be nice and tidy, this really belongs - # XXX in an IMAPUtilities class, or something like that, - # XXX or something like this: - class IMAPSession(imaplib.IMAP4): '''A class extending the IMAP4 class, with a few optimizations''' --- 112,115 ---- *************** *** 111,114 **** --- 118,125 ---- imaplib.Debug = debug # this is a global in the imaplib module imaplib.IMAP4.__init__(self, server, port) + # For efficiency, we remember which folder we are currently + # in, and only send a select command to the IMAP server if + # we want to *change* folders. This function is used by + # both IMAPMessage and IMAPFolder. self.current_folder = None self.current_folder_readonly = None *************** *** 136,139 **** --- 147,153 ---- if self.current_folder != folder or \ self.current_folder_readonly != readOnly or force: + # Occasionally, we need to force a command, because we + # are interested in the response. Things would be much + # nicer if we cached this information somewhere. response = self.select(folder, readOnly) if response[0] != "OK": *************** *** 348,356 **** if options.verbose: print "Filtering took", time.time() - t, "seconds." - ! if __name__ == '__main__': ! try: opts, args = getopt.getopt(sys.argv[1:], 'htcvpl:e:i:d:D:') --- 362,368 ---- if options.verbose: print "Filtering took", time.time() - t, "seconds." ! def run(): try: opts, args = getopt.getopt(sys.argv[1:], 'htcvpl:e:i:d:D:') *************** *** 381,385 **** doTrain = True elif opt == '-p': ! promptForPass = 1 elif opt == '-c': doClassify = True --- 393,397 ---- doTrain = True elif opt == '-p': ! promptForPass = True elif opt == '-c': doClassify = True *************** *** 441,442 **** --- 453,457 ---- else: break + + if __name__ == '__main__': + run() \ No newline at end of file From anadelonbrin at users.sourceforge.net Thu Apr 17 17:55:31 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Thu Apr 17 19:55:34 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.18,1.19 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv25118 Modified Files: imapfilter.py Log Message: Opps. That last commit broke it. Here's the fix. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.18 retrieving revision 1.19 diff -C2 -d -r1.18 -r1.19 *** imapfilter.py 17 Apr 2003 23:49:40 -0000 1.18 --- imapfilter.py 17 Apr 2003 23:55:28 -0000 1.19 *************** *** 51,55 **** that they can be removed. Your mailer may not show these messages by default, but there should be an option to do so. *However*, if ! your mailer automatically purges/expunges (i.e. permantently deletes) mail flagged as such, *or* if you set the imap_expunge option to True, then this mail will be irretrievably lost. --- 51,55 ---- that they can be removed. Your mailer may not show these messages by default, but there should be an option to do so. *However*, if ! your mailer automatically purges/expunges (i.e. permanently deletes) mail flagged as such, *or* if you set the imap_expunge option to True, then this mail will be irretrievably lost. *************** *** 193,197 **** # so what we do is create a new message and delete the old one time_stamp = self.extractTime() ! msgstr = re.sub('([^\r])\n', r'\1\r\n', self.as_string()) response = imap.append(self.folder.name, None, --- 193,198 ---- # so what we do is create a new message and delete the old one time_stamp = self.extractTime() ! #msgstr = re.sub('([^\r])\n', r'\1\r\n', self.as_string()) ! msgstr = self.as_string() response = imap.append(self.folder.name, None, *************** *** 365,368 **** --- 366,370 ---- def run(): + global imap try: opts, args = getopt.getopt(sys.argv[1:], 'htcvpl:e:i:d:D:') From anadelonbrin at users.sourceforge.net Thu Apr 17 18:00:20 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Thu Apr 17 20:00:25 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.19,1.20 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv26622 Modified Files: imapfilter.py Log Message: Last commit included testing code. Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.19 retrieving revision 1.20 diff -C2 -d -r1.19 -r1.20 *** imapfilter.py 17 Apr 2003 23:55:28 -0000 1.19 --- imapfilter.py 18 Apr 2003 00:00:13 -0000 1.20 *************** *** 193,198 **** # so what we do is create a new message and delete the old one time_stamp = self.extractTime() ! #msgstr = re.sub('([^\r])\n', r'\1\r\n', self.as_string()) ! msgstr = self.as_string() response = imap.append(self.folder.name, None, --- 193,197 ---- # so what we do is create a new message and delete the old one time_stamp = self.extractTime() ! msgstr = re.sub('([^\r])\n', r'\1\r\n', self.as_string()) response = imap.append(self.folder.name, None, From timstone4 at users.sourceforge.net Thu Apr 17 18:56:10 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Thu Apr 17 20:56:13 2003 Subject: [Spambayes-checkins] spambayes imapfilter.py,1.20,1.21 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv12325 Modified Files: imapfilter.py Log Message: Moved the header repair regex into the message class Index: imapfilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v retrieving revision 1.20 retrieving revision 1.21 diff -C2 -d -r1.20 -r1.21 *** imapfilter.py 18 Apr 2003 00:00:13 -0000 1.20 --- imapfilter.py 18 Apr 2003 00:56:06 -0000 1.21 *************** *** 192,200 **** # we can't actually update the message with IMAP # so what we do is create a new message and delete the old one - time_stamp = self.extractTime() - msgstr = re.sub('([^\r])\n', r'\1\r\n', self.as_string()) - response = imap.append(self.folder.name, None, ! time_stamp, msgstr) self._check(response, 'append') --- 192,197 ---- # we can't actually update the message with IMAP # so what we do is create a new message and delete the old one response = imap.append(self.folder.name, None, ! self.extractTime(), self.as_string()) self._check(response, 'append') From timstone4 at users.sourceforge.net Thu Apr 17 18:56:22 2003 From: timstone4 at users.sourceforge.net (Tim Stone) Date: Thu Apr 17 20:56:26 2003 Subject: [Spambayes-checkins] spambayes/spambayes message.py,1.10,1.11 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv12424 Modified Files: message.py Log Message: Moved the header repair regex into the message class Index: message.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** message.py 17 Apr 2003 23:49:41 -0000 1.10 --- message.py 18 Apr 2003 00:56:20 -0000 1.11 *************** *** 88,91 **** --- 88,92 ---- import sys import types + import re import email.Message *************** *** 169,172 **** --- 170,180 ---- def asTokens(self): return tokenize(self.as_string()) + + def as_string(self): + # This override is currently needed because of an apparent bug + # in the email package, where header lines are not properly + # terminated with \r\n + return re.sub('([^\r])\n', r'\1\r\n', \ + email.Message.Message.as_string(self)) def modified(self): From anadelonbrin at users.sourceforge.net Thu Apr 17 20:36:13 2003 From: anadelonbrin at users.sourceforge.net (Tony Meyer) Date: Thu Apr 17 22:36:19 2003 Subject: [Spambayes-checkins] spambayes/spambayes CompatConfigParser.py, NONE, 1.1 UpdatableConfigParser.py, 1.1, 1.2 Message-ID: Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv9543/spambayes Modified Files: UpdatableConfigParser.py Added Files: CompatConfigParser.py Log Message: A much improved ConfigParser (from Python 2.3). Updates UpdatableConfigParser to use this (this module is still not used by any spambayes module, but stay tuned!) --- NEW FILE: CompatConfigParser.py --- from __future__ import generators # kludge for Python pre 2.3 """Configuration file parser. A setup file consists of sections, lead by a "[section]" header, and followed by "name: value" entries, with continuations and such in the style of RFC 822. The option values can contain format strings which refer to other values in the same section, or values in a special [DEFAULT] section. For example: something: %(dir)s/whatever would resolve the "%(dir)s" to the value of dir. All reference expansions are done late, on demand. Intrinsic defaults can be specified by passing them into the ConfigParser constructor as a dictionary. class: ConfigParser -- responsible for for parsing a list of configuration files, and managing the parsed database. methods: __init__(defaults=None) create the parser and specify a dictionary of intrinsic defaults. The keys must be strings, the values must be appropriate for %()s string interpolation. Note that `__name__' is always an intrinsic default; it's value is the section's name. sections() return all the configuration section names, sans DEFAULT has_section(section) return whether the given section exists has_option(section, option) return whether the given option exists in the given section options(section) return list of configuration options for the named section read(filenames) read and parse the list of named configuration files, given by name. A single filename is also allowed. Non-existing files are ignored. readfp(fp, filename=None) read and parse one configuration file, given as a file object. The filename defaults to fp.name; it is only used in error messages (if fp has no `name' attribute, the string `' is used). get(section, option, raw=False, vars=None) return a string value for the named option. All % interpolations are expanded in the return values, based on the defaults passed into the constructor and the DEFAULT section. Additional substitutions may be provided using the `vars' argument, which must be a dictionary whose contents override any pre-existing defaults. getint(section, options) like get(), but convert value to an integer getfloat(section, options) like get(), but convert value to a float getboolean(section, options) like get(), but convert value to a boolean (currently case insensitively defined as 0, false, no, off for False, and 1, true, yes, on for True). Returns False or True. items(section, raw=False, vars=None) return a list of tuples with (name, value) for each option in the section. remove_section(section) remove the given file section and all its options remove_option(section, option) remove the given option from the given section set(section, option, value) set the given option write(fp) write the configuration state in .ini format """ import re __all__ = ["NoSectionError", "DuplicateSectionError", "NoOptionError", "InterpolationError", "InterpolationDepthError", "InterpolationSyntaxError", "ParsingError", "MissingSectionHeaderError", "ConfigParser", "DEFAULTSECT", "MAX_INTERPOLATION_DEPTH"] DEFAULTSECT = "DEFAULT" MAX_INTERPOLATION_DEPTH = 10 # exception classes class Error(Exception): """Base class for ConfigParser exceptions.""" def __init__(self, msg=''): self.message = msg Exception.__init__(self, msg) def __repr__(self): return self.message __str__ = __repr__ class NoSectionError(Error): """Raised when no section matches a requested option.""" def __init__(self, section): Error.__init__(self, 'No section: ' + `section`) self.section = section class DuplicateSectionError(Error): """Raised when a section is multiply-created.""" def __init__(self, section): Error.__init__(self, "Section %r already exists" % section) self.section = section class NoOptionError(Error): """A requested option was not found.""" def __init__(self, option, section): Error.__init__(self, "No option %r in section: %r" % (option, section)) self.option = option self.section = section class InterpolationError(Error): """Base class for interpolation-related exceptions.""" def __init__(self, option, section, msg): Error.__init__(self, msg) self.option = option self.section = section class InterpolationMissingOptionError(InterpolationError): """A string substitution required a setting which was not available.""" def __init__(self, option, section, rawval, reference): msg = ("Bad value substitution:\n" "\tsection: [%s]\n" "\toption : %s\n" "\tkey : %s\n" "\trawval : %s\n" % (section, option, reference, rawval)) InterpolationError.__init__(self, option, section, msg) self.reference = reference class InterpolationSyntaxError(InterpolationError): """Raised when the source text into which substitutions are made does not conform to the required syntax.""" class InterpolationDepthError(InterpolationError): """Raised when substitutions are nested too deeply.""" def __init__(self, option, section, rawval): msg = ("Value interpolation too deeply recursive:\n" "\tsection: [%s]\n" "\toption : %s\n" "\trawval : %s\n" % (section, option, rawval)) InterpolationError.__init__(self, option, section, msg) class ParsingError(Error): """Raised when a configuration file does not follow legal syntax.""" def __init__(self, filename): Error.__init__(self, 'File contains parsing errors: %s' % filename) self.filename = filename self.errors = [] def append(self, lineno, line): self.errors.append((lineno, line)) self.message += '\n\t[line %2d]: %s' % (lineno, line) class MissingSectionHeaderError(ParsingError): """Raised when a key-value pair is found before any section header.""" def __init__(self, filename, lineno, line): Error.__init__( self, 'File contains no section headers.\nfile: %s, line: %d\n%s' % (filename, lineno, line)) self.filename = filename self.lineno = lineno self.line = line class RawConfigParser: def __init__(self, defaults=None): self._sections = {} if defaults is None: self._defaults = {} else: self._defaults = defaults def defaults(self): return self._defaults def sections(self): """Return a list of section names, excluding [DEFAULT]""" # self._sections will never have [DEFAULT] in it return self._sections.keys() def add_section(self, section): """Create a new section in the configuration. Raise DuplicateSectionError if a section by the specified name already exists. """ if section in self._sections: raise DuplicateSectionError(section) self._sections[section] = {} def has_section(self, section): """Indicate whether the named section is present in the configuration. The DEFAULT section is not acknowledged. """ return section in self._sections def options(self, section): """Return a list of option names for the given section name.""" try: opts = self._sections[section].copy() except KeyError: raise NoSectionError(section) opts.update(self._defaults) if '__name__' in opts: del opts['__name__'] return opts.keys() def read(self, filenames): """Read and parse a filename or a list of filenames. Files that cannot be opened are silently ignored; this is designed so that you can specify a list of potential configuration file locations (e.g. current directory, user's home directory, systemwide directory), and all existing configuration files in the list will be read. A single filename may also be given. """ # we kludge this for Python pre 2.3 if isinstance(filenames, type("basestring")): filenames = [filenames] for filename in filenames: try: fp = open(filename) except IOError: continue self._read(fp, filename) fp.close() def readfp(self, fp, filename=None): """Like read() but the argument must be a file-like object. The `fp' argument must have a `readline' method. Optional second argument is the `filename', which if not given, is taken from fp.name. If fp has no `name' attribute, `' is used. """ if filename is None: try: filename = fp.name except AttributeError: filename = '' self._read(fp, filename) def get(self, section, option): opt = self.optionxform(option) if section not in self._sections: if section != DEFAULTSECT: raise NoSectionError(section) if opt in self._defaults: return self._defaults[opt] else: raise NoOptionError(option, section) elif opt in self._sections[section]: return self._sections[section][opt] elif opt in self._defaults: return self._defaults[opt] else: raise NoOptionError(option, section) def items(self, section): try: d2 = self._sections[section] except KeyError: if section != DEFAULTSECT: raise NoSectionError(section) d2 = {} d = self._defaults.copy() d.update(d2) if "__name__" in d: del d["__name__"] return d.items() def _get(self, section, conv, option): return conv(self.get(section, option)) def getint(self, section, option): return self._get(section, int, option) def getfloat(self, section, option): return self._get(section, float, option) _boolean_states = {'1': True, 'yes': True, 'true': True, 'on': True, '0': False, 'no': False, 'false': False, 'off': False} def getboolean(self, section, option): v = self.get(section, option) if v.lower() not in self._boolean_states: raise ValueError, 'Not a boolean: %s' % v return self._boolean_states[v.lower()] def optionxform(self, optionstr): return optionstr.lower() def has_option(self, section, option): """Check for the existence of a given option in a given section.""" if not section or section == DEFAULTSECT: option = self.optionxform(option) return option in self._defaults elif section not in self._sections: return False else: option = self.optionxform(option) return (option in self._sections[section] or option in self._defaults) def set(self, section, option, value): """Set an option.""" if not section or section == DEFAULTSECT: sectdict = self._defaults else: try: sectdict = self._sections[section] except KeyError: raise NoSectionError(section) sectdict[self.optionxform(option)] = value def write(self, fp): """Write an .ini-format representation of the configuration state.""" if self._defaults: fp.write("[%s]\n" % DEFAULTSECT) for (key, value) in self._defaults.items(): fp.write("%s = %s\n" % (key, str(value).replace('\n', '\n\t'))) fp.write("\n") for section in self._sections: fp.write("[%s]\n" % section) for (key, value) in self._sections[section].items(): if key != "__name__": fp.write("%s = %s\n" % (key, str(value).replace('\n', '\n\t'))) fp.write("\n") def remove_option(self, section, option): """Remove an option.""" if not section or section == DEFAULTSECT: sectdict = self._defaults else: try: sectdict = self._sections[section] except KeyError: raise NoSectionError(section) option = self.optionxform(option) existed = option in sectdict if existed: del sectdict[option] return existed def remove_section(self, section): """Remove a file section.""" existed = section in self._sections if existed: del self._sections[section] return existed # # Regular expressions for parsing section headers and options. # SECTCRE = re.compile( r'\[' # [ r'(?P

        [^]]+)' # very permissive! r'\]' # ] ) OPTCRE = re.compile( r'(?P