[Spambayes-checkins] spambayes Options.py,1.79,1.80
tokenizer.py,1.70,1.71
Skip Montanaro
montanaro at users.sourceforge.net
Tue Dec 10 20:57:26 EST 2002
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv5498
Modified Files:
Options.py tokenizer.py
Log Message:
New option summarize_email_prefixes attempts to capitalize on the fact that
spam is often sent to multiple similar addresses.
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.79
retrieving revision 1.80
diff -C2 -d -r1.79 -r1.80
*** Options.py 3 Dec 2002 20:11:13 -0000 1.79
--- Options.py 11 Dec 2002 04:57:24 -0000 1.80
***************
*** 105,108 ****
--- 105,111 ----
generate_long_skips: True
+ # Try to capitalize on mail sent to multiple similar addresses.
+ summarize_email_prefixes: False
+
#
# Length of words that triggers 'long skips'. Longer than this
***************
*** 390,393 ****
--- 393,397 ----
'record_header_absence': boolean_cracker,
'generate_long_skips': boolean_cracker,
+ 'summarize_email_prefixes': boolean_cracker,
'skip_max_word_size': int_cracker,
'extract_dow': boolean_cracker,
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.70
retrieving revision 1.71
diff -C2 -d -r1.70 -r1.71
*** tokenizer.py 24 Nov 2002 07:41:03 -0000 1.70
--- tokenizer.py 11 Dec 2002 04:57:24 -0000 1.71
***************
*** 12,15 ****
--- 12,16 ----
import math
import time
+ import os
from sets import Set
***************
*** 1136,1139 ****
--- 1137,1175 ----
yield "%s:no real name:2**%d" % (field,
round(log2(noname_count)))
+
+ # Spammers sometimes send out mail alphabetically to fairly large
+ # numbers of addresses. This results in headers like:
+ # To: <itinerart@videotron.ca>
+ # Cc: <itinerant@skyful.com>, <itinerant@netillusions.net>,
+ # <itineraries@musi-cal.com>, <itinerario@rullet.leidenuniv.nl>,
+ # <itinerance@sorengo.com>
+ #
+ # This token attempts to exploit that property. The above would
+ # give a common prefix of "itinera" for 6 addresses, yielding a
+ # gross score of 42. We group scores into buckets by dividing by 10
+ # to yield a final token value of "pfxlen:04". The length test
+ # eliminates the bad case where the message was sent to a single
+ # individual.
+ if options.summarize_email_prefixes:
+ all_addrs = []
+ addresses = msg.get_all('to', []) + msg.get_all('cc', [])
+ for name, addr in email.Utils.getaddresses(addresses):
+ all_addrs.append(addr.lower())
+
+ if len(all_addrs) > 1:
+ # don't be fooled by "os.path." - commonprefix
+ # operates char-by-char!
+ pfx = os.path.commonprefix(all_addrs)
+ if pfx:
+ score = (len(pfx) * len(all_addrs)) // 10
+ # After staring at pflen:* values generated from a large
+ # number of ham & spam I saw that any scores greater
+ # than 3 were always associated with spam. Collapsing
+ # all such scores into a single token avoids a bunch of
+ # hapaxes like "pfxlen:28".
+ if score > 3:
+ yield "pfxlen:big"
+ else:
+ yield "pfxlen:%d" % score
# To:
More information about the Spambayes-checkins
mailing list