[Spambayes-checkins] spambayes/contrib pycksum.py,1.1,1.2
Skip Montanaro
montanaro at users.sourceforge.net
Fri Aug 18 04:29:05 CEST 2006
Update of /cvsroot/spambayes/spambayes/contrib
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv16513
Modified Files:
pycksum.py
Log Message:
* Try to improve the duplicate detection capability. Lots of spam nowadays
has random text junk, so be more lenient about how many chunks have to
match. Also do a little more filtering on the source:
- Compress multiple spaces and tabs to a single space
- Compress multiple contiguous newlines into one
- Map all strings of digits to a single "#" character
- Map some common html entities to their plain text equivalents.
* Use md5 checksum hexdigests instead of binascii.b2a_hex.
* Correct line breaking of filtered body.
* Use email.generator to flatten body instead of the broken flatten()
function.
Index: pycksum.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/contrib/pycksum.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** pycksum.py 25 May 2004 14:58:39 -0000 1.1
--- pycksum.py 18 Aug 2006 02:29:02 -0000 1.2
***************
*** 39,60 ****
import sys
import email.Parser
import md5
import anydbm
import re
import time
! import binascii
!
! def flatten(body):
! # three types are possible: list, string, Message
! if isinstance(body, str):
! return body
! if hasattr(body, "get_payload"):
! payload = body.get_payload()
! if payload is None:
! return ""
! return flatten(payload)
! if isinstance(body, list):
! return "\n".join([flatten(b) for b in body])
! raise TypeError, ("unrecognized body type: %s" % type(body))
def clean(data):
--- 39,51 ----
import sys
import email.Parser
+ import email.generator
import md5
import anydbm
import re
import time
! try:
! import cStringIO as StringIO
! except ImportError:
! import StringIO
def clean(data):
***************
*** 67,74 ****
data = re.sub(r"<[^>]*>", "", data).lower()
# delete anything which looks like a url or email address
# not sure what a pmguid: url is but it seems to occur frequently in spam
# also convert all runs of whitespace into a single space
! return " ".join([w for w in data.split()
if ('@' not in w and
(':' not in w or
--- 58,78 ----
data = re.sub(r"<[^>]*>", "", data).lower()
+ # Map all digits to '#'
+ data = re.sub(r"[0-9]+", "#", data)
+
+ # Map a few common html entities
+ data = re.sub(r"( )+", " ", data)
+ data = re.sub(r"<", "<", data)
+ data = re.sub(r">", ">", data)
+ data = re.sub(r"&", "&", data)
+
+ # Elide blank lines and multiple horizontal whitespace
+ data = re.sub(r"\n+", "\n", data)
+ data = re.sub(r"[ \t]+", " ", data)
+
# delete anything which looks like a url or email address
# not sure what a pmguid: url is but it seems to occur frequently in spam
# also convert all runs of whitespace into a single space
! return " ".join([w for w in data.split(" ")
if ('@' not in w and
(':' not in w or
***************
*** 87,97 ****
# separately or in various combinations if desired.
! body = flatten(msg)
! lines = clean(body)
chunksize = len(lines)//4+1
sum = []
for i in range(4):
chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize])
! sum.append(binascii.b2a_hex(md5.new(chunk).digest()))
return ".".join(sum)
--- 91,105 ----
# separately or in various combinations if desired.
! fp = StringIO.StringIO()
! g = email.generator.Generator(fp, mangle_from_=False, maxheaderlen=60)
! g.flatten(msg)
! text = fp.getvalue()
! body = text.split("\n\n", 1)[1]
! lines = clean(body).split("\n")
chunksize = len(lines)//4+1
sum = []
for i in range(4):
chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize])
! sum.append(md5.new(chunk).hexdigest())
return ".".join(sum)
***************
*** 102,111 ****
db = anydbm.open(f, "c")
maxdblen = 2**14
! # consider the first three pieces, the last three pieces and the middle
! # two pieces - one or more will likely eliminate attempts at disrupting
! # the checksum - if any are found in the db file, call it a match
! for subsum in (".".join(pieces[:-1]),
".".join(pieces[1:-1]),
! ".".join(pieces[1:])):
if not db.has_key(subsum):
db[subsum] = str(time.time())
--- 110,119 ----
db = anydbm.open(f, "c")
maxdblen = 2**14
! # consider the first two pieces, the middle two pieces and the last two
! # pieces - one or more will likely eliminate attempts at disrupting the
! # checksum - if any are found in the db file, call it a match
! for subsum in (".".join(pieces[:-2]),
".".join(pieces[1:-1]),
! ".".join(pieces[2:])):
if not db.has_key(subsum):
db[subsum] = str(time.time())
***************
*** 155,157 ****
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
-
--- 163,164 ----
More information about the Spambayes-checkins
mailing list