[Spambayes-checkins] spambayes tokenizer.py,1.67,1.68
Tim Peters
tim_one@users.sourceforge.net
Wed Nov 13 06:25:10 2002
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv2039a
Modified Files:
tokenizer.py
Log Message:
More refinements of address-header tokenization. In particular, it
now generators "no real name" log-count tokens, which are strong
spam clues in my data.
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.67
retrieving revision 1.68
diff -C2 -d -r1.67 -r1.68
*** tokenizer.py 12 Nov 2002 23:16:04 -0000 1.67
--- tokenizer.py 13 Nov 2002 06:25:08 -0000 1.68
***************
*** 1081,1097 ****
if not addrlist:
yield field + ":none"
! for addrs in addrlist:
! for rname,ename in email.Utils.getaddresses([addrs]):
! if rname:
! for rname,rcharset in email.Header.decode_header(rname):
! for w in rname.lower().split():
! for t in tokenize_word(w):
! yield field+'realname:'+t
! if rcharset is not None:
! yield field+'charset:'+rcharset
! if ename:
! for w in ename.lower().split('@'):
! for t in tokenize_word(w):
! yield field+'email:'+t
# To:
# Cc:
--- 1081,1105 ----
if not addrlist:
yield field + ":none"
! continue
!
! noname_count = 0
! for name, addr in email.Utils.getaddresses(addrlist):
! if name:
! for name, charset in email.Header.decode_header(name):
! yield "%s:name:%s" % (field, name.lower())
! if charset is not None:
! yield "%s:charset:%s" % (field, charset)
! else:
! noname_count += 1
! if addr:
! for w in addr.lower().split('@'):
! yield "%s:addr:%s" % (field, w)
! else:
! yield field + ":addr:none"
!
! if noname_count:
! yield "%s:no real name:2**%d" % (field,
! round(log2(noname_count)))
!
# To:
# Cc:
More information about the Spambayes-checkins
mailing list