[Python-checkins] python/nondist/sandbox/spambayes timtest.py,1.8,1.9
tim_one@users.sourceforge.net
tim_one@users.sourceforge.net
Mon, 02 Sep 2002 00:55:27 -0700
Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv17809
Modified Files:
timtest.py
Log Message:
Some comment changes and nesting reduction.
Index: timtest.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/timtest.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** timtest.py 2 Sep 2002 01:18:17 -0000 1.8
--- timtest.py 2 Sep 2002 07:55:25 -0000 1.9
***************
*** 191,194 ****
--- 191,197 ----
# A long word. If there are any high-bit chars,
# tokenize it as byte 5-grams.
+ # XXX This really won't work for high-bit languages -- the scoring
+ # XXX scheme throws almost everything away, and one bad phrase can
+ # XXX generate enough bad 5-grams to dominate the final score.
if has_highbit_char(word):
prefix += "5g:"
***************
*** 229,245 ****
# Subject:
# Don't ignore case in Subject lines; e.g., 'free' versus 'FREE' is
! # especially significant in this context.
! subj = msg.get('Subject', None)
! if subj:
! for w in subject_word_re.findall(subj):
! for t in tokenize_word(w, 'subject:'):
! yield t
# From:
! subj = msg.get('From', None)
! if subj:
! for w in subj.lower().split():
! for t in tokenize_word(w, 'from:'):
! yield t
# Find, decode (base64, qp), and tokenize the textual parts of the body.
--- 232,247 ----
# Subject:
# Don't ignore case in Subject lines; e.g., 'free' versus 'FREE' is
! # especially significant in this context. Experiment showed a small
! # but real benefit to keeping case intact in this specific context.
! subj = msg.get('Subject', '')
! for w in subject_word_re.findall(subj):
! for t in tokenize_word(w, 'subject:'):
! yield t
# From:
! subj = msg.get('From', '')
! for w in subj.lower().split():
! for t in tokenize_word(w, 'from:'):
! yield t
# Find, decode (base64, qp), and tokenize the textual parts of the body.