[Spambayes-checkins] spambayes/spambayes classifier.py, 1.20,
1.21 tokenizer.py, 1.28, 1.29
Tony Meyer
anadelonbrin at users.sourceforge.net
Mon Jan 12 03:38:25 EST 2004
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv21783/spambayes
Modified Files:
classifier.py tokenizer.py
Log Message:
Fix the experimental slurping option so that it only retrieves the
text from the URL if necessary, as was the original intent.
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/classifier.py,v
retrieving revision 1.20
retrieving revision 1.21
diff -C2 -d -r1.20 -r1.21
*** classifier.py 30 Dec 2003 16:26:33 -0000 1.20
--- classifier.py 12 Jan 2004 08:38:23 -0000 1.21
***************
*** 45,48 ****
--- 45,65 ----
from spambayes.compatsets import Set
+ # XXX At time of writing, these are only necessary for the
+ # XXX experimental url retrieving/slurping code. If that
+ # XXX gets ripped out, either rip these out, or run
+ # XXX PyChecker over the code.
+ import re
+ import os
+ import sys
+ import socket
+ import pickle
+ import urllib2
+ from email import message_from_string
+
+ DOMAIN_AND_PORT_RE = re.compile(r"([^:/\\]+)(:([\d]+))?")
+ HTTP_ERROR_RE = re.compile(r"HTTP Error ([\d]+)")
+ URL_KEY_RE = re.compile(r"[\W]")
+ # XXX ---- ends ----
+
from spambayes.Options import options
from spambayes.chi2 import chi2Q
***************
*** 57,61 ****
LN2 = math.log(2) # used frequently by chi-combining
! slurp_wordstream = []
PICKLE_VERSION = 5
--- 74,78 ----
LN2 = math.log(2) # used frequently by chi-combining
! slurp_wordstream = None
PICKLE_VERSION = 5
***************
*** 217,222 ****
# at the URL's destination.
if len(clues) < options["Classifier", "max_discriminators"] and \
! prob > h_cut and prob < s_cut:
! sprob, sclues = self.chi2_spamprob(slurp_wordstream, True)
if sprob < h_cut or sprob > s_cut:
prob = sprob
--- 234,241 ----
# at the URL's destination.
if len(clues) < options["Classifier", "max_discriminators"] and \
! prob > h_cut and prob < s_cut and slurp_wordstream:
! slurp_tokens = list(self._generate_slurp())
! slurp_tokens.extend([w for (w,p) in clues])
! sprob, sclues = self.chi2_spamprob(slurp_tokens, True)
if sprob < h_cut or sprob > s_cut:
prob = sprob
***************
*** 516,519 ****
--- 535,748 ----
last = token
+ def _generate_slurp(self):
+ # We don't want to do this recursively and check URLs
+ # on webpages, so we have this little cheat.
+ if not hasattr(self, "setup_done"):
+ self.setup()
+ self.setup_done = True
+ if not hasattr(self, "do_slurp") or self.do_slurp:
+ if slurp_wordstream:
+ self.do_slurp = False
+
+ tokens = self.slurp(*slurp_wordstream)
+ self.do_slurp = True
+ self._save_caches()
+ return tokens
+ return []
+
+ def setup(self):
+ # Can't import this at the top because it's circular.
+ # XXX Someone smarter than me, please figure out the right
+ # XXX way to do this.
+ from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory
+
+ username = options["globals", "proxy_username"]
+ password = options["globals", "proxy_password"]
+ server = options["globals", "proxy_server"]
+ if server.find(":") != -1:
+ server, port = server.split(':', 1)
+ else:
+ port = 8080
+ if server:
+ # Build a new opener that uses a proxy requiring authorization
+ proxy_support = urllib2.ProxyHandler({"http" : \
+ "http://%s:%s@%s:%d" % \
+ (username, password,
+ server, port)})
+ opener = urllib2.build_opener(proxy_support,
+ urllib2.HTTPHandler)
+ else:
+ # Build a new opener without any proxy information.
+ opener = urllib2.build_opener(urllib2.HTTPHandler)
+
+ # Install it
+ urllib2.install_opener(opener)
+
+ # Setup the cache for retrieved urls
+ age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60
+ dir = options["URLRetriever", "x-cache_directory"]
+ if not os.path.exists(dir):
+ # Create the directory.
+ if options["globals", "verbose"]:
+ print >>sys.stderr, "Creating URL cache directory"
+ os.makedirs(dir)
+
+ self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(),
+ dir, cacheSize=20)
+ # Kill any old information in the cache
+ self.urlCorpus.removeExpiredMessages()
+
+ # Setup caches for unretrievable urls
+ self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
+ self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
+ if os.path.exists(self.bad_url_cache_name):
+ b_file = file(self.bad_url_cache_name, "r")
+ self.bad_urls = pickle.load(b_file)
+ b_file.close()
+ else:
+ if options["globals", "verbose"]:
+ print "URL caches don't exist: creating"
+ self.bad_urls = {"url:non_resolving": (),
+ "url:non_html": (),
+ "url:unknown_error": ()}
+ if os.path.exists(self.http_error_cache_name):
+ h_file = file(self.http_error_cache_name, "r")
+ self.http_error_urls = pickle.load(h_file)
+ h_file.close()
+ else:
+ self.http_error_urls = {}
+
+ def _save_caches(self):
+ # XXX Note that these caches are never refreshed, which might not
+ # XXX be a good thing long-term (if a previously invalid URL
+ # XXX becomes valid, for example).
+ b_file = file(self.bad_url_cache_name, "w")
+ pickle.dump(self.bad_urls, b_file)
+ b_file.close()
+ h_file = file(self.http_error_cache_name, "w")
+ pickle.dump(self.http_error_urls, h_file)
+ h_file.close()
+
+ def slurp(self, proto, url):
+ # We generate these tokens:
+ # url:non_resolving
+ # url:non_html
+ # url:http_XXX (for each type of http error encounted,
+ # for example 404, 403, ...)
+ # And tokenise the received page (but we do not slurp this).
+ # Actually, the special url: tokens barely showed up in my testing,
+ # although I would have thought that they would more - this might
+ # be due to an error, although they do turn up on occasion. In
+ # any case, we have to do the test, so generating an extra token
+ # doesn't cost us anything apart from another entry in the db, and
+ # it's only two entries, plus one for each type of http error
+ # encountered, so it's pretty neglible.
+ from spambayes.tokenizer import Tokenizer
+
+ if options["URLRetriever", "x-only_slurp_base"]:
+ url = self._base_url(url)
+
+ # Check the unretrievable caches
+ for err in self.bad_urls.keys():
+ if url in self.bad_urls[err]:
+ return [err]
+ if self.http_error_urls.has_key(url):
+ return self.http_error_urls[url]
+
+ # We check if the url will resolve first
+ mo = DOMAIN_AND_PORT_RE.match(url)
+ domain = mo.group(1)
+ if mo.group(3) is None:
+ port = 80
+ else:
+ port = mo.group(3)
+ try:
+ not_used = socket.getaddrinfo(domain, port)
+ except socket.error:
+ self.bad_urls["url:non_resolving"] += (url,)
+ return ["url:non_resolving"]
+
+ # If the message is in our cache, then we can just skip over
+ # retrieving it from the network, and get it from there, instead.
+ url_key = URL_KEY_RE.sub('_', url)
+ cached_message = self.urlCorpus.get(url_key)
+
+ if cached_message is None:
+ # We're going to ignore everything that isn't text/html,
+ # so we might as well not bother retrieving anything with
+ # these extensions.
+ parts = url.split('.')
+ if parts[-1] in ('jpg', 'gif', 'png', 'css', 'js'):
+ self.bad_urls["url:non_html"] += (url,)
+ return ["url:non_html"]
+
+ try:
+ if options["globals", "verbose"]:
+ print >>sys.stderr, "Slurping", url
+ f = urllib2.urlopen("%s://%s" % (proto, url))
+ except (urllib2.URLError, socket.error), details:
+ mo = HTTP_ERROR_RE.match(str(details))
+ if mo:
+ self.http_error_urls[url] = "url:http_" + mo.group(1)
+ return ["url:http_" + mo.group(1)]
+ self.bad_urls["url:unknown_error"] += (url,)
+ return ["url:unknown_error"]
+
+ # Anything that isn't text/html is ignored
+ content_type = f.info().get('content-type')
+ if content_type is None or \
+ not content_type.startswith("text/html"):
+ self.bad_urls["url:non_html"] += (url,)
+ return ["url:non_html"]
+
+ page = f.read()
+ headers = str(f.info())
+ f.close()
+ fake_message_string = headers + "\r\n" + page
+
+ # Retrieving the same messages over and over again will tire
+ # us out, so we store them in our own wee cache.
+ message = self.urlCorpus.makeMessage(url_key)
+ message.setPayload(fake_message_string)
+ self.urlCorpus.addMessage(message)
+ else:
+ fake_message_string = cached_message.as_string()
+
+ msg = message_from_string(fake_message_string)
+
+ # We don't want to do full header tokenising, as this is
+ # optimised for messages, not webpages, so we just do the
+ # basic stuff.
+ bht = options["Tokenizer", "basic_header_tokenize"]
+ bhto = options["Tokenizer", "basic_header_tokenize_only"]
+ options["Tokenizer", "basic_header_tokenize"] = True
+ options["Tokenizer", "basic_header_tokenize_only"] = True
+
+ tokens = Tokenizer().tokenize(msg)
+ pf = options["URLRetriever", "x-web_prefix"]
+ tokens = ["%s%s" % (pf, tok) for tok in tokens]
+
+ # Undo the changes
+ options["Tokenizer", "basic_header_tokenize"] = bht
+ options["Tokenizer", "basic_header_tokenize_only"] = bhto
+ return tokens
+
+ def _base_url(self, url):
+ # To try and speed things up, and to avoid following
+ # unique URLS, we convert the URL to as basic a form
+ # as we can - so http://www.massey.ac.nz/~tameyer/index.html?you=me
+ # would become http://massey.ac.nz and http://id.example.com
+ # would become http://example.com
+ url += '/'
+ domain, garbage = url.split('/', 1)
+ parts = domain.split('.')
+ if len(parts) > 2:
+ base_domain = parts[-2] + '.' + parts[-1]
+ if len(parts[-1]) < 3:
+ base_domain = parts[-3] + '.' + base_domain
+ else:
+ base_domain = domain
+ return base_domain
+
def _add_slurped(self, wordstream):
"""Add tokens generated by 'slurping' (i.e. tokenizing
***************
*** 522,526 ****
for token in wordstream:
yield token
! for token in slurp_wordstream:
yield token
--- 751,756 ----
for token in wordstream:
yield token
! slurped_tokens = self._generate_slurp()
! for token in slurped_tokens:
yield token
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.28
retrieving revision 1.29
diff -C2 -d -r1.28 -r1.29
*** tokenizer.py 5 Jan 2004 17:40:30 -0000 1.28
--- tokenizer.py 12 Jan 2004 08:38:23 -0000 1.29
***************
*** 21,36 ****
from compatsets import Set
- # XXX At time of writing, these are only necessary for the
- # XXX experimental url retrieving/slurping code. If that
- # XXX gets ripped out, either rip these out, or run
- # XXX PyChecker over the code.
- import sys
- import socket
- import pickle
- import urllib2
from spambayes import classifier
- from email import message_from_string
- # XXX ---- ends ----
-
from spambayes.Options import options
--- 21,25 ----
***************
*** 1076,1198 ****
return tokens
- DOMAIN_AND_PORT_RE = re.compile(r"([^:/\\]+)(:([\d]+))?")
- HTTP_ERROR_RE = re.compile(r"HTTP Error ([\d]+)")
- URL_KEY_RE = re.compile(r"[\W]")
-
class SlurpingURLStripper(URLStripper):
def __init__(self):
URLStripper.__init__(self)
- self.setup_done = False
- self.do_slurp = True
-
- def setup(self):
- # Can't import this at the top because it's circular.
- # XXX Someone smarter than me, please figure out the right
- # XXX way to do this.
- from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory
- username = options["globals", "proxy_username"]
- password = options["globals", "proxy_password"]
- server = options["globals", "proxy_server"]
- if server.find(":") != -1:
- server, port = server.split(':', 1)
- else:
- port = 8080
- if server:
- # Build a new opener that uses a proxy requiring authorization
- proxy_support = urllib2.ProxyHandler({"http" : \
- "http://%s:%s@%s:%d" % \
- (username, password,
- server, port)})
- opener = urllib2.build_opener(proxy_support,
- urllib2.HTTPHandler)
- else:
- # Build a new opener without any proxy information.
- opener = urllib2.build_opener(urllib2.HTTPHandler)
-
- # Install it
- urllib2.install_opener(opener)
-
- # Setup the cache for retrieved urls
- age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60
- dir = options["URLRetriever", "x-cache_directory"]
- if not os.path.exists(dir):
- # Create the directory.
- if options["globals", "verbose"]:
- print >>sys.stderr, "Creating URL cache directory"
- os.makedirs(dir)
-
- self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(),
- dir, cacheSize=20)
- # Kill any old information in the cache
- self.urlCorpus.removeExpiredMessages()
-
- # Setup caches for unretrievable urls
- self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
- self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
- if os.path.exists(self.bad_url_cache_name):
- b_file = file(self.bad_url_cache_name, "r")
- self.bad_urls = pickle.load(b_file)
- b_file.close()
- else:
- self.bad_urls = {"url:non_resolving": (),
- "url:non_html": (),
- "url:unknown_error": ()}
- if os.path.exists(self.http_error_cache_name):
- h_file = file(self.http_error_cache_name, "r")
- self.http_error_urls = pickle.load(h_file)
- h_file.close()
- else:
- self.http_error_urls = {}
! def _save_caches(self):
! # XXX Note that these caches are never refreshed, which might not
! # XXX be a good thing long-term (if a previously invalid URL
! # XXX becomes valid, for example).
! b_file = file(self.bad_url_cache_name, "w")
! pickle.dump(self.bad_urls, b_file)
! b_file.close()
! h_file = file(self.http_error_cache_name, "w")
! pickle.dump(self.http_error_urls, h_file)
! h_file.close()
def tokenize(self, m):
! # XXX A weakness of this is that the text from URLs is
! # XXX always retrieved, even if it won't be used (if the
! # XXX raw score is outside unsure, for example). The
! # XXX problem is that when tokenizing, we have no idea
! # XXX what the score of the message should be, and so
! # XXX if we need the tokens or not. But when calculating
! # XXX the spamprob, we have no idea what the content of
! # XXX the message is - just the tokens we generated from it
! # XXX (and we can't reverse-engineer the URLs from that).
! # XXX I've (Tony) played around with various ways to get
! # XXX around this, but can't really come up with anything
! # XXX good, apart from moving the decision whether to
! # XXX recalculate the score 'higher' up (out of classifier's
! # XXX spamprob()), but then it seems that code in a *lot*
! # XXX of places will need to be changed to call the new
! # XXX function; not nice given that this is experimental.
! # XXX Either someone else will point out a good way to do this
! # XXX or it can be moved higher up if this ever makes it out
! # XXX of experimental status.
! # XXX This might not matter so much because of the local
! # XXX cache of the 'slurped' content, especially if the cache
! # XXX isn't set to expire content regularly, and if your ham
! # XXX (likely) and spam (unlikely) messages tend to have the
! # XXX same URLs in them, and only unsure change.
! # XXX Also note that the 'slurped' tokens are *always* trained
# XXX on; it would be simple to change/parameterize this.
- if not self.setup_done:
- self.setup()
- self.setup_done = True
tokens = URLStripper.tokenize(self, m)
! if not (options["URLRetriever", "x-slurp_urls"] and \
! self.do_slurp):
return tokens
- # We don't want to do this recursively and check URLs
- # on webpages, so we have this little cheat.
- self.do_slurp = False
-
proto, guts = m.groups()
if proto != "http":
--- 1065,1087 ----
return tokens
class SlurpingURLStripper(URLStripper):
def __init__(self):
URLStripper.__init__(self)
! def analyze(self, text):
! # If there are no URLS, then we need to clear the
! # wordstream, or whatever was there from the last message
! # will be used.
! classifier.slurp_wordstream = None
! # Continue as normal.
! return URLStripper.analyze(self, text)
def tokenize(self, m):
! # XXX Note that the 'slurped' tokens are *always* trained
# XXX on; it would be simple to change/parameterize this.
tokens = URLStripper.tokenize(self, m)
! if not options["URLRetriever", "x-slurp_urls"]:
return tokens
proto, guts = m.groups()
if proto != "http":
***************
*** 1203,1329 ****
guts = guts[:-1]
! classifier.slurp_wordstream = self.slurp(proto, guts)
! self.do_slurp = True
! self._save_caches()
! return tokens
!
! def slurp(self, proto, url):
! # We generate these tokens:
! # url:non_resolving
! # url:non_html
! # url:http_XXX (for each type of http error encounted,
! # for example 404, 403, ...)
! # And tokenise the received page (but we do not slurp this).
! # Actually, the special url: tokens barely showed up in my testing,
! # although I would have thought that they would more - this might
! # be due to an error, although they do turn up on occasion. In
! # any case, we have to do the test, so generating an extra token
! # doesn't cost us anything apart from another entry in the db, and
! # it's only two entries, plus one for each type of http error
! # encountered, so it's pretty neglible.
! if options["URLRetriever", "x-only_slurp_base"]:
! url = self._base_url(url)
!
! # Check the unretrievable caches
! for err in self.bad_urls.keys():
! if url in self.bad_urls[err]:
! return [err]
! if self.http_error_urls.has_key(url):
! return self.http_error_urls[url]
!
! # We check if the url will resolve first
! mo = DOMAIN_AND_PORT_RE.match(url)
! domain = mo.group(1)
! if mo.group(3) is None:
! port = 80
! else:
! port = mo.group(3)
! try:
! not_used = socket.getaddrinfo(domain, port)
! except socket.error:
! self.bad_urls["url:non_resolving"] += (url,)
! return ["url:non_resolving"]
!
! # If the message is in our cache, then we can just skip over
! # retrieving it from the network, and get it from there, instead.
! url_key = URL_KEY_RE.sub('_', url)
! cached_message = self.urlCorpus.get(url_key)
!
! if cached_message is None:
! # We're going to ignore everything that isn't text/html,
! # so we might as well not bother retrieving anything with
! # these extensions.
! parts = url.split('.')
! if parts[-1] in ('jpg', 'gif', 'png', 'css', 'js'):
! self.bad_urls["url:non_html"] += (url,)
! return ["url:non_html"]
!
! try:
! if options["globals", "verbose"]:
! print >>sys.stderr, "Slurping", url
! f = urllib2.urlopen("%s://%s" % (proto, url))
! except (urllib2.URLError, socket.error), details:
! mo = HTTP_ERROR_RE.match(str(details))
! if mo:
! self.http_error_urls[url] = "url:http_" + mo.group(1)
! return ["url:http_" + mo.group(1)]
! self.bad_urls["url:unknown_error"] += (url,)
! return ["url:unknown_error"]
!
! # Anything that isn't text/html is ignored
! content_type = f.info().get('content-type')
! if content_type is None or \
! not content_type.startswith("text/html"):
! self.bad_urls["url:non_html"] += (url,)
! return ["url:non_html"]
!
! page = f.read()
! headers = str(f.info())
! f.close()
! fake_message_string = headers + "\r\n" + page
!
! # Retrieving the same messages over and over again will tire
! # us out, so we store them in our own wee cache.
! message = self.urlCorpus.makeMessage(url_key)
! message.setPayload(fake_message_string)
! self.urlCorpus.addMessage(message)
! else:
! fake_message_string = cached_message.as_string()
!
! msg = message_from_string(fake_message_string)
!
! # We don't want to do full header tokenising, as this is
! # optimised for messages, not webpages, so we just do the
! # basic stuff.
! bht = options["Tokenizer", "basic_header_tokenize"]
! bhto = options["Tokenizer", "basic_header_tokenize_only"]
! options["Tokenizer", "basic_header_tokenize"] = True
! options["Tokenizer", "basic_header_tokenize_only"] = True
!
! tokens = Tokenizer().tokenize(msg)
! pf = options["URLRetriever", "x-web_prefix"]
! tokens = ["%s%s" % (pf, tok) for tok in tokens]
!
! # Undo the changes
! options["Tokenizer", "basic_header_tokenize"] = bht
! options["Tokenizer", "basic_header_tokenize_only"] = bhto
return tokens
-
- def _base_url(self, url):
- # To try and speed things up, and to avoid following
- # unique URLS, we convert the URL to as basic a form
- # as we can - so http://www.massey.ac.nz/~tameyer/index.html?you=me
- # would become http://massey.ac.nz and http://id.example.com
- # would become http://example.com
- url += '/'
- domain, garbage = url.split('/', 1)
- parts = domain.split('.')
- if len(parts) > 2:
- base_domain = parts[-2] + '.' + parts[-1]
- if len(parts[-1]) < 3:
- base_domain = parts[-3] + '.' + base_domain
- else:
- base_domain = domain
- return base_domain
if options["URLRetriever", "x-slurp_urls"]:
--- 1092,1097 ----
guts = guts[:-1]
! classifier.slurp_wordstream = (proto, guts)
return tokens
if options["URLRetriever", "x-slurp_urls"]:
More information about the Spambayes-checkins
mailing list