[Spambayes-checkins] spambayes/spambayes classifier.py, 1.23,
1.23.4.1
Tony Meyer
anadelonbrin at users.sourceforge.net
Tue Nov 9 23:03:31 CET 2004
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv19075/spambayes
Modified Files:
Tag: release_1_0-branch
classifier.py
Log Message:
Backport:
Fix [ 922063 ] Intermittent sb_filter.py faliure with URL pickle
Fix [ 1051081 ] uncaught socket timeoutexception slurping URLs
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/classifier.py,v
retrieving revision 1.23
retrieving revision 1.23.4.1
diff -C2 -d -r1.23 -r1.23.4.1
*** classifier.py 6 Feb 2004 21:43:00 -0000 1.23
--- classifier.py 9 Nov 2004 22:03:27 -0000 1.23.4.1
***************
*** 527,533 ****
'synthetic' tokens get bigram'ed, too.
! The bigram token is simply "unigram1 unigram2" - a space should
be sufficient as a separator, since spaces aren't in any other
! tokens, apart from 'synthetic' ones.
If the experimental "Classifier":"x-use_bigrams" option is
--- 527,536 ----
'synthetic' tokens get bigram'ed, too.
! The bigram token is simply "bi:unigram1 unigram2" - a space should
be sufficient as a separator, since spaces aren't in any other
! tokens, apart from 'synthetic' ones. The "bi:" prefix is added
! to avoid conflict with tokens we generate (like "subject: word",
! which could be "word" in a subject, or a bigram of "subject:" and
! "word").
If the experimental "Classifier":"x-use_bigrams" option is
***************
*** 607,611 ****
if os.path.exists(self.bad_url_cache_name):
b_file = file(self.bad_url_cache_name, "r")
! self.bad_urls = pickle.load(b_file)
b_file.close()
else:
--- 610,623 ----
if os.path.exists(self.bad_url_cache_name):
b_file = file(self.bad_url_cache_name, "r")
! try:
! self.bad_urls = pickle.load(b_file)
! except IOError, ValueError:
! # Something went wrong loading it (bad pickle,
! # probably). Start afresh.
! if options["globals", "verbose"]:
! print >>sys.stderr, "Bad URL pickle, using new."
! self.bad_urls = {"url:non_resolving": (),
! "url:non_html": (),
! "url:unknown_error": ()}
b_file.close()
else:
***************
*** 617,621 ****
if os.path.exists(self.http_error_cache_name):
h_file = file(self.http_error_cache_name, "r")
! self.http_error_urls = pickle.load(h_file)
h_file.close()
else:
--- 629,640 ----
if os.path.exists(self.http_error_cache_name):
h_file = file(self.http_error_cache_name, "r")
! try:
! self.http_error_urls = pickle.load(h_file)
! except IOError, ValueError:
! # Something went wrong loading it (bad pickle,
! # probably). Start afresh.
! if options["globals", "verbose"]:
! print >>sys.stderr, "Bad HHTP error pickle, using new."
! self.http_error_urls = {}
h_file.close()
else:
***************
*** 626,635 ****
# XXX be a good thing long-term (if a previously invalid URL
# XXX becomes valid, for example).
! b_file = file(self.bad_url_cache_name, "w")
! pickle.dump(self.bad_urls, b_file)
! b_file.close()
! h_file = file(self.http_error_cache_name, "w")
! pickle.dump(self.http_error_urls, h_file)
! h_file.close()
def slurp(self, proto, url):
--- 645,661 ----
# XXX be a good thing long-term (if a previously invalid URL
# XXX becomes valid, for example).
! for name, data in [(self.bad_url_cache_name, self.bad_urls),
! (self.http_error_cache_name, self.http_error_urls),]:
! # Save to a temp file first, in case something goes wrong.
! cache = open(name + ".tmp", "w")
! pickle.dump(data, cache)
! cache.close()
! try:
! os.rename(name + ".tmp", name)
! except OSError:
! # Atomic replace isn't possible with win32, so just
! # remove and rename.
! os.remove(name)
! os.rename(name + ".tmp", name)
def slurp(self, proto, url):
***************
*** 698,711 ****
return ["url:unknown_error"]
! # Anything that isn't text/html is ignored
! content_type = f.info().get('content-type')
! if content_type is None or \
! not content_type.startswith("text/html"):
! self.bad_urls["url:non_html"] += (url,)
! return ["url:non_html"]
! page = f.read()
! headers = str(f.info())
! f.close()
fake_message_string = headers + "\r\n" + page
--- 724,743 ----
return ["url:unknown_error"]
! try:
! # Anything that isn't text/html is ignored
! content_type = f.info().get('content-type')
! if content_type is None or \
! not content_type.startswith("text/html"):
! self.bad_urls["url:non_html"] += (url,)
! return ["url:non_html"]
! page = f.read()
! headers = str(f.info())
! f.close()
! except socket.error:
! # This is probably a temporary error, like a timeout.
! # For now, just bail out.
! return []
!
fake_message_string = headers + "\r\n" + page
More information about the Spambayes-checkins
mailing list