[Spambayes-checkins] spambayes/spambayes classifier.py, 1.23, 1.23.4.1

Tue Nov 9 23:03:31 CET 2004

Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv19075/spambayes

Modified Files:
      Tag: release_1_0-branch
	classifier.py 
Log Message:
Backport:

Fix [ 922063 ] Intermittent sb_filter.py faliure with URL pickle
Fix [ 1051081 ] uncaught socket timeoutexception slurping URLs

Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/classifier.py,v
retrieving revision 1.23
retrieving revision 1.23.4.1
diff -C2 -d -r1.23 -r1.23.4.1
*** classifier.py	6 Feb 2004 21:43:00 -0000	1.23
--- classifier.py	9 Nov 2004 22:03:27 -0000	1.23.4.1
***************
*** 527,533 ****
          'synthetic' tokens get bigram'ed, too.
  
!         The bigram token is simply "unigram1 unigram2" - a space should
          be sufficient as a separator, since spaces aren't in any other
!         tokens, apart from 'synthetic' ones.
  
          If the experimental "Classifier":"x-use_bigrams" option is
--- 527,536 ----
          'synthetic' tokens get bigram'ed, too.
  
!         The bigram token is simply "bi:unigram1 unigram2" - a space should
          be sufficient as a separator, since spaces aren't in any other
!         tokens, apart from 'synthetic' ones.  The "bi:" prefix is added
!         to avoid conflict with tokens we generate (like "subject: word",
!         which could be "word" in a subject, or a bigram of "subject:" and
!         "word").
  
          If the experimental "Classifier":"x-use_bigrams" option is
***************
*** 607,611 ****
          if os.path.exists(self.bad_url_cache_name):
              b_file = file(self.bad_url_cache_name, "r")
!             self.bad_urls = pickle.load(b_file)
              b_file.close()
          else:
--- 610,623 ----
          if os.path.exists(self.bad_url_cache_name):
              b_file = file(self.bad_url_cache_name, "r")
!             try:
!                 self.bad_urls = pickle.load(b_file)
!             except IOError, ValueError:
!                 # Something went wrong loading it (bad pickle,
!                 # probably).  Start afresh.
!                 if options["globals", "verbose"]:
!                     print >>sys.stderr, "Bad URL pickle, using new."
!                 self.bad_urls = {"url:non_resolving": (),
!                                  "url:non_html": (),
!                                  "url:unknown_error": ()}
              b_file.close()
          else:
***************
*** 617,621 ****
          if os.path.exists(self.http_error_cache_name):
              h_file = file(self.http_error_cache_name, "r")
!             self.http_error_urls = pickle.load(h_file)
              h_file.close()
          else:
--- 629,640 ----
          if os.path.exists(self.http_error_cache_name):
              h_file = file(self.http_error_cache_name, "r")
!             try:
!                 self.http_error_urls = pickle.load(h_file)
!             except IOError, ValueError:
!                 # Something went wrong loading it (bad pickle,
!                 # probably).  Start afresh.
!                 if options["globals", "verbose"]:
!                     print >>sys.stderr, "Bad HHTP error pickle, using new."
!                 self.http_error_urls = {}
              h_file.close()
          else:
***************
*** 626,635 ****
          # XXX be a good thing long-term (if a previously invalid URL
          # XXX becomes valid, for example).
!         b_file = file(self.bad_url_cache_name, "w")
!         pickle.dump(self.bad_urls, b_file)
!         b_file.close()
!         h_file = file(self.http_error_cache_name, "w")
!         pickle.dump(self.http_error_urls, h_file)
!         h_file.close()
  
      def slurp(self, proto, url):
--- 645,661 ----
          # XXX be a good thing long-term (if a previously invalid URL
          # XXX becomes valid, for example).
!         for name, data in [(self.bad_url_cache_name, self.bad_urls),
!                            (self.http_error_cache_name, self.http_error_urls),]:
!             # Save to a temp file first, in case something goes wrong.
!             cache = open(name + ".tmp", "w")
!             pickle.dump(data, cache)
!             cache.close()
!             try:
!                 os.rename(name + ".tmp", name)
!             except OSError:
!                 # Atomic replace isn't possible with win32, so just
!                 # remove and rename.
!                 os.remove(name)
!                 os.rename(name + ".tmp", name)
  
      def slurp(self, proto, url):
***************
*** 698,711 ****
                  return ["url:unknown_error"]
  
!             # Anything that isn't text/html is ignored
!             content_type = f.info().get('content-type')
!             if content_type is None or \
!                not content_type.startswith("text/html"):
!                 self.bad_urls["url:non_html"] += (url,)
!                 return ["url:non_html"]
  
!             page = f.read()
!             headers = str(f.info())
!             f.close()
              fake_message_string = headers + "\r\n" + page
  
--- 724,743 ----
                  return ["url:unknown_error"]
  
!             try:
!                 # Anything that isn't text/html is ignored
!                 content_type = f.info().get('content-type')
!                 if content_type is None or \
!                    not content_type.startswith("text/html"):
!                     self.bad_urls["url:non_html"] += (url,)
!                     return ["url:non_html"]
  
!                 page = f.read()
!                 headers = str(f.info())
!                 f.close()
!             except socket.error:
!                 # This is probably a temporary error, like a timeout.
!                 # For now, just bail out.
!                 return []
!             
              fake_message_string = headers + "\r\n" + page