[Spambayes-checkins] spambayes/spambayes/test test_sb_dbexpimp.py, 1.1, 1.2

Mon Nov 15 07:19:16 CET 2004

Update of /cvsroot/spambayes/spambayes/spambayes/test
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv13734/spambayes/test

Modified Files:
	test_sb_dbexpimp.py 
Log Message:
Add tests for merging.

Rather than just a comment in the script, ensure that the temp testing files don't
 exist before running the test script.

Index: test_sb_dbexpimp.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/test/test_sb_dbexpimp.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** test_sb_dbexpimp.py	12 Nov 2004 02:48:27 -0000	1.1
--- test_sb_dbexpimp.py	15 Nov 2004 06:19:14 -0000	1.2
***************
*** 15,25 ****

  # We borrow the test messages that test_sb_server uses.
  from test_sb_server import good1, spam1

- # WARNING!
- # If these files exist when running this test, they will be deleted.
  TEMP_PICKLE_NAME = os.path.join(os.path.dirname(__file__), "temp.pik")
  TEMP_CSV_NAME = os.path.join(os.path.dirname(__file__), "temp.csv")
  TEMP_DBM_NAME = os.path.join(os.path.dirname(__file__), "temp.dbm")

  class dbexpimpTest(unittest.TestCase):
--- 15,38 ----

  # We borrow the test messages that test_sb_server uses.
+ # I doubt it really makes much difference, but if we wanted more than
+ # one message of each type (the tests should all handle this ok) then
+ # Richie's hammer.py script has code for generating any number of
+ # randomly composed email messages.
  from test_sb_server import good1, spam1

  TEMP_PICKLE_NAME = os.path.join(os.path.dirname(__file__), "temp.pik")
  TEMP_CSV_NAME = os.path.join(os.path.dirname(__file__), "temp.csv")
  TEMP_DBM_NAME = os.path.join(os.path.dirname(__file__), "temp.dbm")
+ # The chances of anyone having files with these names in the test
+ # directory is minute, but we don't want to wipe anything, so make
+ # sure that they don't already exist.  Our tearDown code gets rid
+ # of our copies (whether the tests pass or fail) so they shouldn't
+ # be ours.
+ for fn in [TEMP_PICKLE_NAME, TEMP_CSV_NAME, TEMP_DBM_NAME]:
+     if os.path.exists(fn):
+         print fn, "already exists.  Please remove this file before " \
+               "running these tests (a file by that name will be " \
+               "created and destroyed as part of the tests)."
+         sys.exit(1)

  class dbexpimpTest(unittest.TestCase):
***************
*** 32,36 ****
              pass

!     def test_csv_import(self):
          """Check that we don't import the old object craft csv module."""
          self.assert_(hasattr(sb_dbexpimp.csv, "reader"))
--- 45,49 ----
              pass

!     def test_csv_module_import(self):
          """Check that we don't import the old object craft csv module."""
          self.assert_(hasattr(sb_dbexpimp.csv, "reader"))
***************
*** 132,135 ****
--- 145,232 ----
              self.assertEqual(wi.spamcount, spam)

+     def test_merge_to_pickle(self):
+         # Create a pickled classifier to merge with.
+         bayes = PickledClassifier(TEMP_PICKLE_NAME)
+         # Stuff some messages in it so it's not empty.
+         bayes.learn(tokenize(spam1), True)
+         bayes.learn(tokenize(good1), False)
+         # Save.
+         bayes.store()
+         # Create a CSV file to import.
+         nham, nspam = 3,4
+         temp = open(TEMP_CSV_NAME, "wb")
+         temp.write("%d,%d\n" % (nham, nspam))
+         csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1),
+                     "of":(1,0), "the":(1,2), "import":(3,1)}
+         for word, (ham, spam) in csv_data.items():
+             temp.write("%s,%s,%s\n" % (word, ham, spam))
+         temp.close()
+         sb_dbexpimp.runImport(TEMP_PICKLE_NAME, "pickle", False,
+                               TEMP_CSV_NAME)
+         # Open the converted file and verify that it has all the data from
+         # the CSV file (and by opening it, that it is a valid pickle),
+         # and the data from the original pickle.
+         bayes2 = open_storage(TEMP_PICKLE_NAME, "pickle")
+         self.assertEqual(bayes2.nham, nham + bayes.nham)
+         self.assertEqual(bayes2.nspam, nspam + bayes.nspam)
+         words = bayes._wordinfokeys()
+         words.extend(csv_data.keys())
+         for word in words:
+             word = sb_dbexpimp.uquote(word)
+             self.assert_(word in bayes2._wordinfokeys())
+             h, s = csv_data.get(word, (0,0))
+             wi = bayes._wordinfoget(word)
+             if wi:
+                 h += wi.hamcount
+                 s += wi.spamcount
+             wi2 = bayes2._wordinfoget(word)
+             self.assertEqual(h, wi2.hamcount)
+             self.assertEqual(s, wi2.spamcount)
+ 
+     def test_merge_to_dbm(self):
+         # Create a dbm classifier to merge with.
+         bayes = DBDictClassifier(TEMP_DBM_NAME)
+         # Stuff some messages in it so it's not empty.
+         bayes.learn(tokenize(spam1), True)
+         bayes.learn(tokenize(good1), False)
+         # Save data to check against.
+         original_nham = bayes.nham
+         original_nspam = bayes.nspam
+         original_data = {}
+         for key in bayes._wordinfokeys():
+             original_data[key] = bayes._wordinfoget(key)
+         # Save & Close.
+         bayes.store()
+         bayes.close()
+         # Create a CSV file to import.
+         nham, nspam = 3,4
+         temp = open(TEMP_CSV_NAME, "wb")
+         temp.write("%d,%d\n" % (nham, nspam))
+         csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1),
+                     "of":(1,0), "the":(1,2), "import":(3,1)}
+         for word, (ham, spam) in csv_data.items():
+             temp.write("%s,%s,%s\n" % (word, ham, spam))
+         temp.close()
+         sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", False, TEMP_CSV_NAME)
+         # Open the converted file and verify that it has all the data from
+         # the CSV file (and by opening it, that it is a valid dbm file),
+         # and the data from the original dbm database.
+         bayes2 = open_storage(TEMP_DBM_NAME, "dbm")
+         self.assertEqual(bayes2.nham, nham + original_nham)
+         self.assertEqual(bayes2.nspam, nspam + original_nspam)
+         words = original_data.keys()[:]
+         words.extend(csv_data.keys())
+         for word in words:
+             word = sb_dbexpimp.uquote(word)
+             self.assert_(word in bayes2._wordinfokeys())
+             h, s = csv_data.get(word, (0,0))
+             wi = original_data.get(word, None)
+             if wi:
+                 h += wi.hamcount
+                 s += wi.spamcount
+             wi2 = bayes2._wordinfoget(word)
+             self.assertEqual(h, wi2.hamcount)
+             self.assertEqual(s, wi2.spamcount)
+ 

  def suite():