[Spambayes-checkins] spambayes Histogram.py,NONE,1.1 TestDriver.py,1.18,1.19

Thu, 03 Oct 2002 19:29:22 -0700

Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv17494

Modified Files:
	TestDriver.py 
Added Files:
	Histogram.py 
Log Message:
Split the histogram class into its own file, greatly robustified the
numerics, and made it a lot more flexible and memory-consuming <sigh>.
This is to help make progress on the central-limit scheme, where we
still have a poor idea of what the zscore distributions look like.
The histogram class is flexible enough to give us nice pictures of
that now.

Note that new min, max, and median statistics are displayed for all
histograms now (and computing percentile cutoffs would be easy to add).

Note that if you have a histogram object, you can now pass the # of
buckets desired to the display() method (no code exploits that yet,
but it means, e.g., that if you discover you really wanted more buckets,
you potentially don't have to rerun the test).  HistToGNU.py in
particular may be able to exploit that immediately.

--- NEW FILE: Histogram.py ---
import math

from Options import options

class Hist:
    """Simple histograms of float values."""

    # Pass None for lo and hi and it will automatically adjust to the min
    # and max values seen.
    # Note:  nbuckets can be passed for backward compatibility.  The
    # display() method can be passed a different nbuckets value.
    def __init__(self, nbuckets=options.nbuckets,  lo=0.0, hi=100.0):
        self.lo, self.hi = lo, hi
        self.nbuckets = nbuckets
        self.buckets = [0] * nbuckets
        self.data = []  # the raw data points
        self.stats_uptodate = False

    # Add a value to the collection.
    def add(self, x):
        self.data.append(x)
        self.stats_uptodate = False

    # Compute, and set as instance attrs:
    #     n         # of data points
    # The rest are set iff n>0:
    #     min       smallest value in collection
    #     max       largest value in collection
    #     median    midpoint
    #     mean
    #     var       variance
    #     sdev      population standard deviation (sqrt(variance))
    # self.data is also sorted.
    def compute_stats(self):
        if self.stats_uptodate:
            return
        stats_uptodate = True
        data = self.data
        n = self.n = len(data)
        if n == 0:
            return
        data.sort()
        self.min = data[0]
        self.max = data[-1]
        if n & 1:
            self.median = data[n // 2]
        else:
            self.median = (data[n // 2] + data[(n-1) // 2]) / 2.0
        # Compute mean.
        # Add in increasing order of magnitude, to minimize roundoff error.
        if data[0] < 0.0:
            temp = [(abs(x), x) for x in data]
            temp.sort()
            data = [x[1] for x in temp]
            del temp
        sum = 0.0
        for x in data:
            sum += x
        mean = self.mean = sum / n
        # Compute variance.
        var = 0.0
        for x in data:
            d = x - mean
            var += d*d
        self.var = var / n
        self.sdev = math.sqrt(self.var)

    # Merge other into self.
    def __iadd__(self, other):
        self.data.extend(other.data)
        self.stats_uptodate = False
        return self

    # Print a histogram to stdout.
    # Also sets instance var nbuckets to the # of buckets, and
    # buckts to a list of nbuckets counts, but only if at least one
    # data point is in the collection.
    def display(self, nbuckets=None, WIDTH=61):
        if nbuckets <= 0:
            raise ValueError("nbuckets %g > 0 required" % nbuckets)
        self.compute_stats()
        n = self.n
        if n == 0:
            return
        print "%d items; mean %.2f; sdev %.2f" % (n, self.mean, self.sdev)
        print "-> <stat> min %g; median %g; max %g" % (self.min,
                                                       self.median,
                                                       self.max)
        if nbuckets is None:
            nbuckets = self.nbuckets
        self.nbuckets = nbuckets
        self.buckets = buckets = [0] * nbuckets

        lo, hi = self.lo, self.hi
        if lo is None:
            lo = self.min
        if hi is None:
            hi = self.max
        if lo > hi:
            return

        # Compute bucket counts.
        span = float(hi - lo)
        bucketwidth = span / nbuckets
        for x in self.data:
            i = int((x - lo) / bucketwidth)
            if i >= nbuckets:
                i = nbuckets - 1
            elif i < 0:
                i = 0
            buckets[i] += 1

        # hunit is how many items a * represents.  A * is printed for
        # each hunit items, plus any non-zero fraction thereof.
        biggest = max(self.buckets)
        hunit, r = divmod(biggest, WIDTH)
        if r:
            hunit += 1
        print "* =", hunit, "items"

        # We need ndigits decimal digits to display the largest bucket count.
        ndigits = len(str(biggest))

        # Displaying the bucket boundaries is more troublesome.  For now,
        # just print one digit after the decimal point, regardless of what
        # the boundaries look like.
        boundary_digits = max(len(str(int(lo))), len(str(int(hi))))
        format = "%" + str(boundary_digits + 2) + '.1f %' + str(ndigits) + "d"

        for i in range(nbuckets):
            n = self.buckets[i]
            print format % (lo + i * bucketwidth, n),
            print '*' * ((n + hunit - 1) // hunit)

Index: TestDriver.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v
retrieving revision 1.18
retrieving revision 1.19
diff -C2 -d -r1.18 -r1.19
*** TestDriver.py	28 Sep 2002 03:44:15 -0000	1.18
--- TestDriver.py	4 Oct 2002 02:29:20 -0000	1.19
***************
*** 29,102 ****
  import Tester
  import classifier

! class Hist:
!     """Simple histograms of float values in [0.0, 1.0]."""
! 
!     def __init__(self, nbuckets=20):
!         self.buckets = [0] * nbuckets
!         self.nbuckets = nbuckets
!         self.n = 0          # number of data points
!         self.sum = 0.0      # sum of their values
!         self.sumsq = 0.0    # sum of their squares
! 
!     def add(self, x):
!         n = self.nbuckets
!         i = int(n * x)
!         if i >= n:
!             i = n-1
!         self.buckets[i] += 1
! 
!         self.n += 1
!         x *= 100.0
!         self.sum += x
!         self.sumsq += x*x
! 
!     def __iadd__(self, other):
!         if self.nbuckets != other.nbuckets:
!             raise ValueError('bucket size mismatch')
!         for i in range(self.nbuckets):
!             self.buckets[i] += other.buckets[i]
!         self.n += other.n
!         self.sum += other.sum
!         self.sumsq += other.sumsq
!         return self
! 
!     def display(self, WIDTH=61):
!         from math import sqrt
!         if self.n > 0:
!             mean = self.sum / self.n
!             var = self.sumsq / self.n - mean**2
!             # The vagaries of f.p. rounding can make var come out negative.
!             # There are ways to fix that, but they're too painful for this
!             # part of the code to endure.
!             if var < 0.0:
!                 var = 0.0
!             print "%d items; mean %.2f; sdev %.2f" % (self.n, mean, sqrt(var))
! 
!         biggest = max(self.buckets)
!         hunit, r = divmod(biggest, WIDTH)
!         if r:
!             hunit += 1
!         print "* =", hunit, "items"
! 
!         ndigits = len(str(biggest))
!         format = "%5.1f %" + str(ndigits) + "d"
! 
!         for i in range(len(self.buckets)):
!             n = self.buckets[i]
!             print format % (100.0 * i / self.nbuckets, n),
!             print '*' * ((n + hunit - 1) // hunit)
! 
! def printhist(tag, ham, spam):
      print
      print "-> <stat> Ham scores for", tag,
!     ham.display()

      print
      print "-> <stat> Spam scores for", tag,
!     spam.display()

      if not options.compute_best_cutoffs_from_histograms:
          return

      # Figure out "the best" spam cutoff point, meaning the one that minimizes
--- 29,47 ----
  import Tester
  import classifier
+ from Histogram import Hist

! def printhist(tag, ham, spam, nbuckets=options.nbuckets):
      print
      print "-> <stat> Ham scores for", tag,
!     ham.display(nbuckets)

      print
      print "-> <stat> Spam scores for", tag,
!     spam.display(nbuckets)

      if not options.compute_best_cutoffs_from_histograms:
          return
+     if ham.n == 0 or spam.n == 0:
+         return

      # Figure out "the best" spam cutoff point, meaning the one that minimizes
***************
*** 112,116 ****
      best_total = fpw * fp + fn
      bests = [(0, fp, fn)]
!     for i in range(ham.nbuckets):
          # When moving the cutoff beyond bucket i, the ham in bucket i
          # are redeemed, and the spam in bucket i become false negatives.
--- 57,61 ----
      best_total = fpw * fp + fn
      bests = [(0, fp, fn)]
!     for i in range(nbuckets):
          # When moving the cutoff beyond bucket i, the ham in bucket i
          # are redeemed, and the spam in bucket i become false negatives.
***************
*** 127,131 ****

      i, fp, fn = bests.pop(0)
!     print '-> best cutoff for', tag, float(i) / ham.nbuckets
      print '->     with weighted total %g*%d fp + %d fn = %g' % (
            fpw, fp, fn, best_total)
--- 72,76 ----

      i, fp, fn = bests.pop(0)
!     print '-> best cutoff for', tag, float(i) / nbuckets
      print '->     with weighted total %g*%d fp + %d fn = %g' % (
            fpw, fp, fn, best_total)
***************
*** 155,160 ****
          self.falsepos = Set()
          self.falseneg = Set()
!         self.global_ham_hist = Hist(options.nbuckets)
!         self.global_spam_hist = Hist(options.nbuckets)
          self.ntimes_finishtest_called = 0
          self.new_classifier()
--- 100,105 ----
          self.falsepos = Set()
          self.falseneg = Set()
!         self.global_ham_hist = Hist()
!         self.global_spam_hist = Hist()
          self.ntimes_finishtest_called = 0
          self.new_classifier()
***************
*** 163,168 ****
          c = self.classifier = classifier.Bayes()
          self.tester = Tester.Test(c)
!         self.trained_ham_hist = Hist(options.nbuckets)
!         self.trained_spam_hist = Hist(options.nbuckets)

      # CAUTION:  this just doesn't work for incrememental training when
--- 108,113 ----
          c = self.classifier = classifier.Bayes()
          self.tester = Tester.Test(c)
!         self.trained_ham_hist = Hist()
!         self.trained_spam_hist = Hist()

      # CAUTION:  this just doesn't work for incrememental training when
***************
*** 192,197 ****
          self.global_ham_hist += self.trained_ham_hist
          self.global_spam_hist += self.trained_spam_hist
!         self.trained_ham_hist = Hist(options.nbuckets)
!         self.trained_spam_hist = Hist(options.nbuckets)

          self.ntimes_finishtest_called += 1
--- 137,142 ----
          self.global_ham_hist += self.trained_ham_hist
          self.global_spam_hist += self.trained_spam_hist
!         self.trained_ham_hist = Hist()
!         self.trained_spam_hist = Hist()

          self.ntimes_finishtest_called += 1
***************
*** 220,229 ****
          c = self.classifier
          t = self.tester
!         local_ham_hist = Hist(options.nbuckets)
!         local_spam_hist = Hist(options.nbuckets)

          def new_ham(msg, prob, lo=options.show_ham_lo,
                                 hi=options.show_ham_hi):
!             local_ham_hist.add(prob)
              if lo <= prob <= hi:
                  print
--- 165,174 ----
          c = self.classifier
          t = self.tester
!         local_ham_hist = Hist()
!         local_spam_hist = Hist()

          def new_ham(msg, prob, lo=options.show_ham_lo,
                                 hi=options.show_ham_hi):
!             local_ham_hist.add(prob * 100.0)
              if lo <= prob <= hi:
                  print
***************
*** 234,238 ****
          def new_spam(msg, prob, lo=options.show_spam_lo,
                                  hi=options.show_spam_hi):
!             local_spam_hist.add(prob)
              if lo <= prob <= hi:
                  print
--- 179,183 ----
          def new_spam(msg, prob, lo=options.show_spam_lo,
                                  hi=options.show_spam_hi):
!             local_spam_hist.add(prob * 100.0)
              if lo <= prob <= hi:
                  print