[SciPy-dev] denoising spatial point process data

Fri Jan 9 11:37:31 EST 2009

If it is of interest I'll donate a useful algorithm of denoising spatial
point process data to scipy.spatial. It works by fitting a mixture of two
Poisson processes (signal + noise) using an EM algorithm. It was
originally developed for US DoD to detect minefields using reconnaissance
aircraft images. I use it on neurophysiological spike data to remove
artifact waveforms. It seems to be very efficient.

Regards,
Sturla Molden

# Written by Sturla Molden

import numpy
from scipy.special import gamma

def nnclean(K, knn, ndim, convergence=0.01):

    """

    K-th nearest neighbour clutter removal

    Algorithm from: Byers, S & Raftery, A (1998). Nearest-Neighbor
    Clutter Removal for estimating Features in Spatial Point
    Processes. J. Am. Stat. Assoc., 93: 577-585.

    Input:

      K: neighbour to use (K=5 often works well)

      Dk: distance to K-th nearest neighbour.
          Obtain by searching a scipy.spatial.cKDTree with p=2

      ndim: number of dimensions in the data set from which Dk was computed.

    Output:

      signal: indices of signal points
      noise: indices of clutter points
      delta: likelihood that a point is signal
      loglike: total log-likelihood of the fit

    """

    # size of data
    d, n = ndim, len(Dk)
    # calculate volume of N-D hypersphere
    ad = (2*numpy.pi**(d/2.0))/(d*gamma(d/2.0))
    #select a random starting point
    delta = numpy.random.rand(n)
    l1 = K * delta.sum()/(ad*numpy.sum((Dk**d)*delta))
    l2 = K*numpy.sum(1.0-delta)/(ad*numpy.sum((Dk**d)*(1.0-delta)))
    #l1[numpy.where(l1<1E-12)[0]] = 1E-12
    #l2[numpy.where(l2<1E-12)[0]] = 1E-12
    p = numpy.sum(delta)/n
    old_loglike = numpy.sum( -p*l1*ad*((Dk**d)*delta) -
(1.0-p)*l2*ad*((Dk**d)*(1.0-delta))
       + delta*K*numpy.log(l1*ad) + (1.0-delta)*K*numpy.log(l2*ad) )
    # estimate Poisson mixture by EM updates
    term = 100
    loglike = [old_loglike]
    counter = 0
    run = 1
    old_classification = numpy.round(delta)

    while run:
        new_delta, l1, l2, p = EM_update(Dk,K,n,d,ad,delta,l1,l2,p)
        #l1[numpy.where(l1<1E-12)[0]] = 1E-12
        #l2[numpy.where(l2<1E-12)[0]] = 1E-12

        new_loglike = numpy.sum( -p*l1*ad*((Dk**d)*new_delta) -
(1-p)*l2*ad*((Dk**d)*(1.0-new_delta))
            + new_delta*K*numpy.log(l1*ad) +
(1.0-new_delta)*K*numpy.log(l2*ad) )

        update = 100*(new_loglike - old_loglike)/new_loglike
        # Force at least 3 EM updates, quit after 50.
        # Quit if EM starts to oscillate.
        counter = counter + 1
        if counter < 3:
            run = 1
        elif counter > 50:
            run = 0
        elif old_loglike > new_loglike:
            run = 0
            classification = old_classification
            new_loglike = old_loglike;
            new_delta = delta
        elif update < convergence:
            run = 0
        old_loglike = new_loglike
        loglike.append(new_loglike)
        delta = new_delta
    if l2 > l1:
        delta = 1.0 - delta
    idx, = numpy.where(delta < 1E-6)
    delta[idx] = 1E-6
    signal = numpy.where(delta > .5)[0]
    noise  = numpy.where(delta <= .5)[0]
    # done
    return signal, noise, delta, loglike

#Single EM update
def EM_update(Dk,K,n,d,ad,delta,l1,l2,p):
    factorial = lambda K : numpy.arange(1.0,K+1.0).prod() if K > 1 else 1.0
    pdf = lambda x,K,l,d: numpy.exp(-l*(x**d)) * 2 * (l**K) * x**(d*K - 1)
/ factorial(K-1)
    # E-step
    P1 = pdf(Dk,K,l1*ad,d)
    P2 = pdf(Dk,K,l2*ad,d)
    index, = numpy.where((P1 == 0.0) * (P2 == 0.0))
    delta[index] = .5
    index, = numpy.where((P1 != 0.0) + (P2 != 0.0))
    delta[index] = p*P1[index]/(p*P1[index] + (1.0-p)*P2[index])
    # M-step
    l1 = K*numpy.sum(delta)/(ad*numpy.sum((Dk**d)*delta))
    l2 = K*numpy.sum(1.0-delta)/(ad*numpy.sum((Dk**d)*(1.0-delta)))
    p = numpy.sum(delta)/n
    return delta, l1, l2, p