In [3]:
from numpy.linalg import inv

class Kalman():
    '''Implements a 1D (observation dimension) Bayesian Kalman filter following the probabilistic approach of Murphy page ~641.  
       dim_z is the number of measurement inputs
    
    Attributes
    ----------
    mu : numpy.array(dim_mu, 1)
        State estimate vector
        
    sigma : numpy.array(dim_x, dim_x)
        Covariance matrix
    
    A : numpy.array(dim_mu, dim_mu)
        State Transition matrix
    
    B : numpy.array(dim_mu, dim_u)
        Control transition matrix
    
    C : numpy.array(dim_mu, dim_mu)
        Measurement function
    
    D : numpy.array(dim_mu, dim_u)
        Control observation matrix
    
    R : numpy.array(dim_z, dim_z)
        Measurement noise matrix
        
    Q : numpy.array(dim_x, dim_x)
        Process noise matrix
        
    S : numpy.array(dim_z, dim_z)
        Observation Noise Estimate. For now set to R 
        
    '''
    
    def __init__(self, mu_0, sigma_0, A, B, C, D, Q, R):
        '''
        Parameters
        ----------
        mu_0 : numpy.array(dim_mu, 1)
            Initial state estimate vector

        sigma_0 : numpy.array(dim_x, dim_x)
            Initial covariance matrix

        A : numpy.array(dim_mu, dim_mu)
            State Transition matrix

        B : numpy.array(dim_mu, dim_u)
            Control transition matrix

        C : numpy.array(dim_mu, dim_mu)
            Measurement function

        D : numpy.array(dim_mu, dim_u)
            Control observation matrix

        R : numpy.array(dim_z, dim_z)
            Measurement noise matrix

        Q : numpy.array(dim_x, dim_x)
            Process noise matrix
        '''
        self.A = A   # Parameter matrix A 
        self.B = B   # Parameter matrix B 
        self.C = C   # Parameter matrix C 
        self.D = D   # Parameter matrix D
        self.Q = Q   # State noise covaraiance matrix 
        self.R = R   # Observation noise covariance matrix
        self.S = self.R # Observation Noise Estimate. For now set to R 
        self.mu = mu_0 # Initial state estimate 
        self.sigma = sigma_0 # Initial state covariance 
        
        
    def predict(self, u=None): 
        ''' Murphy Sec. 18.3.1.1'''
        
        # Predicted state covariance 
        self.sigma = np.dot(np.dot(self.A, self.sigma), self.A.T) + self.Q
        
        # if there is no control input do not include it 
        if u is None:
            self.mu = np.dot(self.A, self.mu)  # Predict state mean 
        else:
            self.mu = np.dot(self.A, self.mu) + np.dot(self.B, self.u)

    
        
    def update(self, Y):
        '''
        Add a new measurement (z) to the Kalman filter. If z is None, nothing
        is changed.  Murphy Sec. 18.3.1.2
        
        Parameters
        ----------
        Y : np.array(dim_z)
            measurement for this update. z can be a scalar if dim_z is 1,
            otherwise it must be a column vector.

        '''
        
        self.y = np.dot(self.C,self.mu) # Posterior predictive mean 
        
        r = Y - self.y # residual 
        S = np.dot(np.dot(self.C, self.sigma), self.C.T) + self.R #         
        K = np.dot(np.dot(self.sigma, self.C.T), inv(S)) # Kalman Gain 
        
        # Correct the state covariance and mean 
        self.mu = self.mu + np.dot(K, r)
        I_KC = np.identity(len(self.mu)) - np.dot(K,self.C)
        self.sigma = np.dot(np.dot(I_KC, self.sigma), I_KC.T) + np.dot(np.dot(K,self.R), K.T)
        
        # Update the class attribute values 
        self.K = K 
        self.S = S

Estimate the observation noise for each sensor. Noise is Gaussian, but definitely not the same for all sensors. The bi-modal characterisitcs of sensor 9 are not an obvious shift in the baseline, but the time series does not look like pure white noise.


In [4]:
def EstimateObservationNoise(sensor_number, start_obs, end_obs, plot=False):
    '''
    Estimate the Gaussian noise over an observation period.  Just subtract mean and estimate variance. NaN's are omitted.
    
    Parameters
    ----------
    sensor_number : integer
        integer index of sensor in pandas dataframe.  
        
    start_obs : integer
        integer index of starting observation
        
    end_obs : integer
        integer index of ending observation
        
    plot : bool (default true)
        If true, plot the histogram and gaussian estimator. 
    
    Returns
    ----------
    variance : Variance of observation set. 
    '''
    
    df = pd.read_pickle('../output/cluster_0_cleaned.pickle')
    # Observations 
    Y = df['snowdepth_%i'%sensor_number].values
    obs = Y[start_obs:end_obs] 
    obs = obs[~np.isnan(obs)]
    obs -= np.mean(obs)
    var = np.std(obs)**2
    bins = np.linspace(-100,100,50)
    
    if plot: 
        plt.hist(obs, bins=bins, histtype='step', normed=True, label='Sensor %i'%sensor_number)
        gaus = np.exp(-bins**2/(2*var))
        gaus = gaus / np.sum(gaus*(bins[-1]-bins[0]))
        #plt.plot(bins, gaus)
        plt.legend(frameon=False)
        plt.xlabel('Scatter [mm]')
    
    return var



# What range of observations are we using to estimate the noise? 
start_obs=2000
stop_obs=3000

for i_sensor in range(1,11):
    plt.subplot(211)
    if i_sensor > 5: 
        plt.subplot(212)
    EstimateObservationNoise(i_sensor, start_obs, stop_obs, plot=True)
plt.text(.05, .82, 'Obs:%i-%i'%(start_obs, stop_obs), transform=plt.gca().transAxes)
plt.show()

# Uncomment to plot sensor 9 over the time period of interest. 
# df = pd.read_pickle('../output/cluster_0_cleaned.pickle')
# Y = df['snowdepth_%i'%9].values
# plt.plot(range(start_obs, stop_obs),Y[start_obs:stop_obs])



In [ ]:


In [38]:
import pandas as pd
import time 

def FilterSnowdepth(sensor_number, obs_noise, system_noise, outlier_threhold=2e3):
    '''
    Run a forward Kalman filter through a single snowdepth sensor time series.
    
    Parameters
    ----------
    sensor_number : integer
        integer index of sensor in pandas dataframe.  
        
    obs_noise : np.array(z_dim, z_dim)
        Specify the observation noise covariance matrix.  Scalar if z_dim=1 (observation dimension)
    
    system_noise : np.array(mu_dim, mu_dim)
        Specifies the system (process) covariance matrix mu_dim is the state dimension
        
    outlier_threshold : float
        Distance threshold in mm before point is considered an outlier, and thus not used in 
        the measurement update.  This needs to be large enough that real baseline shifts are not omitted.
    
    Returns 
    ----------
    mu_list : np.array(N_obs, dim_mu)
        contains the updated state vector for each observation
    
    sigma_list : contains the projected measurement covariance at each observation. 
    
    '''
    
    df = pd.read_pickle('../output/cluster_0_cleaned.pickle')
    # Observations 
    Y = df['snowdepth_%i'%sensor_number].values
    
    # Let's estimate the initial baseline using the median of the first 2500 data points, excluding NaNs
    baseline = np.median(Y[2000:3000][~np.isnan(Y[2000:3000])])

    # Label the state parameters. 
    state_params=['depth_1', 'velocity_1', 'baseline_1']
    
    # First observation that is not nan 
    Y0 = Y[np.argwhere(~np.isnan(Y))[0]]
    sigma_0 = np.diag((50, 10, 10))
    mu_0 = np.array([-Y0+baseline, 0., baseline]) # Initial state is the first observation 
    dt = .25 # 15 minute intervals.  Velocity in mm/hr
    
    # Transition Matrix 
    A = np.array([[1, dt, 0],
                  [0,  1, 0],
                  [0,  0, 1]])

    # Control Model 
    B = np.zeros((len(mu_0),len(mu_0)))

    # Observation Matrix
    C = np.array([[-1, 0, +1],]) 

    # Process noise.
    Q = system_noise
    
    # Observation Noise
    R = obs_noise

    # For now, no control input 
    u = None
    D = None 

    K = Kalman(mu_0, sigma_0, A, B, C, D, Q, R)

    sigma_list  = np.zeros(len(Y)) 
    mu_list = np.zeros((len(Y),len(mu_0)))
    #diffs = np.zeros(len(Y)) # Debugging 
    
    # Initial values
    mu_list[0,:] = mu_0
    sigma_list[0] = obs_noise
    
    print 'Processing sensor %i'%sensor_number
    start = time.time()
    for i_obs in range(1, len(Y)):
        K.predict()
        
        # Only update the state if we have a valid measurement 
        # and it is not an obvious outlier (threhold is a change of >2meters)
        
        difference = np.abs((Y[i_obs]-np.dot(K.C,K.mu)))
        # diffs[i_obs] = difference # Debugging 
        if not np.isnan(Y[i_obs]) and (difference < 2e3):
            K.update(Y[i_obs])
        
        mu_list[i_obs] = K.mu
        sigma_list[i_obs] = K.S
                
        if (i_obs%500)==0:
            print '\r Forward pass on observation %i of %i'%(i_obs,len(Y)), 
    
    print '\n Completed Forward Pass in %1.2f s'%(time.time()-start)
    return mu_list, sigma_list

In [47]:
results = []
for i_sensor in range(1,11): 
    # Estimate the observation variance initially. 
    start_obs=2000
    stop_obs=3000
    obs_noise = EstimateObservationNoise(i_sensor, start_obs, stop_obs)
    # Run the Kalman Filter
    results.append(FilterSnowdepth(i_sensor, obs_noise=obs_noise, system_noise=np.diag((1e0,1e-2,1e-3))))


Processing sensor 1
 Forward pass on observation 17500 of 17569 
 Completed Forward Pass in 2.32 s
Processing sensor 2
 Forward pass on observation 17500 of 17569 
 Completed Forward Pass in 1.38 s
Processing sensor 3
 Forward pass on observation 17500 of 17569 
 Completed Forward Pass in 2.22 s
Processing sensor 4
 Forward pass on observation 17500 of 17569 
 Completed Forward Pass in 1.81 s
Processing sensor 5
 Forward pass on observation 17500 of 17569 
 Completed Forward Pass in 2.07 s
Processing sensor 6
 Forward pass on observation 17500 of 17569 
 Completed Forward Pass in 1.96 s
Processing sensor 7
 Forward pass on observation 17500 of 17569 
 Completed Forward Pass in 2.15 s
Processing sensor 8
 Forward pass on observation 17500 of 17569 
 Completed Forward Pass in 1.98 s
Processing sensor 9
 Forward pass on observation 17500 of 17569 
 Completed Forward Pass in 1.95 s
Processing sensor 10
 Forward pass on observation 17500 of 17569 
 Completed Forward Pass in 1.22 s

In [48]:
plt.figure(figsize=(6,8))
for i_sensor, (mu, sigma,) in enumerate(results[:11]):
    # This sensor went out after not very long
#     if i_sensor==1: 
#         continue
    
    # Load the observations for plotting 
    df = pd.read_pickle('../output/cluster_0_cleaned.pickle')
    Y = df['snowdepth_%i'%(i_sensor+1)].values
    
    plt.subplot(311)
    # Plot the snowdepth 
    plt.plot(mu[:,0], label='Sensor %i'%(i_sensor+1))
    #plt.plot(mu[:,0]+sigma, label='Sensor %i'%(i_sensor+1), color='firebrick')
    #plt.plot(mu[:,0]-sigma, label='Sensor %i'%(i_sensor+1), color='firebrick')
    plt.xlabel('Observation')
    plt.ylabel('Snowdepth $d$ [mm]')
    plt.grid(linestyle='-', alpha=.15)
    #plt.legend(loc=2, ncol=2, frameon=False, columnspacing=.2, labelspacing=.2)
    plt.ylim(-100, 3000)

    plt.ylim(600, 750)
    plt.xlim(14400, 14800)
    
    df = pd.read_pickle('../output/cluster_0_cleaned.pickle')
    # Observations 
    Y = df['snowdepth_4'].values
    
    # Let's estimate the initial baseline using the median of the first 2500 data points, excluding NaNs
    baseline = np.median(Y[2000:3000][~np.isnan(Y[2000:3000])])
    plt.scatter(range(len(Y)), -Y+baseline, s=.5, color='k', marker='o')
    # Can also plot uncertainty bands, though these are not great yet.
    plt.fill_between(range(len(mu[:,0])), mu[:,0]-np.sqrt(sigma), mu[:,0]+np.sqrt(sigma), alpha=.1, color='m')
    
    
    # -------------------------------
    # Plot the velocity parameter
    plt.subplot(312)
    plt.plot(mu[:,1], alpha=.3)
    plt.xlabel('Observation')
    plt.ylabel('$\dot{d}$ [mm/hr]')
    plt.ylim(-10,25)
    plt.grid(linestyle='-', alpha=.15)
        
    # -------------------------------
    # Plot the baseline
    plt.subplot(313)
    plt.plot(mu[:,2], alpha=.7)
    plt.xlabel('Observation')
    plt.ylabel('$b(t)$ [mm]')
    plt.ylim(3.5e3,4.5e3)
    plt.grid(linestyle='-', alpha=.15)


Step Function Control Input


In [259]:
def GenerateStepControl(Y, window_size):
    '''
    Returns the convoltion of a step function with the input observation array Y.
    '''
    step = np.zeros(2*window_size)
    step[:window_size] = -1
    step[window_size:] = 1
    return np.convolve(step, Y, mode='full')/(2*window_size)

window_size = 50


plt.figure(figsize=(10,8))
steps = []
for i_sensor in range(1,11):
    
    
    # Note this is the cleaned and filled dataset.
    df = pd.read_pickle('../output/cluster_0_cleaned_filled.pickle')
    Y = df['snowdepth_%i'%(i_sensor)].values
    plt.subplot(311)
    plt.plot(Y)
    #plt.ylim(2500,3500)
    
    conv = GenerateStepControl(Y, window_size=window_size)
    steps.append(conv)
    plt.subplot(312)
    plt.plot(conv, label='Sensor %i'%i_sensor)
    plt.ylabel('Step Response')
    plt.ylim(-200,200)

plt.subplot(313)
# Calculate the median and median average deviation 
steps_med = np.median(steps, axis=0)
steps_MAD = np.median(np.abs((steps-steps_med)), axis=0)

# steps_med = np.mean(steps, axis=0)
# steps_MAD = np.std(steps, axis=0)


for i_sensor in range(1,11):
    z_score = (steps[i_sensor-1]-steps_med) / steps_MAD
    # Need to shift by window size to figure out where the sensor was moved.
    plt.plot(np.arange(len(z_score))+window_size, z_score, alpha=1, label='Sensor %i'%i_sensor)
plt.ylabel('Modified Z-Score')
plt.legend(loc=9, ncol=3, columnspacing=.2, labelspacing=.2, frameon=False)
plt.xlim(0,18000)
#plt.ylim(0,50)


Out[259]:
(0, 18000)

Deeper into Velocities. Estimating relative snowfall rates for each sensor.


In [ ]:
bins=np.linspace(0,3,31) # Velocity bins
v_threhold = 3. # Only include velocity ratios when both velocities are above this 


bin_centers = 0.5*(bins[:-1] + bins[1:])
histograms = np.zeros((len(results), len(results), len(bins)-1))
median_ratios = np.zeros((len(results), len(results)))

for i_sensor, (mu_i, sigma_i,) in enumerate(results):
    # Get the velocities of the first set.
    v_i = mu_i[:,1]
    
    for j_sensor, (mu_j, sigma_j,) in enumerate(results):
        if i_sensor == j_sensor: 
            continue
        # Get the velocities of the second set.
        v_j = mu_j[:,1] 
        
        valid = np.where((v_i>v_threhold) & (v_j>v_threhold))[0] # Only positive since we are looking at snowfall not melt rate
        
        ratios = (v_i[valid]/v_j[valid])
        
        median_ratios[i_sensor, j_sensor] = np.median(ratios[~np.isnan(ratios)]) 
        
        hist, bin_edges = np.histogram(v_i[valid]/v_j[valid], bins=bins, normed=True)
        histograms[i_sensor, j_sensor] = hist
        # Find velocities over some threhold 
    
    median_ratios[i_sensor, np.isnan(median_ratios[i_sensor])] = 1.
np.set_printoptions(precision=3)
print median_ratios

In [348]:
plt.figure(figsize=(10,20))

for i_sensor in range(len(results)):
    for j_sensor, hist in enumerate(histograms[i_sensor]):
        #print len(hist)
        if j_sensor < 5:
            plt.subplot(len(results),2,2*i_sensor+1)
        else:
            plt.subplot(len(results),2,2*i_sensor+2)
        plt.step(bin_centers, hist, alpha=.75, label='i,j = %i, %i'%(i_sensor+1, j_sensor+1))
        
        plt.legend(frameon=False,fontsize=7, labelspacing=.2)
        plt.xlabel('$v_i/v_j$')


Same as above, but for snow-melt


In [358]:
#bins=np.linspace(0,3,50) # Velocity bins
v_threhold = 2. # Only include velocity ratios when both velocities are above this 


bin_centers = 0.5*(bins[:-1] + bins[1:])
histograms = np.zeros((len(results), len(results), len(bins)-1))
median_ratios = np.zeros((len(results), len(results)))

for i_sensor, (mu_i, sigma_i,) in enumerate(results):
    # Get the velocities of the first set.
    v_i = mu_i[:,1]
    
    for j_sensor, (mu_j, sigma_j,) in enumerate(results):
        if i_sensor == j_sensor: 
            continue
        # Get the velocities of the second set.
        v_j = mu_j[:,1] 
        
        valid = np.where((v_i<v_threhold) & (v_j<v_threhold) )[0] # Only positive since we are looking at snowfall not melt rate
        
        hist, bin_edges = np.histogram(v_i[valid]/v_j[valid], bins=bins, normed=True)
        ratios = (v_i[valid]/v_j[valid])
        median_ratios[i_sensor, j_sensor] = np.median(ratios[~np.isnan(ratios)]) 
                         
        histograms[i_sensor, j_sensor] = hist
        # Find velocities over some threhold 

np.set_printoptions(precision=3)
print median_ratios


[[ 0.    -0.78   0.759  0.589  0.849  0.644  0.924  0.78   0.688  0.749]
 [-0.552  0.    -0.38  -0.372 -0.491 -0.301 -0.471 -0.402 -0.422 -0.983]
 [ 0.894 -0.735  0.     0.606  0.827  0.694  0.977  0.83   0.711  0.753]
 [ 0.52  -0.492  0.496  0.     0.457  0.435  0.565  0.35   0.313  0.445]
 [ 0.889 -0.636  0.833  0.584  0.     0.707  0.972  0.791  0.652  0.826]
 [ 1.012 -0.811  0.872  0.815  0.95   0.     1.016  0.916  0.791  0.866]
 [ 0.772 -0.683  0.716  0.559  0.745  0.659  0.     0.749  0.652  0.72 ]
 [ 0.797 -0.463  0.725  0.47   0.842  0.724  0.831  0.     0.785  0.669]
 [ 0.96  -0.712  0.828  0.35   0.964  0.759  0.911  1.083  0.     0.863]
 [ 0.783 -1.017  0.6    0.464  0.725  0.551  0.792  0.745  0.625  0.   ]]

In [347]:
plt.figure(figsize=(10,20))

for i_sensor in range(len(results)):
    for j_sensor, hist in enumerate(histograms[i_sensor]):
        #print len(hist)
        if j_sensor < 5:
            plt.subplot(len(results),2,2*i_sensor+1)
        else:
            plt.subplot(len(results),2,2*i_sensor+2)
        plt.step(bin_centers, hist, alpha=.75, label='i,j = %i, %i'%(i_sensor+1, j_sensor+1))
        
        plt.legend(frameon=False,fontsize=7, labelspacing=.2)
        plt.xlabel('$v_i/v_j$')



In [ ]: