In [1]:
import pandas as pd
# Load the CSV
df = pd.read_csv('../raw_data/example_data.csv')
print df.columns
print
print df.head(10)
In [2]:
# Plot all the time series.
plt.figure(figsize=(10,4))
for i in range(1,11):
plt.plot(df['snowdepth_%i'%i].values, linewidth=.5)
plt.ylim(0,5000)
plt.ylabel('Sensor Dist [mm]')
plt.xlabel('Observation')
Out[2]:
This data is plagued by outliers. Fortunately, the outlier deviation is well above the noise level. We construct a simple outlier detection filter based on a moving median and median absolute deviation, which will be muh more robust than a mean squared error and variance filter. For a window size $n$, an observation's outlier statistic is the modified z-score,
$$ z(x_i) = \frac{ |x_i-{\rm med}_i(x_i)|}{{\rm med_j}(|x_j-{\rm med}_i(x_i)|)}$$for $j\in (i-n/2) ... (i+n/2) $ and where med_i is the median of the argument over the index i.
Our outlier detection algorithm then as two parameters: the window size $n$ and the threshold for the modified z-score. We then set the outliers to NaN.
In [3]:
def GetMAD_Obs(i, x, window_size):
'''
Return the median abosolute deviation of an observation i over a moving window.
Parameters
----------
i : int
observation index
x : 1D array
observation list
window_size: int
how many observations are included on each side of the target observation.
Returns
-------
MAD : int
Modified z-score in median abosolute deviations.
'''
x = np.asarray(x)
# If our observation is an NaN, just return NaN
if np.isnan(x[i]):
return x[i]
n_obs = x.shape[0]
start_i = np.max( (0,i - window_size) ) # Don't let start i go negative
end_i = np.min( (i + window_size, n_obs) ) # i < n_obs
# We also need to remove NaN values to compute the median
obs_set = x[start_i:end_i][np.logical_not(np.isnan(x[start_i:end_i]))]
median = np.median( obs_set )
MAD = np.median( np.abs((obs_set-median)) )
mod_z_score = np.abs(x[i]-median)/MAD
return mod_z_score
from multiprocessing import pool
from functools import partial
def GetMad(x, window_size, threshold):
'''
Return the median abosolute deviation of an observation list over a moving window.
Parameters
----------
x : 1D array
observation list
window_size: int
how many observations are included on each side of the target observation.
threshold: float
what is the modified z-score threshold for outliers
Returns
-------
MAD : float array
Modified z-score in median abosolute deviations.
'''
# Multithreaded because, why not?
p = pool.Pool()
func = partial(GetMAD_Obs, x=x, window_size=window_size )
mod_z = np.array(p.map(func, range(len(x))))
p.close()
# plt.plot(mod_z)
# plt.show()
new_obs = x.copy()
new_obs[mod_z>threshold] = np.nan # set outliers to NaN
return new_obs
In [4]:
plt.figure(figsize=(10,10))
df_clean = df.copy()
for i in range(1,11):
plt.subplot(5,2,i)
cleaned = GetMad(df['snowdepth_%i'%i].values[:], window_size=600, threshold=3)
plt.plot(df['snowdepth_%i'%i].values, linewidth=.5, alpha=.4)
df_clean['snowdepth_%i'%i] = cleaned
plt.plot(df_clean['snowdepth_%i'%i], linewidth=.5)
plt.ylim(0,5000)
plt.ylabel('Sensor Distance [mm]')
if i>8:
plt.xlabel('Observation')
In [6]:
def nan_helper(y):
"""Helper to handle indices and logical indices of NaNs.
Input:
- y, 1d numpy array with possible NaNs
Output:
- nans, logical indices of NaNs
- index, a function, with signature indices= index(logical_indices),
to convert logical indices of NaNs to 'equivalent' indices
Example:
>>> # linear interpolation of NaNs
>>> nans, x= nan_helper(y)
>>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
"""
return np.isnan(y), lambda z: z.nonzero()[0]
def InterpolateNaNs(y):
"""Fill missing values with interpolation
Input:
- y, 1d numpy array with possible NaNs
Output:
- new, interpolated array
"""
nans, x= nan_helper(y)
new = y.copy()
new[nans] = np.interp(x(nans), x(~nans), y[~nans])
return new
df_clean_filled = df_clean.copy()
for i in range(1,11):
df_clean_filled['snowdepth_%i'%i] = InterpolateNaNs(df_clean_filled['snowdepth_%i'%i])
plt.plot(df_clean_filled['snowdepth_1']-0, label='Clean and Interpolated', alpha=1, color='firebrick')
plt.plot(df_clean['snowdepth_1'], label='Clean', color='steelblue')
df_clean.to_pickle('../output/cluster_0_cleaned.pickle')
df_clean_filled.to_pickle('../output/cluster_0_cleaned_filled.pickle')
plt.legend(frameon=False)
Out[6]:
In [11]:
from scipy import signal
plt.figure(figsize=(10,10))
for i in range(5,11):
for j in range(i,11):
set1 = df_clean_filled['snowdepth_%i'%i].values
set2 = df_clean_filled['snowdepth_%i'%j].values
set1_no_clean = np.nan_to_num(df['snowdepth_%i'%i].values)
set2_no_clean = np.nan_to_num(df['snowdepth_%i'%j].values)
# Normalize the input vectors
a = (set1 - mean(set1)) / (std(set1) * len(set1))
v = (set2 - mean(set2)) / std(set2)
corr = signal.correlate(a, v, mode='same')
max_corr = np.max(corr) # Find maximal correlation
max_corr_position = np.argmax(corr)-len(a)/2 # Find time lag for maximial cross-correlation
a_no_clean = (set1_no_clean - mean(set1_no_clean)) / (std(set1_no_clean) * len(set1_no_clean))
v_no_clean = (set2_no_clean - mean(set2_no_clean)) / std(set2_no_clean)
corr_no_clean = signal.correlate(a_no_clean, v_no_clean, mode='same')
max_corr_no_clean = np.max(corr_no_clean) # Find maximal correlation
# Plotting
#plt.subplot2grid((5,5),(i-1, j-1))
plt.subplot2grid((5,5),(i-6, j-6))
plt.plot((np.arange(len(corr))-len(a)/2.)/4., corr)
plt.plot((np.arange(len(corr_no_clean))-len(a_no_clean)/2.)/4., corr_no_clean, color='firebrick', linestyle='-')
plt.xlim(-500, 500)
plt.vlines(len(a)/2, 0, 1, linestyle='--', alpha=.4)
plt.text(.05, .1, 'max=%1.3f'%max_corr, transform=plt.gca().transAxes, fontsize=20)
#plt.text(.05, .1, 'lag=%1.2f hr'%(max_corr_position/4.), transform=plt.gca().transAxes, fontsize=20)
plt.ylim(0,1)
We see from the above that the sensors are very highly correlated at zero time-lag. Using the correlation and lag (time shift to maximal cross-correlation), we have (i) a powerful way to find faulty sensors, and (ii) a correlation coefficient which may be useful as input into our Kalman filter process model.
It is obvious here, for example, that sensor 10 is failing, or improperly placed.
We are also not limited to this sensor cluster alone and can use a similar methods to detect correlated sensors from other nearby clusters.
In order to get a handle on the noise characteristics, lets decompse the time series, and look at correlations across frequecny windows. Hopefully, there is a clear frequency above which the correlation coefficient drops to approximately zero, representing white noise.
In [62]:
def FFT_High_Pass(x, low_freq_cutoff, plot=False, low_pass=False):
"""Run a signal through a high-pass filter. Timestep is assumed 15 s
Input:
- x: 1d numpy array with input signal
- low_freq_cutoff: Frequency cut in inverse hours
- plot: If True, plot the power spectrum
- low_pass: if True, run a low pass filter instead of high pass.
Output:
- signal: The processed signal
"""
fft_coeff = np.fft.fft(x)
freq = np.fft.fftfreq(len(x), .25)
power = fft_coeff*np.conjugate(fft_coeff)
cutoff_index = np.argmin( np.abs(low_freq_cutoff-freq) )
if low_pass == False:
fft_coeff[:cutoff_index] = 0
else:
fft_coeff[cutoff_index:] = 0
bandpassed_signal = np.fft.ifft(fft_coeff).real
if plot:
power_filtered = fft_coeff*np.conjugate(fft_coeff)
plt.loglog(freq, power)
plt.loglog(freq, power_filtered)
plt.xlabel('Frequency [Hr$^{-1}$]')
plt.ylabel(r'Power Spectrum $\langle a_i|a_i \rangle$')
plt.show()
return bandpassed_signal
def FFT_Band_Pass(x, freq_range, plot=False):
"""Run a signal through a Band-pass filter. Timestep is assumed 15 s
Input:
- x: 1d numpy array with input signal
- freq_range: Tuple
Frequency range in inverse hours
- plot: If True, plot the power spectrum
Output:
- signal: The processed signal
"""
fft_coeff = np.fft.fft(x)
freq = np.fft.fftfreq(len(x), .25)
power = fft_coeff*np.conjugate(fft_coeff)
cutoff_index_low = np.argmin( np.abs(freq_range[0]-freq) )
cutoff_index_high = np.argmin( np.abs(freq_range[1]-freq) )
fft_coeff[:cutoff_index_low] = 0
fft_coeff[cutoff_index_high:] = 0
bandpassed_signal = np.fft.ifft(fft_coeff).real
if plot:
power_filtered = fft_coeff*np.conjugate(fft_coeff)
plt.loglog(freq, power)
plt.loglog(freq, power_filtered)
plt.xlabel('Frequency [Hr$^{-1}$]')
plt.ylabel(r'Power Spectrum $\langle a_i|a_i \rangle$')
plt.show()
return bandpassed_signal
def High_Pass_Cross_Correlation(x, y):
"""For two input signals we run each through a high-pass filter and calculate the cross-correlation, scanning over frequencies.
Input:
- x: 1d numpy array with input signal 1
- y: 1d numpy array with input signal 2
Output:
- scan_frequencies: 1d array with frequencies in Hr^-1
- cross_correlation: normalized maximal cross-correlation coefficient
"""
scan_frequencies = np.logspace(-2,np.log10(2),10)
xcorr = []
for i_freq, scan_freq in enumerate(scan_frequencies[:-1]):
freq_range = [scan_frequencies[i_freq], scan_frequencies[i_freq+1]]
sig1 = FFT_Band_Pass(x, freq_range, plot=False)
sig2 = FFT_Band_Pass(y, freq_range, plot=False)
# Normalize the input vectors
a = (sig1 - mean(sig1)) / (std(sig1) * len(sig1))
v = (sig2 - mean(sig2)) / std(sig2)
corr = signal.correlate(a, v, mode='same')
max_corr = np.max(corr) # Find maximal correlation
xcorr.append(max_corr)
#max_corr_position = np.argmax(corr)-len(a)/2 # Find time lag for maximial cross-correlation
return scan_frequencies, xcorr
corr_list = []
for i in range(1,11):
corr_list.append([])
for j in range(i+1,11):
print i, j
sig1 = df_clean_filled['snowdepth_%i'%i].values
sig2 = df_clean_filled['snowdepth_%i'%j].values
scan_frequencies, xcorr = High_Pass_Cross_Correlation(sig1, sig2)
corr_list[i-1].append(xcorr)
import pickle
pickle.dump(corr_list, open('../output/bandpassed_cross_correlations.pickle', 'wb'))
In [18]:
import pickle
corr_list = pickle.load(open('../output/bandpassed_cross_correlations.pickle', 'r'))
scan_frequencies = np.logspace(-2,np.log10(2),10)
scan_frequencies_centers = [np.exp(0.5*(np.log(scan_frequencies[i]) + np.log(scan_frequencies[i+1])))
for i in range(len(scan_frequencies)-1)]
plt.figure(figsize=(10, 12))
for i in range(1,11):
plt.subplot(5,2,i)
for j, xcorr in enumerate(corr_list[i-1]):
plt.plot(scan_frequencies_centers, xcorr, label=r'$%i \times %i$'%(i,j+i+1), marker='o', markersize=2)
plt.xscale('log')
plt.xlabel('Frequency Band [Hr$^{-1}$]')
plt.ylabel('Cross-Correlation')
plt.legend(loc=1, frameon=False, labelspacing=.2, fontsize=8)
plt.subplots_adjust(hspace=.3)
In [95]:
def Power_Spectrum(x, plot=False):
"""Return the power spectrum of the input signal
Input:
- x: 1d numpy array with input signal
- plot: If True, plot the power spectrum
Output:
- freq: 1D array with frequencies in inverse hours
- spec: Power Spectrum
"""
fft_coeff = np.fft.fft(x)
freq = np.fft.fftfreq(len(x), .25)
power = fft_coeff*np.conjugate(fft_coeff)
if plot:
power_filtered = fft_coeff*np.conjugate(fft_coeff)
plt.loglog(freq, power)
plt.xlabel('Frequency [Hr$^{-1}$]')
plt.ylabel(r'Power Spectrum $\langle a_i|a_i \rangle$')
plt.show()
return freq, power
def Power_Spectral_Ratio(x, split_freq=.2):
"""Return the power spectrum of the input signal
Input:
- x: 1d numpy array with input signal
- split_freq: Frequency in hr^-1 that defines the split between high/low
Output:
-ratio: Power spectral ratio above/below the split freq.
"""
freq, power = Power_Spectrum(x)
split = np.argmin(np.abs(freq-split_freq))
SPD = freq**2*power # Spectral Power Density is proportional to this for log spaced frequencies.
low_power, high_power = np.sum(power[:split]), np.sum(power[split:])
return high_power/low_power
def Cross_Correlation(sig1, sig2):
"""Return the normalized zero lag cross-correlation
Input:
- sig1: 1d array with input signal 1
- sig2: 1d array with input signal 2
Output:
-xcorr: Cross correlation at zero lag
"""
# Normalize the input vectors
a = (sig1 - mean(sig1)) / (std(sig1) * len(sig1))
v = (sig2 - mean(sig2)) / std(sig2)
corr = signal.correlate(a, v, mode='same')
return corr[len(a)/2]
def DetectAnamolousSensors(df, plot=True, threshold=30):
"""Calculates the mean cross-correlation with other sensors, as well as the noise characteristics of the sensor.
Input:
- df: pandas dataframe containing sensor data
- plot: If true, generate summary plots
- threshold: modified z-score threshold for detecting an outlier (plotting only)
Output:
- sensor_z_scores: dict containing an entry for each sensors modified z-score.
"""
keys = [column for column in df.columns if 'snowdepth_' in column]
sensor_ratios = {}
sensor_xcorrs = {}
sensor_z_scores = {}
for key in keys:
# Ratio of high vs low spectral power
ratio = np.log(Power_Spectral_Ratio(df[key].values))
# Calculate the mean cross-correlation with all other sensors excluding the autocorrelation
mean_xcorr = np.median([Cross_Correlation(df[key].values, df[key2].values)
for key2 in keys if key2 != key])
sensor_ratios[key] = ratio
sensor_xcorrs[key] = mean_xcorr
# Calculate some statistics useful for outliers
median_xcorr_all = np.median(sensor_xcorrs.values())
median_ratios_all = np.median(sensor_ratios.values())
mad_xcorr_all = np.median( np.abs(sensor_xcorrs.values()-median_xcorr_all) )
mad_ratio_all = np.median( np.abs(sensor_ratios.values()-median_ratios_all) )
def _score_point(ratio, xcorr):
z1 = (ratio - median_ratios_all)/mad_ratio_all
z2 = (xcorr - median_xcorr_all)/mad_xcorr_all
return np.sqrt(z1**2 + z2**2)
# Score each point by quadratically summing the modified z-scores in each dimension
for key in keys:
sensor_z_scores[key] = _score_point(sensor_ratios[key], sensor_xcorrs[key])
if plot:
for key in keys:
if sensor_z_scores[key] > threshold:
plt.scatter(sensor_ratios[key], sensor_xcorrs[key],
marker='o', s=30, color='firebrick', label=key.replace('_'," "))
else:
plt.scatter(sensor_ratios[key], sensor_xcorrs[key], marker='o', s=6, color='k')
# Map the decision boundary
grid = np.zeros((100,100))
for i, xcorr in enumerate(np.linspace(0,1,100)):
ratios = np.linspace(-10,0, 100)
grid[i] = _score_point(ratios, xcorr)
CS = plt.contour(ratios, np.linspace(0,1,100), grid, levels=np.linspace(0,100,11), colors='k', alpha=.3)
plt.clabel(CS, inline=1, fontsize=8, fmt='%i')
CS = plt.contour(ratios, np.linspace(0,1,100), grid, levels=[threshold,], colors='firebrick')
plt.xlabel('Log(Spectral Power Ratio)')
plt.ylabel('Median Cross-Correlation')
plt.ylim(0,1.2)
plt.xlim(-10,0)
plt.legend(frameon=False, loc=2)
return sensor_z_scores
DetectAnamolousSensors(df_clean_filled, threshold=30)
Out[95]:
In [77]:
Out[77]:
In [ ]: