In [1]:
import pandas as pd

In [174]:
def DifferentialFilter(df, max_diff=2, max_dev=10, max_T=40):
    '''
    Filters the temperature data from the input dataframe in three steps:
      (i) Delete points which change too rapidly over 1 timestep
      (ii) Delete points where the sensor data is more than max_dev deg C from the median of other sensors
      (iii) Delete points where the Temperature is max_T.
    
    Parameters
    ----------
    df : pandas dataframe
        The dataframe containing the temperature data. 
    max_diff : float
        Outliers are any points with more than this differential over a single timestep
    max_dev : float
        Outliers are any points lying more than max_diff degrees C from the median
    max_T : float 
        Temperatures above this value are assumed invalid
    
    Returns
    -------
    df_cleaned : np.array(float)
        dataframe with c
        
    '''
    n_nodes = np.sum([1 for col in df.columns if 'temperature' in col])
    temps=[]
    for i in range(1, n_nodes+1):
        temps.append(df['temperature_%i'%i].values)
    temps=np.array(temps)
    
    df_cleaned = df.copy()
    diff = temps[:,:]-temps[:,:]
    for i in range(n_nodes):
        # If the differential is more than two degrees, it is probably very wrong 
        # This is way above the sensor noise level
        # Set these values to NaN
        idx = np.where(diff[i]>2)[0]
        for i_bad in idx:
            temps[i, i_bad-1:i_bad+1] = np.nan
        
        # If the temp is more than 10 degrees from the median intercluster value, this is ridiculous 
        means = numpy.nanmedian(temps, axis=0)
        diff = np.abs(temps[i]-means)
        temps[i, diff>max_diff] = np.nan
        # Temps above 40c are also not physical.
        temps[i, temps[i]>max_T] = np.nan
        # Write to the cleaned dataframe 
        df_cleaned['temperature_%i'%(i+1)] = temps[i]
        
    return df_cleaned


#------------------------------------------------
# Clean and plot each dataset
for i_site in range(1,11):
    if i_site == 4: 
        continue
        
    plt.figure(figsize=(10,3))
    df = pd.read_csv('../raw_data/site_%i.csv'%i_site)
    df_cleaned = DifferentialFilter(df)   
    df.to_csv('../output/site_%i_cleaned_T.csv'%i_site)

    for i_sensor in range(1,11):
        plt.plot(df_cleaned['temperature_%i'%i_sensor].values, alpha=.6)
    plt.xlim(0, 18000) 
    plt.ylabel('Temperature [C]')
    plt.xlabel('Observation')
    plt.text(.05, .05, 'Site %i'%i_site, transform=plt.gca().transAxes)



In [ ]:


In [24]:


In [ ]:


In [ ]:


In [ ]: