notebook.community

Edit and run



In [1]:

    
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline



In [2]:

    
# read the data from the csv file created by the Data_Preprocessing notebook
data = pd.read_csv('data/raw_data_unscaled.csv', index_col=0, parse_dates=True)
data.dropna(inplace=True) #drop the rows where cum_stand = NaN
data.head()









    Out[2]:







  
    
      
      cl_kadij
      cl_lobith
      afv_lobith
      stand_hvh
      cum_stand
    
  
  
    
      1982-02-12 00:00:00
      118.0
      109.000
      3564.12
      -19.0
      68.0
    
    
      1982-02-12 01:00:00
      113.0
      118.004
      3559.79
      -26.0
      94.0
    
    
      1982-02-12 02:00:00
      113.0
      118.137
      3555.49
      -12.0
      137.0
    
    
      1982-02-12 03:00:00
      117.0
      118.192
      3551.20
      14.0
      190.0
    
    
      1982-02-12 04:00:00
      103.0
      118.151
      3546.91
      52.0
      250.0



In [3]:

    
#find a long time interval without gaps to do analysis
gaps = (data.index.values[1:] - data.index.values[:-1]) != np.timedelta64(1,'h')
gap_indices = np.where(gaps == True)[0]
gap_size = np.append(gap_indices,data.shape[0]) - np.insert(gap_indices,0,0)
max_gap_size = np.max(gap_size)
print('largest time interval with continuous data is of size', max_gap_size)
max_gap_index = gap_indices[np.argmax(gap_size) - 1] + 1
print('it is found at index ', max_gap_index)

correlation_data = data.iloc[max_gap_index:max_gap_index+max_gap_size]

#final check:
gaps = correlation_data.index.values[1:] - correlation_data.index.values[:-1] != np.timedelta64(1,'h')
if (np.sum(gaps) != 0):
    print('ERROR: something weird happened, there is still a gap in the continuous data found!')
    print(np.sum(gaps))
    print(np.where(gaps == True))









    



largest time interval with continuous data is of size 9718
it is found at index  48082



In [4]:

    
# define two functions to quickly calculate and visualize the correlation between two variables

def print_correlation(correlation, var1, var2):
    '''plots the given correlation vector and prints the position of the maximal correlation'''
    plt.title('Correlation of ' + str(var1) + ' and ' + str(var2) + ' over time')
    plt.plot(correlation);
    print('maximum correlation found at', np.argmax(correlation), 'hours')
    print('this is at', np.argmax(correlation)/24, 'days')
    print('minimum correlation (negative correlation) found at', np.argmin(correlation), 'hours')
    print('this is at', np.argmin(correlation)/24, 'days')

def get_correlation(data, var1, var2, num_steps):
    '''calculates the correlation for each offset in the interval [0,num_steps), 
       between the given variables var1 and var2 (both strings), 
       using the data in the dataframe "data" (in which var1 and var2 must be present)'''
    
    correlation = np.zeros(num_steps)
    for i in range(0,num_steps):
        var1_shifted = data.shift(-i)[var1][:data.shape[0] - num_steps]
        var2_data = data[var2][:data.shape[0] - num_steps]
        correlation[i] = np.corrcoef(var1_shifted, var2_data)[0,1]
    print_correlation(correlation,var1,var2)
    return correlation



In [5]:

    
#calculate the correlation between cl_kadij and afv_lobith for 0 to 999 hour shift:
correlation_afv = get_correlation(correlation_data, 'cl_kadij', 'afv_lobith', 1000)









    



maximum correlation found at 820 hours
this is at 34.1666666667 days
minimum correlation (negative correlation) found at 181 hours
this is at 7.54166666667 days



In [6]:

    
#calculate the correlation between cl_kadij and cl_lobith for 0 to 999 hour shift:
correlation_cl = get_correlation(correlation_data, 'cl_kadij', 'cl_lobith', 1000)









    



maximum correlation found at 101 hours
this is at 4.20833333333 days
minimum correlation (negative correlation) found at 855 hours
this is at 35.625 days



In [7]:

    
#calculate the correlation between cl_kadij and cum_stand for 0 to 999 hour shift:
correlation_stand = get_correlation(correlation_data, 'cl_kadij', 'cum_stand', 1000)









    



maximum correlation found at 7 hours
this is at 0.291666666667 days
minimum correlation (negative correlation) found at 326 hours
this is at 13.5833333333 days



In [8]:

    
# Conclusion based on correlation, using the following time intervals for the data is "best"
afv_lobith_firstday = 6
afv_lobith_lastday = 9

cl_lobith_firstday = 3
cl_lobith_lastday = 6

stand_firstday = 1 #would ideally be 0 but then we need to use prediction for prediction so let's avoid that for the moment
stand_lastday = 2

# To visualize results, chosen intervals are shown in red
f, axarr = plt.subplots(3, sharex=True)
f.tight_layout()
axarr[0].plot(correlation_afv)
axarr[0].plot(np.arange(afv_lobith_firstday*24, afv_lobith_lastday*24), correlation_afv[afv_lobith_firstday*24:afv_lobith_lastday*24], 'r')
axarr[0].set_title('correlation afv_lobith')

axarr[1].plot(correlation_cl)
axarr[1].plot(np.arange(cl_lobith_firstday*24, cl_lobith_lastday*24), correlation_cl[cl_lobith_firstday*24:cl_lobith_lastday*24], 'r')
axarr[1].set_title('correlation cl_lobith')

axarr[2].plot(correlation_stand)
axarr[2].plot(np.arange(stand_firstday*24, stand_lastday*24), correlation_stand[stand_firstday*24:stand_lastday*24], 'r')
axarr[2].set_title('correlation cum_stand');



In [ ]:

	cl_kadij	cl_lobith	afv_lobith	stand_hvh	cum_stand
1982-02-12 00:00:00	118.0	109.000	3564.12	-19.0	68.0
1982-02-12 01:00:00	113.0	118.004	3559.79	-26.0	94.0
1982-02-12 02:00:00	113.0	118.137	3555.49	-12.0	137.0
1982-02-12 03:00:00	117.0	118.192	3551.20	14.0	190.0
1982-02-12 04:00:00	103.0	118.151	3546.91	52.0	250.0