In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
In [2]:
# read the data from the csv file created by the Data_Preprocessing notebook
data = pd.read_csv('data/raw_data_unscaled.csv', index_col=0, parse_dates=True)
data.dropna(inplace=True) #drop the rows where cum_stand = NaN
data.head()
Out[2]:
In [3]:
#find a long time interval without gaps to do analysis
gaps = (data.index.values[1:] - data.index.values[:-1]) != np.timedelta64(1,'h')
gap_indices = np.where(gaps == True)[0]
gap_size = np.append(gap_indices,data.shape[0]) - np.insert(gap_indices,0,0)
max_gap_size = np.max(gap_size)
print('largest time interval with continuous data is of size', max_gap_size)
max_gap_index = gap_indices[np.argmax(gap_size) - 1] + 1
print('it is found at index ', max_gap_index)
correlation_data = data.iloc[max_gap_index:max_gap_index+max_gap_size]
#final check:
gaps = correlation_data.index.values[1:] - correlation_data.index.values[:-1] != np.timedelta64(1,'h')
if (np.sum(gaps) != 0):
print('ERROR: something weird happened, there is still a gap in the continuous data found!')
print(np.sum(gaps))
print(np.where(gaps == True))
In [4]:
# define two functions to quickly calculate and visualize the correlation between two variables
def print_correlation(correlation, var1, var2):
'''plots the given correlation vector and prints the position of the maximal correlation'''
plt.title('Correlation of ' + str(var1) + ' and ' + str(var2) + ' over time')
plt.plot(correlation);
print('maximum correlation found at', np.argmax(correlation), 'hours')
print('this is at', np.argmax(correlation)/24, 'days')
print('minimum correlation (negative correlation) found at', np.argmin(correlation), 'hours')
print('this is at', np.argmin(correlation)/24, 'days')
def get_correlation(data, var1, var2, num_steps):
'''calculates the correlation for each offset in the interval [0,num_steps),
between the given variables var1 and var2 (both strings),
using the data in the dataframe "data" (in which var1 and var2 must be present)'''
correlation = np.zeros(num_steps)
for i in range(0,num_steps):
var1_shifted = data.shift(-i)[var1][:data.shape[0] - num_steps]
var2_data = data[var2][:data.shape[0] - num_steps]
correlation[i] = np.corrcoef(var1_shifted, var2_data)[0,1]
print_correlation(correlation,var1,var2)
return correlation
In [5]:
#calculate the correlation between cl_kadij and afv_lobith for 0 to 999 hour shift:
correlation_afv = get_correlation(correlation_data, 'cl_kadij', 'afv_lobith', 1000)
In [6]:
#calculate the correlation between cl_kadij and cl_lobith for 0 to 999 hour shift:
correlation_cl = get_correlation(correlation_data, 'cl_kadij', 'cl_lobith', 1000)
In [7]:
#calculate the correlation between cl_kadij and cum_stand for 0 to 999 hour shift:
correlation_stand = get_correlation(correlation_data, 'cl_kadij', 'cum_stand', 1000)
In [8]:
# Conclusion based on correlation, using the following time intervals for the data is "best"
afv_lobith_firstday = 6
afv_lobith_lastday = 9
cl_lobith_firstday = 3
cl_lobith_lastday = 6
stand_firstday = 1 #would ideally be 0 but then we need to use prediction for prediction so let's avoid that for the moment
stand_lastday = 2
# To visualize results, chosen intervals are shown in red
f, axarr = plt.subplots(3, sharex=True)
f.tight_layout()
axarr[0].plot(correlation_afv)
axarr[0].plot(np.arange(afv_lobith_firstday*24, afv_lobith_lastday*24), correlation_afv[afv_lobith_firstday*24:afv_lobith_lastday*24], 'r')
axarr[0].set_title('correlation afv_lobith')
axarr[1].plot(correlation_cl)
axarr[1].plot(np.arange(cl_lobith_firstday*24, cl_lobith_lastday*24), correlation_cl[cl_lobith_firstday*24:cl_lobith_lastday*24], 'r')
axarr[1].set_title('correlation cl_lobith')
axarr[2].plot(correlation_stand)
axarr[2].plot(np.arange(stand_firstday*24, stand_lastday*24), correlation_stand[stand_firstday*24:stand_lastday*24], 'r')
axarr[2].set_title('correlation cum_stand');
In [ ]: