In [62]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from statsmodels.tsa.ar_model import AR
from sklearn.metrics import r2_score
%matplotlib inline
plt.rcParams['figure.figsize']=(20,10)
plt.style.use('ggplot')
In [3]:
data = pd.read_csv('DEOK_hourly.csv')
data['Datetime']=pd.to_datetime(data['Datetime'])
data.set_index('Datetime', inplace=True)
In [4]:
data.head()
Out[4]:
In [5]:
data.plot()
Out[5]:
In [9]:
data['DEOK_MW'].hist(bins=20)
Out[9]:
In [39]:
one, two, three = np.split(
data['DEOK_MW'].sample(
frac=1), [int(.25*len(data['DEOK_MW'])),
int(.75*len(data['DEOK_MW']))])
In [34]:
mean1, mean2, mean3 = one.mean(), two.mean(), three.mean()
var1, var2, var3 = one.var(), two.var(), three.var()
In [40]:
print mean1, mean2, mean3
print var1, var2, var3
In [41]:
from statsmodels.tsa.stattools import adfuller
In [44]:
adf_test = adfuller(data['DEOK_MW'])
In [45]:
adf_test
Out[45]:
In [48]:
print "ADF = " + str(adf_test[0])
print "p-value = " +str(adf_test[1])
In [50]:
pd.tools.plotting.lag_plot(data['DEOK_MW'])
Out[50]:
In [149]:
#create train/test datasets
X = data['DEOK_MW'].dropna()
train_data = X[1:len(X)-12]
test_data = X[len(X)-12:]
In [150]:
#train the autoregression model
model = AR(train_data)
model_fitted = model.fit()
In [151]:
print('The lag value chose is: %s' % model_fitted.k_ar)
In [152]:
# make predictions
predictions = model_fitted.predict(
start=len(train_data),
end=len(train_data) + len(test_data)-1,
dynamic=False)
# create a comparison dataframe
compare_df = pd.concat(
[data['DEOK_MW'].reset_index().tail(12),
predictions], axis=1).rename(
columns={'DEOK_MW': 'actual', 0:'predicted'})
compare_df=compare_df[['actual', 'predicted']].dropna()
In [153]:
predictions
Out[153]:
In [ ]:
In [154]:
compare_df
Out[154]:
In [155]:
compare_df.plot()
Out[155]:
In [158]:
r2 = r2_score(compare_df.actual, compare_df.predicted)
In [160]:
r2
Out[160]:
In [ ]: