notebook.community

Edit and run



In [62]:

    
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from statsmodels.tsa.ar_model import AR
from sklearn.metrics import r2_score
%matplotlib inline
 
plt.rcParams['figure.figsize']=(20,10)
plt.style.use('ggplot')



In [3]:

    
data = pd.read_csv('DEOK_hourly.csv')
data['Datetime']=pd.to_datetime(data['Datetime'])
data.set_index('Datetime', inplace=True)



In [4]:

    
data.head()









    Out[4]:







  
    
      
      DEOK_MW
    
    
      Datetime
      
    
  
  
    
      2012-12-31 01:00:00
      2945.0
    
    
      2012-12-31 02:00:00
      2868.0
    
    
      2012-12-31 03:00:00
      2812.0
    
    
      2012-12-31 04:00:00
      2812.0
    
    
      2012-12-31 05:00:00
      2860.0



In [5]:

    
data.plot()









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x7efbc832db50>



In [9]:

    
data['DEOK_MW'].hist(bins=20)









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x7efbc41bfcd0>



In [39]:

    
one, two, three = np.split(
        data['DEOK_MW'].sample(
        frac=1), [int(.25*len(data['DEOK_MW'])),
        int(.75*len(data['DEOK_MW']))])



In [34]:

    
mean1, mean2, mean3 = one.mean(), two.mean(), three.mean()
var1, var2, var3 = one.var(), two.var(), three.var()



In [40]:

    
print mean1, mean2, mean3
print var1, var2, var3









    



3093.27497575 3107.45445099 3112.20124697
353154.655416 363558.421407 358899.692558



In [41]:

    
from statsmodels.tsa.stattools import adfuller



In [44]:

    
adf_test = adfuller(data['DEOK_MW'])



In [45]:

    
adf_test









    Out[45]:





(-14.913267801069782,
 1.4477674072055658e-27,
 57,
 57681,
 {'1%': -3.4304633751328555,
  '10%': -2.5667966716717614,
  '5%': -2.8615901096273602},
 669611.23911962728)



In [48]:

    
print "ADF = " + str(adf_test[0])
print "p-value = " +str(adf_test[1])









    



ADF = -14.9132678011
p-value = 1.44776740721e-27



In [50]:

    
pd.tools.plotting.lag_plot(data['DEOK_MW'])









    



/vagrant/pythondata/env/lib/python2.7/site-packages/ipykernel_launcher.py:1: FutureWarning: 'pandas.tools.plotting.lag_plot' is deprecated, import 'pandas.plotting.lag_plot' instead.
  """Entry point for launching an IPython kernel.






    Out[50]:





<matplotlib.axes._subplots.AxesSubplot at 0x7efbcc9196d0>



In [149]:

    
#create train/test datasets
X = data['DEOK_MW'].dropna()

train_data = X[1:len(X)-12]
test_data = X[len(X)-12:]



In [150]:

    
#train the autoregression model
model = AR(train_data)
model_fitted = model.fit()









    



/vagrant/pythondata/env/local/lib/python2.7/site-packages/statsmodels/tsa/base/tsa_model.py:225: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
  ' ignored when e.g. forecasting.', ValueWarning)



In [151]:

    
print('The lag value chose is: %s' % model_fitted.k_ar)









    



The lag value chose is: 59



In [152]:

    
# make predictions 
predictions = model_fitted.predict(
    start=len(train_data), 
    end=len(train_data) + len(test_data)-1, 
    dynamic=False)


# create a comparison dataframe
compare_df = pd.concat(
    [data['DEOK_MW'].reset_index().tail(12),
    predictions], axis=1).rename(
    columns={'DEOK_MW': 'actual', 0:'predicted'})
compare_df=compare_df[['actual', 'predicted']].dropna()









    



/vagrant/pythondata/env/local/lib/python2.7/site-packages/statsmodels/tsa/base/tsa_model.py:531: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
  ValueWarning)



In [153]:

    
predictions









    Out[153]:





57726    3869.472137
57727    3805.973745
57728    3796.575978
57729    3797.108515
57730    3890.598661
57731    4097.286867
57732    4260.151854
57733    4310.203028
57734    4307.501185
57735    4252.261197
57736    4146.984356
57737    4053.067806
dtype: float64



In [ ]:



In [154]:

    
compare_df









    Out[154]:







  
    
      
      actual
      predicted
    
  
  
    
      57727
      3865.0
      3805.973745
    
    
      57728
      3824.0
      3796.575978
    
    
      57729
      3766.0
      3797.108515
    
    
      57730
      3776.0
      3890.598661
    
    
      57731
      3885.0
      4097.286867
    
    
      57732
      4200.0
      4260.151854
    
    
      57733
      4393.0
      4310.203028
    
    
      57734
      4426.0
      4307.501185
    
    
      57735
      4419.0
      4252.261197
    
    
      57736
      4355.0
      4146.984356
    
    
      57737
      4224.0
      4053.067806



In [155]:

    
compare_df.plot()









    Out[155]:





<matplotlib.axes._subplots.AxesSubplot at 0x7efbbd7af0d0>



In [158]:

    
r2 = r2_score(compare_df.actual, compare_df.predicted)



In [160]:

    
r2









    Out[160]:





0.75819470059241667



In [ ]:

	DEOK_MW
Datetime
2012-12-31 01:00:00	2945.0
2012-12-31 02:00:00	2868.0
2012-12-31 03:00:00	2812.0
2012-12-31 04:00:00	2812.0
2012-12-31 05:00:00	2860.0

	actual	predicted
57727	3865.0	3805.973745
57728	3824.0	3796.575978
57729	3766.0	3797.108515
57730	3776.0	3890.598661
57731	3885.0	4097.286867
57732	4200.0	4260.151854
57733	4393.0	4310.203028
57734	4426.0	4307.501185
57735	4419.0	4252.261197
57736	4355.0	4146.984356
57737	4224.0	4053.067806