In [1]:
from datetime import datetime
import json
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import YearLocator, MonthLocator, DateFormatter, DayLocator, HourLocator
%matplotlib inline
import urllib2
from lxml import etree
from bs4 import BeautifulSoup
import time
from calendar import monthrange
import numpy as np
import copy
import statsmodels.api as sm
import pickle
import seaborn as sns
sns.set(context="paper", font="monospace")
In [2]:
years = [2012,2013,2014]
months = [1,2,3,4,5,6,7,8,9,10,11,12]
dfs = []
for year in years:
for month in months:
dfs.append(pd.read_csv('../data/generation/'+str(month)+'_'+str(year)+'.csv'))
df_orig = pd.concat(dfs)
df_orig = df_orig.reset_index(drop=True)
In [3]:
df_orig['Demanda'][df_orig['Date']=='4/2/2013']
Out[3]:
In [4]:
df_orig['Demanda'][df_orig['Date']=='28/4/2013']
df_orig['Demanda'][df_orig.index==11609]
Out[4]:
In [5]:
df_gen = copy.deepcopy(df_orig)
datetime_array = []
date_array = []
month_array = []
weekday_array = []
day_array = []
demand_array = []
tcpi_array = []
date_split = ''
for i,val in enumerate(df_gen.iterrows()):
row = val[1]
if(row['Demanda']=='4/2/2013'):
row['Date'] = '4/2/2013'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['SND-L9090'] = None
if(row['Demanda']=='28/4/2013'):
row['Date'] = '28/4/2013'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['SND-L9090'] = None
if(row['Demanda']=='25/7/2012'):
row['Date'] = '25/7/2012'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = row['MTR']
row['MTR'] = row['GSR']
row['GSR'] = row['PNI2']
row['PNI2'] = row['PNI1']
row['PNI1'] = row['PCHN']
row['PCHN'] = row['EEC20']
row['EEC20'] = row['EEC']
row['EEC'] = row['CEN']
row['CEN'] = row['PBP']
row['PBP'] = None
if(row['Demanda']=='27/12/2012'):
row['Date'] = '27/12/2012'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['SND-L9090'] = None
if(row['Demanda']=='3/6/2014'):
row['Date'] = '3/6/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = None
if(row['Demanda']=='4/6/2014'):
row['Date'] = '4/6/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = row['MTR']
row['MTR'] = row['GSR']
row['GSR'] = row['PNI2']
row['PNI2'] = row['PNI1']
row['PNI1'] = None
if(row['Demanda']=='5/6/2014'):
row['Date'] = '5/6/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = row['MTR']
row['MTR'] = row['GSR']
row['GSR'] = row['PNI2']
row['PNI2'] = row['PNI1']
row['PNI1'] = row['PCHN']
row['PCHN'] = row['EEC20']
row['EEC20'] = row['EEC']
row['EEC'] = row['CEN']
row['CEN'] = row['PBP']
row['PBP'] = row['AMY2']
row['AMY2'] = row['AMY1']
row['AMY1'] = row['PHC2']
row['PHC2'] = row['PHC1']
row['PHC1'] = row['PCG9']
row['PCG9'] = row['PCG8']
row['PCG8'] = row['PCG7']
row['PCG7'] = row['PCG6']
row['PCG6'] = row['PCG5']
row['PCG5'] = None
if(row['TCPI-L9150']=='9/6/2014'):
row['Date'] = '9/6/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = None
row['PLB1'] = None
if(row['Demanda']=='10/6/2014'):
row['Date'] = '10/6/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = None
if(row['Demanda']=='12/6/2014'):
row['Date'] = '12/6/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = None
if(row['Demanda']=='15/6/2014'):
row['Date'] = '15/6/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = row['MTR']
row['MTR'] = row['GSR']
row['GSR'] = row['PNI2']
row['PNI2'] = row['PNI1']
row['PNI1'] = None
if(row['Demanda']=='24/6/2014'):
row['Date'] = '24/6/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = None
if(row['Demanda']=='27/6/2014'):
row['Date'] = '27/6/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = row['MTR']
row['MTR'] = row['GSR']
row['GSR'] = row['PNI2']
row['PNI2'] = row['PNI1']
row['PNI1'] = row['PCHN']
row['PCHN'] = row['EEC20']
row['EEC20'] = row['EEC']
row['EEC'] = row['CEN']
row['CEN'] = row['PBP']
row['PBP'] = row['AMY2']
row['AMY2'] = row['AMY1']
row['AMY1'] = row['PHC2']
row['PHC2'] = row['PHC1']
row['PHC1'] = row['PCG9']
row['PCG9'] = row['PCG8']
row['PCG8'] = row['PCG7']
row['PCG7'] = row['PCG6']
row['PCG6'] = row['PCG5']
row['PCG5'] = None
if(row['Demanda']=='1/7/2014'):
row['Date'] = '1/7/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = row['MTR']
row['MTR'] = row['GSR']
row['GSR'] = row['PNI2']
row['PNI2'] = row['PNI1']
row['PNI1'] = None
if(row['Demanda']=='7/7/2014'):
row['Date'] = '7/7/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = row['MTR']
row['MTR'] = row['GSR']
row['GSR'] = row['PNI2']
row['PNI2'] = row['PNI1']
row['PNI1'] = row['PCHN']
row['PCHN'] = row['EEC20']
row['EEC20'] = row['EEC']
row['EEC'] = row['CEN']
row['CEN'] = row['PBP']
row['PBP'] = row['AMY2']
row['AMY2'] = row['AMY1']
row['AMY1'] = row['PHC2']
row['PHC2'] = None
if(row['Demanda']=='11/7/2014'):
row['Date'] = '11/7/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = None
if(row['Demanda']=='12/7/2014'):
row['Date'] = '12/7/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = row['MTR']
row['MTR'] = row['GSR']
row['GSR'] = row['PNI2']
row['PNI2'] = row['PNI1']
row['PNI1'] = None
if(row['Demanda']=='19/7/2014'):
row['Date'] = '19/7/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = row['MTR']
row['MTR'] = row['GSR']
row['GSR'] = row['PNI2']
row['PNI2'] = row['PNI1']
row['PNI1'] = row['PCHN']
row['PCHN'] = row['EEC20']
row['EEC20'] = row['EEC']
row['EEC'] = row['CEN']
row['CEN'] = row['PBP']
row['PBP'] = row['AMY2']
row['AMY2'] = row['AMY1']
row['AMY1'] = row['PHC2']
row['PHC2'] = row['PHC1']
row['PHC1'] = row['PCG9']
row['PCG9'] = row['PCG8']
row['PCG8'] = row['PCG7']
row['PCG7'] = row['PCG6']
row['PCG6'] = row['PCG5']
row['PCG5'] = row['PCG4']
row['PCG4'] = row['PCG3']
row['PCG3'] = row['PCG2']
row['PCG2'] = row['PCG1']
row['PCG1'] = None
if(row['Demanda']=='20/7/2014'):
row['Date'] = '20/7/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = row['MTR']
row['MTR'] = row['GSR']
row['GSR'] = row['PNI2']
row['PNI2'] = row['PNI1']
row['PNI1'] = row['PCHN']
row['PCHN'] = row['EEC20']
row['EEC20'] = row['EEC']
row['EEC'] = row['CEN']
row['CEN'] = row['PBP']
row['PBP'] = row['AMY2']
row['AMY2'] = row['AMY1']
row['AMY1'] = row['PHC2']
row['PHC2'] = None
if(row['TCPI-L9150']=='21/7/2014'):
row['Date'] = '21/7/2014'
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = row['MTR']
row['MTR'] = row['GSR']
row['GSR'] = row['PNI2']
row['PNI2'] = row['PNI1']
row['PNI1'] = row['PCHN']
row['PCHN'] = row['EEC20']
row['EEC20'] = row['EEC']
row['EEC'] = row['CEN']
row['CEN'] = row['PBP']
row['PBP'] = row['AMY2']
row['AMY2'] = row['AMY1']
row['AMY1'] = row['PHC2']
row['PHC2'] = row['PHC1']
row['PHC1'] = row['PCG9']
row['PCG9'] = row['PCG8']
row['PCG8'] = row['PCG7']
row['PCG7'] = row['PCG6']
row['PCG6'] = row['PCG5']
row['PCG5'] = None
row['Demanda'] = row['TCPI-L9150']
row['TCPI-L9150'] = row['AMY-L9030']
row['AMY-L9030'] = row['SND-L9090']
row['LNI-L9040'] = row['AMY-L9030']
row['TPC'] = row['LNI-L9040']
row['LNI-9040'] = row['TPC']
row['TPC'] = row['PMT3']
row['PMT3'] = row['PMT2']
row['PMT2'] = row['PMT1']
row['PMT1'] = row['PMG5']
row['PMG5'] = row['PMG4']
row['PMG4'] = row['PMG3']
row['PMG3'] = row['PLB2']
row['PLB2'] = row['PLB1']
row['PLB1'] = row['PEN3']
row['PEN3'] = row['PEN1 y 2']
row['PEN1 y 2'] = row['PCF2']
row['PCF2'] = row['PCF1']
row['PCF1'] = row['PCA2']
row['PCA2'] = row['PCA1']
row['PCA1'] = row['NSL']
row['NSL'] = row['MTR']
row['MTR'] = row['GSR']
row['GSR'] = row['PNI2']
row['PNI2'] = None
try:
date_split = row['Date'].split('/')
except:
print row
hour = row['HORA']
date = datetime(int(float(date_split[2])),int(float(date_split[1])),int(float(date_split[0])))
date_time = datetime(int(float(date_split[2])),int(float(date_split[1])),int(float(date_split[0])),int(float(hour)))
month = int(float(date_split[0]))
weekday = int(float(date_time.weekday()))
day = int(float(date_time.day))
datetime_array.append(date_time)
date_array.append(date)
month_array.append(month)
weekday_array.append(weekday)
day_array.append(day)
demand_array.append(row['Demanda'])
tcpi_array.append(row['TCPI-L9150'])
df_gen['TCPI-L9150'] = tcpi_array
df_gen['TCPI-L9150'] = df_gen['TCPI-L9150'].astype(np.float64)
df_gen['Demanda'] = demand_array
df_gen['Demanda'] = df_gen['Demanda'].astype(np.float64)
df_gen['datetime']=datetime_array
df_gen['date']=date_array
df_gen['month']=month_array
df_gen['weekday']=weekday_array
df_gen['day']=day_array
df_gen = df_gen.set_index('datetime').drop('Date',1).drop('Unnamed: 0',1)
df_gen['HORA'] = df_gen[['HORA']].astype(int)
In [6]:
df_gen['Demanda'][df_gen['Demanda'].isnull()]
Out[6]:
In [4]:
def plot_with_time_axis(index,value,xaxis='year',symbol='-'):
fig, ax = plt.subplots()
ax.plot_date(index, value, symbol)
# format the ticks
years_plot = YearLocator()
months_plot = MonthLocator() # every month
date_plot = DayLocator()
if xaxis == 'year':
fmt = DateFormatter('%Y')
ax.xaxis.set_major_locator(years_plot)
ax.xaxis.set_major_formatter(fmt)
ax.xaxis.set_minor_locator(months_plot)
elif xaxis == 'month':
fmt = DateFormatter('%m\n%Y')
ax.xaxis.set_major_locator(months_plot)
ax.xaxis.set_major_formatter(fmt)
ax.xaxis.set_minor_locator(date_plot)
elif xaxis == 'day':
fmt = DateFormatter('%d\n%m')
ax.xaxis.set_major_formatter(fmt)
elif xaxis == 'hour':
fmt = DateFormatter('%H')
ax.xaxis.set_major_formatter(fmt)
ax.autoscale_view()
In [5]:
#changed EOLO to EOL, EEC50 to EEC, and TPT to TPC
#changed PMGA to PMG, PCH1 to PCHN
df_gen['Albanisa_Bunker'] = df_gen[['PCG1','PCG2','PCG3','PCG4','PCG5','PCG6','PCG7','PCG8','PCG9','PHC1','PHC2']].sum(axis=1)
df_gen['AEI_Wind'] = df_gen[['AMY1','AMY2']].sum(axis=1)
df_gen['Gruo_Terra_Wind'] = df_gen[['PBP']].sum(axis=1)
df_gen['Other_Wind'] = df_gen[['ABR','EOL']].sum(axis=1)
df_gen['AEI_Bunker'] = df_gen[['CEN','EEC20','EEC','TPC']].sum(axis=1)
df_gen['Enel_Hydro'] = df_gen[['PCA1','PCA2','PCF1','PCF2']].sum(axis=1)
df_gen['Enel_Bunker'] = df_gen[['PLB1','PLB2','PMG3','PMG4','PMG5']].sum(axis=1)
df_gen['Other_Bunker'] = df_gen[['PCHN','PNI1','PNI2','GSR']].sum(axis=1)
df_gen['Hidropanasma_Hydro'] = df_gen[['HPA1','HPA2']].sum(axis=1)
bunker_fuel = ['PCG1','PCG2','PCG3','PCG4','PCG5','PCG6','PCG7','PCG8','PCG9','PHC1','PHC2','CEN','EEC20','EEC','TPC','PLB1','PLB2','PMG3','PMG4','PMG5','PCHN','PNI1','PNI2','GSR']
interconnect = ['LNI-L9040','SND-L9090','AMY-L9030','TCPI-L9150']
hydro = ['PCA1','PCA2','PCF1','PCF2','HPA1','HPA2']
geothermal = ['PEN1 y 2','PEN3','PEN4','PMT1','PMT2']
biomass = ['MTR','NSL']
wind = ['AMY1','AMY2','ABR','EOL','PBP']
df_gen['bunker'] = df_gen[bunker_fuel].sum(axis=1)
df_gen['interconnect'] = df_gen[interconnect].sum(axis=1)
df_gen['hydro'] = df_gen[hydro].sum(axis=1)
df_gen['geothermal'] = df_gen[geothermal].sum(axis=1)
df_gen['biomass'] = df_gen[biomass].sum(axis=1)
df_gen['wind'] = df_gen[wind].sum(axis=1)
df_big_gen = pd.DataFrame([df_gen['wind'],df_gen['biomass'],df_gen['interconnect'],df_gen['geothermal'],df_gen['bunker'],df_gen['hydro']]).T
In [6]:
df_gen['all_power'] = df_gen['bunker']+df_gen['hydro']+df_gen['geothermal']+df_gen['biomass']+df_gen['wind']
In [7]:
df_big_gen['Demanda'] = df_gen['Demanda']
df_pickle = copy.deepcopy(df_big_gen)
df_pickle['Demanda'] = df_gen['Demanda']
In [9]:
error = (df_gen['all_power'].astype(float)+df_gen['interconnect']-df_gen['Demanda'].astype(float))/df_gen['Demanda'].astype(float)
plt.hist(error,300,range=[-1,1])
print 'error: ' + str(np.mean(error)) + '+/- (' + str(3*np.std(error))
In [10]:
start_date = datetime(2013,1,1)
end_date = datetime(2013,1,2)
df_plot = df_gen[start_date:end_date]
plot_with_time_axis(df_plot.index,df_plot['all_power'],'hour',)
In [11]:
fuel_types = ['bunker','interconnect','hydro','geothermal','biomass','wind']
fuel_types_array = []
start_date = datetime(2013,1,1)
end_date = datetime(2013,1,5)
df_plot = df_gen[start_date:end_date]
plt.plot(df_plot['all_power'],label='all')
for key in fuel_types:
fuel_types_array.append(df_plot[key])
plt.plot(df_plot[key],label=key)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Out[11]:
In [12]:
plt.plot(df_plot['geothermal'],label='geothermal')
plt.plot(df_plot['biomass']+df_plot['geothermal'],label='biomass')
plt.plot(df_plot['wind']+df_plot['biomass']+df_plot['geothermal'],label='wind')
plt.plot(df_plot['hydro']+df_plot['wind']+df_plot['biomass']+df_plot['geothermal'],label='hydro')
plt.plot(df_plot['bunker']+df_plot['hydro']+df_plot['wind']+df_plot['biomass']+df_plot['geothermal'],label='bunker')
plt.plot(df_plot['interconnect']+df_plot['bunker']+df_plot['hydro']+df_plot['wind']+df_plot['biomass']+df_plot['geothermal'],label='interconnect')
plt.plot(df_plot['Demanda'],label='Demand')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#sample correlation coeffcients
#hydro 2011-2013 vs. 2014
Out[12]:
In [13]:
plt.plot(df_plot['all_power']-df_plot['Demanda'].astype(float))
plt.plot(df_plot['interconnect'])
plt.plot(df_plot['all_power']-df_plot['Demanda'].astype(float)+df_plot['interconnect'])
Out[13]:
In [67]:
df_big_gen.keys()
df_corr = copy.deepcopy(df_big_gen.dropna())
In [68]:
#The Quantities Correlated
corr_df = pd.DataFrame(np.corrcoef(df_corr.T),columns=df_big_gen.keys()).T
corr_df.columns = df_big_gen.keys()
print corr_df
plt.figure()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_df.as_matrix(), linewidths=0, robust=True, square=False,xticklabels=df_big_gen.keys(),yticklabels=df_big_gen.keys())
Out[68]:
In [65]:
df_norm = copy.deepcopy(df_corr.dropna())
for key in df_norm:
df_norm[key] = df_norm[key]/np.max(df_norm[key])
In [66]:
#The Quantities Correlated Normalized
corr_df = pd.DataFrame(np.corrcoef(df_norm.T),columns=df_norm.keys()).T
corr_df.columns = df_norm.keys()
print corr_df
plt.figure()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_df.as_matrix(), linewidths=0, robust=True, square=False,xticklabels=df_big_gen.keys(),yticklabels=df_big_gen.keys())
Out[66]:
In [43]:
df_corr = copy.deepcopy(df_big_gen.diff(1).dropna())
corr_df = pd.DataFrame(np.corrcoef(df_corr.T),columns=df_big_gen.keys()).T
corr_df.columns = df_big_gen.keys()
corr_df
print corr_df
plt.figure()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_df.as_matrix(), linewidths=0, robust=True, square=False,xticklabels=df_big_gen.keys(),yticklabels=df_big_gen.keys())
Out[43]:
In [64]:
df_corr = copy.deepcopy(df_big_gen.diff(2).dropna())
corr_df = pd.DataFrame(np.corrcoef(df_corr.T),columns=df_big_gen.keys()).T
corr_df.columns = df_big_gen.keys()
print corr_df
plt.figure()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_df.as_matrix(), linewidths=0, robust=True, square=False,xticklabels=df_big_gen.keys(),yticklabels=df_big_gen.keys())
Out[64]:
In [49]:
df_corr = copy.deepcopy(df_big_gen.drop('Demanda',1).diff(24).dropna())
corr_df = pd.DataFrame(np.corrcoef(df_corr.T),columns=df_corr.keys()).T
corr_df.columns = df_corr.keys()
print corr_df
plt.figure()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_df.as_matrix(), linewidths=0, robust=True, square=False,xticklabels=df_big_gen.keys(),yticklabels=df_big_gen.keys())
Out[49]:
In [50]:
def vcorrcoef(X,y):
#Xm = np.reshape(np.mean(X,axis=1),(X.shape[0],1))
Xm = np.mean(X)
ym = np.mean(y)
r_num = np.sum((X-Xm)*(y-ym))
r_den = np.sqrt(np.sum((X-Xm)**2)*np.sum((y-ym)**2))
r = r_num/r_den
return r
'''
def vcorrcoef(X,y):
Xm = np.reshape(np.mean(X,axis=1),(X.shape[0],1))
ym = np.mean(y)
r_num = np.sum((X-Xm)*(y-ym),axis=1)
r_den = np.sqrt(np.sum((X-Xm)**2,axis=1)*np.sum((y-ym)**2))
r = r_num/r_den
return r
'''
pass
In [51]:
start_date = datetime(2013,1,1)
end_date = datetime(2013,1,2)
df_play = copy.deepcopy(df_big_gen[start_date:end_date])
In [69]:
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_big_gen['wind'],df_big_gen['bunker']))
for i in range(1,350):
sample_corr_coeff.append(vcorrcoef(df_big_gen['wind'].diff(i),df_big_gen['bunker'].diff(i)))
plt.plot(sample_corr_coeff)
Out[69]:
In [75]:
start_date = datetime(2012,1,1)
end_date = datetime(2013,12,31)
df_play = copy.deepcopy(df_big_gen[start_date:end_date])
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['wind'],df_play['bunker']))
print '2012-2013: Wind - Bunker (Quantity): ' + str(vcorrcoef(df_play['wind'],df_play['bunker']))
print '2012-2013: Wind - Bunker (12-Hour): ' + str(vcorrcoef(df_play['wind'].diff(12),df_play['bunker'].diff(12)))
for i in range(1,350):
sample_corr_coeff.append(vcorrcoef(df_play['wind'].diff(i),df_play['bunker'].diff(i)))
plt.plot(sample_corr_coeff)
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['wind'],df_play['hydro']))
print '2012-2013: Wind - Hydro (Quantity): ' + str(vcorrcoef(df_play['wind'],df_play['hydro']))
print '2012-2013: Wind - Hydro (12-Hour): ' + str(vcorrcoef(df_play['wind'].diff(12),df_play['hydro'].diff(12)))
for i in range(1,350):
sample_corr_coeff.append(vcorrcoef(df_play['wind'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff)
hydro_2012_2013 = df_play['hydro']
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['Demanda'],df_play['hydro']))
print '2012-2013: Demand - Hydro (Quantity): ' + str(vcorrcoef(df_play['Demanda'],df_play['hydro']))
print '2012-2013: Demand - Hydro (12-Hour): ' + str(vcorrcoef(df_play['Demanda'].diff(12),df_play['hydro'].diff(12)))
for i in range(1,350):
sample_corr_coeff.append(vcorrcoef(df_play['Demanda'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff)
plt.legend()
hydro_2012_2013 = df_play['hydro']
In [132]:
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['Demanda'],df_play['hydro']))
print '2012-2013: Demand - Hydro (Quantity): ' + str(vcorrcoef(df_play['Demanda'],df_play['hydro']))
print '2012-2013: Demand - Hydro (12-Hour): ' + str(vcorrcoef(df_play['Demanda'].diff(12),df_play['hydro'].diff(12)))
for i in range(1,25):
sample_corr_coeff.append(vcorrcoef(df_play['Demanda'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff,label='Demand - Hydro')
plt.xlabel('Length of Ramp (Hours)')
plt.ylabel('Sample Correlation Coefficient')
plt.title('Demand - Hydro Correlation')
sample_corr_coeff = []
df_play['Demandaminuswind']=df_play['Demanda'] - df_play['wind']
sample_corr_coeff.append(vcorrcoef(df_play['Demandaminuswind'],df_play['hydro']))
print '2012-2013: Demand - Hydro (Quantity): ' + str(vcorrcoef(df_play['Demandaminuswind'],df_play['hydro']))
print '2012-2013: Demand - Hydro (12-Hour): ' + str(vcorrcoef(df_play['Demandaminuswind'].diff(12),df_play['hydro'].diff(12)))
for i in range(1,25):
sample_corr_coeff.append(vcorrcoef(df_play['Demandaminuswind'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff,'k',label='Demand(-wind) - Hydro')
plt.xlabel('Length of Ramp (Hours)')
plt.ylabel('Sample Correlation Coefficient')
plt.title('Demand - Hydro Correlation')
sample_corr_coeff = []
df_play['Demandaminuswindinterconnect']=df_play['Demanda'] - df_play['wind'] + df_play['interconnect']
sample_corr_coeff.append(vcorrcoef(df_play['Demandaminuswindinterconnect'],df_play['hydro']))
print '2012-2013: Demand - Hydro (Quantity): ' + str(vcorrcoef(df_play['Demandaminuswindinterconnect'],df_play['hydro']))
print '2012-2013: Demand - Hydro (12-Hour): ' + str(vcorrcoef(df_play['Demandaminuswindinterconnect'].diff(12),df_play['hydro'].diff(12)))
for i in range(1,25):
sample_corr_coeff.append(vcorrcoef(df_play['Demandaminuswindinterconnect'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff,'g',label='Demand(-wind +interconnect) - Hydro')
plt.xlabel('Length of Ramp (Hours)')
plt.ylabel('Sample Correlation Coefficient')
plt.title('Demand - Hydro Correlation')
plt.legend(title='Correlations',bbox_to_anchor=(1.25, .9),
bbox_transform=plt.gcf().transFigure)
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['wind'],df_play['hydro']))
print '2012-2013: Demand - Hydro (Quantity): ' + str(vcorrcoef(df_play['wind'],df_play['hydro']))
print '2012-2013: Demand - Hydro (12-Hour): ' + str(vcorrcoef(df_play['wind'].diff(12),df_play['hydro'].diff(12)))
for i in range(1,25):
sample_corr_coeff.append(vcorrcoef(df_play['wind'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff,'r',label='Wind-Hydro')
plt.xlabel('Length of Ramp (Hours)')
plt.ylabel('Sample Correlation Coefficient')
plt.title('Demand - Hydro Correlation')
plt.legend(title='Correlations',bbox_to_anchor=(1.35, .9),
bbox_transform=plt.gcf().transFigure)
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['interconnect'],df_play['hydro']))
for i in range(1,25):
sample_corr_coeff.append(vcorrcoef(df_play['interconnect'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff,'c',label='Interconnect-Hydro')
plt.xlabel('Length of Ramp (Hours)')
plt.ylabel('Sample Correlation Coefficient')
plt.title('Demand - Hydro Correlation')
plt.legend(title='Correlations',bbox_to_anchor=(1.35, .9),
bbox_transform=plt.gcf().transFigure)
Out[132]:
In [54]:
start_date = datetime(2014,1,1)
end_date = datetime(2014,12,31)
df_play = copy.deepcopy(df_big_gen[start_date:end_date])
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['wind'],df_play['bunker']))
print '2014: Wind - Bunker (Quantity): ' + str(vcorrcoef(df_play['wind'],df_play['bunker']))
for i in range(1,350):
sample_corr_coeff.append(vcorrcoef(df_play['wind'].diff(i),df_play['bunker'].diff(i)))
plt.plot(sample_corr_coeff)
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['wind'],df_play['hydro']))
print '2014: Wind - Hydro (1-Hour Ramp): ' + str(vcorrcoef(df_play['wind'].diff(1),df_play['hydro'].diff(1)))
for i in range(1,350):
sample_corr_coeff.append(vcorrcoef(df_play['wind'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff)
hydro_2014 = df_play['hydro']
In [76]:
start_date = datetime(2012,1,1)
end_date = datetime(2014,12,31)
df_play = copy.deepcopy(df_big_gen[start_date:end_date])
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['wind']+df_play['hydro'],df_play['bunker']))
print '2014: Wind+Hydro - Bunker (Quantity): ' + str(vcorrcoef(df_play['wind']+df_play['hydro'],df_play['bunker']))
for i in range(1,350):
sample_corr_coeff.append(vcorrcoef(df_play['wind'].diff(i)+df_play['hydro'].diff(i),df_play['bunker'].diff(i)))
plt.plot(sample_corr_coeff)
Out[76]:
In [27]:
plt.plot(hydro_2012_2013.index,hydro_2012_2013)
plt.plot(hydro_2014.index,hydro_2014)
print np.mean(hydro_2012_2013)
print np.mean(hydro_2014)
In [28]:
train_start_date = datetime(2013,1,1)
train_end_date = datetime(2013,6,30)
test_start_date = datetime(2013,7,1)
test_end_date = datetime(2013,12,31)
def mls(df):
df_train = df[train_start_date:train_end_date]
df_test = df[test_start_date:test_end_date]
x_train = df_train.drop('wind_shift_1',1)
x_test = df_test.drop('wind_shift_1',1)
y_train = df['wind_shift_1'][train_start_date:train_end_date]
y_test = df['wind_shift_1'][test_start_date:test_end_date]
x_train = sm.add_constant(x_train, prepend=True) #add a constant
x_test = sm.add_constant(x_test, prepend=True) #add a constant
print df_train.drop('wind_shift_1',1).keys()
ols = sm.OLS(y_train,x_train)
res = ols.fit() #create a model and fit it
pred_y = res.predict(x_test)
error = (y_test - pred_y)
mape = np.mean(abs(error))
print "MAE ERROR: " + str(mape)
print "MAE ERROR/MEAN: " + str(mape/np.mean(y_test))
print "MAX ERROR: " + str(max(error))
print res.summary()
return pred_y
#This shifts the demand data, so that each row has the values and then the next hours demand.
df_mls = copy.deepcopy(df_big_gen)
df_mls['wind_shift_1'] = df_mls['wind'].shift(-1)
df_mls['wind_back_1'] = df_mls['wind'].diff(1)
df_mls = df_mls.dropna()
pred_y = mls(df_mls)
df_test = copy.deepcopy(df_mls[test_start_date:test_end_date])
df_test['pred_wind'] = pred_y
In [29]:
def get_PCA_Matrix(df_imp_stand):
all_samples = df_imp_stand.as_matrix().T
num_var = len(all_samples)
mean_vector = all_samples.mean(1)
#print('Mean Vector:\n', mean_vector)
cov_mat = np.cov(all_samples)
#print('Covariance Matrix:\n', cov_mat)
eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)
for i in range(len(eig_val_cov)):
eigvec_cov = eig_vec_cov[:,i].reshape(1,num_var).T
#print('Eigenvalue {} from covariance matrix: {}'.format(i+1, eig_val_cov[i]))
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_val_cov[i]), eig_vec_cov[:,i]) for i in range(len(eig_val_cov))]
# Sort the (eigenvalue, eigenvector) tuples from high to low
def getKey(item):
return item[0]
eig_pairs = sorted(eig_pairs,key=getKey)
eig_pairs.reverse()
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
for i in eig_pairs:
#print(i[0])
pass
matrix_w = np.hstack((eig_pairs[0][1].reshape(num_var,1), eig_pairs[1][1].reshape(num_var,1),eig_pairs[2][1].reshape(num_var,1),eig_pairs[3][1].reshape(num_var,1),eig_pairs[4][1].reshape(num_var,1)))
#print matrix_w
transformed = matrix_w.T.dot(all_samples)
return [transformed, matrix_w]
In [30]:
df_pca = copy.deepcopy(df_mls.drop('wind_shift_1',1))
[transformed,matrix_w] = get_PCA_Matrix(df_pca)
In [31]:
table = []
#row_format ="{:>20}" * 5
#print row_format.format("", *['PCA0','PCA1','PCA2','Demand'])
for key in df_pca:
if key != 'Demanda_shift_1':
corr = [np.corrcoef(df_pca[key],transformed[0])[0][1],np.corrcoef(df_pca[key],transformed[1])[0][1],np.corrcoef(df_pca[key],transformed[2])[0][1],np.corrcoef(df_pca[key],df_mls['wind_shift_1'])[0][1]]
table.append(corr)
#print row_format.format(key, *corr)
In [32]:
corr_df = pd.DataFrame(np.array(table).T,columns=df_pca.keys()).T
corr_df.columns = ['PCA0','PCA1','PCA2','Wind']
corr_df.sort('PCA0',ascending=False)
Out[32]:
In [33]:
def mls_vars_and_pca(df):
y_train = df['wind_shift_1'][train_start_date:train_end_date]
y_test = df['wind_shift_1'][test_start_date:test_end_date]
df_train = df.drop('wind_shift_1',1)[train_start_date:train_end_date]
df_test = df.drop('wind_shift_1',1)[test_start_date:test_end_date]
[transformed, matrix_w] = get_PCA_Matrix(df_train)
x_train_pls = transformed.T
x_test_pls = matrix_w.T.dot(df_test.T).T
x_train_pls = sm.add_constant(x_train_pls, prepend=True) #add a constant
x_test_pls = sm.add_constant(x_test_pls, prepend=True) #add a constant
df = df.dropna()
df = copy.deepcopy(df)
df_train = df[train_start_date:train_end_date]
df_test = df[test_start_date:test_end_date]
x_train = df_train.drop('wind_shift_1',1)
x_test = df_test.drop('wind_shift_1',1)
x_train = sm.add_constant(x_train, prepend=True) #add a constant
x_test = sm.add_constant(x_test, prepend=True) #add a constant
x_train_all = np.vstack((x_train_pls.T,x_train.T)).T
x_test_all = np.vstack((x_test_pls.T,x_test.T)).T
ols = sm.OLS(y_train,x_train_all)
res = ols.fit() #create a model and fit it
pred_y = res.predict(x_test_all)
error = (y_test - pred_y)
mape = np.mean(abs(error))
print "MAE ERROR: " + str(mape)
print "MAE ERROR/MEAN: " + str(mape/np.mean(y_test))
print "MAX ERROR: " + str(max(error))
print res.summary()
return [mape,error]
In [34]:
[mape,error] = mls_vars_and_pca(df_mls)
In [35]:
def mls_pca(df):
#df_stand = copy.deepcopy((df - df.mean()) / df.std())
y_train = df['wind_shift_1'][train_start_date:train_end_date]
y_test = df['wind_shift_1'][test_start_date:test_end_date]
df_train = df.drop('wind_shift_1',1)[train_start_date:train_end_date]
df_test = df.drop('wind_shift_1',1)[test_start_date:test_end_date]
[transformed, matrix_w] = get_PCA_Matrix(df_train)
x_train = transformed.T
x_test = matrix_w.T.dot(df_test.T).T
x_train = sm.add_constant(x_train, prepend=True) #add a constant
x_test = sm.add_constant(x_test, prepend=True) #add a constant
ols = sm.OLS(y_train,x_train)
res = ols.fit() #create a model and fit it
pred_y = res.predict(x_test)
error = (y_test - pred_y)
mape = np.mean(abs(error))
print "MAE ERROR: " + str(mape)
print "MAE ERROR/MEAN: " + str(mape/np.mean(y_test))
print "MAX ERROR: " + str(max(error))
print res.summary()
table = []
for key in df:
if key != 'wind_shift_1':
corr = [np.corrcoef(df_train[key],transformed[0])[0][1],np.corrcoef(df_train[key],transformed[1])[0][1],np.corrcoef(df_train[key],transformed[2])[0][1],np.corrcoef(df_train[key],transformed[3])[0][1],np.corrcoef(df_train[key],df['wind_shift_1'][train_start_date:train_end_date])[0][1]]
table.append(corr)
corr_df = pd.DataFrame(np.array(table).T,columns=df_train.keys()).T
corr_df.columns = ['PCA0','PCA1','PCA2','PCA3','Wind']
corr_df.sort('PCA0',ascending=False)
mls_pca(df_mls)
In [36]:
#normalize values for quantity
#follow Diego algorithm
In [ ]: