In [1]:
from datetime import datetime
import json
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import YearLocator, MonthLocator, DateFormatter, DayLocator, HourLocator
%matplotlib inline

import urllib2
from lxml import etree
from bs4 import BeautifulSoup
import time

from calendar import monthrange

import numpy as np
import copy

import statsmodels.api as sm
import pickle

import seaborn as sns
sns.set(context="paper", font="monospace")

In [2]:
years = [2012,2013,2014]
months = [1,2,3,4,5,6,7,8,9,10,11,12]
dfs = []
for year in years:
    for month in months:  
        dfs.append(pd.read_csv('../data/generation/'+str(month)+'_'+str(year)+'.csv'))
df_orig = pd.concat(dfs)
df_orig = df_orig.reset_index(drop=True)

In [3]:
df_orig['Demanda'][df_orig['Date']=='4/2/2013']


Out[3]:
9600    348.328
9601    347.578
9602    334.565
9603    329.508
9604    331.746
9605    349.541
9606    372.759
9607     390.05
9608    446.225
9609    498.499
9610    542.353
9611    530.698
9612    527.171
9613     515.32
9614    545.604
9615    542.141
9616    513.279
9617    499.916
9618    520.823
9619    576.835
9620    553.737
9621    519.532
9622    435.978
Name: Demanda, dtype: object

In [4]:
df_orig['Demanda'][df_orig['Date']=='28/4/2013']
df_orig['Demanda'][df_orig.index==11609]


Out[4]:
11609    28/4/2013
Name: Demanda, dtype: object

In [5]:
df_gen = copy.deepcopy(df_orig)
datetime_array = []
date_array = []
month_array = []
weekday_array = []
day_array = []
demand_array = []
tcpi_array = []
date_split = ''
for i,val in enumerate(df_gen.iterrows()):
    row = val[1]
    if(row['Demanda']=='4/2/2013'):
        row['Date'] = '4/2/2013'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['SND-L9090'] = None
    if(row['Demanda']=='28/4/2013'):
        row['Date'] = '28/4/2013'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['SND-L9090'] = None
    if(row['Demanda']=='25/7/2012'):
        row['Date'] = '25/7/2012'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = row['MTR']
        row['MTR'] = row['GSR']
        row['GSR'] = row['PNI2']
        row['PNI2'] = row['PNI1']
        row['PNI1'] = row['PCHN']
        row['PCHN'] = row['EEC20']
        row['EEC20'] = row['EEC']
        row['EEC'] = row['CEN']
        row['CEN'] = row['PBP']
        row['PBP'] = None
    if(row['Demanda']=='27/12/2012'):
        row['Date'] = '27/12/2012'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['SND-L9090'] = None
    if(row['Demanda']=='3/6/2014'):
        row['Date'] = '3/6/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = None
    if(row['Demanda']=='4/6/2014'):
        row['Date'] = '4/6/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = row['MTR']
        row['MTR'] = row['GSR']
        row['GSR'] = row['PNI2']
        row['PNI2'] = row['PNI1']
        row['PNI1'] = None
    if(row['Demanda']=='5/6/2014'):
        row['Date'] = '5/6/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = row['MTR']
        row['MTR'] = row['GSR']
        row['GSR'] = row['PNI2']
        row['PNI2'] = row['PNI1']
        row['PNI1'] = row['PCHN']
        row['PCHN'] = row['EEC20']
        row['EEC20'] = row['EEC']
        row['EEC'] = row['CEN']
        row['CEN'] = row['PBP']
        row['PBP'] = row['AMY2']
        row['AMY2'] = row['AMY1']
        row['AMY1'] = row['PHC2']
        row['PHC2'] = row['PHC1']
        row['PHC1'] = row['PCG9']
        row['PCG9'] = row['PCG8']
        row['PCG8'] = row['PCG7']
        row['PCG7'] = row['PCG6']
        row['PCG6'] = row['PCG5']
        row['PCG5'] = None
    if(row['TCPI-L9150']=='9/6/2014'):
        row['Date'] = '9/6/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = None
        row['PLB1'] = None
    if(row['Demanda']=='10/6/2014'):
        row['Date'] = '10/6/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = None
    if(row['Demanda']=='12/6/2014'):
        row['Date'] = '12/6/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = None
    if(row['Demanda']=='15/6/2014'):
        row['Date'] = '15/6/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = row['MTR']
        row['MTR'] = row['GSR']
        row['GSR'] = row['PNI2']
        row['PNI2'] = row['PNI1']
        row['PNI1'] = None
    if(row['Demanda']=='24/6/2014'):
        row['Date'] = '24/6/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = None
    if(row['Demanda']=='27/6/2014'):
        row['Date'] = '27/6/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = row['MTR']
        row['MTR'] = row['GSR']
        row['GSR'] = row['PNI2']
        row['PNI2'] = row['PNI1']
        row['PNI1'] = row['PCHN']
        row['PCHN'] = row['EEC20']
        row['EEC20'] = row['EEC']
        row['EEC'] = row['CEN']
        row['CEN'] = row['PBP']
        row['PBP'] = row['AMY2']
        row['AMY2'] = row['AMY1']
        row['AMY1'] = row['PHC2']
        row['PHC2'] = row['PHC1']
        row['PHC1'] = row['PCG9']
        row['PCG9'] = row['PCG8']
        row['PCG8'] = row['PCG7']
        row['PCG7'] = row['PCG6']
        row['PCG6'] = row['PCG5']
        row['PCG5'] = None
    if(row['Demanda']=='1/7/2014'):
        row['Date'] = '1/7/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = row['MTR']
        row['MTR'] = row['GSR']
        row['GSR'] = row['PNI2']
        row['PNI2'] = row['PNI1']
        row['PNI1'] = None
    if(row['Demanda']=='7/7/2014'):
        row['Date'] = '7/7/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = row['MTR']
        row['MTR'] = row['GSR']
        row['GSR'] = row['PNI2']
        row['PNI2'] = row['PNI1']
        row['PNI1'] = row['PCHN']
        row['PCHN'] = row['EEC20']
        row['EEC20'] = row['EEC']
        row['EEC'] = row['CEN']
        row['CEN'] = row['PBP']
        row['PBP'] = row['AMY2']
        row['AMY2'] = row['AMY1']
        row['AMY1'] = row['PHC2']
        row['PHC2'] = None
    if(row['Demanda']=='11/7/2014'):
        row['Date'] = '11/7/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = None
    if(row['Demanda']=='12/7/2014'):
        row['Date'] = '12/7/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = row['MTR']
        row['MTR'] = row['GSR']
        row['GSR'] = row['PNI2']
        row['PNI2'] = row['PNI1']
        row['PNI1'] = None
    if(row['Demanda']=='19/7/2014'):
        row['Date'] = '19/7/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = row['MTR']
        row['MTR'] = row['GSR']
        row['GSR'] = row['PNI2']
        row['PNI2'] = row['PNI1']
        row['PNI1'] = row['PCHN']
        row['PCHN'] = row['EEC20']
        row['EEC20'] = row['EEC']
        row['EEC'] = row['CEN']
        row['CEN'] = row['PBP']
        row['PBP'] = row['AMY2']
        row['AMY2'] = row['AMY1']
        row['AMY1'] = row['PHC2']
        row['PHC2'] = row['PHC1']
        row['PHC1'] = row['PCG9']
        row['PCG9'] = row['PCG8']
        row['PCG8'] = row['PCG7']
        row['PCG7'] = row['PCG6']
        row['PCG6'] = row['PCG5']
        row['PCG5'] = row['PCG4']
        row['PCG4'] = row['PCG3']
        row['PCG3'] = row['PCG2']
        row['PCG2'] = row['PCG1']
        row['PCG1'] = None
    if(row['Demanda']=='20/7/2014'):
        row['Date'] = '20/7/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = row['MTR']
        row['MTR'] = row['GSR']
        row['GSR'] = row['PNI2']
        row['PNI2'] = row['PNI1']
        row['PNI1'] = row['PCHN']
        row['PCHN'] = row['EEC20']
        row['EEC20'] = row['EEC']
        row['EEC'] = row['CEN']
        row['CEN'] = row['PBP']
        row['PBP'] = row['AMY2']
        row['AMY2'] = row['AMY1']
        row['AMY1'] = row['PHC2']
        row['PHC2'] = None
    if(row['TCPI-L9150']=='21/7/2014'):
        row['Date'] = '21/7/2014'
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = row['MTR']
        row['MTR'] = row['GSR']
        row['GSR'] = row['PNI2']
        row['PNI2'] = row['PNI1']
        row['PNI1'] = row['PCHN']
        row['PCHN'] = row['EEC20']
        row['EEC20'] = row['EEC']
        row['EEC'] = row['CEN']
        row['CEN'] = row['PBP']
        row['PBP'] = row['AMY2']
        row['AMY2'] = row['AMY1']
        row['AMY1'] = row['PHC2']
        row['PHC2'] = row['PHC1']
        row['PHC1'] = row['PCG9']
        row['PCG9'] = row['PCG8']
        row['PCG8'] = row['PCG7']
        row['PCG7'] = row['PCG6']
        row['PCG6'] = row['PCG5']
        row['PCG5'] = None
        row['Demanda'] = row['TCPI-L9150']
        row['TCPI-L9150'] = row['AMY-L9030']
        row['AMY-L9030'] = row['SND-L9090']
        row['LNI-L9040'] = row['AMY-L9030']
        row['TPC'] = row['LNI-L9040']
        row['LNI-9040'] = row['TPC']
        row['TPC'] = row['PMT3']
        row['PMT3'] = row['PMT2']
        row['PMT2'] = row['PMT1']
        row['PMT1'] = row['PMG5']
        row['PMG5'] = row['PMG4']
        row['PMG4'] = row['PMG3']
        row['PMG3'] = row['PLB2']
        row['PLB2'] = row['PLB1']
        row['PLB1'] = row['PEN3']
        row['PEN3'] = row['PEN1 y 2']
        row['PEN1 y 2'] = row['PCF2']
        row['PCF2'] = row['PCF1']
        row['PCF1'] = row['PCA2']
        row['PCA2'] = row['PCA1']
        row['PCA1'] = row['NSL']
        row['NSL'] = row['MTR']
        row['MTR'] = row['GSR']
        row['GSR'] = row['PNI2']
        row['PNI2'] = None
    try:
        date_split = row['Date'].split('/')
    except:
        print row
    hour = row['HORA']
    date = datetime(int(float(date_split[2])),int(float(date_split[1])),int(float(date_split[0])))
    date_time = datetime(int(float(date_split[2])),int(float(date_split[1])),int(float(date_split[0])),int(float(hour)))
    month = int(float(date_split[0]))  
    weekday = int(float(date_time.weekday()))
    day = int(float(date_time.day))
    datetime_array.append(date_time)
    date_array.append(date)
    month_array.append(month)
    weekday_array.append(weekday)
    day_array.append(day)
    demand_array.append(row['Demanda'])
    tcpi_array.append(row['TCPI-L9150'])
df_gen['TCPI-L9150'] = tcpi_array
df_gen['TCPI-L9150'] = df_gen['TCPI-L9150'].astype(np.float64)
df_gen['Demanda'] = demand_array
df_gen['Demanda'] = df_gen['Demanda'].astype(np.float64)

df_gen['datetime']=datetime_array
df_gen['date']=date_array
df_gen['month']=month_array
df_gen['weekday']=weekday_array
df_gen['day']=day_array
df_gen = df_gen.set_index('datetime').drop('Date',1).drop('Unnamed: 0',1)
df_gen['HORA'] = df_gen[['HORA']].astype(int)

In [6]:
df_gen['Demanda'][df_gen['Demanda'].isnull()]


Out[6]:
Series([], name: Demanda, dtype: float64)

In [4]:
def plot_with_time_axis(index,value,xaxis='year',symbol='-'):
    fig, ax = plt.subplots()
    ax.plot_date(index, value, symbol)
    # format the ticks
    years_plot = YearLocator()
    months_plot = MonthLocator()  # every month
    date_plot = DayLocator()
    if xaxis == 'year':
        fmt = DateFormatter('%Y')
        ax.xaxis.set_major_locator(years_plot)
        ax.xaxis.set_major_formatter(fmt)
        ax.xaxis.set_minor_locator(months_plot)
    elif xaxis == 'month':
        fmt = DateFormatter('%m\n%Y')
        ax.xaxis.set_major_locator(months_plot)
        ax.xaxis.set_major_formatter(fmt)
        ax.xaxis.set_minor_locator(date_plot)
    elif xaxis == 'day':
        fmt = DateFormatter('%d\n%m')
        ax.xaxis.set_major_formatter(fmt)
    elif xaxis == 'hour':
        fmt = DateFormatter('%H')
        ax.xaxis.set_major_formatter(fmt)
    ax.autoscale_view()

In [5]:
#changed EOLO to EOL, EEC50 to EEC, and TPT to TPC
#changed PMGA to PMG, PCH1 to PCHN
df_gen['Albanisa_Bunker'] = df_gen[['PCG1','PCG2','PCG3','PCG4','PCG5','PCG6','PCG7','PCG8','PCG9','PHC1','PHC2']].sum(axis=1)
df_gen['AEI_Wind'] = df_gen[['AMY1','AMY2']].sum(axis=1)
df_gen['Gruo_Terra_Wind'] = df_gen[['PBP']].sum(axis=1)
df_gen['Other_Wind'] = df_gen[['ABR','EOL']].sum(axis=1)
df_gen['AEI_Bunker'] = df_gen[['CEN','EEC20','EEC','TPC']].sum(axis=1)
df_gen['Enel_Hydro'] = df_gen[['PCA1','PCA2','PCF1','PCF2']].sum(axis=1)
df_gen['Enel_Bunker'] = df_gen[['PLB1','PLB2','PMG3','PMG4','PMG5']].sum(axis=1)
df_gen['Other_Bunker'] = df_gen[['PCHN','PNI1','PNI2','GSR']].sum(axis=1)
df_gen['Hidropanasma_Hydro'] = df_gen[['HPA1','HPA2']].sum(axis=1)

bunker_fuel = ['PCG1','PCG2','PCG3','PCG4','PCG5','PCG6','PCG7','PCG8','PCG9','PHC1','PHC2','CEN','EEC20','EEC','TPC','PLB1','PLB2','PMG3','PMG4','PMG5','PCHN','PNI1','PNI2','GSR']
interconnect = ['LNI-L9040','SND-L9090','AMY-L9030','TCPI-L9150']
hydro = ['PCA1','PCA2','PCF1','PCF2','HPA1','HPA2']
geothermal = ['PEN1 y 2','PEN3','PEN4','PMT1','PMT2']
biomass = ['MTR','NSL'] 
wind = ['AMY1','AMY2','ABR','EOL','PBP']
df_gen['bunker'] = df_gen[bunker_fuel].sum(axis=1)
df_gen['interconnect'] = df_gen[interconnect].sum(axis=1)
df_gen['hydro'] = df_gen[hydro].sum(axis=1)
df_gen['geothermal'] = df_gen[geothermal].sum(axis=1)
df_gen['biomass'] = df_gen[biomass].sum(axis=1)
df_gen['wind'] = df_gen[wind].sum(axis=1)

df_big_gen = pd.DataFrame([df_gen['wind'],df_gen['biomass'],df_gen['interconnect'],df_gen['geothermal'],df_gen['bunker'],df_gen['hydro']]).T

In [6]:
df_gen['all_power'] = df_gen['bunker']+df_gen['hydro']+df_gen['geothermal']+df_gen['biomass']+df_gen['wind']

In [7]:
df_big_gen['Demanda'] = df_gen['Demanda']
df_pickle = copy.deepcopy(df_big_gen)
df_pickle['Demanda'] = df_gen['Demanda']

In [9]:
error = (df_gen['all_power'].astype(float)+df_gen['interconnect']-df_gen['Demanda'].astype(float))/df_gen['Demanda'].astype(float)
plt.hist(error,300,range=[-1,1])
print 'error: ' + str(np.mean(error)) + '+/- (' + str(3*np.std(error))


error: -0.00862335469024+/- (0.254362108183

In [10]:
start_date = datetime(2013,1,1)
end_date = datetime(2013,1,2)
df_plot = df_gen[start_date:end_date]
plot_with_time_axis(df_plot.index,df_plot['all_power'],'hour',)



In [11]:
fuel_types = ['bunker','interconnect','hydro','geothermal','biomass','wind']
fuel_types_array = []
start_date = datetime(2013,1,1)
end_date = datetime(2013,1,5)
df_plot = df_gen[start_date:end_date]
plt.plot(df_plot['all_power'],label='all')

for key in fuel_types:
    fuel_types_array.append(df_plot[key])
    plt.plot(df_plot[key],label=key)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)


Out[11]:
<matplotlib.legend.Legend at 0x7fb49079f2d0>

In [12]:
plt.plot(df_plot['geothermal'],label='geothermal')
plt.plot(df_plot['biomass']+df_plot['geothermal'],label='biomass')
plt.plot(df_plot['wind']+df_plot['biomass']+df_plot['geothermal'],label='wind')
plt.plot(df_plot['hydro']+df_plot['wind']+df_plot['biomass']+df_plot['geothermal'],label='hydro')
plt.plot(df_plot['bunker']+df_plot['hydro']+df_plot['wind']+df_plot['biomass']+df_plot['geothermal'],label='bunker')
plt.plot(df_plot['interconnect']+df_plot['bunker']+df_plot['hydro']+df_plot['wind']+df_plot['biomass']+df_plot['geothermal'],label='interconnect')
plt.plot(df_plot['Demanda'],label='Demand')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#sample correlation coeffcients
#hydro 2011-2013 vs. 2014


Out[12]:
<matplotlib.legend.Legend at 0x7fb490870d50>

In [13]:
plt.plot(df_plot['all_power']-df_plot['Demanda'].astype(float))
plt.plot(df_plot['interconnect'])
plt.plot(df_plot['all_power']-df_plot['Demanda'].astype(float)+df_plot['interconnect'])


Out[13]:
[<matplotlib.lines.Line2D at 0x7fb490dfebd0>]

In [67]:
df_big_gen.keys()
df_corr = copy.deepcopy(df_big_gen.dropna())

In [68]:
#The Quantities Correlated
corr_df = pd.DataFrame(np.corrcoef(df_corr.T),columns=df_big_gen.keys()).T
corr_df.columns = df_big_gen.keys()
print corr_df
plt.figure()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_df.as_matrix(), linewidths=0, robust=True, square=False,xticklabels=df_big_gen.keys(),yticklabels=df_big_gen.keys())


                  wind   biomass  interconnect  geothermal    bunker  \
wind          1.000000  0.469583      0.213128    0.179373 -0.481952   
biomass       0.469583  1.000000      0.220028   -0.018901 -0.379613   
interconnect  0.213128  0.220028      1.000000    0.149827  0.093449   
geothermal    0.179373 -0.018901      0.149827    1.000000 -0.110458   
bunker       -0.481952 -0.379613      0.093449   -0.110458  1.000000   
hydro        -0.289219 -0.149855     -0.076092   -0.013980  0.084089   
Demanda       0.080801  0.030313      0.136742    0.116499  0.732294   

                 hydro   Demanda  
wind         -0.289219  0.080801  
biomass      -0.149855  0.030313  
interconnect -0.076092  0.136742  
geothermal   -0.013980  0.116499  
bunker        0.084089  0.732294  
hydro         1.000000  0.267482  
Demanda       0.267482  1.000000  
Out[68]:
<matplotlib.axes.AxesSubplot at 0x7fb48cd048d0>
<matplotlib.figure.Figure at 0x7fb48ea18f90>

In [65]:
df_norm = copy.deepcopy(df_corr.dropna())
for key in df_norm:
    df_norm[key] = df_norm[key]/np.max(df_norm[key])

In [66]:
#The Quantities Correlated Normalized
corr_df = pd.DataFrame(np.corrcoef(df_norm.T),columns=df_norm.keys()).T
corr_df.columns = df_norm.keys()
print corr_df
plt.figure()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_df.as_matrix(), linewidths=0, robust=True, square=False,xticklabels=df_big_gen.keys(),yticklabels=df_big_gen.keys())


                  wind   biomass  interconnect  geothermal    bunker  \
wind          1.000000  0.025030      0.073297    0.011595 -0.239908   
biomass       0.025030  1.000000      0.061267    0.059907 -0.064135   
interconnect  0.073297  0.061267      1.000000    0.011928  0.171112   
geothermal    0.011595  0.059907      0.011928    1.000000 -0.013232   
bunker       -0.239908 -0.064135      0.171112   -0.013232  1.000000   
hydro        -0.250886 -0.100507      0.023586   -0.062228  0.268320   
Demanda       0.071399 -0.009434     -0.017230    0.043991  0.864270   

                 hydro   Demanda  
wind         -0.250886  0.071399  
biomass      -0.100507 -0.009434  
interconnect  0.023586 -0.017230  
geothermal   -0.062228  0.043991  
bunker        0.268320  0.864270  
hydro         1.000000  0.477478  
Demanda       0.477478  1.000000  
Out[66]:
<matplotlib.axes.AxesSubplot at 0x7fb48ce9e090>
<matplotlib.figure.Figure at 0x7fb48d2e5e10>

In [43]:
df_corr = copy.deepcopy(df_big_gen.diff(1).dropna())
corr_df = pd.DataFrame(np.corrcoef(df_corr.T),columns=df_big_gen.keys()).T
corr_df.columns = df_big_gen.keys()
corr_df

print corr_df
plt.figure()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_df.as_matrix(), linewidths=0, robust=True, square=False,xticklabels=df_big_gen.keys(),yticklabels=df_big_gen.keys())


                  wind   biomass  interconnect  geothermal    bunker  \
wind          1.000000  0.011297      0.088610    0.009113 -0.261979   
biomass       0.011297  1.000000      0.072385    0.062183 -0.042096   
interconnect  0.088610  0.072385      1.000000    0.023355  0.167318   
geothermal    0.009113  0.062183      0.023355    1.000000 -0.000024   
bunker       -0.261979 -0.042096      0.167318   -0.000024  1.000000   
hydro        -0.309384 -0.109119      0.024376   -0.073003  0.106177   
Demanda       0.054164  0.012932     -0.092797    0.060233  0.793795   

                 hydro   Demanda  
wind         -0.309384  0.054164  
biomass      -0.109119  0.012932  
interconnect  0.024376 -0.092797  
geothermal   -0.073003  0.060233  
bunker        0.106177  0.793795  
hydro         1.000000  0.385308  
Demanda       0.385308  1.000000  
Out[43]:
<matplotlib.axes.AxesSubplot at 0x7fb48daf6e90>
<matplotlib.figure.Figure at 0x7fb48dad6810>

In [64]:
df_corr = copy.deepcopy(df_big_gen.diff(2).dropna())
corr_df = pd.DataFrame(np.corrcoef(df_corr.T),columns=df_big_gen.keys()).T
corr_df.columns = df_big_gen.keys()

print corr_df
plt.figure()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_df.as_matrix(), linewidths=0, robust=True, square=False,xticklabels=df_big_gen.keys(),yticklabels=df_big_gen.keys())


                  wind   biomass  interconnect  geothermal    bunker  \
wind          1.000000  0.025030      0.073297    0.011595 -0.239908   
biomass       0.025030  1.000000      0.061267    0.059907 -0.064135   
interconnect  0.073297  0.061267      1.000000    0.011928  0.171112   
geothermal    0.011595  0.059907      0.011928    1.000000 -0.013232   
bunker       -0.239908 -0.064135      0.171112   -0.013232  1.000000   
hydro        -0.250886 -0.100507      0.023586   -0.062228  0.268320   
Demanda       0.071399 -0.009434     -0.017230    0.043991  0.864270   

                 hydro   Demanda  
wind         -0.250886  0.071399  
biomass      -0.100507 -0.009434  
interconnect  0.023586 -0.017230  
geothermal   -0.062228  0.043991  
bunker        0.268320  0.864270  
hydro         1.000000  0.477478  
Demanda       0.477478  1.000000  
Out[64]:
<matplotlib.axes.AxesSubplot at 0x7fb48d05cb50>
<matplotlib.figure.Figure at 0x7fb48d23aad0>

In [49]:
df_corr = copy.deepcopy(df_big_gen.drop('Demanda',1).diff(24).dropna())
corr_df = pd.DataFrame(np.corrcoef(df_corr.T),columns=df_corr.keys()).T
corr_df.columns = df_corr.keys()

print corr_df
plt.figure()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_df.as_matrix(), linewidths=0, robust=True, square=False,xticklabels=df_big_gen.keys(),yticklabels=df_big_gen.keys())


                  wind   biomass  interconnect  geothermal    bunker     hydro
wind          1.000000  0.014273      0.029517    0.022856 -0.540879 -0.255181
biomass       0.014273  1.000000      0.036967    0.044795 -0.068725 -0.089805
interconnect  0.029517  0.036967      1.000000   -0.043033  0.036743 -0.001829
geothermal    0.022856  0.044795     -0.043033    1.000000 -0.049323 -0.058565
bunker       -0.540879 -0.068725      0.036743   -0.049323  1.000000  0.022092
hydro        -0.255181 -0.089805     -0.001829   -0.058565  0.022092  1.000000
Out[49]:
<matplotlib.axes.AxesSubplot at 0x7fb48d686ed0>
<matplotlib.figure.Figure at 0x7fb48da5ce10>

In [50]:
def vcorrcoef(X,y):
    #Xm = np.reshape(np.mean(X,axis=1),(X.shape[0],1))
    Xm = np.mean(X)
    ym = np.mean(y)
    r_num = np.sum((X-Xm)*(y-ym))
    r_den = np.sqrt(np.sum((X-Xm)**2)*np.sum((y-ym)**2))
    r = r_num/r_den
    return r

'''
def vcorrcoef(X,y):
    Xm = np.reshape(np.mean(X,axis=1),(X.shape[0],1))
    ym = np.mean(y)
    r_num = np.sum((X-Xm)*(y-ym),axis=1)
    r_den = np.sqrt(np.sum((X-Xm)**2,axis=1)*np.sum((y-ym)**2))
    r = r_num/r_den
    return r
'''
pass

In [51]:
start_date = datetime(2013,1,1)
end_date = datetime(2013,1,2)
df_play = copy.deepcopy(df_big_gen[start_date:end_date])

In [69]:
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_big_gen['wind'],df_big_gen['bunker']))
for i in range(1,350):
    sample_corr_coeff.append(vcorrcoef(df_big_gen['wind'].diff(i),df_big_gen['bunker'].diff(i)))
plt.plot(sample_corr_coeff)


Out[69]:
[<matplotlib.lines.Line2D at 0x7fb48cad2390>]

In [75]:
start_date = datetime(2012,1,1)
end_date = datetime(2013,12,31)
df_play = copy.deepcopy(df_big_gen[start_date:end_date])
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['wind'],df_play['bunker']))
print '2012-2013: Wind - Bunker (Quantity): ' + str(vcorrcoef(df_play['wind'],df_play['bunker']))
print '2012-2013: Wind - Bunker (12-Hour): ' + str(vcorrcoef(df_play['wind'].diff(12),df_play['bunker'].diff(12)))
for i in range(1,350):
    sample_corr_coeff.append(vcorrcoef(df_play['wind'].diff(i),df_play['bunker'].diff(i)))
plt.plot(sample_corr_coeff)

sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['wind'],df_play['hydro']))
print '2012-2013: Wind - Hydro (Quantity): ' + str(vcorrcoef(df_play['wind'],df_play['hydro']))
print '2012-2013: Wind - Hydro (12-Hour): ' + str(vcorrcoef(df_play['wind'].diff(12),df_play['hydro'].diff(12)))
for i in range(1,350):
    sample_corr_coeff.append(vcorrcoef(df_play['wind'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff)
hydro_2012_2013 = df_play['hydro']

sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['Demanda'],df_play['hydro']))
print '2012-2013: Demand - Hydro (Quantity): ' + str(vcorrcoef(df_play['Demanda'],df_play['hydro']))
print '2012-2013: Demand - Hydro (12-Hour): ' + str(vcorrcoef(df_play['Demanda'].diff(12),df_play['hydro'].diff(12)))
for i in range(1,350):
    sample_corr_coeff.append(vcorrcoef(df_play['Demanda'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff)
plt.legend()
hydro_2012_2013 = df_play['hydro']


2012-2013: Wind - Bunker (Quantity): -0.471457981253
2012-2013: Wind - Bunker (12-Hour): -0.382294550094
2012-2013: Wind - Hydro (Quantity): -0.387193988464
2012-2013: Wind - Hydro (12-Hour): -0.360481734696
2012-2013: Demand - Hydro (Quantity): 0.321797474512
2012-2013: Demand - Hydro (12-Hour): 0.656086310756
/usr/local/lib/python2.7/dist-packages/matplotlib/axes.py:4747: UserWarning: No labeled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labeled objects found. "

In [132]:
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['Demanda'],df_play['hydro']))
print '2012-2013: Demand - Hydro (Quantity): ' + str(vcorrcoef(df_play['Demanda'],df_play['hydro']))
print '2012-2013: Demand - Hydro (12-Hour): ' + str(vcorrcoef(df_play['Demanda'].diff(12),df_play['hydro'].diff(12)))
for i in range(1,25):
    sample_corr_coeff.append(vcorrcoef(df_play['Demanda'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff,label='Demand - Hydro')
plt.xlabel('Length of Ramp (Hours)')
plt.ylabel('Sample Correlation Coefficient')
plt.title('Demand - Hydro Correlation')

sample_corr_coeff = []
df_play['Demandaminuswind']=df_play['Demanda'] - df_play['wind']
sample_corr_coeff.append(vcorrcoef(df_play['Demandaminuswind'],df_play['hydro']))
print '2012-2013: Demand - Hydro (Quantity): ' + str(vcorrcoef(df_play['Demandaminuswind'],df_play['hydro']))
print '2012-2013: Demand - Hydro (12-Hour): ' + str(vcorrcoef(df_play['Demandaminuswind'].diff(12),df_play['hydro'].diff(12)))
for i in range(1,25):
    sample_corr_coeff.append(vcorrcoef(df_play['Demandaminuswind'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff,'k',label='Demand(-wind) - Hydro')
plt.xlabel('Length of Ramp (Hours)')
plt.ylabel('Sample Correlation Coefficient')
plt.title('Demand - Hydro Correlation')

sample_corr_coeff = []
df_play['Demandaminuswindinterconnect']=df_play['Demanda'] - df_play['wind'] + df_play['interconnect']
sample_corr_coeff.append(vcorrcoef(df_play['Demandaminuswindinterconnect'],df_play['hydro']))
print '2012-2013: Demand - Hydro (Quantity): ' + str(vcorrcoef(df_play['Demandaminuswindinterconnect'],df_play['hydro']))
print '2012-2013: Demand - Hydro (12-Hour): ' + str(vcorrcoef(df_play['Demandaminuswindinterconnect'].diff(12),df_play['hydro'].diff(12)))
for i in range(1,25):
    sample_corr_coeff.append(vcorrcoef(df_play['Demandaminuswindinterconnect'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff,'g',label='Demand(-wind +interconnect) - Hydro')
plt.xlabel('Length of Ramp (Hours)')
plt.ylabel('Sample Correlation Coefficient')
plt.title('Demand - Hydro Correlation')
plt.legend(title='Correlations',bbox_to_anchor=(1.25, .9),
           bbox_transform=plt.gcf().transFigure)

sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['wind'],df_play['hydro']))
print '2012-2013: Demand - Hydro (Quantity): ' + str(vcorrcoef(df_play['wind'],df_play['hydro']))
print '2012-2013: Demand - Hydro (12-Hour): ' + str(vcorrcoef(df_play['wind'].diff(12),df_play['hydro'].diff(12)))
for i in range(1,25):
    sample_corr_coeff.append(vcorrcoef(df_play['wind'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff,'r',label='Wind-Hydro')
plt.xlabel('Length of Ramp (Hours)')
plt.ylabel('Sample Correlation Coefficient')
plt.title('Demand - Hydro Correlation')
plt.legend(title='Correlations',bbox_to_anchor=(1.35, .9),
           bbox_transform=plt.gcf().transFigure)

sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['interconnect'],df_play['hydro']))
for i in range(1,25):
    sample_corr_coeff.append(vcorrcoef(df_play['interconnect'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff,'c',label='Interconnect-Hydro')
plt.xlabel('Length of Ramp (Hours)')
plt.ylabel('Sample Correlation Coefficient')
plt.title('Demand - Hydro Correlation')
plt.legend(title='Correlations',bbox_to_anchor=(1.35, .9),
           bbox_transform=plt.gcf().transFigure)


2012-2013: Demand - Hydro (Quantity): 0.266913599191
2012-2013: Demand - Hydro (12-Hour): 0.669368072851
2012-2013: Demand - Hydro (Quantity): 0.3963154341
2012-2013: Demand - Hydro (12-Hour): 0.699856522557
2012-2013: Demand - Hydro (Quantity): 0.373205006308
2012-2013: Demand - Hydro (12-Hour): 0.695068776
2012-2013: Demand - Hydro (Quantity): -0.289532311523
2012-2013: Demand - Hydro (12-Hour): -0.363596976791
Out[132]:
<matplotlib.legend.Legend at 0x7fb4840ea890>

In [54]:
start_date = datetime(2014,1,1)
end_date = datetime(2014,12,31)
df_play = copy.deepcopy(df_big_gen[start_date:end_date])
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['wind'],df_play['bunker']))
print '2014: Wind - Bunker (Quantity): ' + str(vcorrcoef(df_play['wind'],df_play['bunker']))
for i in range(1,350):
    sample_corr_coeff.append(vcorrcoef(df_play['wind'].diff(i),df_play['bunker'].diff(i)))
plt.plot(sample_corr_coeff)

sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['wind'],df_play['hydro']))
print '2014: Wind - Hydro (1-Hour Ramp): ' + str(vcorrcoef(df_play['wind'].diff(1),df_play['hydro'].diff(1)))
for i in range(1,350):
    sample_corr_coeff.append(vcorrcoef(df_play['wind'].diff(i),df_play['hydro'].diff(i)))
plt.plot(sample_corr_coeff)
hydro_2014 = df_play['hydro']


2014: Wind - Bunker (Quantity): -0.544520570825
2014: Wind - Hydro (1-Hour Ramp): -0.317485123136

In [76]:
start_date = datetime(2012,1,1)
end_date = datetime(2014,12,31)
df_play = copy.deepcopy(df_big_gen[start_date:end_date])
sample_corr_coeff = []
sample_corr_coeff.append(vcorrcoef(df_play['wind']+df_play['hydro'],df_play['bunker']))
print '2014: Wind+Hydro - Bunker (Quantity): ' + str(vcorrcoef(df_play['wind']+df_play['hydro'],df_play['bunker']))
for i in range(1,350):
    sample_corr_coeff.append(vcorrcoef(df_play['wind'].diff(i)+df_play['hydro'].diff(i),df_play['bunker'].diff(i)))
plt.plot(sample_corr_coeff)


2014: Wind+Hydro - Bunker (Quantity): -0.426657309669
Out[76]:
[<matplotlib.lines.Line2D at 0x7fb48c907a90>]

In [27]:
plt.plot(hydro_2012_2013.index,hydro_2012_2013)
plt.plot(hydro_2014.index,hydro_2014)
print np.mean(hydro_2012_2013)
print np.mean(hydro_2014)


49.3143684721
43.9040096143

In [28]:
train_start_date = datetime(2013,1,1)
train_end_date = datetime(2013,6,30)

test_start_date = datetime(2013,7,1)
test_end_date = datetime(2013,12,31)


def mls(df):
    df_train = df[train_start_date:train_end_date]
    df_test = df[test_start_date:test_end_date]
    x_train = df_train.drop('wind_shift_1',1)
    x_test = df_test.drop('wind_shift_1',1)

    y_train = df['wind_shift_1'][train_start_date:train_end_date]
    y_test = df['wind_shift_1'][test_start_date:test_end_date]
   
    x_train = sm.add_constant(x_train, prepend=True) #add a constant
    x_test = sm.add_constant(x_test, prepend=True) #add a constant
    print df_train.drop('wind_shift_1',1).keys()
    ols = sm.OLS(y_train,x_train)
    res = ols.fit() #create a model and fit it
    pred_y = res.predict(x_test)

    error = (y_test - pred_y)
    mape = np.mean(abs(error))

    print "MAE ERROR: " + str(mape)
    print "MAE ERROR/MEAN: " + str(mape/np.mean(y_test))
    print "MAX ERROR: " + str(max(error))
    print res.summary()
    return pred_y
#This shifts the demand data, so that each row has the values and then the next hours demand.

df_mls = copy.deepcopy(df_big_gen)
df_mls['wind_shift_1'] = df_mls['wind'].shift(-1)
df_mls['wind_back_1'] = df_mls['wind'].diff(1)
df_mls = df_mls.dropna()
pred_y = mls(df_mls)
df_test = copy.deepcopy(df_mls[test_start_date:test_end_date])
   
df_test['pred_wind'] = pred_y


Index([u'wind', u'biomass', u'interconnect', u'geothermal', u'bunker', u'hydro', u'Demanda', u'wind_back_1'], dtype='object')
MAE ERROR: 10.8248895622
MAE ERROR/MEAN: 0.249905548369
MAX ERROR: 133.674010262
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           wind_shift_1   R-squared:                       0.881
Model:                            OLS   Adj. R-squared:                  0.881
Method:                 Least Squares   F-statistic:                     4000.
Date:                Mon, 13 Apr 2015   Prob (F-statistic):               0.00
Time:                        10:18:14   Log-Likelihood:                -18058.
No. Observations:                4312   AIC:                         3.613e+04
Df Residuals:                    4303   BIC:                         3.619e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
const           -2.4314      2.347     -1.036      0.300        -7.032     2.169
wind             0.9452      0.028     33.977      0.000         0.891     1.000
biomass          0.1947      0.037      5.280      0.000         0.122     0.267
interconnect    -0.0426      0.022     -1.942      0.052        -0.086     0.000
geothermal       0.0404      0.036      1.112      0.266        -0.031     0.112
bunker           0.0302      0.028      1.084      0.279        -0.024     0.085
hydro            0.0272      0.030      0.920      0.358        -0.031     0.085
Demanda         -0.0251      0.028     -0.902      0.367        -0.080     0.029
wind_back_1      0.0280      0.016      1.785      0.074        -0.003     0.059
==============================================================================
Omnibus:                      958.285   Durbin-Watson:                   2.005
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             9704.558
Skew:                           0.765   Prob(JB):                         0.00
Kurtosis:                      10.188   Cond. No.                     5.08e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.08e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

In [29]:
def get_PCA_Matrix(df_imp_stand):
    all_samples = df_imp_stand.as_matrix().T
    num_var = len(all_samples)
    mean_vector = all_samples.mean(1)
    #print('Mean Vector:\n', mean_vector)

    cov_mat = np.cov(all_samples)
    #print('Covariance Matrix:\n', cov_mat)
    eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)
    for i in range(len(eig_val_cov)):
        eigvec_cov = eig_vec_cov[:,i].reshape(1,num_var).T
        #print('Eigenvalue {} from covariance matrix: {}'.format(i+1, eig_val_cov[i]))

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eig_val_cov[i]), eig_vec_cov[:,i]) for i in range(len(eig_val_cov))]

    # Sort the (eigenvalue, eigenvector) tuples from high to low
    def getKey(item):
        return item[0]
    eig_pairs = sorted(eig_pairs,key=getKey)

    eig_pairs.reverse()

    # Visually confirm that the list is correctly sorted by decreasing eigenvalues
    for i in eig_pairs:
        #print(i[0])
        pass
    matrix_w = np.hstack((eig_pairs[0][1].reshape(num_var,1), eig_pairs[1][1].reshape(num_var,1),eig_pairs[2][1].reshape(num_var,1),eig_pairs[3][1].reshape(num_var,1),eig_pairs[4][1].reshape(num_var,1)))
    #print matrix_w
    transformed = matrix_w.T.dot(all_samples)
    return [transformed, matrix_w]

In [30]:
df_pca = copy.deepcopy(df_mls.drop('wind_shift_1',1))
[transformed,matrix_w] = get_PCA_Matrix(df_pca)

In [31]:
table = []
#row_format ="{:>20}" * 5
#print row_format.format("", *['PCA0','PCA1','PCA2','Demand'])
for key in df_pca:
    if key != 'Demanda_shift_1':
        corr = [np.corrcoef(df_pca[key],transformed[0])[0][1],np.corrcoef(df_pca[key],transformed[1])[0][1],np.corrcoef(df_pca[key],transformed[2])[0][1],np.corrcoef(df_pca[key],df_mls['wind_shift_1'])[0][1]]
        table.append(corr)
        #print row_format.format(key, *corr)

In [32]:
corr_df = pd.DataFrame(np.array(table).T,columns=df_pca.keys()).T
corr_df.columns = ['PCA0','PCA1','PCA2','Wind']
corr_df.sort('PCA0',ascending=False)


Out[32]:
PCA0 PCA1 PCA2 Wind
bunker 0.969359 0.203836 -0.130980 -0.457575
Demanda 0.873145 -0.467423 0.126002 0.080673
hydro 0.199982 0.030418 0.951669 -0.278527
interconnect 0.099959 -0.220644 -0.209040 0.208788
wind_back_1 -0.014118 -0.129939 -0.176144 0.140369
geothermal -0.033063 -0.250138 0.071619 0.179490
biomass -0.268699 -0.531132 0.005514 0.469167
wind -0.345469 -0.895851 -0.233112 0.948542

In [33]:
def mls_vars_and_pca(df):

    y_train = df['wind_shift_1'][train_start_date:train_end_date]
    y_test = df['wind_shift_1'][test_start_date:test_end_date]

    df_train = df.drop('wind_shift_1',1)[train_start_date:train_end_date]
    df_test = df.drop('wind_shift_1',1)[test_start_date:test_end_date]

    [transformed, matrix_w] = get_PCA_Matrix(df_train)

    x_train_pls = transformed.T
    x_test_pls =  matrix_w.T.dot(df_test.T).T

    x_train_pls = sm.add_constant(x_train_pls, prepend=True) #add a constant
    x_test_pls = sm.add_constant(x_test_pls, prepend=True) #add a constant

    df = df.dropna()
    df = copy.deepcopy(df)

    df_train = df[train_start_date:train_end_date]
    df_test = df[test_start_date:test_end_date]

    x_train = df_train.drop('wind_shift_1',1)
    x_test = df_test.drop('wind_shift_1',1)

    x_train = sm.add_constant(x_train, prepend=True) #add a constant
    x_test = sm.add_constant(x_test, prepend=True) #add a constant

    x_train_all = np.vstack((x_train_pls.T,x_train.T)).T
    x_test_all = np.vstack((x_test_pls.T,x_test.T)).T

    ols = sm.OLS(y_train,x_train_all)
    res = ols.fit() #create a model and fit it
    pred_y = res.predict(x_test_all)

    error = (y_test - pred_y)
    mape = np.mean(abs(error))

    print "MAE ERROR: " + str(mape)
    print "MAE ERROR/MEAN: " + str(mape/np.mean(y_test))
    print "MAX ERROR: " + str(max(error))
    print res.summary()
    return [mape,error]

In [34]:
[mape,error] = mls_vars_and_pca(df_mls)


MAE ERROR: 10.8248895622
MAE ERROR/MEAN: 0.249905548369
MAX ERROR: 133.674010262
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           wind_shift_1   R-squared:                       0.881
Model:                            OLS   Adj. R-squared:                  0.881
Method:                 Least Squares   F-statistic:                     4000.
Date:                Mon, 13 Apr 2015   Prob (F-statistic):               0.00
Time:                        10:18:17   Log-Likelihood:                -18058.
No. Observations:                4312   AIC:                         3.613e+04
Df Residuals:                    4303   BIC:                         3.619e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const         -1.2157      1.173     -1.036      0.300        -3.516     1.085
x1            -0.0850      0.001    -80.047      0.000        -0.087    -0.083
x2             0.3488      0.002    154.744      0.000         0.344     0.353
x3            -0.1669      0.005    -36.959      0.000        -0.176    -0.158
x4             0.0039      0.007      0.564      0.572        -0.010     0.017
x5            -0.0276      0.008     -3.490      0.000        -0.043    -0.012
x6            -1.2157      1.173     -1.036      0.300        -3.516     1.085
x7             0.6102      0.028     22.185      0.000         0.556     0.664
x8             0.1524      0.037      4.138      0.000         0.080     0.225
x9            -0.0626      0.019     -3.305      0.001        -0.100    -0.025
x10            0.0410      0.036      1.134      0.257        -0.030     0.112
x11            0.1390      0.028      5.036      0.000         0.085     0.193
x12            0.1585      0.029      5.525      0.000         0.102     0.215
x13           -0.1403      0.028     -5.068      0.000        -0.195    -0.086
x14            0.0007      0.008      0.079      0.937        -0.016     0.017
==============================================================================
Omnibus:                      958.285   Durbin-Watson:                   2.005
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             9704.558
Skew:                           0.765   Prob(JB):                         0.00
Kurtosis:                      10.188   Cond. No.                     3.34e+18
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.11e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [35]:
def mls_pca(df):


    #df_stand = copy.deepcopy((df - df.mean()) / df.std())


    y_train = df['wind_shift_1'][train_start_date:train_end_date]
    y_test = df['wind_shift_1'][test_start_date:test_end_date]

    df_train = df.drop('wind_shift_1',1)[train_start_date:train_end_date]
    df_test = df.drop('wind_shift_1',1)[test_start_date:test_end_date]

    [transformed, matrix_w] = get_PCA_Matrix(df_train)

    x_train = transformed.T
    x_test =  matrix_w.T.dot(df_test.T).T


    x_train = sm.add_constant(x_train, prepend=True) #add a constant
    x_test = sm.add_constant(x_test, prepend=True) #add a constant

    ols = sm.OLS(y_train,x_train)
    res = ols.fit() #create a model and fit it
    pred_y = res.predict(x_test)

    error = (y_test - pred_y)
    mape = np.mean(abs(error))
    
    print "MAE ERROR: " + str(mape)
    print "MAE ERROR/MEAN: " + str(mape/np.mean(y_test))
    print "MAX ERROR: " + str(max(error))
    print res.summary()

    table = []
    for key in df:
        if key != 'wind_shift_1':
            corr = [np.corrcoef(df_train[key],transformed[0])[0][1],np.corrcoef(df_train[key],transformed[1])[0][1],np.corrcoef(df_train[key],transformed[2])[0][1],np.corrcoef(df_train[key],transformed[3])[0][1],np.corrcoef(df_train[key],df['wind_shift_1'][train_start_date:train_end_date])[0][1]]
            table.append(corr)
    corr_df = pd.DataFrame(np.array(table).T,columns=df_train.keys()).T
    corr_df.columns = ['PCA0','PCA1','PCA2','PCA3','Wind']
    corr_df.sort('PCA0',ascending=False)

mls_pca(df_mls)


MAE ERROR: 12.5252267429
MAE ERROR/MEAN: 0.289159869914
MAX ERROR: 142.692601102
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           wind_shift_1   R-squared:                       0.874
Model:                            OLS   Adj. R-squared:                  0.874
Method:                 Least Squares   F-statistic:                     5979.
Date:                Mon, 13 Apr 2015   Prob (F-statistic):               0.00
Time:                        10:18:17   Log-Likelihood:                -18189.
No. Observations:                4312   AIC:                         3.639e+04
Df Residuals:                    4306   BIC:                         3.643e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const        -26.2685      1.662    -15.810      0.000       -29.526   -23.011
x1            -0.1699      0.002    -77.694      0.000        -0.174    -0.166
x2             0.6977      0.005    150.194      0.000         0.689     0.707
x3            -0.3338      0.009    -35.872      0.000        -0.352    -0.316
x4             0.0078      0.014      0.548      0.584        -0.020     0.036
x5            -0.0552      0.016     -3.387      0.001        -0.087    -0.023
==============================================================================
Omnibus:                      849.595   Durbin-Watson:                   1.925
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             7385.138
Skew:                           0.690   Prob(JB):                         0.00
Kurtosis:                       9.261   Cond. No.                     3.44e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.44e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

In [36]:
#normalize values for quantity
#follow Diego algorithm

In [ ]: