In [1]:
from __future__ import division
import os
import numpy as np 
import pandas as pd
from helpers import data_provider
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# figure adjustments for compatibility with latex
%matplotlib inline
plt.style.use('classic')
plt.rc("figure", facecolor="white")


fig_width_pt = 469.755  # Get this from LaTeX using \showthe\columnwidth
inches_per_pt = 1.0/72.27               # Convert pt to inch
golden_mean = (np.sqrt(5)-1.0)/2.0         # Aesthetic ratio
fig_width = fig_width_pt*inches_per_pt  # width in inches
fig_height = fig_width*golden_mean      # height in inches
fig_size =  [fig_width,fig_height]
params = {'backend': 'ps',
          'axes.labelsize': 10,
          'text.fontsize': 10,
          'legend.fontsize': 10,
          'xtick.labelsize': 8,
          'ytick.labelsize': 8,
          'text.usetex': True,
          'figure.figsize': fig_size}
plt.rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
plt.rcParams.update(params)


C:\Users\chris\Anaconda3\envs\mlp\lib\site-packages\matplotlib\__init__.py:913: UserWarning: text.fontsize is deprecated and replaced with font.size; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

Linear Regression Results


In [3]:
# load the results csv
data = pd.read_csv('results_linear_regression.csv',index_col='House')
data = data.drop(['Unnamed: 0'],axis=1)

In [4]:
#sns.set(style="whitegrid", color_codes=True)
y=data.train_r_squared_adj.values
x=data.index
pal = sns.color_palette("GnBu_d", len(data))
rank = data.train_r_squared_adj.argsort().argsort()

plt.figure(1)
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank])
plt.ylabel('Adjusted R Squared')
plt.title('Baseline Model - Goodness of Fit')
plt.savefig('figures/baseline_model/adj_rsquared.eps')
plt.savefig('figures/baseline_model/adj_rsquared.pdf')
plt.show()



In [8]:
#sns.set(style="whitegrid", color_codes=True)
y=data.test_r_squared_adj.values
x=data.index
pal = sns.color_palette("GnBu_r", len(data))
rank = data.test_r_squared_adj.argsort().argsort()

plt.figure(1)
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank])
plt.ylabel('Adjusted R Squared')
plt.title('Adjusted R Squared on Test Set')
plt.savefig('figures/baseline_model/adj_rsquared_test.eps')
plt.savefig('figures/baseline_model/adj_rsquared_test.pdf')
plt.show()



In [20]:
# Setting the positions and width for the bars
pos = list(range(len(data['train_F_DayOfMonth'])))
width = 0.2

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['train_F_DayOfMonth'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#FFC222')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['train_F_Hour'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#F78F1E')
        # with label the second value in first_name
        #label=df['first_name'][1]

# Create a bar with post_score data,
# in position pos + some width buffer,
plt.bar([p + width*2 for p in pos],
        #using df['post_score'] data,
        data['train_F_Month'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#EE3224')
        # with label the third value in first_name
        #label=df['first_name'][2]
        
        
plt.bar([p + width*3 for p in pos],
        #using df['post_score'] data,
        data['train_F_WeekDays'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#B9770E')
        # with label the third value in first_name
        #label=df['first_name'][2]


# Set the y axis label
ax.set_ylabel('F-statistic')

# Set the chart's title
ax.set_title('Baseline Model - Importance of Predictors per House')

# Set the position of the x ticks
ax.set_xticks([p + 1.6 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, max(data['train_F_Hour'] + data['train_F_Month'])] )

# Adding the legend and showing the plot
plt.legend(['Day of Month', 'Hour', 'Month', 'Day of Week'], loc='best')
plt.grid()
plt.savefig('figures/baseline_model/anova.eps')
plt.savefig('figures/baseline_model/anova.pdf')
plt.show()



In [18]:
# Setting the positions and width for the bars
pos = list(range(len(data['test_mae'])))
width = 0.25

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['test_mae'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#FE2E2E')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['test_mde'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#0000FF')
        # with label the second value in first_name
        #label=df['first_name'][1]



# Set the y axis label
ax.set_ylabel('Absolute Error')

# Set the chart's title
ax.set_title('Baseline Model - Error Metrics')

# Set the position of the x ticks
ax.set_xticks([p + 1.0 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, max(data['test_mae'] + 0.1)] )

# Adding the legend and showing the plot
plt.legend(['Mean Absolute Error', 'Median Absolute Error'], loc='best')
plt.grid()
plt.savefig('figures/baseline_model/mae_mde.eps')
plt.savefig('figures/baseline_model/mae_mde.pdf')
plt.show()



In [21]:
# Setting the positions and width for the bars
pos = list(range(len(data['test_mape'])))
width = 0.25

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['test_mape'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#2ECCFA')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['test_stdape'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#FFBF00')
        # with label the second value in first_name
        #label=df['first_name'][1]



# Set the y axis label
ax.set_ylabel('Percentage Error')

# Set the chart's title
ax.set_title('Baseline Model - Percentage Error Metrics')

# Set the position of the x ticks
ax.set_xticks([p + 1.0 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, max(data['test_stdape'] + 10)] )

# Adding the legend and showing the plot
plt.legend(['Mean Absolute Percentage Error', 'Standard Deviation Percentage Error'], loc='best')
plt.grid()
plt.savefig('figures/baseline_model/mape_stdape.eps')
plt.savefig('figures/baseline_model/mape_stdape.pdf')
plt.show()


Linear Regression with weather variables


In [4]:
# load the results csv
data = pd.read_csv('results_linear_weather.csv',index_col='House')
data = data.drop(['Unnamed: 0'],axis=1)

In [8]:
# Setting the positions and width for the bars
pos = list(range(len(data['train_F_AirTemperature'])))
width = 0.2

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['train_F_AirTemperature'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#FFC222')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['train_F_Relative_humitidity'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#F78F1E')
        # with label the second value in first_name
        #label=df['first_name'][1]

# Create a bar with post_score data,
# in position pos + some width buffer,
plt.bar([p + width*2 for p in pos],
        #using df['post_score'] data,
        data['train_F_Total_horizontal_solar_irradiation'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#EE3224')
        # with label the third value in first_name
        #label=df['first_name'][2]
        
        
plt.bar([p + width*3 for p in pos],
        #using df['post_score'] data,
        data['train_F_Total_rainfall'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#B9770E')
        # with label the third value in first_name
        #label=df['first_name'][2]


# Set the y axis label
ax.set_ylabel('F-statistic')

# Set the chart's title
ax.set_title('Linear Model - Importance of Weather Predictors per House')

# Set the position of the x ticks
ax.set_xticks([p + 1.6 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, max(data['train_F_Relative_humitidity'] + 30)] )

# Adding the legend and showing the plot
plt.legend(['Air Temperature','Relative Humidity', 'Total Solar Irradiation','Total Rainfall'], loc='best')
plt.grid()
plt.savefig('figures/baseline_model/anova_weather.eps')
plt.savefig('figures/baseline_model/anova_weather.pdf')
plt.show()


Dynamic Regression Results


In [3]:
# load the results csv
data = pd.read_csv('results_dynamic_regression.csv',index_col='House')
data = data.drop(['Unnamed: 0'],axis=1)

In [6]:
#sns.set(style="whitegrid", color_codes=True)
y=data.train_r_squared_adj.values
x=data.index
pal = sns.color_palette("GnBu_d", len(data))
rank = data.train_r_squared_adj.argsort().argsort()

plt.figure(1)
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank])
plt.ylabel('Adjusted R Squared')
plt.title('Dynamic Model - Goodness of Fit')
plt.savefig('figures/dynamic_model/adj_rsquared.eps')
plt.savefig('figures/dynamic_model/adj_rsquared.pdf')
plt.show()



In [5]:
#sns.set(style="whitegrid", color_codes=True)
y=data.test_r_squared_adj.values
x=data.index
pal = sns.color_palette("GnBu_r", len(data))
rank = data.test_r_squared_adj.argsort().argsort()

plt.figure(1)
sns.barplot(x=x,y=y,palette=np.array(pal[::-1])[rank])
plt.ylabel('Adjusted R Squared')
plt.title('Adjusted R Squared on Test Set')
plt.savefig('figures/dynamic_model/adj_rsquared_test.eps')
plt.savefig('figures/dynamic_model/adj_rsquared_test.pdf')
plt.show()



In [14]:
# Setting the positions and width for the bars
pos = list(range(len(data['train_F_DayOfMonth'])))
width = 0.3

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['train_F_t1'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#FFC222')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['train_F_t24'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#EE3224')
        # with label the second value in first_name
        #label=df['first_name'][1]


# Set the y axis label
ax.set_ylabel('F-statistic')

# Set the chart's title
ax.set_title('Dynamic Model - Importance of Lag Predictors per House')

# Set the position of the x ticks
ax.set_xticks([p + 1 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, max(data['train_F_Hour'] + data['train_F_t1'])] )

# Adding the legend and showing the plot
plt.legend(['Lag 1','Lag 24'], loc='best')
plt.grid()
plt.savefig('figures/dynamic_model/anova.eps')
plt.savefig('figures/dynamic_model/anova.pdf')
plt.show()



In [15]:
# Setting the positions and width for the bars
pos = list(range(len(data['test_mae'])))
width = 0.25

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['test_mae'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#FE2E2E')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['test_mde'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#0000FF')
        # with label the second value in first_name
        #label=df['first_name'][1]



# Set the y axis label
ax.set_ylabel('Absolute Error')

# Set the chart's title
ax.set_title('Dynamic Model - Error Metrics')

# Set the position of the x ticks
ax.set_xticks([p + 1.0 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, max(data['test_mae'] + 0.1)] )

# Adding the legend and showing the plot
plt.legend(['Mean Absolute Error', 'Median Absolute Error'], loc='best')
plt.grid()
plt.savefig('figures/dynamic_model/mae_mde.eps')
plt.savefig('figures/dynamic_model/mae_mde.pdf')
plt.show()



In [16]:
# Setting the positions and width for the bars
pos = list(range(len(data['test_mape'])))
width = 0.25

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['test_mape'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#2ECCFA')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['test_stdape'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#FFBF00')
        # with label the second value in first_name
        #label=df['first_name'][1]



# Set the y axis label
ax.set_ylabel('Percentage Error')

# Set the chart's title
ax.set_title('Dynamic Model - Percentage Error Metrics')

# Set the position of the x ticks
ax.set_xticks([p + 1.0 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, max(data['test_stdape'] + 10)] )

# Adding the legend and showing the plot
plt.legend(['Mean Absolute Percentage Error', 'Standard Deviation Percentage Error'], loc='best')
plt.grid()
plt.savefig('figures/dynamic_model/mape_stdape.eps')
plt.savefig('figures/dynamic_model/mape_stdape.pdf')
plt.show()


Random Forests Regression Results


In [4]:
# load the results csv
data = pd.read_csv('results_forest_regression.csv',index_col='House')
data = data.drop(['Unnamed: 0'],axis=1)

In [11]:
# Setting the positions and width for the bars
pos = list(range(len(data['train_r_squared_adj'])))
width = 0.25

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['train_r_squared_adj'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#086A87')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['test_r_squared_adj'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#04B486')
        # with label the second value in first_name
        #label=df['first_name'][1]



# Set the y axis label
ax.set_ylabel(r'Adjusted $R^2$')

# Set the chart's title
ax.set_title('Dynamic Model - Percentage Error Metrics')

# Set the position of the x ticks
ax.set_xticks([p + 1.0 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, 1.2] )

# Adding the legend and showing the plot
plt.legend([r'Adjusted $R^2$ on Training set', r'Adjusted $R^2$ on Test set'], loc='best')

plt.ylabel(r'Adjusted $R^2$')
plt.title('Random Forest Model - Goodness of Fit')
plt.savefig('figures/forest_model/adj_rsquared.eps')
plt.savefig('figures/forest_model/adj_rsquared.pdf')
plt.show()



In [12]:
# Setting the positions and width for the bars
pos = list(range(len(data['test_mae'])))
width = 0.25

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['test_mae'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#FE2E2E')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['test_mde'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#0000FF')
        # with label the second value in first_name
        #label=df['first_name'][1]



# Set the y axis label
ax.set_ylabel('Absolute Error')

# Set the chart's title
ax.set_title('Random Forest Model - Error Metrics')

# Set the position of the x ticks
ax.set_xticks([p + 1.0 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, max(data['test_mae'] + 0.1)] )

# Adding the legend and showing the plot
plt.legend(['Mean Absolute Error', 'Median Absolute Error'], loc='best')
plt.grid()
plt.savefig('figures/forest_model/mae_mde.eps')
plt.savefig('figures/forest_model/mae_mde.pdf')
plt.show()



In [13]:
# Setting the positions and width for the bars
pos = list(range(len(data['test_mape'])))
width = 0.25

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['test_mape'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#2ECCFA')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['test_stdape'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#FFBF00')
        # with label the second value in first_name
        #label=df['first_name'][1]



# Set the y axis label
ax.set_ylabel('Percentage Error')

# Set the chart's title
ax.set_title('Random Forest Model - Percentage Error Metrics')

# Set the position of the x ticks
ax.set_xticks([p + 1.0 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, max(data['test_stdape'] + 10)] )

# Adding the legend and showing the plot
plt.legend(['Mean Absolute Percentage Error', 'Standard Deviation Percentage Error'], loc='best')
plt.grid()
plt.savefig('figures/forest_model/mape_stdape.eps')
plt.savefig('figures/forest_model/mape_stdape.pdf')
plt.show()



In [17]:
# Setting the positions and width for the bars
pos = list(range(len(data['importance_DayOfMonth'])))
width = 0.15

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['importance_DayOfMonth'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#FFC222')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['importance_Hour'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#F78F1E')
        # with label the second value in first_name
        #label=df['first_name'][1]

# Create a bar with post_score data,
# in position pos + some width buffer,
plt.bar([p + width*2 for p in pos],
        #using df['post_score'] data,
        data['importance_Month'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#EE3224')
        # with label the third value in first_name
        #label=df['first_name'][2]
        
        
plt.bar([p + width*3 for p in pos],
        #using df['post_score'] data,
        data['importance_WeekDays'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#B9770E')
        # with label the third value in first_name
        #label=df['first_name'][2]
        
        
plt.bar([p + width*4 for p in pos],
        #using df['post_score'] data,
        data['importance_t1'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#00BFFF')

plt.bar([p + width*5 for p in pos],
        #using df['post_score'] data,
        data['importance_t24'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#5FB404')



# Set the y axis label
ax.set_ylabel('Feature Importance')

# Set the chart's title
ax.set_title('Random Forest Model - Importance of Predictors per House')

# Set the position of the x ticks
ax.set_xticks([p + 1.6 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, 1] )

# Adding the legend and showing the plot
plt.legend(['Day Of Month','Hour', 'Month', 'Day of Week', 'Lag 1', 'Lag 24'], loc='best')
plt.grid()
plt.savefig('figures/forest_model/anova.eps')
plt.savefig('figures/forest_model/anova.pdf')
plt.show()


XgBoost results


In [3]:
# load the results csv
data = pd.read_csv('results_xgboost_regression.csv',index_col='House')
data = data.drop(['Unnamed: 0'],axis=1)

In [4]:
data.describe()


Out[4]:
importance_DayOfMonth importance_Hour importance_Month importance_WeekDays importance_air_temperature importance_humidity importance_irradiation importance_rainfall importance_t1 importance_t24 ... test_r_squared test_r_squared_adj test_stdape train_mae train_mape train_mde train_mse train_r_squared train_r_squared_adj train_stdape
count 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 ... 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000 17.000000
mean 695.588235 1515.411765 451.235294 368.764706 1132.941176 814.882353 1009.647059 137.352941 1562.117647 1178.000000 ... 0.453006 0.431935 34.412841 0.103256 27.134902 0.061143 0.029828 0.604109 0.599156 29.272979
std 243.507202 371.993793 169.232285 121.673502 327.103476 280.138556 224.261271 69.997269 395.093167 284.608064 ... 0.098873 0.102635 15.871454 0.045212 9.743833 0.027939 0.021743 0.072087 0.072965 12.778051
min 314.000000 849.000000 186.000000 207.000000 746.000000 414.000000 751.000000 31.000000 1092.000000 702.000000 ... 0.297597 0.271493 13.770930 0.029371 12.459685 0.016998 0.002055 0.500907 0.494199 11.929515
25% 536.000000 1240.000000 331.000000 295.000000 911.000000 637.000000 821.000000 94.000000 1246.000000 989.000000 ... 0.398298 0.376387 20.773136 0.064233 20.507424 0.043221 0.007879 0.542962 0.537374 17.541865
50% 612.000000 1484.000000 439.000000 355.000000 1030.000000 786.000000 987.000000 124.000000 1421.000000 1063.000000 ... 0.451986 0.431421 33.862009 0.107020 27.043943 0.058112 0.028112 0.614601 0.609853 27.351185
75% 855.000000 1735.000000 554.000000 447.000000 1308.000000 832.000000 1100.000000 155.000000 1812.000000 1374.000000 ... 0.502909 0.484066 39.810147 0.143029 33.388687 0.073841 0.047752 0.655609 0.651428 33.960186
max 1277.000000 2225.000000 840.000000 662.000000 2029.000000 1612.000000 1611.000000 309.000000 2489.000000 1864.000000 ... 0.663962 0.650832 69.660151 0.173045 46.690851 0.109628 0.068066 0.721636 0.718101 58.329075

8 rows × 24 columns


In [5]:
# Setting the positions and width for the bars
pos = list(range(len(data['importance_DayOfMonth'])))
width = 0.15

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['importance_Hour'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#FFC222')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['importance_air_temperature'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#F78F1E')
        # with label the second value in first_name
        #label=df['first_name'][1]

# Create a bar with post_score data,
# in position pos + some width buffer,
plt.bar([p + width*2 for p in pos],
        #using df['post_score'] data,
        data['importance_irradiation'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#EE3224')
        # with label the third value in first_name
        #label=df['first_name'][2]
        
        
plt.bar([p + width*3 for p in pos],
        #using df['post_score'] data,
        data['importance_t1'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#B9770E')
        # with label the third value in first_name
        #label=df['first_name'][2]
        
        
plt.bar([p + width*4 for p in pos],
        #using df['post_score'] data,
        data['importance_t24'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#00BFFF')



# Set the y axis label
ax.set_ylabel('Feature Importance')

# Set the chart's title
ax.set_title('Extreme Gradient Boosting - Highly Important Predictors')

# Set the position of the x ticks
ax.set_xticks([p + 1.6 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, 3500] )

# Adding the legend and showing the plot
plt.legend(['Hour','Air temperature', 'Average solar irradiation', 'Lag 1', 'Lag 24'], loc='best')
plt.grid()
plt.savefig('figures/xgboost/high_importance.eps')
plt.savefig('figures/xgboost/high_importance.pdf')
plt.show()



In [14]:
# Setting the positions and width for the bars
pos = list(range(len(data['importance_DayOfMonth'])))
width = 0.15

# Plotting the bars
fig, ax = plt.subplots(1)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos,
        #using df['pre_score'] data,
        data['importance_Month'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#FFC222')
        # with label the first value in first_name
        #label=df['first_name'][0]

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos],
        #using df['mid_score'] data,
        data['importance_DayOfMonth'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#F78F1E')
        # with label the second value in first_name
        #label=df['first_name'][1]

# Create a bar with post_score data,
# in position pos + some width buffer,
plt.bar([p + width*2 for p in pos],
        #using df['post_score'] data,
        data['importance_WeekDays'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#EE3224')
        # with label the third value in first_name
        #label=df['first_name'][2]
        
        
plt.bar([p + width*3 for p in pos],
        #using df['post_score'] data,
        data['importance_humidity'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#B9770E')
        # with label the third value in first_name
        #label=df['first_name'][2]
        
        
plt.bar([p + width*4 for p in pos],
        #using df['post_score'] data,
        data['importance_rainfall'],
        # of width
        width,
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#00BFFF')



# Set the y axis label
ax.set_ylabel('Feature Importance')

# Set the chart's title
ax.set_title('Ensemble of Decision Trees - Less Important Predictors')

# Set the position of the x ticks
ax.set_xticks([p + 1.6 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(data.index)
ax.set_xlabel('House')

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*5)
plt.ylim([0, 2000] )

# Adding the legend and showing the plot
plt.legend(['Month','Day of month', 'Day of week', 'Total rainfall', 'Average humidity'], loc='best')
plt.grid()
plt.savefig('figures/xgboost/low_importance.eps')
plt.savefig('figures/xgboost/low_importance.pdf')
plt.show()



In [ ]: