Predicting Energy Efficiency


In [ ]:
%matplotlib inline

import os
import requests
import pandas as pd 
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

In [ ]:
ENERGY = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx'

In [ ]:
def download_data(url, path='data'):
    if not os.path.exists(path):
        os.mkdir(path)

    response = requests.get(url)
    name = os.path.basename(url)
    with open(os.path.join(path, name), 'wb') as f:
        f.write(response.content)

In [ ]:
download_data(ENERGY)

In [ ]:
energy = pd.read_excel('data/ENB2012_data.xlsx')

In [ ]:
energy.shape

In [ ]:
energy.columns = ['compactness','surface_area','wall_area','roof_area','height',
                  'orientation','glazing_area','distribution','heating_load','cooling_load']

In [ ]:
energy.head()

In [ ]:
energy.describe()

Are the features predictive?


In [ ]:
scatter_matrix(energy, alpha=0.2, figsize=(18,18), diagonal='kde')
plt.show()

Let's focus on predicting heating load


In [ ]:
energy_features = energy.iloc[:,0:8]
heat_labels = energy.iloc[:,8]

In [ ]:
X_train, X_test, y_train, y_test = train_test_split(energy_features, heat_labels, test_size=0.2)

In [ ]:
model = LinearRegression()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print('Linear Regression model')
print('Mean Squared Error: %0.3f' % mse(expected, predicted))
print('Coefficient of Determination: %0.3f' % r2_score(expected, predicted))

In [ ]:
model = Ridge(alpha=0.1)
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print('Ridge model')
print('Mean Squared Error: %0.3f' % mse(expected, predicted))
print('Coefficient of Determination: %0.3f' % r2_score(expected, predicted))

In [ ]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print('Random Forest model')
print('Mean squared error = %0.3f' % mse(expected, predicted))
print('R2 score = %0.3f' % r2_score(expected, predicted))

Which one did best?
Which model should we try next?

Ready for a bigger challenge? Try this one!

Build a command line application to compute the energy efficiency of a house:
https://github.com/georgetown-analytics/machine-learning/blob/master/code/energy.py


In [ ]: