Predicting Energy Efficiency


In [1]:
%matplotlib inline

import os
import requests
import pandas as pd 
import matplotlib.pyplot as plt

from pandas.tools.plotting import scatter_matrix

from sklearn import cross_validation as cv
from sklearn.cross_validation import train_test_split as tts

from sklearn.linear_model import Ridge
from sklearn.linear_model import RandomizedLasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse

In [ ]:
ENERGY = "http://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"

In [ ]:
def download_data(url, path='data'):
    if not os.path.exists(path):
        os.mkdir(path)

    response = requests.get(url)
    name = os.path.basename(url)
    with open(os.path.join(path, name), 'w') as f:
        f.write(response.content)

In [ ]:
download_data(ENERGY)

In [ ]:
energy   = pd.read_excel('data/ENB2012_data.xlsx', sep=",")

In [ ]:
energy.head()

In [ ]:
energy.columns = ['compactness','surface_area','wall_area','roof_area','height',\
                  'orientation','glazing_area','distribution','heating_load','cooling_load']

In [ ]:
energy.describe()

Are the features predictive?


In [ ]:
scatter_matrix(energy, alpha=0.2, figsize=(18,18), diagonal='kde')
plt.show()

In [ ]:
energy_features = energy.ix[:,0:8]
energy_labels = energy.ix[:,8:]

In [ ]:
model = RandomizedLasso(alpha=0.1)
model.fit(energy_features, energy_labels["heating_load"])
names = list(energy_features)

print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), model.scores_), 
                 names), reverse=True)

In [ ]:
model = RandomizedLasso(alpha=0.1)
model.fit(energy_features, energy_labels["cooling_load"])
names = list(energy_features)

print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), model.scores_), 
                 names), reverse=True)

Let's focus on predicting heating load


In [ ]:
heat_labels = energy.ix[:,8]

In [ ]:
splits = cv.train_test_split(energy_features, heat_labels, test_size=0.2)
X_train, X_test, y_train, y_test = splits

In [ ]:
model = Ridge(alpha=0.1)
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print "Ridge Regression model"
print "Mean Squared Error: %0.3f" % mse(expected, predicted)
print "Coefficient of Determination: %0.3f" % r2_score(expected, predicted)

In [ ]:
model = LinearRegression()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print "Linear Regression model"
print "Mean Squared Error: %0.3f" % mse(expected, predicted)
print "Coefficient of Determination: %0.3f" % r2_score(expected, predicted)

In [ ]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print "Random Forest model"
print "Mean squared error = %0.3f" % mse(expected, predicted)
print "R2 score = %0.3f" % r2_score(expected, predicted)

Which one did best?

What should we try next?

Ready for a bigger challenge? Try this one!

Build a command line application to compute the energy efficiency of a house
https://github.com/georgetown-analytics/machine-learning/blob/master/code/energy.py