Predicting Energy Efficiency



In [ ]:

    
%matplotlib inline

import os
import requests
import pandas as pd 
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split



In [ ]:

    
ENERGY = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx'



In [ ]:

    
def download_data(url, path='data'):
    if not os.path.exists(path):
        os.mkdir(path)

    response = requests.get(url)
    name = os.path.basename(url)
    with open(os.path.join(path, name), 'wb') as f:
        f.write(response.content)



In [ ]:

    
download_data(ENERGY)



In [ ]:

    
energy = pd.read_excel('data/ENB2012_data.xlsx')



In [ ]:

    
energy.shape



In [ ]:

    
energy.columns = ['compactness','surface_area','wall_area','roof_area','height',
                  'orientation','glazing_area','distribution','heating_load','cooling_load']



In [ ]:

    
energy.head()



In [ ]:

    
energy.describe()

Are the features predictive?



In [ ]:

    
scatter_matrix(energy, alpha=0.2, figsize=(18,18), diagonal='kde')
plt.show()

Let's focus on predicting heating load



In [ ]:

    
energy_features = energy.iloc[:,0:8]
heat_labels = energy.iloc[:,8]



In [ ]:

    
X_train, X_test, y_train, y_test = train_test_split(energy_features, heat_labels, test_size=0.2)



In [ ]:

    
model = LinearRegression()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print('Linear Regression model')
print('Mean Squared Error: %0.3f' % mse(expected, predicted))
print('Coefficient of Determination: %0.3f' % r2_score(expected, predicted))



In [ ]:

    
model = Ridge(alpha=0.1)
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print('Ridge model')
print('Mean Squared Error: %0.3f' % mse(expected, predicted))
print('Coefficient of Determination: %0.3f' % r2_score(expected, predicted))



In [ ]:

    
model = RandomForestRegressor()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print('Random Forest model')
print('Mean squared error = %0.3f' % mse(expected, predicted))
print('R2 score = %0.3f' % r2_score(expected, predicted))

Which one did best?
Which model should we try next?

Ready for a bigger challenge? Try this one!

Build a command line application to compute the energy efficiency of a house:
https://github.com/georgetown-analytics/machine-learning/blob/master/code/energy.py



In [ ]: