In [ ]:
%matplotlib inline
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
In [ ]:
ENERGY = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx'
In [ ]:
def download_data(url, path='data'):
if not os.path.exists(path):
os.mkdir(path)
response = requests.get(url)
name = os.path.basename(url)
with open(os.path.join(path, name), 'wb') as f:
f.write(response.content)
In [ ]:
download_data(ENERGY)
In [ ]:
energy = pd.read_excel('data/ENB2012_data.xlsx')
In [ ]:
energy.shape
In [ ]:
energy.columns = ['compactness','surface_area','wall_area','roof_area','height',
'orientation','glazing_area','distribution','heating_load','cooling_load']
In [ ]:
energy.head()
In [ ]:
energy.describe()
In [ ]:
scatter_matrix(energy, alpha=0.2, figsize=(18,18), diagonal='kde')
plt.show()
In [ ]:
energy_features = energy.iloc[:,0:8]
heat_labels = energy.iloc[:,8]
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(energy_features, heat_labels, test_size=0.2)
In [ ]:
model = LinearRegression()
model.fit(X_train, y_train)
expected = y_test
predicted = model.predict(X_test)
print('Linear Regression model')
print('Mean Squared Error: %0.3f' % mse(expected, predicted))
print('Coefficient of Determination: %0.3f' % r2_score(expected, predicted))
In [ ]:
model = Ridge(alpha=0.1)
model.fit(X_train, y_train)
expected = y_test
predicted = model.predict(X_test)
print('Ridge model')
print('Mean Squared Error: %0.3f' % mse(expected, predicted))
print('Coefficient of Determination: %0.3f' % r2_score(expected, predicted))
In [ ]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
expected = y_test
predicted = model.predict(X_test)
print('Random Forest model')
print('Mean squared error = %0.3f' % mse(expected, predicted))
print('R2 score = %0.3f' % r2_score(expected, predicted))
Which one did best?
Which model should we try next?
Build a command line application to compute the energy efficiency of a house:
https://github.com/georgetown-analytics/machine-learning/blob/master/code/energy.py
In [ ]: