In [1]:
%matplotlib inline
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn import cross_validation as cv
from sklearn.cross_validation import train_test_split as tts
from sklearn.linear_model import Ridge
from sklearn.linear_model import RandomizedLasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
In [ ]:
ENERGY = "http://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
In [ ]:
def download_data(url, path='data'):
if not os.path.exists(path):
os.mkdir(path)
response = requests.get(url)
name = os.path.basename(url)
with open(os.path.join(path, name), 'w') as f:
f.write(response.content)
In [ ]:
download_data(ENERGY)
In [ ]:
energy = pd.read_excel('data/ENB2012_data.xlsx', sep=",")
In [ ]:
energy.head()
In [ ]:
energy.columns = ['compactness','surface_area','wall_area','roof_area','height',\
'orientation','glazing_area','distribution','heating_load','cooling_load']
In [ ]:
energy.describe()
In [ ]:
scatter_matrix(energy, alpha=0.2, figsize=(18,18), diagonal='kde')
plt.show()
In [ ]:
energy_features = energy.ix[:,0:8]
energy_labels = energy.ix[:,8:]
In [ ]:
model = RandomizedLasso(alpha=0.1)
model.fit(energy_features, energy_labels["heating_load"])
names = list(energy_features)
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), model.scores_),
names), reverse=True)
In [ ]:
model = RandomizedLasso(alpha=0.1)
model.fit(energy_features, energy_labels["cooling_load"])
names = list(energy_features)
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), model.scores_),
names), reverse=True)
In [ ]:
heat_labels = energy.ix[:,8]
In [ ]:
splits = cv.train_test_split(energy_features, heat_labels, test_size=0.2)
X_train, X_test, y_train, y_test = splits
In [ ]:
model = Ridge(alpha=0.1)
model.fit(X_train, y_train)
expected = y_test
predicted = model.predict(X_test)
print "Ridge Regression model"
print "Mean Squared Error: %0.3f" % mse(expected, predicted)
print "Coefficient of Determination: %0.3f" % r2_score(expected, predicted)
In [ ]:
model = LinearRegression()
model.fit(X_train, y_train)
expected = y_test
predicted = model.predict(X_test)
print "Linear Regression model"
print "Mean Squared Error: %0.3f" % mse(expected, predicted)
print "Coefficient of Determination: %0.3f" % r2_score(expected, predicted)
In [ ]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
expected = y_test
predicted = model.predict(X_test)
print "Random Forest model"
print "Mean squared error = %0.3f" % mse(expected, predicted)
print "R2 score = %0.3f" % r2_score(expected, predicted)
Build a command line application to compute the energy efficiency of a house
https://github.com/georgetown-analytics/machine-learning/blob/master/code/energy.py