In [1]:
%matplotlib inline
import os
import json
import time
import pickle
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import yellowbrick as yb
sns.set_palette('RdBu', 10)
In [2]:
URL = 'https://raw.githubusercontent.com/georgetown-analytics/classroom-occupancy/master/models/sensor_data_ml.csv'
def fetch_data(fname='sensor_data_ml.csv'):
response = requests.get(URL)
outpath = os.path.abspath(fname)
with open(outpath, 'wb') as f:
f.write(response.content)
return outpath
# Defining fetching data from the URL
DATA = fetch_data()
In [3]:
# Import sensor data
df = pd.read_csv('sensor_data_ml.csv', index_col='datetime', parse_dates=True)
# Rename columns
df.columns = ['temp', 'humidity', 'co2', 'light', 'light_st',
'noise', 'bluetooth', 'images', 'door', 'occupancy_count', 'occupancy_level']
In [4]:
df.info()
df.head()
Out[4]:
In [5]:
# Drop 'occupancy_level' column
df.drop('occupancy_level', axis=1, inplace=True)
In [6]:
# Create feature and target arrays
X = df.drop('occupancy_count', axis=1).values
y = df['occupancy_count']
In [7]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=12)
for train_index, test_index in tscv.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
In [8]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as mse, r2_score
# Create a ridge regression object: ridge
ridge = Ridge().fit(X_train, y_train)
# Predict on the test data: y_pred
y_pred = ridge.predict(X_test)
print('Ridge Model')
print('Root Mean Squared Error: {:.3f}'.format(np.sqrt(mse(y_test, y_pred))))
print('Mean Squared Error: {:.3f}'.format(mse(y_test, y_pred)))
print('Coefficient of Determination: {:.3f}'.format(r2_score(y_test, y_pred)))
In [9]:
print('ridge.coef_: {}'.format(ridge.coef_))
print('ridge.intercept_: {:.3f}'.format(ridge.intercept_))
In [10]:
from sklearn.model_selection import cross_val_score
# Compute 12-fold cross-validation scores: cv_scores
cv_scores = cross_val_score(ridge, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error')
# Print ridge mean absolute error
print('Mean Absolute Error: {:.4f}'.format(np.mean(cv_scores)))
print('Training set score: {:.3f}'.format(ridge.score(X_train, y_train)))
print('Test set score: {:.3f}'.format(ridge.score(X_test, y_test)))
In [11]:
from yellowbrick.regressor import ResidualsPlot
visualizer = ResidualsPlot(ridge)
fig = plt.figure()
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
fig.savefig('ml_graphs/ridge_residuals_plot.png')
In [12]:
from yellowbrick.regressor import PredictionError
visualizer = PredictionError(ridge)
fig = plt.figure()
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
fig.savefig('ml_graphs/ridge_prediction_error.png')
In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Setup the pipeline with StandardScaler: steps
steps = [('scaler', StandardScaler()),
('ridge', Ridge())]
# Create the pipeline: pipeline
pipeline = Pipeline(steps)
# Fit the pipeline to the training set: logreg_scaled
ridge_scaled = pipeline.fit(X_train, y_train)
# Instantiate and fit a Logistic Regression classifier to the unscaled data: logreg_unscaled
ridge_unscaled = Ridge().fit(X_train, y_train)
# Compute and print metrics
print('Accuracy with Scaling: {:.4f}'.format(ridge_scaled.score(X_test, y_test)))
print('Accuracy without Scaling: {:.4f}'.format(ridge_unscaled.score(X_test, y_test)))
In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
pipe = make_pipeline(StandardScaler(),
PolynomialFeatures(),
Ridge())
param_grid = {'polynomialfeatures__degree': [0, 1, 2, 3],
'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=tscv, n_jobs=-1)
ridgecv = grid.fit(X_train, y_train)
In [17]:
ridgecv
Out[17]:
In [18]:
print('Best score: {:.4f}'.format(ridgecv.best_score_))
print('Best parameters: {}'.format(ridgecv.best_params_))
In [19]:
y_pred = ridgecv.predict(X_test)
print('Ridge Model')
print('Root Mean Squared Error: {:.3f}'.format(np.sqrt(mse(y_test, y_pred))))
print('Mean Squared Error: {:.3f}'.format(mse(y_test, y_pred)))
print('Coefficent of Determination: {:.3f}'.format(r2_score(y_test, y_pred)))
In [20]:
# Compute 12-fold cross-validation scores: cv_scores
cv_scores = cross_val_score(ridgecv, X, y, cv=tscv, scoring='neg_mean_squared_error')
# Print mean absolute error score
print('Mean Absolute Error: {:.4f}'.format(np.mean(cv_scores)))
In [19]:
import pickle
ridge_model = 'ridge_regression_model.sav'
# Save fitted model to disk
pickle.dump(ridgecv, open(ridge_model, 'wb'))
In [22]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error as mse, r2_score
# Create a ridge regression object: ridge
lasso = Lasso().fit(X_train, y_train)
# Predict on the test data: y_pred
y_pred = lasso.predict(X_test)
print('LASSO Model')
print('Root Mean Squared Error: {:.3f}'.format(np.sqrt(mse(y_test, y_pred))))
print('Mean Squared Error: {:.3f}'.format(mse(y_test, y_pred)))
print('Coefficient of Determination: {:.3f}'.format(r2_score(y_test, y_pred)))
In [25]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error as mse, r2_score
# Create a ridge regression object: ridge
elastic = ElasticNet().fit(X_train, y_train)
# Predict on the test data: y_pred
y_pred = elastic.predict(X_test)
print('ElasticNet Model')
print('Root Mean Squared Error: {:.3f}'.format(np.sqrt(mse(y_test, y_pred))))
print('Mean Squared Error: {:.3f}'.format(mse(y_test, y_pred)))
print('Coefficient of Determination: {:.3f}'.format(r2_score(y_test, y_pred)))