In [1]:
%matplotlib inline
import os
import json
import time
import pickle
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import yellowbrick as yb
sns.set_palette('RdBu', 10)
In [2]:
URL = 'https://raw.githubusercontent.com/georgetown-analytics/classroom-occupancy/master/models/sensor_data_ml.csv'
def fetch_data(fname='sensor_data_ml.csv'):
response = requests.get(URL)
outpath = os.path.abspath(fname)
with open(outpath, 'wb') as f:
f.write(response.content)
return outpath
# Defining fetching data from the URL
DATA = fetch_data()
In [3]:
# Import sensor data
df = pd.read_csv('sensor_data_ml.csv', index_col='datetime', parse_dates=True)
In [4]:
# Rename columns
df.columns = ['temp', 'humidity', 'co2', 'light', 'light_st', 'noise',
'bluetooth', 'images', 'door', 'occupancy_count', 'occupancy_level']
In [5]:
df.info()
df.head()
Out[5]:
In [6]:
# Breakdown of classroom occupancy levels
df.occupancy_level.value_counts()
Out[6]:
In [7]:
# Encode multiclass target variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit_transform(df['occupancy_level'])
Out[7]:
In [8]:
# Create feature and target arrays
X = df.drop('occupancy_level', axis=1).values
y = df['occupancy_level']
In [35]:
# Use TimeSeriesSplit to create training and test set split indices
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=12)
for train_index, test_index in tscv.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
In [36]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
# Create a SVC classifer object: svc
svc = SVC().fit(X_train, y_train)
print('SVC Cross-Validation Scores')
cv_scores = cross_val_score(svc, X_train, y_train, cv=tscv)
print(cv_scores)
print('Average CV Score: {:.4f}'.format(np.mean(cv_scores)))
In [37]:
from sklearn.metrics import classification_report
# Predict test set labels: y_pred
y_pred = svc.predict(X_test)
print(classification_report(y_test, y_pred))
print('Training set score: {:.4f}'.format(svc.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(svc.score(X_test, y_test)))
In [38]:
# Scale and tune hyperparameters using a SVM classifer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
pipe = Pipeline([('scaler', RobustScaler()),
('clf', SVC())])
# Specify the hyperparameter space
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.00]
param_grid = [{'clf__kernel': ['linear'], 'clf__C': param_range,
'clf__class_weight': [None, 'balanced']},
{'clf__kernel': ['rbf'], 'clf__C': param_range,
'clf__class_weight': [None, 'balanced'], 'clf__gamma': param_range}]
# Instantiate the GridSearchCV object: svc
grid = GridSearchCV(pipe, param_grid, cv=tscv)
# Fit to the training set
svc = grid.fit(X_train, y_train)
print('Best Estimator:\n{}'.format(svc.best_estimator_))
In [39]:
print('SVM Model')
print('Best Score: {:.4f}'.format(svc.best_score_))
print('Best Parameters: {}'.format(svc.best_params_))
In [40]:
from sklearn.metrics import classification_report
# Predict test set labels: y_pred
y_pred = svc.predict(X_test)
print('SVC Classification Report: \n{}'.format(classification_report(y_test, y_pred)))
print('Training set score: {:.4f}'.format(svc.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(svc.score(X_test, y_test)))
In [42]:
from sklearn.metrics import f1_score
print('SVC Multiclass Scores')
print('Micro average f1 score: {:.4f}'.format(f1_score(y_test, y_pred, average='micro')))
print('Weighted average f1 score: {:.4f}'.format(f1_score(y_test, y_pred, average='weighted')))
print('Macro average f1 score: {:.4f}'.format(f1_score(y_test, y_pred, average='macro')))
In [43]:
from yellowbrick.classifier import ClassificationReport
classes = ['Empty', 'High', 'Low', 'Mid-Level']
fig = plt.figure()
visualizer = ClassificationReport(svc, classes=classes)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
fig.savefig('ml_graphs/svc_classification_report.png')
In [44]:
from sklearn.metrics import confusion_matrix
print('SVC Confusion Matrix')
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
In [47]:
from yellowbrick.classifier import ClassBalance
classes = ['Empty', 'High', 'Low', 'Mid-Level']
fig = plt.figure()
visualizer = ClassBalance(svc, classes=classes)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
fig.savefig('ml_graphs/svc_class_balance.png')
In [48]:
import pickle
svc_model = 'svc_model.sav'
# Save fitted model to disk
pickle.dump(svc, open(svc_model, 'wb'))
In [49]:
loaded_model = pickle.load(open(svc_model, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)