In [1]:
%matplotlib inline
import os
import json
import time
import pickle
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import yellowbrick as yb
sns.set_palette('RdBu', 10)
In [2]:
URL = 'https://raw.githubusercontent.com/georgetown-analytics/classroom-occupancy/master/models/sensor_data_ml.csv'
def fetch_data(fname='sensor_data_ml.csv'):
response = requests.get(URL)
outpath = os.path.abspath(fname)
with open(outpath, 'wb') as f:
f.write(response.content)
return outpath
# Defining fetching data from the URL
DATA = fetch_data()
In [2]:
# Import as pandas dataframe with DateTimeIndex: df
df = pd.read_csv('sensor_data_ml.csv', index_col='datetime', parse_dates=True)
In [3]:
# Rename columns
df.columns = ['temp', 'humidity', 'co2', 'light', 'light_st', 'noise',
'bluetooth', 'images', 'door', 'occupancy_count', 'occupancy_level']
In [5]:
df.info()
df.head()
Out[5]:
In [6]:
# Breakdown of classroom occupancy levels
df.occupancy_level.value_counts()
Out[6]:
In [4]:
# Encode multiclass target variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit_transform(df['occupancy_level'])
Out[4]:
In [5]:
X = df.drop('occupancy_level', axis=1).values
y = df['occupancy_level']
In [7]:
# Use TimeSeriesSplit to create training and test set split indices
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=12)
for train_index, test_index in tscv.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
In [8]:
# Initial cross-validation scores
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# Fit logistic regression classifier onto the training data: logreg
logreg = LogisticRegression().fit(X_train, y_train)
# Print the 12-fold cross-validation scores
cv_scores = cross_val_score(logreg, X_train, y_train, cv=tscv)
print('Logistic Regression Cross-Validation Scores')
print(cv_scores)
print('Average 12-Fold CV Score: {:.4f}'.format(np.mean(cv_scores)))
In [9]:
# Initial classification report
from sklearn.metrics import classification_report
# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)
# Compute and print the classification report and training and test scores
print('Logistic Regression Model')
print(classification_report(y_test, y_pred))
print('Training set score: {:.4f}'.format(logreg.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(logreg.score(X_test, y_test)))
In [11]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.01, 0.1, 1, 10, 100, 110, 120], 'class_weight':[None, 'balanced']}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=tscv)
logreg_clf = grid.fit(X_train, y_train)
print('Best estimator:\n{}'.format(logreg_clf.best_estimator_))
In [12]:
print('Logistic Regression Model')
print('Best Score: {:.4f}'.format(logreg_clf.best_score_))
print('Best parameters: {}'.format(logreg_clf.best_params_))
In [13]:
# Accuracy scores after tuning C parameter
# Predict the labels of the test set: y_pred
y_pred = logreg_clf.predict(X_test)
print('Training set score: {:.4f}'.format(logreg_clf.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(logreg_clf.score(X_test, y_test)))
In [22]:
# Compute and print the classification report and training and test scores
print('Logistic Regression Model')
print(classification_report(y_test, y_pred))
In [16]:
from sklearn.metrics import f1_score, precision_score, recall_score
print('Logistic Regression F1 Scores')
print('F1 Score - micro: {:.4f}'.format(f1_score(y_test, y_pred, average='micro')))
print('F1 Score - weighted: {:.4f}'.format(f1_score(y_test, y_pred, average='weighted')))
print('F1 Score - macro: {:.4f}'.format(f1_score(y_test, y_pred, average='macro')))
In [17]:
from yellowbrick.classifier import ClassificationReport
classes = ['Empty', 'High', 'Low', 'Mid-Level']
fig = plt.figure()
visualizer = ClassificationReport(logreg_clf, classes=classes)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
fig.savefig('ml_graphs/logreg_classification_report.png')
In [18]:
from sklearn.metrics import confusion_matrix
print('Logistic Regression Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
In [19]:
from yellowbrick.classifier import ClassBalance
classes = ['Empty', 'High', 'Low', 'Mid-Level']
visualizer = ClassBalance(logreg_clf, classes=classes)
fig = plt.figure()
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
fig.savefig('ml_graphs/logreg_class_balance.png')
In [20]:
import pickle
logreg_model = 'logreg_model.sav'
# Save fitted model to disk
pickle.dump(logreg_clf, open(logreg_model, 'wb'))
In [21]:
# Test model
loaded_model = pickle.load(open(logreg_model, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)