In [1]:
%matplotlib inline
import os
import json
import time
import pickle
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yellowbrick as yb
import seaborn as sns
sns.set_palette('RdBu', 10)
In [2]:
URL = 'https://raw.githubusercontent.com/georgetown-analytics/classroom-occupancy/master/models/sensor_data_ml.csv'
def fetch_data(fname='sensor_data_ml.csv'):
response = requests.get(URL)
outpath = os.path.abspath(fname)
with open(outpath, 'wb') as f:
f.write(response.content)
return outpath
# Defining fetching data from the URL
DATA = fetch_data()
In [3]:
# Import as pandas dataframe with DateTimeIndex: df
df = pd.read_csv('sensor_data_ml.csv', index_col='datetime', parse_dates=True)
In [4]:
# Rename columns
df.columns = ['temp', 'humidity', 'co2', 'light', 'light_st', 'noise',
'bluetooth', 'images', 'door', 'occupancy_count', 'occupancy_level']
In [5]:
df.info()
df.head()
Out[5]:
In [6]:
# Breakdown of classroom occupancy levels
df.occupancy_level.value_counts()
Out[6]:
In [7]:
# Encode multiclass target variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit_transform(df['occupancy_level'])
Out[7]:
In [8]:
X = df.drop('occupancy_level', axis=1).values
y = df['occupancy_level']
In [9]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=12)
for train_index, test_index in tscv.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
# Fit GaussianNB classifier onto the training data: bayes
bayes = GaussianNB().fit(X_train, y_train)
cv_scores = cross_val_score(bayes, X_train, y_train, cv=tscv)
print('GaussianNB Cross-Validation Scores')
print(cv_scores)
print('Average 12-Fold CV Score: {:.4f}'.format(np.mean(cv_scores)))
In [11]:
from sklearn.metrics import classification_report
# Predict test set labels: y_pred
y_pred = bayes.predict(X_test)
print('Naive Bayes Classification Report')
print(classification_report(y_test, y_pred))
print('Training set score: {:.4f}'.format(bayes.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(bayes.score(X_test, y_test)))
In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score
print('F1 Score: {:.4f}'.format(f1_score(y_test, y_pred, average='micro')))
print('F1 Score: {:.4f}'.format(f1_score(y_test, y_pred, average='weighted')))
print('F1 Score: {:.4f}'.format(f1_score(y_test, y_pred, average='macro')))
In [13]:
print('Micro')
print('F1 Score: {:.4f}'.format(f1_score(y_test, y_pred, average='micro')))
print('Precision Score: {:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Recall Score: {:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
In [14]:
from yellowbrick.classifier import ClassificationReport
classes = ['Empty', 'High', 'Low', 'Mid']
fig = plt.figure()
visualizer = ClassificationReport(bayes, classes=classes)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
fig.savefig('ml_graphs/bayes_classification_report.png')
In [15]:
from sklearn.metrics import confusion_matrix
print('Naive Bayes Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
In [16]:
from yellowbrick.classifier import ConfusionMatrix
fig = plt.figure()
cm = ConfusionMatrix(bayes)
cm.score(X_test, y_test)
cm.poof()
fig.savefig('logreg_confusion_matrix.png')
In [17]:
from yellowbrick.classifier import ClassBalance
fig = plt.figure()
visualizer = ClassBalance(bayes, classes=classes)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
fig.savefig('ml_graphs/bayes_class_balance.png')
In [18]:
import pickle
bayes_model = 'bayes_model.sav'
# Save fitted model to disk
pickle.dump(bayes, open(bayes_model, 'wb'))
In [19]:
loaded_model = pickle.load(open(bayes_model, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)