In [1]:
%matplotlib inline
import os
import json
import time
import pickle
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import yellowbrick as yb
sns.set_palette('RdBu', 10)
In [2]:
URL = 'https://raw.githubusercontent.com/georgetown-analytics/classroom-occupancy/master/models/sensor_data_ml.csv'
def fetch_data(fname='sensor_data_ml.csv'):
response = requests.get(URL)
outpath = os.path.abspath(fname)
with open(outpath, 'wb') as f:
f.write(response.content)
return outpath
# Defining fetching data from the URL
DATA = fetch_data()
In [3]:
# Import sensor data
df = pd.read_csv('sensor_data_ml.csv', index_col='datetime', parse_dates=True)
In [4]:
# Rename columns
df.columns = ['temp', 'humidity', 'co2', 'light', 'light_st', 'noise',
'bluetooth', 'images', 'door', 'occupancy_count', 'occupancy_level']
In [5]:
df.info()
df.describe()
Out[5]:
In [6]:
# Breakdown of classroom occupancy levels
df.occupancy_level.value_counts()
Out[6]:
In [7]:
# Encode multiclass target variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit_transform(df['occupancy_level'])
Out[7]:
In [13]:
# Create feature and target arrays
X = df.drop('occupancy_level', axis=1).values
y = df['occupancy_level']
In [14]:
# Use TimeSeriesSplit to create training and test set split indices
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=12)
for train_index, test_index in tscv.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
In [15]:
# Initial cross-validation scores
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
# Create a KNeighborsClassifier object: knn
knn = KNeighborsClassifier().fit(X_train, y_train)
# Print the 12-fold cross-validation scores
cv_scores = cross_val_score(knn, X_train, y_train, cv=tscv)
print(cv_scores)
print('Average 12-Fold CV Score: {:.4f}'.format(np.mean(cv_scores)))
In [16]:
# Initial classification report
from sklearn.metrics import classification_report
tscv = TimeSeriesSplit()
for train_index, test_index in tscv.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# Predict the labels of the test set: y_pred
y_pred = knn.predict(X_test)
# Compute and print the classification report and training and test scores
print('kNN Classification Report: \n{}'.format(classification_report(y_test, y_pred)))
print('Training set score: {:.4f}'.format(knn.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(knn.score(X_test, y_test)))
In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
neighbors = np.arange(1, 15)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
# Loop over different values of k
for i, k in enumerate(neighbors):
# Setup a k-NN Classifier with k neighbors: knn
pipeline = Pipeline([('scaler', RobustScaler()),
('knn', KNeighborsClassifier(n_neighbors=k))])
# Fit the classifier to the training data
pipeline.fit(X_train, y_train)
#Compute accuracy on the training set
train_accuracy[i] = pipeline.score(X_train, y_train)
#Compute accuracy on the testing set
test_accuracy[i] = pipeline.score(X_test, y_test)
# Plot the results
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training Accuracy')
plt.title('k-NN: Varying Number of Neighbors')
plt.xlabel('Number of k-NN Neighbors')
plt.ylabel('Test Accuracy')
plt.legend(loc='best')
plt.savefig('ml_graphs/knn_model_complexity_curve.png')
In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
pipeline = make_pipeline(RobustScaler(), KNeighborsClassifier())
param_grid = {'kneighborsclassifier__n_neighbors': np.arange(1, 15)}
knn = GridSearchCV(pipeline, param_grid=param_grid, cv=tscv)
knn.fit(X_train, y_train)
print('kNN Best estimator:\n{}'.format(knn.best_estimator_))
In [20]:
# Print the tuned parameters and score by accessing the best_params_ and best_score_ attributes of grid
print('kNN Model (Tuned)')
print('Best Score: {:.4f}'.format(knn.best_score_))
print('Best Parameters: {}'.format(knn.best_params_))
In [21]:
# Predict the labels of the test set: y_pred
y_pred = knn.predict(X_test)
print('kNN Classification Report: \n{}'.format(classification_report(y_test, y_pred)))
print('Training set score: {:.4f}'.format(knn.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(knn.score(X_test, y_test)))
In [22]:
# Compare f1 scores based on different averaging strategies
from sklearn.metrics import f1_score
print('F1 Score - micro: {:.4f}'.format(f1_score(y_test, y_pred, average='micro')))
print('F1 Score - weighted: {:.4f}'.format(f1_score(y_test, y_pred, average='weighted')))
print('F1 Score - macro: {:.4f}'.format(f1_score(y_test, y_pred, average='macro')))
In [24]:
from sklearn.metrics import precision_score, recall_score
print('Micro')
print('F1 Score: {:.4f}'.format(f1_score(y_test, y_pred, average='micro')))
print('Precision Score: {:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Recall Score: {:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
In [27]:
from yellowbrick.classifier import ClassificationReport
classes = ['Empty', 'High', 'Low', 'Mid-Level']
visualizer = ClassificationReport(knn, classes=classes)
fig = plt.figure()
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
#plt.savefig('ml_graphs/knn_classification_report.png')
In [29]:
from sklearn.metrics import confusion_matrix
print('kNN Confusion Matrix')
print(confusion_matrix(y_test, y_pred))
In [28]:
from yellowbrick.classifier import ClassBalance
classes = ['Empty', 'High', 'Low', 'Mid-Level']
visualizer = ClassBalance(knn, classes=classes)
fig = plt.figure()
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
#plt.savefig('ml_graphs/knn_class_balance.png')
In [31]:
import pickle
knn_model = 'knn_model.sav'
# Save fitted model to disk
pickle.dump(knn, open(knn_model, 'wb'))
In [32]:
loaded_model = pickle.load(open(knn_model, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)