SVM Classifer

Dataset Information

No. of Features: 12
No. of Instances: 4492

Data Ingestion


In [1]:
%matplotlib inline

import os
import json
import time
import pickle
import requests
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import yellowbrick as yb
sns.set_palette('RdBu', 10)

In [2]:
URL = 'https://raw.githubusercontent.com/georgetown-analytics/classroom-occupancy/master/models/sensor_data_ml.csv'

def fetch_data(fname='sensor_data_ml.csv'):
    response = requests.get(URL)
    outpath  = os.path.abspath(fname)
    with open(outpath, 'wb') as f:
        f.write(response.content)
    
    return outpath

# Defining fetching data from the URL
DATA = fetch_data()

In [3]:
# Import sensor data
df = pd.read_csv('sensor_data_ml.csv', index_col='datetime', parse_dates=True)

In [4]:
# Rename columns
df.columns = ['temp', 'humidity', 'co2', 'light', 'light_st', 'noise',
              'bluetooth', 'images', 'door', 'occupancy_count', 'occupancy_level']

In [5]:
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4492 entries, 2017-03-25 09:05:00 to 2017-06-10 16:47:00
Data columns (total 11 columns):
temp               4492 non-null float64
humidity           4492 non-null float64
co2                4492 non-null float64
light              4492 non-null float64
light_st           4492 non-null float64
noise              4492 non-null float64
bluetooth          4492 non-null float64
images             4492 non-null float64
door               4492 non-null float64
occupancy_count    4492 non-null float64
occupancy_level    4492 non-null object
dtypes: float64(10), object(1)
memory usage: 421.1+ KB
Out[5]:
temp humidity co2 light light_st noise bluetooth images door occupancy_count occupancy_level
datetime
2017-03-25 09:05:00 22.600000 36.900000 781.000000 430.000000 1.0 511.000000 1.000000 15.242697 0.000000 0.000000 empty
2017-03-25 09:06:00 23.800000 38.954167 765.465279 428.533744 1.0 503.515931 11.399457 15.242697 0.000000 0.000000 empty
2017-03-25 09:07:00 23.850000 38.900000 768.458333 423.576500 1.0 510.548913 19.916667 15.242697 0.083333 4.416667 low
2017-03-25 09:08:00 23.900000 38.766667 777.791667 423.053571 1.0 506.504630 29.750000 15.242697 0.000000 23.416667 mid-level
2017-03-25 09:09:00 23.908333 38.733333 770.864583 438.607904 1.0 500.092672 35.860577 15.242697 0.000000 30.000000 high

Features & Target Arrays


In [6]:
# Breakdown of classroom occupancy levels
df.occupancy_level.value_counts()


Out[6]:
high         2881
mid-level     781
empty         482
low           348
Name: occupancy_level, dtype: int64

In [7]:
# Encode multiclass target variable
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit_transform(df['occupancy_level'])


Out[7]:
array([0, 0, 2, ..., 2, 2, 2], dtype=int64)

In [8]:
# Create feature and target arrays
X = df.drop('occupancy_level', axis=1).values
y = df['occupancy_level']

In [35]:
# Use TimeSeriesSplit to create training and test set split indices
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=12)

for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

SVC


In [36]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Create a SVC classifer object: svc
svc = SVC().fit(X_train, y_train)

print('SVC Cross-Validation Scores')
cv_scores = cross_val_score(svc, X_train, y_train, cv=tscv)
print(cv_scores)
print('Average CV Score: {:.4f}'.format(np.mean(cv_scores)))


SVC Cross-Validation Scores
[ 0.76489028  0.63009404  0.71786834  0.74294671  0.57053292  0.70219436
  0.47021944  0.75548589  0.62695925  0.42633229  0.54231975  0.71473354]
Average CV Score: 0.6387

In [37]:
from sklearn.metrics import classification_report

# Predict test set labels: y_pred
y_pred = svc.predict(X_test)

print(classification_report(y_test, y_pred))
print('Training set score: {:.4f}'.format(svc.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(svc.score(X_test, y_test)))


C:\Users\kmcintyre\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
             precision    recall  f1-score   support

      empty       0.00      0.00      0.00        61
       high       0.57      1.00      0.73       198
        low       0.00      0.00      0.00        41
  mid-level       0.00      0.00      0.00        45

avg / total       0.33      0.57      0.42       345

Training set score: 0.9988
Test set score: 0.5739

Hyperparameter Tuning


In [38]:
# Scale and tune hyperparameters using a SVM classifer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

pipe = Pipeline([('scaler', RobustScaler()),
                 ('clf', SVC())])

# Specify the hyperparameter space
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.00]

param_grid = [{'clf__kernel': ['linear'], 'clf__C': param_range,
               'clf__class_weight': [None, 'balanced']},
              {'clf__kernel': ['rbf'], 'clf__C': param_range,
               'clf__class_weight': [None, 'balanced'], 'clf__gamma': param_range}]
      
# Instantiate the GridSearchCV object: svc
grid = GridSearchCV(pipe, param_grid, cv=tscv)

# Fit to the training set
svc = grid.fit(X_train, y_train)

print('Best Estimator:\n{}'.format(svc.best_estimator_))


Best Estimator:
Pipeline(steps=[('scaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('clf', SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [39]:
print('SVM Model')
print('Best Score: {:.4f}'.format(svc.best_score_))
print('Best Parameters: {}'.format(svc.best_params_))


SVM Model
Best Score: 0.9747
Best Parameters: {'clf__kernel': 'linear', 'clf__class_weight': None, 'clf__C': 100.0}

Classification Report


In [40]:
from sklearn.metrics import classification_report

# Predict test set labels: y_pred
y_pred = svc.predict(X_test)

print('SVC Classification Report: \n{}'.format(classification_report(y_test, y_pred)))
print('Training set score: {:.4f}'.format(svc.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(svc.score(X_test, y_test)))


SVC Classification Report: 
             precision    recall  f1-score   support

      empty       1.00      1.00      1.00        61
       high       1.00      1.00      1.00       198
        low       0.98      1.00      0.99        41
  mid-level       1.00      0.98      0.99        45

avg / total       1.00      1.00      1.00       345

Training set score: 0.9993
Test set score: 0.9971

In [42]:
from sklearn.metrics import f1_score

print('SVC Multiclass Scores')
print('Micro average f1 score: {:.4f}'.format(f1_score(y_test, y_pred, average='micro')))
print('Weighted average f1 score: {:.4f}'.format(f1_score(y_test, y_pred, average='weighted')))
print('Macro average f1 score: {:.4f}'.format(f1_score(y_test, y_pred, average='macro')))


SVC Multiclass Scores
Micro average f1 score: 0.9971
Weighted average f1 score: 0.9971
Macro average f1 score: 0.9942

In [43]:
from yellowbrick.classifier import ClassificationReport
classes = ['Empty', 'High', 'Low', 'Mid-Level']

fig = plt.figure()
visualizer = ClassificationReport(svc, classes=classes)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
fig.savefig('ml_graphs/svc_classification_report.png')


Confusion Matrix


In [44]:
from sklearn.metrics import confusion_matrix

print('SVC Confusion Matrix')
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))


SVC Confusion Matrix
Confusion Matrix
[[ 61   0   0   0]
 [  0 198   0   0]
 [  0   0  41   0]
 [  0   0   1  44]]

Class Balance


In [47]:
from yellowbrick.classifier import ClassBalance
classes = ['Empty', 'High', 'Low', 'Mid-Level']

fig = plt.figure()
visualizer = ClassBalance(svc, classes=classes)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
fig.savefig('ml_graphs/svc_class_balance.png')


Save Model


In [48]:
import pickle

svc_model = 'svc_model.sav'

# Save fitted model to disk
pickle.dump(svc, open(svc_model, 'wb'))

In [49]:
loaded_model = pickle.load(open(svc_model, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)


0.997101449275