In [5]:
    
%matplotlib inline
import os
import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
    
In [6]:
    
# Load dataset
df = pd.read_csv('dataset-1min.csv')
# Drop duplicates
df = df.drop_duplicates()
df.head(3)
    
    Out[6]:
In [7]:
    
# Determine the shape of the data
print("{} instances with {} features\n".format(*df.shape))
# Determine the frequency of each class
print(df.groupby('occupancy_category')['occupancy_category'].count())
    
    
In [8]:
    
# Helper function to encode occupancy_category based on the number of people 
def occupancy(df):
  if df['occupancy_category'] == 'very-low':
    return '1'
  elif df['occupancy_category'] == 'low':
    return '2'
  elif df['occupancy_category'] == 'fair':
    return '3'
  else:
    return '4'
df['occupancy_code'] = df.apply(occupancy, axis=1)
df.head(3)
    
    Out[8]:
In [9]:
    
# Read the data into a DataFrame
features = [
    'temperature',
    'humidity',
    'co2',
    'light',
    'noise',
    'bluetooth_devices',
    'occupancy_code'
]
classes = [
    'very-low',
    'low',
    'fair',
    'high'
] 
df = df[features]
df.shape
    
    Out[9]:
In [10]:
    
# Extract the target from the data
data   = df.ix[:, 0:-1]
target = df.ix[:, -1]
print(data.shape)
print(target.shape)
    
    
In [11]:
    
# Split into test and train data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target)
    
In [12]:
    
# Standarize data
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)
    
In [13]:
    
from sklearn import metrics
from sklearn.cross_validation import KFold
from yellowbrick.classifier import ClassificationReport, ROCAUC, ClassBalance
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
    
In [14]:
    
def fit_and_evaluate(dataset, model, label, **kwargs):
    """
    Because of the Scikit-Learn API, we can create a function to
    do all of the fit and evaluate work on our behalf!
    """
    start  = time.time() # Start the clock! 
    scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}
    
    for train, test in KFold(data.shape[0], n_folds=12, shuffle=True):
        
        estimator = model(**kwargs)
        estimator.fit(X_train, y_train)
        
        expected  = y_test
        predicted = estimator.predict(X_test)
        
        # Append our scores to the tracker
        scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
        scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
        scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
        scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))
    # Report
    print("Build and Validation of {} took {:0.3f} seconds".format(label, time.time()-start))
    print("Validation scores are as follows:\n")
    print(pd.DataFrame(scores).mean())
    
In [15]:
    
# Perform SVC Classification
svc = SVC()
fit_and_evaluate(df, SVC, "SVM Classifier")
    
    
In [16]:
    
visualizer = ClassificationReport(svc, classes=classes)
visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test)  
g = visualizer.poof()
    
    
In [28]:
    
# Perform kNN Classification
knn = KNeighborsClassifier()
fit_and_evaluate(df, KNeighborsClassifier, "kNN Classifier", n_neighbors=12)
    
    
In [29]:
    
visualizer = ClassificationReport(knn, classes=classes)
visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test)  
g = visualizer.poof()
    
    
In [30]:
    
# Perform Random Forest Classification
rfc = RandomForestClassifier()
fit_and_evaluate(df, RandomForestClassifier, "Random Forest Classifier")
    
    
In [31]:
    
visualizer = ClassificationReport(rfc, classes=classes)
visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test)  
g = visualizer.poof()
    
    
In [ ]: