In [1]:
import numpy
import pandas
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, make_scorer, f1_score
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from math import ceil, sqrt
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
import os.path
import xgboost
import tsne
In [2]:
if os.path.isfile("../data/processed/completeDataset.csv"):
data = pandas.read_csv("../data/processed/completeDataset.csv", sep=',', quotechar='"')
elif os.path.isfile("../data/processed/completeDataset.csv.gz"):
data = pandas.read_csv("../data/processed/completeDataset.csv.gz", compression='gzip', sep=',', quotechar='"')
else:
print("The dataset is not in ../data/processed !")
print data.shape
print data.head(3)
print "..."
print data.tail(3)
In [3]:
#Cleanup the data, like R's
# We only look for predicting 4 states of activity and 3 of social, the rest (incl.NA) we bunch in 'Other'
#fulldata$Activity.clean <- ifelse(is.na(as.character(fulldata$Activity.win)) |
# as.character(fulldata$Activity.win)=='OFF' |
# as.character(fulldata$Activity.win)=='TDT' |
# as.character(fulldata$Activity.win)=='TEC',
# 'Other',as.character(fulldata$Activity.win))
#fulldata$Social.clean <- ifelse(is.na(as.character(fulldata$Social.win)),
# 'Other',as.character(fulldata$Social.win))
#names(fulldata)[7562:7563] <- c('Activity','Social')
#fulldata <- fulldata[,-c(1,4,5,6)]
#fulldata$Activity <- factor(fulldata$Activity)
#fulldata$Social <- factor(fulldata$Social)
#test <- fulldata[fulldata$session=='case2-day3-session1-teacher2' | fulldata$session=='case1-day1-session1-teacher1',]
#train <- fulldata[fulldata$session!='case2-day3-session1-teacher2' & fulldata$session!='case1-day1-session1-teacher1',]
# Cleans up a vector of values, by assigning NaNs and others to a certain value
def clean_values(x, others=[], value='Other'):
if pandas.isnull(x) or (x in others):
return value
else:
return x
others = ['OFF','TDT','TEC']
data['Activity.clean'] = data['Activity.win'].apply(clean_values, args=(others,))
data['Social.clean'] = data['Social.win'].apply(clean_values)
data.shape
Out[3]:
In [4]:
# Drop the first meaningless column in the csv, plus the others we don't use anymore
cleandata = data.drop(data.columns[[0]], axis=1)
cleandata = cleandata.drop(['timestamp.orig', 'Activity.win', 'Social.win'], axis=1)
cleandata.rename(index=str, columns={'Activity.clean': 'Activity', 'Social.clean': 'Social'}, inplace=True)
#print cleandata.head(3)
#print cleandata.columns[-2:].values
Both the training and testing datasets have the following general structure:
''Rows'' represent the features of each 10s window (overlapping/sliding 5s), ordered by session
ID and its timestamp
(in ms)
''Columns'' are the features themselves (they have more-or-less-cryptic column names), up to 7559 of them!
In [5]:
# Uncomment to get the actual column names
# print cleandata.columns[0]
# print cleandata.columns[1]
# print cleandata.columns[2:12].values
# print cleandata.columns[12:152].values
# print cleandata.columns[152:6557].values
# print cleandata.columns[6557:7557].values
# print cleandata.columns[7557:].values
# feature_names of X
feature_names = cleandata.columns[2:7557]
idx_eyetracking = range(0,10)
idx_acc = range(10,150)
idx_audio = range(150,6555)
idx_video = range(6555,7555)
#print feature_names[idx_audio].values
In [6]:
sessions = cleandata['session'].unique()
activities = cleandata['Activity'].unique()
socials = cleandata['Social'].unique()
label_encoderA = LabelEncoder()
label_encoderA = label_encoderA.fit(cleandata['Activity'].values)
label_encoderS = LabelEncoder()
label_encoderS = label_encoderS.fit(cleandata['Social'].values)
# for s in sessions: ...
s = sessions[9]
test = cleandata.loc[cleandata['session'] == s]
train = cleandata.loc[cleandata['session'] != s]
print train.shape
print test.shape
In [7]:
# Separate the target values (Activity and Social) from features, etc.
X_train = train[feature_names].values
Y_trainA = train['Activity'].values
Y_trainS = train['Social'].values
X_test = test[feature_names].values
Y_testA = test['Activity'].values
Y_testS = test['Social'].values
print X_train[0:5,0:15]
print Y_trainA[0:5]
In [11]:
# Model preparation and feature selection
# Predicting Activity
#X = X_train[:,idx_eyetracking]
X = X_train
Y = Y_trainA
label_encoded_y = label_encoderA.transform(Y)
Xval = X_test
Yval = Y_testA
print X.shape
In [12]:
# Model training
model = xgboost.XGBClassifier()
model.fit(X, label_encoded_y)
print model
In [13]:
# Model evaluation
Ypred = model.predict(Xval)
label_encoded_yval = label_encoderA.transform(Yval)
print classification_report(label_encoded_yval, Ypred)
print confusion_matrix(label_encoded_yval, Ypred)
print "Accuracy %f " % accuracy_score(label_encoded_yval, Ypred)
#print "AUC %f " % roc_auc_score(label_encoded_yval, Ypred, average='macro')
#print "Kappa %f " % cohen_kappa_score(label_encoded_yval, Ypred, average='macro')
#TODO: Install scikit learn 0.18+
In [14]:
# Model preparation and feature selection
# Predicting Activity
#X = X_train[:,idx_eyetracking]
X = X_train
Y = Y_trainS
label_encoded_y = label_encoderS.transform(Y)
Xval = X_test
Yval = Y_testS
print X.shape
In [15]:
# Model training
model = xgboost.XGBClassifier()
model.fit(X, label_encoded_y)
print model
In [17]:
# Model evaluation
Ypred = model.predict(Xval)
label_encoded_yval = label_encoderS.transform(Yval)
print classification_report(label_encoded_yval, Ypred)
print confusion_matrix(label_encoded_yval, Ypred)
print "Accuracy %f " % accuracy_score(label_encoded_yval, Ypred)
#print "AUC %f " % roc_auc_score(label_encoded_yval, Ypred, average='macro')
#print "Kappa %f " % cohen_kappa_score(label_encoded_yval, Ypred, average='macro')
#TODO: Install scikit learn 0.18+