1. Preparation

Load libraries



In [1]:

    
import numpy
import pandas
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, make_scorer, f1_score
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from math import ceil, sqrt
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
import os.path
import xgboost
import tsne

Load dataset



In [2]:

    
if os.path.isfile("../data/processed/completeDataset.csv"):
    data = pandas.read_csv("../data/processed/completeDataset.csv", sep=',', quotechar='"')
elif os.path.isfile("../data/processed/completeDataset.csv.gz"):
    data = pandas.read_csv("../data/processed/completeDataset.csv.gz", compression='gzip', sep=',', quotechar='"')
else:
    print("The dataset is not in ../data/processed !")

print data.shape
print data.head(3)
print "..."
print data.tail(3)









    



(5561, 7561)
   Unnamed: 0                       session  timestamp  timestamp.orig  \
0         351  case1-day1-session1-teacher1       5000          100100   
1           1  case1-day1-session1-teacher1      10000          105100   
2         110  case1-day1-session1-teacher1      15000          110100   

  Activity.win Social.win  value.Mean  value.SD  value.Fix  value.Sac  \
0          TDT        CLS    3.731296  0.610973          0   0.271604   
1          TDT        CLS    3.623267  0.579418          0   0.294837   
2          TDT        CLS    3.663333  0.578927          0   0.230081   

   value.Fix.Dur  value.Fix.Disp  value.Sac.Dur  value.Sac.Amp  value.Sac.Len  \
0     155.517241      103.428029      77.047619      22.980952     210.109938   
1     195.000000      140.122022      78.210526      25.407895     221.507133   
2     193.250000      114.539547      75.232558      19.411628     171.338448   

   value.Sac.Vel  value.X.Mean  value.X.SD  value.X.Max  value.X.Min      
0       2.842520      1.968844    3.606027       13.810       -5.660 ...  
1       2.896652      0.820018    4.076379       13.810       -9.864 ...  
2       2.230033     -1.459000    2.536654       10.151       -9.864 ...  

[3 rows x 7561 columns]
...
      Unnamed: 0                       session  timestamp  timestamp.orig  \
5558        5399  case2-day4-session2-teacher2    2815000         2843900   
5559        5400  case2-day4-session2-teacher2    2820000         2848900   
5560        5401  case2-day4-session2-teacher2    2825000         2853900   

     Activity.win Social.win  value.Mean  value.SD  value.Fix  value.Sac  \
5558          NaN        NaN    2.938600  0.364288          0   0.129524   
5559          NaN        NaN    2.775017  0.266534          0   0.127137   
5560          NaN        NaN         NaN       NaN        NaN        NaN   

      value.Fix.Dur  value.Fix.Disp  value.Sac.Dur  value.Sac.Amp  \
5558     179.870968       89.115134      93.666667      13.161111   
5559     163.848485       64.801448      87.222222      11.288889   
5560            NaN             NaN            NaN            NaN   

      value.Sac.Len  value.Sac.Vel  value.X.Mean  value.X.SD  value.X.Max  \
5558     185.952369       1.938144      9.632364    0.629687       13.111   
5559     145.475801       1.758783      9.574206    0.498873       13.111   
5560            NaN            NaN           NaN         NaN          NaN   

      value.X.Min      
5558        8.351 ...  
5559        8.351 ...  
5560          NaN ...  

[3 rows x 7561 columns]



In [3]:

    
#Cleanup the data, like R's
# We only look for predicting 4 states of activity and 3 of social, the rest (incl.NA) we bunch in 'Other'
#fulldata$Activity.clean <- ifelse(is.na(as.character(fulldata$Activity.win)) | 
#                                      as.character(fulldata$Activity.win)=='OFF' |
#                                      as.character(fulldata$Activity.win)=='TDT' |
#                                      as.character(fulldata$Activity.win)=='TEC',
#                                  'Other',as.character(fulldata$Activity.win))

#fulldata$Social.clean <- ifelse(is.na(as.character(fulldata$Social.win)),
#                                  'Other',as.character(fulldata$Social.win))


#names(fulldata)[7562:7563] <- c('Activity','Social')
#fulldata <- fulldata[,-c(1,4,5,6)]
#fulldata$Activity <- factor(fulldata$Activity)
#fulldata$Social <- factor(fulldata$Social)


#test <- fulldata[fulldata$session=='case2-day3-session1-teacher2' | fulldata$session=='case1-day1-session1-teacher1',]
#train <- fulldata[fulldata$session!='case2-day3-session1-teacher2' & fulldata$session!='case1-day1-session1-teacher1',]

# Cleans up a vector of values, by assigning NaNs and others to a certain value
def clean_values(x, others=[], value='Other'):
    if pandas.isnull(x) or (x in others):
        return value
    else:
        return x

others = ['OFF','TDT','TEC']
data['Activity.clean'] = data['Activity.win'].apply(clean_values, args=(others,))
data['Social.clean'] = data['Social.win'].apply(clean_values)
data.shape









    



/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:92: DeprecationWarning: DisplayFormatter._ipython_display_formatter_default is deprecated: use @default decorator instead.
  def _ipython_display_formatter_default(self):
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:98: DeprecationWarning: DisplayFormatter._formatters_default is deprecated: use @default decorator instead.
  def _formatters_default(self):
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:677: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.
  def _deferred_printers_default(self):
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:669: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.
  def _singleton_printers_default(self):
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:672: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.
  def _type_printers_default(self):
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:669: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.
  def _singleton_printers_default(self):
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:672: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.
  def _type_printers_default(self):
/usr/local/lib/python2.7/dist-packages/IPython/core/formatters.py:677: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.
  def _deferred_printers_default(self):






    Out[3]:





(5561, 7563)



In [4]:

    
# Drop the first meaningless column in the csv, plus the others we don't use anymore
cleandata = data.drop(data.columns[[0]], axis=1)
cleandata = cleandata.drop(['timestamp.orig', 'Activity.win', 'Social.win'], axis=1)
cleandata.rename(index=str, columns={'Activity.clean': 'Activity', 'Social.clean': 'Social'}, inplace=True)

#print cleandata.head(3)
#print cleandata.columns[-2:].values

Dataset overview

Both the training and testing datasets have the following general structure:

''Rows'' represent the features of each 10s window (overlapping/sliding 5s), ordered by session ID and its timestamp (in ms)
''Columns'' are the features themselves (they have more-or-less-cryptic column names), up to 7559 of them!
- [,0]: ''session id''
- [,1]: ''timestamp'' within the session (in ms)
- [,2:11]: ''eyetracking'' features (mean/sd pupil diameter, nr. of long fixations, avg. saccade speed, fixation duration, fixation dispersion, saccade duration, saccade amplitude, saccade length, saccade velocity)
- [,12:151]: ''accelerometer'' features, including X, Y, Z (mean, sd, max, min, median, and 30 FFT coefficients of each of them) and jerk (mean, sd, max, min, median, and 30 FFT coefficients of each of it)
- [,152:6556]: ''audio'' features extracted from an audio snippet of the 10s window, using openSMILE. Includes features about whether there is someone speaking (153:163), emotion recognition models (164:184), and brute-force audio spectrum features and characteristics used in various audio recognition challenges/tasks (185:6557)
- [,6557:7556]: ''video'' features extracted from an image taken in the middle of the window (the 1000 values of the last layer when passing the immage through a VGG pre-trained model)
- [,7557:7558]: Activity and Social, the two orchestration graph dimensions we want to predict



In [5]:

    
# Uncomment to get the actual column names
# print cleandata.columns[0]
# print cleandata.columns[1]
# print cleandata.columns[2:12].values
# print cleandata.columns[12:152].values
# print cleandata.columns[152:6557].values
# print cleandata.columns[6557:7557].values
# print cleandata.columns[7557:].values

# feature_names of X
feature_names = cleandata.columns[2:7557]
idx_eyetracking = range(0,10)
idx_acc = range(10,150)
idx_audio = range(150,6555)
idx_video = range(6555,7555)
#print feature_names[idx_audio].values

Basic split

For now, we just leave out one session by teacher 2 (later, do a loop)



In [6]:

    
sessions = cleandata['session'].unique()
activities = cleandata['Activity'].unique()
socials = cleandata['Social'].unique()
label_encoderA = LabelEncoder()
label_encoderA = label_encoderA.fit(cleandata['Activity'].values)
label_encoderS = LabelEncoder()
label_encoderS = label_encoderS.fit(cleandata['Social'].values)


# for s in sessions: ...
s = sessions[9]
test = cleandata.loc[cleandata['session'] == s]
train = cleandata.loc[cleandata['session'] != s]

print train.shape
print test.shape









    



(5065, 7559)
(496, 7559)



In [7]:

    
# Separate the target values (Activity and Social) from features, etc.
X_train = train[feature_names].values
Y_trainA = train['Activity'].values
Y_trainS = train['Social'].values
X_test = test[feature_names].values
Y_testA = test['Activity'].values
Y_testS = test['Social'].values
print X_train[0:5,0:15]
print Y_trainA[0:5]









    



[[  3.73129568e+00   6.10972980e-01   0.00000000e+00   2.71603612e-01
    1.55517241e+02   1.03428029e+02   7.70476190e+01   2.29809524e+01
    2.10109938e+02   2.84252017e+00   1.96884431e+00   3.60602652e+00
    1.38100004e+01  -5.65999985e+00   7.55999982e-01]
 [  3.62326667e+00   5.79418457e-01   0.00000000e+00   2.94837273e-01
    1.95000000e+02   1.40122022e+02   7.82105263e+01   2.54078947e+01
    2.21507133e+02   2.89665210e+00   8.20018070e-01   4.07637903e+00
    1.38100004e+01  -9.86400032e+00   7.59999994e-02]
 [  3.66333333e+00   5.78927236e-01   0.00000000e+00   2.30080752e-01
    1.93250000e+02   1.14539547e+02   7.52325581e+01   1.94116279e+01
    1.71338448e+02   2.23003256e+00  -1.45900001e+00   2.53665438e+00
    1.01510000e+01  -9.86400032e+00  -1.58450001e+00]
 [  3.42378738e+00   7.60500016e-01   0.00000000e+00   2.60190343e-01
    1.60333333e+02   1.02295594e+02   7.67500000e+01   2.31625000e+01
    1.72933977e+02   2.12088150e+00  -1.96377246e+00   1.44517212e+00
    2.83400011e+00  -6.68400002e+00  -2.05900002e+00]
 [  2.98644518e+00   5.94602098e-01   0.00000000e+00   3.60908974e-01
    1.25866667e+02   1.08804796e+02   8.04102564e+01   3.37589744e+01
    2.33127601e+02   2.75841294e+00  -1.91071856e+00   1.06853026e+00
    2.61400008e+00  -5.61199999e+00  -1.90499997e+00]]
['Other' 'Other' 'Other' 'Other' 'EXP']

A basic benchmark: XGBoost

How does XGBoost (a decision tree ensemble) perform on the whole dataset?

Teacher activity



In [11]:

    
# Model preparation and feature selection

# Predicting Activity
#X = X_train[:,idx_eyetracking]
X = X_train
Y = Y_trainA
label_encoded_y = label_encoderA.transform(Y)
Xval = X_test
Yval = Y_testA
print X.shape









    



(5065, 7555)



In [12]:

    
# Model training
model = xgboost.XGBClassifier()
model.fit(X, label_encoded_y)
print model









    



XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective=multi:softprob, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)



In [13]:

    
# Model evaluation
Ypred = model.predict(Xval)
label_encoded_yval = label_encoderA.transform(Yval)
print classification_report(label_encoded_yval, Ypred)
print confusion_matrix(label_encoded_yval, Ypred)
print "Accuracy %f " % accuracy_score(label_encoded_yval, Ypred)
#print "AUC %f " % roc_auc_score(label_encoded_yval, Ypred, average='macro')
#print "Kappa %f " % cohen_kappa_score(label_encoded_yval, Ypred, average='macro')
#TODO: Install scikit learn 0.18+









    



             precision    recall  f1-score   support

          0       0.73      0.65      0.68       172
          1       0.63      0.39      0.48       169
          2       0.19      0.59      0.29        73
          3       0.00      0.00      0.00        36
          4       0.50      0.11      0.18        46

avg / total       0.54      0.45      0.46       496

[[111  12  46   0   3]
 [  6  66  97   0   0]
 [ 17  11  43   0   2]
 [  4   3  29   0   0]
 [ 15  12  12   2   5]]
Accuracy 0.453629 






    



/usr/lib/python2.7/dist-packages/sklearn/metrics/metrics.py:1905: UserWarning: The precision and recall are equal to zero for some labels. fbeta_score is ill defined for those labels [3]. 
  average=None)



In [14]:

    
# Model preparation and feature selection

# Predicting Activity
#X = X_train[:,idx_eyetracking]
X = X_train
Y = Y_trainS
label_encoded_y = label_encoderS.transform(Y)
Xval = X_test
Yval = Y_testS
print X.shape









    



(5065, 7555)



In [15]:

    
# Model training
model = xgboost.XGBClassifier()
model.fit(X, label_encoded_y)
print model









    



XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective=multi:softprob, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)



In [17]:

    
# Model evaluation
Ypred = model.predict(Xval)
label_encoded_yval = label_encoderS.transform(Yval)
print classification_report(label_encoded_yval, Ypred)
print confusion_matrix(label_encoded_yval, Ypred)
print "Accuracy %f " % accuracy_score(label_encoded_yval, Ypred)
#print "AUC %f " % roc_auc_score(label_encoded_yval, Ypred, average='macro')
#print "Kappa %f " % cohen_kappa_score(label_encoded_yval, Ypred, average='macro')
#TODO: Install scikit learn 0.18+









    



             precision    recall  f1-score   support

          0       0.87      0.87      0.87       396
          1       0.33      0.04      0.07        48
          2       0.10      0.27      0.15        11
          3       0.22      0.37      0.28        41

avg / total       0.75      0.73      0.73       496

[[343   1  10  42]
 [ 29   2   7  10]
 [  5   2   3   1]
 [ 16   1   9  15]]
Accuracy 0.731855

Other splits are possible! (TODO: create a test harness that tries all of these on our best models)

General model -- Leave one teacher out: train on data for one teacher, test on data for another teacher (we only have two teachers!)
General model -- Leave one situation out: train on data for two teachers, but leave all the sessions for one kind of situation out
Personalized model -- Leave one session out: train on data for one teacher, but leave one session out
Personalized model -- Leave one situation out: train on data for one teacher, but leave one kind of situation out (can only be done with teacher 2)