In [4]:
import numpy
import pandas
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, make_scorer, f1_score
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from math import ceil, sqrt
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
In [7]:
data = pandas.read_csv("../data/processed/completeDataset.csv.gz", compression='gzip', header=True, sep=',', quotechar='"')
In [12]:
type(data)
Out[12]:
... we leave out, as test set, one session per teacher, with variety of states of Activity and Social. This will give us a (quite optimistic) estimate of how good a "general model" (that works across subjects) can be on data from a teacher it has seen, in a classroom situation that it has seen (as there are multiple sessions for each kind of classroom situation), but with different students
In [ ]:
# We only look for predicting 4 states of activity and 3 of social, the rest (incl.NA) we bunch in 'Other'
#fulldata$Activity.clean <- ifelse(is.na(as.character(fulldata$Activity.win)) |
# as.character(fulldata$Activity.win)=='OFF' |
# as.character(fulldata$Activity.win)=='TDT' |
# as.character(fulldata$Activity.win)=='TEC',
# 'Other',as.character(fulldata$Activity.win))
#fulldata$Social.clean <- ifelse(is.na(as.character(fulldata$Social.win)),
# 'Other',as.character(fulldata$Social.win))
#names(fulldata)[7562:7563] <- c('Activity','Social')
#fulldata <- fulldata[,-c(1,4,5,6)]
#fulldata$Activity <- factor(fulldata$Activity)
#fulldata$Social <- factor(fulldata$Social)
#test <- fulldata[fulldata$session=='case2-day3-session1-teacher2' | fulldata$session=='case1-day1-session1-teacher1',]
#train <- fulldata[fulldata$session!='case2-day3-session1-teacher2' & fulldata$session!='case1-day1-session1-teacher1',]
notnull_data = data[data.notnull().all(axis=1)]
train = notnull_data.values
notnull_data2 = data2[data2.notnull().all(axis=1)]
test = notnull_data2.values
Both the training and testing datasets have the following general structure:
''Rows'' represent the features of each 10s window (overlapping/sliding 5s), ordered by session
ID and its timestamp
(in ms)
''Columns'' are the features themselves (they have more-or-less-cryptic column names), up to 7559 of them!
Since RF performed quite well in most cases for our LAK paper dataset, let's try it on the whole dataset and see what comes out, as a baseline for modelling accuracy. In principle, we are using AUC (area under the ROC curve) as the main metric for model comparison
In [ ]:
# Separate the target values (Activity and Social) from features, etc.
X_train = train[:,3:7558].astype(float)
Y_trainA = train[:,7558] #Activity
Y_trainS = train[:,7559] #Social
X_test = test[:,3:7558].astype(float)
Y_testA = test[:,7558]
Y_testS = test[:,7559]
# feature_names of X
feature_names = names[3:7558]
idx_eyetracking = range(0,10)
idx_acc = range(10,150)
idx_audio = range(150,6555)
idx_video = range(6555,7555)
#print feature_names[idx_video]