In [21]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
In [29]:
ftd_csv_file = '/data/shared/bvFTD/AMC_VUMC_bvFTD.csv'
In [30]:
def load_data():
# seperator is different from standard because of SPSS
return pd.read_csv(ftd_csv_file, sep=';')
In [37]:
ftd_csv_df = load_data()
In [38]:
ftd_csv_df.head()
Out[38]:
In [39]:
ftd_csv_df.select(lambda x: x.startswith('VG'), axis=1).head()
Out[39]:
In [41]:
ftd_csv_df.loc[:, ['MMSE', 'FAB']].head()
Out[41]:
In [ ]:
In [42]:
ftd_csv_df['diagnosis_label'] = np.zeros(ftd_csv_df.shape[0])
ftd_csv_df.loc[ftd_csv_df.DxT2 == 'ProbableFTD', 'diagnosis_label'] = 1
ftd_csv_df.loc[ftd_csv_df.DxT2 == 'Def. FTD', 'diagnosis_label'] = 1
In [44]:
ftd_csv_df.diagnosis_label.sum()
Out[44]:
In [48]:
# we have to exclude the free text column
select_VG_and_MED = lambda x: (x.startswith('VG') or x.startswith('MED')) and not x.startswith('VG_TEKST')
In [49]:
VG_MED_col_array = ftd_csv_df.select(select_VG_and_MED, axis=1).as_matrix()
id_non_standard = (VG_MED_col_array != 'ja') & (VG_MED_col_array != 'nee')
outliers = np.unique(VG_MED_col_array[id_non_standard])
In [50]:
empty_string_df = ftd_csv_df.isin(outliers)
id_subj_remove = empty_string_df.select(select_VG_and_MED, axis=1).any(axis=1)
In [51]:
id_subj_remove
Out[51]:
In [52]:
ftd_csv_df = ftd_csv_df.loc[~id_subj_remove]
In [53]:
ftd_csv_df = ftd_csv_df.mask(ftd_csv_df.isin(['ja']), 1.)
ftd_csv_df = ftd_csv_df.mask(ftd_csv_df.isin(['nee']), 0.)
In [54]:
ftd_csv_df.Sex.loc[ftd_csv_df.Sex == 'm'] = 0.
ftd_csv_df.Sex.loc[ftd_csv_df.Sex == 'f'] = 1.
ftd_csv_df.Sex = ftd_csv_df.Sex.astype(np.float)
In [55]:
# the string float values were with a ',' and np.float don't work on that
ftd_csv_df.age_Dx_T0 = ftd_csv_df.age_Dx_T0.str.replace(',', '.').astype(np.float)
In [56]:
ftd_csv_df.columns
Out[56]:
In [57]:
# VG and MED features
X_VG_MED = ftd_csv_df.select(select_VG_and_MED, axis=1).as_matrix().astype(np.float)
X_MMSE = ftd_csv_df.loc[:, 'MMSE'].as_matrix().astype(np.float)
X_sex_age = ftd_csv_df.loc[:, ['age_Dx_T0', 'Sex']].as_matrix()
X = np.concatenate((X_VG_MED, X_MMSE[:, np.newaxis], X_sex_age), axis=1)
In [58]:
X.shape
Out[58]:
In [59]:
y = ftd_csv_df.diagnosis_label.astype(np.int).as_matrix()
In [60]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
In [61]:
svm = SVC(kernel='linear')
rforest = RandomForestClassifier()
log_reg = LogisticRegression(penalty='l1')
In [62]:
auc_svm = cross_val_score(svm, X, y, scoring='roc_auc', cv=10, n_jobs=20, verbose=1)
auc_rforest = cross_val_score(rforest, X, y, scoring='roc_auc', cv=10, n_jobs=20, verbose=1)
auc_log_reg = cross_val_score(log_reg, X, y, scoring='roc_auc', cv=10, n_jobs=20, verbose=1)
In [65]:
auc_log_reg.mean()
Out[65]:
In [66]:
auc_rforest.mean()
Out[66]:
In [67]:
auc_svm.mean()
Out[67]:
In [ ]: