In [21]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [29]:
ftd_csv_file = '/data/shared/bvFTD/AMC_VUMC_bvFTD.csv'

In [30]:
def load_data():
    # seperator is different from standard because of SPSS 
    return pd.read_csv(ftd_csv_file, sep=';')

In [37]:
ftd_csv_df = load_data()

In [38]:
ftd_csv_df.head()


Out[38]:
Status_scan ID_D Geboortedatum Sex age_Dx_T0 OnsetDis Date_DX_T0 Education_Verhage DxT2_filtered DxT2 ... MED_Anticholinerg MED_Antipsychotica MED_Antidepr MED_Antiepileptica MED_Parkinsonisme MMSE FAB FBI_tot_baseline SRI_tot_baseline diagnosis_cat
0 4800 5/13/1955 m 56,3148528405202 2009 9/5/2011 4 1 Subjectieve klachten ... nee nee nee nee nee 28 18 36 0
1 5039 7/24/1948 m 63,0691307323751 2009 8/19/2011 5 1 Subjectieve klachten ... nee nee nee nee nee 25 18 21 10
2 5160 3/28/1939 m 72,3449691991786 2010 8/1/2011 6 1 Subjectieve klachten ... nee nee nee nee nee 30 12 11 3
3 6546 6/28/1965 m 47,7316906228611 2010 3/22/2013 6 1 Subjectieve klachten ... nee nee nee nee nee 29 16 17 0
4 6801 10/5/1953 m 59,9096509240246 2012 9/2/2013 7 1 Subjectieve klachten ... nee nee ja nee nee 28 18 12 2

5 rows × 31 columns


In [39]:
ftd_csv_df.select(lambda x: x.startswith('VG'), axis=1).head()


Out[39]:
VG_HVZ VG_MCI VG_PSY VG_ALS VG_PARK VG_AUTISME VG_DYSLE VG_THYR VG_TEKST
0 ja nee nee nee nee nee nee nee contusio cerebri 1973 doof li '97 myocard infarct
1 nee nee ja nee nee nee nee nee depressie in 2003
2 ja nee nee nee nee nee nee nee
3 nee nee nee nee nee nee nee nee 2006 analyse onverklaarde hoofdpijn
4 nee nee nee nee nee nee nee nee

In [41]:
ftd_csv_df.loc[:, ['MMSE', 'FAB']].head()


Out[41]:
MMSE FAB
0 28 18
1 25 18
2 30 12
3 29 16
4 28 18

In [ ]:

FTD is 1, everything else is 0


In [42]:
ftd_csv_df['diagnosis_label'] = np.zeros(ftd_csv_df.shape[0])
ftd_csv_df.loc[ftd_csv_df.DxT2 == 'ProbableFTD', 'diagnosis_label'] = 1
ftd_csv_df.loc[ftd_csv_df.DxT2 == 'Def. FTD', 'diagnosis_label'] = 1

In [44]:
ftd_csv_df.diagnosis_label.sum()


Out[44]:
23.0

In [48]:
# we have to exclude the free text column 
select_VG_and_MED = lambda x: (x.startswith('VG') or x.startswith('MED')) and not x.startswith('VG_TEKST')

In [49]:
VG_MED_col_array = ftd_csv_df.select(select_VG_and_MED, axis=1).as_matrix()
id_non_standard = (VG_MED_col_array != 'ja') & (VG_MED_col_array != 'nee')
outliers = np.unique(VG_MED_col_array[id_non_standard])

In [50]:
empty_string_df = ftd_csv_df.isin(outliers)
id_subj_remove = empty_string_df.select(select_VG_and_MED, axis=1).any(axis=1)

In [51]:
id_subj_remove


Out[51]:
0      False
1      False
2      False
3      False
4      False
5      False
6       True
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14      True
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26      True
27     False
28      True
29     False
       ...  
90     False
91     False
92     False
93     False
94     False
95     False
96     False
97      True
98     False
99     False
100    False
101    False
102    False
103     True
104     True
105    False
106    False
107     True
108    False
109    False
110    False
111    False
112    False
113    False
114    False
115     True
116    False
117    False
118    False
119    False
dtype: bool

In [52]:
ftd_csv_df = ftd_csv_df.loc[~id_subj_remove]

In [53]:
ftd_csv_df = ftd_csv_df.mask(ftd_csv_df.isin(['ja']), 1.)
ftd_csv_df = ftd_csv_df.mask(ftd_csv_df.isin(['nee']), 0.)

In [54]:
ftd_csv_df.Sex.loc[ftd_csv_df.Sex == 'm'] = 0.
ftd_csv_df.Sex.loc[ftd_csv_df.Sex == 'f'] = 1.
ftd_csv_df.Sex = ftd_csv_df.Sex.astype(np.float)


/data/wbbruin/anaconda2/lib/python2.7/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [55]:
# the string float values were with a ',' and np.float don't work on that
ftd_csv_df.age_Dx_T0 = ftd_csv_df.age_Dx_T0.str.replace(',', '.').astype(np.float)

In [56]:
ftd_csv_df.columns


Out[56]:
Index([u'Status_scan', u'ID_D', u'Geboortedatum', u'Sex', u'age_Dx_T0',
       u'OnsetDis', u'Date_DX_T0', u'Education_Verhage', u'DxT2_filtered',
       u'DxT2', u'Vrij_tekst_Dx', u'VG_HVZ', u'VG_MCI', u'VG_PSY', u'VG_ALS',
       u'VG_PARK', u'VG_AUTISME', u'VG_DYSLE', u'VG_THYR', u'VG_TEKST',
       u'MED_Seditiva', u'MED_Anticholinerg', u'MED_Antipsychotica',
       u'MED_Antidepr', u'MED_Antiepileptica', u'MED_Parkinsonisme', u'MMSE',
       u'FAB', u'FBI_tot_baseline', u'SRI_tot_baseline', u'diagnosis_cat',
       u'diagnosis_label'],
      dtype='object')

In [57]:
# VG and MED features
X_VG_MED = ftd_csv_df.select(select_VG_and_MED, axis=1).as_matrix().astype(np.float)
X_MMSE = ftd_csv_df.loc[:, 'MMSE'].as_matrix().astype(np.float)
X_sex_age = ftd_csv_df.loc[:, ['age_Dx_T0', 'Sex']].as_matrix()
X = np.concatenate((X_VG_MED, X_MMSE[:, np.newaxis], X_sex_age), axis=1)

In [58]:
X.shape


Out[58]:
(103, 17)

In [59]:
y = ftd_csv_df.diagnosis_label.astype(np.int).as_matrix()

Preprocessing done: let's do machine learning!


In [60]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [61]:
svm = SVC(kernel='linear')
rforest = RandomForestClassifier()
log_reg = LogisticRegression(penalty='l1')

In [62]:
auc_svm = cross_val_score(svm, X, y, scoring='roc_auc', cv=10, n_jobs=20, verbose=1)
auc_rforest = cross_val_score(rforest, X, y, scoring='roc_auc', cv=10, n_jobs=20, verbose=1)
auc_log_reg = cross_val_score(log_reg, X, y, scoring='roc_auc', cv=10, n_jobs=20, verbose=1)


[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:    0.4s finished

In [65]:
auc_log_reg.mean()


Out[65]:
0.64930555555555558

In [66]:
auc_rforest.mean()


Out[66]:
0.64652777777777781

In [67]:
auc_svm.mean()


Out[67]:
0.48819444444444449

In [ ]: