notebook.community

Edit and run



In [21]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd



In [29]:

    
ftd_csv_file = '/data/shared/bvFTD/AMC_VUMC_bvFTD.csv'



In [30]:

    
def load_data():
    # seperator is different from standard because of SPSS 
    return pd.read_csv(ftd_csv_file, sep=';')



In [37]:

    
ftd_csv_df = load_data()



In [38]:

    
ftd_csv_df.head()









    Out[38]:






  
    
      
      Status_scan
      ID_D
      Geboortedatum
      Sex
      age_Dx_T0
      OnsetDis
      Date_DX_T0
      Education_Verhage
      DxT2_filtered
      DxT2
      ...
      MED_Anticholinerg
      MED_Antipsychotica
      MED_Antidepr
      MED_Antiepileptica
      MED_Parkinsonisme
      MMSE
      FAB
      FBI_tot_baseline
      SRI_tot_baseline
      diagnosis_cat
    
  
  
    
      0
      
      4800
      5/13/1955
      m
      56,3148528405202
      2009
      9/5/2011
      4
      1
      Subjectieve klachten
      ...
      nee
      nee
      nee
      nee
      nee
      28
      18
      36
      0
      
    
    
      1
      
      5039
      7/24/1948
      m
      63,0691307323751
      2009
      8/19/2011
      5
      1
      Subjectieve klachten
      ...
      nee
      nee
      nee
      nee
      nee
      25
      18
      21
      10
      
    
    
      2
      
      5160
      3/28/1939
      m
      72,3449691991786
      2010
      8/1/2011
      6
      1
      Subjectieve klachten
      ...
      nee
      nee
      nee
      nee
      nee
      30
      12
      11
      3
      
    
    
      3
      
      6546
      6/28/1965
      m
      47,7316906228611
      2010
      3/22/2013
      6
      1
      Subjectieve klachten
      ...
      nee
      nee
      nee
      nee
      nee
      29
      16
      17
      0
      
    
    
      4
      
      6801
      10/5/1953
      m
      59,9096509240246
      2012
      9/2/2013
      7
      1
      Subjectieve klachten
      ...
      nee
      nee
      ja
      nee
      nee
      28
      18
      12
      2
      
    
  

5 rows × 31 columns



In [39]:

    
ftd_csv_df.select(lambda x: x.startswith('VG'), axis=1).head()









    Out[39]:






  
    
      
      VG_HVZ
      VG_MCI
      VG_PSY
      VG_ALS
      VG_PARK
      VG_AUTISME
      VG_DYSLE
      VG_THYR
      VG_TEKST
    
  
  
    
      0
      ja
      nee
      nee
      nee
      nee
      nee
      nee
      nee
      contusio cerebri 1973 doof li '97 myocard infarct
    
    
      1
      nee
      nee
      ja
      nee
      nee
      nee
      nee
      nee
      depressie in 2003
    
    
      2
      ja
      nee
      nee
      nee
      nee
      nee
      nee
      nee
      
    
    
      3
      nee
      nee
      nee
      nee
      nee
      nee
      nee
      nee
      2006 analyse onverklaarde hoofdpijn
    
    
      4
      nee
      nee
      nee
      nee
      nee
      nee
      nee
      nee



In [41]:

    
ftd_csv_df.loc[:, ['MMSE', 'FAB']].head()



In [ ]:

FTD is 1, everything else is 0



In [42]:

    
ftd_csv_df['diagnosis_label'] = np.zeros(ftd_csv_df.shape[0])
ftd_csv_df.loc[ftd_csv_df.DxT2 == 'ProbableFTD', 'diagnosis_label'] = 1
ftd_csv_df.loc[ftd_csv_df.DxT2 == 'Def. FTD', 'diagnosis_label'] = 1



In [44]:

    
ftd_csv_df.diagnosis_label.sum()









    Out[44]:





23.0



In [48]:

    
# we have to exclude the free text column 
select_VG_and_MED = lambda x: (x.startswith('VG') or x.startswith('MED')) and not x.startswith('VG_TEKST')



In [49]:

    
VG_MED_col_array = ftd_csv_df.select(select_VG_and_MED, axis=1).as_matrix()
id_non_standard = (VG_MED_col_array != 'ja') & (VG_MED_col_array != 'nee')
outliers = np.unique(VG_MED_col_array[id_non_standard])



In [50]:

    
empty_string_df = ftd_csv_df.isin(outliers)
id_subj_remove = empty_string_df.select(select_VG_and_MED, axis=1).any(axis=1)



In [51]:

    
id_subj_remove









    Out[51]:





0      False
1      False
2      False
3      False
4      False
5      False
6       True
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14      True
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26      True
27     False
28      True
29     False
       ...  
90     False
91     False
92     False
93     False
94     False
95     False
96     False
97      True
98     False
99     False
100    False
101    False
102    False
103     True
104     True
105    False
106    False
107     True
108    False
109    False
110    False
111    False
112    False
113    False
114    False
115     True
116    False
117    False
118    False
119    False
dtype: bool



In [52]:

    
ftd_csv_df = ftd_csv_df.loc[~id_subj_remove]



In [53]:

    
ftd_csv_df = ftd_csv_df.mask(ftd_csv_df.isin(['ja']), 1.)
ftd_csv_df = ftd_csv_df.mask(ftd_csv_df.isin(['nee']), 0.)



In [54]:

    
ftd_csv_df.Sex.loc[ftd_csv_df.Sex == 'm'] = 0.
ftd_csv_df.Sex.loc[ftd_csv_df.Sex == 'f'] = 1.
ftd_csv_df.Sex = ftd_csv_df.Sex.astype(np.float)









    



/data/wbbruin/anaconda2/lib/python2.7/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [55]:

    
# the string float values were with a ',' and np.float don't work on that
ftd_csv_df.age_Dx_T0 = ftd_csv_df.age_Dx_T0.str.replace(',', '.').astype(np.float)



In [56]:

    
ftd_csv_df.columns









    Out[56]:





Index([u'Status_scan', u'ID_D', u'Geboortedatum', u'Sex', u'age_Dx_T0',
       u'OnsetDis', u'Date_DX_T0', u'Education_Verhage', u'DxT2_filtered',
       u'DxT2', u'Vrij_tekst_Dx', u'VG_HVZ', u'VG_MCI', u'VG_PSY', u'VG_ALS',
       u'VG_PARK', u'VG_AUTISME', u'VG_DYSLE', u'VG_THYR', u'VG_TEKST',
       u'MED_Seditiva', u'MED_Anticholinerg', u'MED_Antipsychotica',
       u'MED_Antidepr', u'MED_Antiepileptica', u'MED_Parkinsonisme', u'MMSE',
       u'FAB', u'FBI_tot_baseline', u'SRI_tot_baseline', u'diagnosis_cat',
       u'diagnosis_label'],
      dtype='object')



In [57]:

    
# VG and MED features
X_VG_MED = ftd_csv_df.select(select_VG_and_MED, axis=1).as_matrix().astype(np.float)
X_MMSE = ftd_csv_df.loc[:, 'MMSE'].as_matrix().astype(np.float)
X_sex_age = ftd_csv_df.loc[:, ['age_Dx_T0', 'Sex']].as_matrix()
X = np.concatenate((X_VG_MED, X_MMSE[:, np.newaxis], X_sex_age), axis=1)



In [58]:

    
X.shape









    Out[58]:





(103, 17)



In [59]:

    
y = ftd_csv_df.diagnosis_label.astype(np.int).as_matrix()

Preprocessing done: let's do machine learning!



In [60]:

    
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression



In [61]:

    
svm = SVC(kernel='linear')
rforest = RandomForestClassifier()
log_reg = LogisticRegression(penalty='l1')



In [62]:

    
auc_svm = cross_val_score(svm, X, y, scoring='roc_auc', cv=10, n_jobs=20, verbose=1)
auc_rforest = cross_val_score(rforest, X, y, scoring='roc_auc', cv=10, n_jobs=20, verbose=1)
auc_log_reg = cross_val_score(log_reg, X, y, scoring='roc_auc', cv=10, n_jobs=20, verbose=1)









    



[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:    0.4s finished



In [65]:

    
auc_log_reg.mean()









    Out[65]:





0.64930555555555558



In [66]:

    
auc_rforest.mean()









    Out[66]:





0.64652777777777781



In [67]:

    
auc_svm.mean()









    Out[67]:





0.48819444444444449



In [ ]:

	ID_D	Geboortedatum	Sex	age_Dx_T0	OnsetDis	Date_DX_T0	Education_Verhage	DxT2_filtered	DxT2	...	MED_Anticholinerg	MED_Antipsychotica	MED_Antidepr	MED_Antiepileptica	MED_Parkinsonisme	MMSE	FAB	FBI_tot_baseline	SRI_tot_baseline
0	4800	5/13/1955	m	56,3148528405202	2009	9/5/2011	4	1	Subjectieve klachten	...	nee	nee	nee	nee	nee	28	18	36	0
1	5039	7/24/1948	m	63,0691307323751	2009	8/19/2011	5	1	Subjectieve klachten	...	nee	nee	nee	nee	nee	25	18	21	10
2	5160	3/28/1939	m	72,3449691991786	2010	8/1/2011	6	1	Subjectieve klachten	...	nee	nee	nee	nee	nee	30	12	11	3
3	6546	6/28/1965	m	47,7316906228611	2010	3/22/2013	6	1	Subjectieve klachten	...	nee	nee	nee	nee	nee	29	16	17	0
4	6801	10/5/1953	m	59,9096509240246	2012	9/2/2013	7	1	Subjectieve klachten	...	nee	nee	ja	nee	nee	28	18	12	2

	VG_HVZ	VG_MCI	VG_PSY	VG_ALS	VG_PARK	VG_AUTISME	VG_DYSLE	VG_THYR	VG_TEKST
0	ja	nee	nee	nee	nee	nee	nee	nee	contusio cerebri 1973 doof li '97 myocard infarct
1	nee	nee	ja	nee	nee	nee	nee	nee	depressie in 2003
2	ja	nee	nee	nee	nee	nee	nee	nee
3	nee	nee	nee	nee	nee	nee	nee	nee	2006 analyse onverklaarde hoofdpijn
4	nee	nee	nee	nee	nee	nee	nee	nee