notebook.community

Edit and run



In [1]:

    
df = pd.read_excel('BrainMets.xlsx', 'DATA', header=1)



In [2]:

    
pred = df['Prediction(Cleveland Clinic)']



In [3]:

    
df.columns









    Out[3]:





Index([u'age', u'cancer type', u'# of tumors', u'Extracranial Disease Status', u'K Score', u'ECOG', u'Prior WBRT', u'Brain Tumor Sx', u'RPA', u'Diagnosis of Primary at the same time as Brain tumor', u'Prediction(Cleveland Clinic)', u' Prediction (Lanie Francis)', u'Prediction(Flickinger)', u'Prediction(Loefler', u'Prediction(Knisely)', u'Prediction(Lunsford)', u'Prediction (Tahrini)', u'Prediction (Sheehan)', u'Prediction (Linskey)', u'Prediction(friedman)', u'Prediction(Stupp)', u'Prediction(Rakfal)', u'Prediction(Rush)', u' Prediction( Kondziolka)', u'Dead', u'Date Endpoint', u'GK 1', u'SurvivalMonths', u'Study'], dtype='object')



In [3]:



In [4]:

    
dead = df['Dead'] == 1



In [5]:

    
df.ix[~dead]['SurvivalMonths'].describe()









    Out[5]:





count    136.000000
mean      14.908028
std       13.038385
min        0.000000
25%        8.071233
50%       14.268493
75%       20.066096
max      132.032877
Name: SurvivalMonths, dtype: float64



In [6]:

    
df['cancer type'].unique()









    Out[6]:





array([u'Breast', u'NSCLC', u'SCLC', u'RCC', u'Melanoma', u'melanoma',
       u'Carcinoid', u'Breast ', u'Endometrial', u'Sarcoma', u'Colon',
       u'Rectal', u'breast', u'Prostate', u'Uterine', u'Nasopharyngeal'], dtype=object)



In [7]:

    
df2 = df.copy()



In [8]:

    
df2['cancer type']  = df2['cancer type'].str.lower()



In [9]:

    
df2['cancer type']









    Out[9]:





0     breast
1      nsclc
2      nsclc
3      nsclc
4       sclc
5      nsclc
6      nsclc
7      nsclc
8      nsclc
9      nsclc
10     nsclc
11     nsclc
12    breast
13    breast
14    breast
...
341              sclc
342             nsclc
343            breast
344             nsclc
345             nsclc
346             nsclc
347              sclc
348             nsclc
349           uterine
350    nasopharyngeal
351             nsclc
352             nsclc
353             nsclc
354             colon
355            breast
Name: cancer type, Length: 356, dtype: object



In [10]:

    
df3 = df2.copy()



In [11]:

    
cancer_types = df2['cancer type'].unique()



In [12]:

    
cancer_type_col = np.zeros(len(df2['cancer type']), dtype=int)



In [13]:

    
for (i, cancer_type) in enumerate(cancer_types):
    cancer_type_col[np.array(df2['cancer type'] == cancer_type)] = i



In [14]:

    
cancer_type_col









    Out[14]:





array([ 0,  1,  1,  1,  2,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,
        1,  0,  1,  0,  0,  0,  1,  0,  0,  0,  1,  1,  0,  0,  0,  1,  0,
        0,  0,  1,  1,  3,  1,  1,  1,  1,  2,  0,  0,  1,  1,  0,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  4,  4,  4,  4,  1,  1,  0,  4,  1,
        1,  1,  4,  1,  1,  0,  2,  1,  0,  1,  1,  3,  3,  1,  1,  1,  0,
        4,  1,  1,  1,  4,  1,  1,  1,  3,  4,  1,  4,  1,  1,  1,  1,  1,
        1,  4,  2,  1,  4,  1,  0,  4,  4,  1,  1,  1,  1,  4,  4,  0,  0,
        4,  2,  0,  3,  1,  0,  2,  4,  1,  1,  0,  1,  3,  1,  2,  2,  2,
        4,  4,  1,  3,  1,  5,  1,  0,  1,  1,  4,  1,  1,  4,  1,  1,  4,
        2,  0,  1,  4,  1,  0,  0,  1,  1,  6,  4,  1,  1,  1,  1,  1,  1,
        1,  1,  4,  4,  4,  0,  0,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,
        1,  3,  3,  1,  4,  1,  4,  1,  1,  1,  1,  7,  0,  2,  3,  1,  1,
        0,  4,  0,  1,  1,  2,  1,  4,  1,  1,  1,  3,  0,  1,  1,  0,  1,
        2,  1,  1,  2,  0,  4,  1,  0,  0,  1,  1,  1,  1,  1,  1,  1,  0,
        1,  2,  1,  1,  1,  4,  8,  0,  0,  1,  1,  1,  4,  1,  0,  2,  1,
        1,  4,  4,  1,  1,  1,  1,  1,  1,  1,  3,  1,  1,  4,  4,  4,  1,
        4,  1,  2,  4,  1,  1,  1,  9,  0,  1,  1,  1,  0,  2, 10,  4,  0,
        0,  6,  1,  0,  1,  1,  2,  1,  7,  2,  1,  1,  1,  0,  3,  1,  3,
        0,  9,  1,  4,  0,  1,  0,  1,  0,  0,  3,  0,  1,  4,  1,  1,  3,
        2,  1,  1,  1,  0,  1,  1,  2,  1,  2,  1,  4,  1,  1, 11,  0,  1,
        0,  2,  1,  0,  1,  1,  1,  2,  1, 12, 13,  1,  1,  1,  9,  0])



In [15]:

    
df3['cancer type'] = cancer_type_col



In [16]:

    
X = df3[['age', 'cancer type', '# of tumors',  'ECOG', 'Prior WBRT', 'Brain Tumor Sx', 'RPA', 'Diagnosis of Primary at the same time as Brain tumor']]



In [17]:

    
sklearn.preprocessing.OneHotEncoder?



In [18]:

    
months_to_live = 12
n_train = 250
mask = (df['Dead'] | (df['SurvivalMonths'] >= months_to_live))
y = np.array((df['Dead'] & (df['SurvivalMonths'] < months_to_live)).ix[mask] == 1)
Xa = np.array(X.ix[mask]).astype('int')
n_features = Xa.shape[1]
binarize_mask = np.ones(n_features, dtype=bool)
binarize_mask[0] = False
binarizer = sklearn.preprocessing.OneHotEncoder(categorical_features = binarize_mask)
Xa = binarizer.fit_transform(Xa).todense()
idx = np.arange(len(y))
np.random.shuffle(idx)
yshuffle = y[idx]
Xshuffle = Xa[idx]
Xtrain = Xshuffle[:n_train]
Ytrain = yshuffle[:n_train]
Xtest = Xshuffle[n_train:]
Ytest = yshuffle[n_train:]
print Xtest[[0,1,2], :]
print Ytest[[0,1,2]]
print np.mean(Ytrain)
print np.mean(Ytest)
print Xtrain.shape
print Xtest.shape









    



[[  0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   1.   0.   0.   0.   1.   1.   0.   0.   0.
    1.   0.   0.   0.   1.  45.]
 [  1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   1.   0.   0.   0.   1.   0.   0.   1.   0.
    0.   1.   0.   1.   0.  61.]
 [  1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   1.   0.   0.   1.   0.   0.   1.   0.
    0.   1.   0.   1.   0.  72.]]
[False False  True]
0.568
0.537037037037
(250, 48)
(54, 48)



In [19]:

    
lr = sklearn.linear_model.LogisticRegression()
lr.fit(Xtrain, Ytrain)
Ypred = lr.predict(Xtest)
print(np.mean(Ypred))
np.mean(Ypred == Ytest)









    



0.722222222222






    Out[19]:





0.55555555555555558



In [20]:

    
"rf = sklearn.ensemble.RandomForestClassifier(n_estimators = 300)
rf.fit(Xtrain, Ytrain)
Ypred_rf = rf.predict(Xtest)
print(np.mean(Ypred_rf))
np.mean(Ypred_rf == Ytest)









    



  File "<ipython-input-20-d07f529efe31>", line 1
    "rf = sklearn.ensemble.RandomForestClassifier(n_estimators = 300)
                                                                    ^
SyntaxError: EOL while scanning string literal



In [21]:

    
et = sklearn.ensemble.ExtraTreesClassifier(n_estimators = 300)
et.fit(Xtrain, Ytrain)
Ypred_et = et.predict(Xtest)
print(np.mean(Ypred_et))
np.mean(Ypred_et == Ytest)









    



0.425925925926






    Out[21]:





0.55555555555555558



In [22]:

    
rf = sklearn.ensemble.RandomForestRegressor(n_estimators=200)



In [23]:

    
rf.fit(Xtrain, Ytrain)









    Out[23]:





RandomForestRegressor(bootstrap=True, compute_importances=None,
           criterion='mse', max_depth=None, max_features='auto',
           min_density=None, min_samples_leaf=1, min_samples_split=2,
           n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
           verbose=0)



In [24]:

    
Ypred_rf = rf.predict(Xtest)



In [25]:

    
pd.DataFrame({'pred_rf': Ypred_rf, 'pred': Ypred,  'actual':Ytest})









    Out[25]:






  
    
      
      actual
      pred
      pred_rf
    
  
  
    
      0 
       False
        True
       0.683917
    
    
      1 
       False
       False
       0.180000
    
    
      2 
        True
       False
       0.255000
    
    
      3 
       False
        True
       0.895000
    
    
      4 
        True
       False
       0.325000
    
    
      5 
       False
        True
       0.590000
    
    
      6 
        True
        True
       0.235000
    
    
      7 
       False
        True
       0.702083
    
    
      8 
        True
        True
       0.435000
    
    
      9 
       False
        True
       0.695000
    
    
      10
        True
       False
       0.255000
    
    
      11
       False
       False
       0.070000
    
    
      12
        True
        True
       0.743750
    
    
      13
       False
       False
       0.412583
    
    
      14
        True
       False
       0.270000
    
    
      15
        True
       False
       0.725000
    
    
      16
        True
        True
       0.785000
    
    
      17
       False
        True
       0.727500
    
    
      18
        True
        True
       0.731310
    
    
      19
        True
        True
       0.747500
    
    
      20
        True
        True
       0.845000
    
    
      21
        True
       False
       0.255000
    
    
      22
        True
       False
       0.190000
    
    
      23
       False
        True
       0.440000
    
    
      24
        True
        True
       0.370000
    
    
      25
        True
        True
       0.515000
    
    
      26
        True
        True
       0.570000
    
    
      27
        True
        True
       0.942500
    
    
      28
       False
        True
       0.677500
    
    
      29
        True
        True
       0.560000
    
    
      30
       False
        True
       0.780000
    
    
      31
       False
       False
       0.120000
    
    
      32
        True
        True
       0.415000
    
    
      33
       False
        True
       0.535000
    
    
      34
       False
        True
       0.590000
    
    
      35
        True
        True
       0.580000
    
    
      36
       False
       False
       0.320000
    
    
      37
       False
       False
       0.845000
    
    
      38
       False
        True
       0.493333
    
    
      39
       False
       False
       0.405000
    
    
      40
       False
        True
       0.660000
    
    
      41
        True
        True
       0.560000
    
    
      42
        True
        True
       0.445000
    
    
      43
        True
        True
       0.772500
    
    
      44
       False
        True
       0.130000
    
    
      45
        True
        True
       0.375000
    
    
      46
       False
        True
       0.265333
    
    
      47
       False
        True
       0.730000
    
    
      48
        True
        True
       0.525000
    
    
      49
       False
       False
       0.605000
    
    
      50
        True
        True
       0.845000
    
    
      51
        True
        True
       0.497500
    
    
      52
        True
        True
       0.800000
    
    
      53
       False
        True
       0.507583
    
  

54 rows × 3 columns



In [26]:

    
np.mean(np.abs(Ytest - Ypred) > np.abs(Ytest - Ypred_rf))









    Out[26]:





0.44444444444444442



In [27]:

    
np.mean(np.abs(Ytest - Ypred_rf))









    Out[27]:





0.49055136684303363



In [28]:

    
((~df['Dead']) & (df['SurvivalMonths']  < 4)).sum()









    Out[28]:





18



In [28]:



In [28]:



In [28]:



In [28]:



In [28]:



In [28]:



In [28]:



In [ ]:

	actual	pred	pred_rf
0	False	True	0.683917
1	False	False	0.180000
2	True	False	0.255000
3	False	True	0.895000
4	True	False	0.325000
5	False	True	0.590000
6	True	True	0.235000
7	False	True	0.702083
8	True	True	0.435000
9	False	True	0.695000
10	True	False	0.255000
11	False	False	0.070000
12	True	True	0.743750
13	False	False	0.412583
14	True	False	0.270000
15	True	False	0.725000
16	True	True	0.785000
17	False	True	0.727500
18	True	True	0.731310
19	True	True	0.747500
20	True	True	0.845000
21	True	False	0.255000
22	True	False	0.190000
23	False	True	0.440000
24	True	True	0.370000
25	True	True	0.515000
26	True	True	0.570000
27	True	True	0.942500
28	False	True	0.677500
29	True	True	0.560000
30	False	True	0.780000
31	False	False	0.120000
32	True	True	0.415000
33	False	True	0.535000
34	False	True	0.590000
35	True	True	0.580000
36	False	False	0.320000
37	False	False	0.845000
38	False	True	0.493333
39	False	False	0.405000
40	False	True	0.660000
41	True	True	0.560000
42	True	True	0.445000
43	True	True	0.772500
44	False	True	0.130000
45	True	True	0.375000
46	False	True	0.265333
47	False	True	0.730000
48	True	True	0.525000
49	False	False	0.605000
50	True	True	0.845000
51	True	True	0.497500
52	True	True	0.800000
53	False	True	0.507583