In [1]:
df = pd.read_excel('BrainMets.xlsx', 'DATA', header=1)

In [2]:
pred = df['Prediction(Cleveland Clinic)']

In [3]:
df.columns


Out[3]:
Index([u'age', u'cancer type', u'# of tumors', u'Extracranial Disease Status', u'K Score', u'ECOG', u'Prior WBRT', u'Brain Tumor Sx', u'RPA', u'Diagnosis of Primary at the same time as Brain tumor', u'Prediction(Cleveland Clinic)', u' Prediction (Lanie Francis)', u'Prediction(Flickinger)', u'Prediction(Loefler', u'Prediction(Knisely)', u'Prediction(Lunsford)', u'Prediction (Tahrini)', u'Prediction (Sheehan)', u'Prediction (Linskey)', u'Prediction(friedman)', u'Prediction(Stupp)', u'Prediction(Rakfal)', u'Prediction(Rush)', u' Prediction( Kondziolka)', u'Dead', u'Date Endpoint', u'GK 1', u'SurvivalMonths', u'Study'], dtype='object')

In [3]:


In [4]:
dead = df['Dead'] == 1

In [5]:
df.ix[~dead]['SurvivalMonths'].describe()


Out[5]:
count    136.000000
mean      14.908028
std       13.038385
min        0.000000
25%        8.071233
50%       14.268493
75%       20.066096
max      132.032877
Name: SurvivalMonths, dtype: float64

In [6]:
df['cancer type'].unique()


Out[6]:
array([u'Breast', u'NSCLC', u'SCLC', u'RCC', u'Melanoma', u'melanoma',
       u'Carcinoid', u'Breast ', u'Endometrial', u'Sarcoma', u'Colon',
       u'Rectal', u'breast', u'Prostate', u'Uterine', u'Nasopharyngeal'], dtype=object)

In [7]:
df2 = df.copy()

In [8]:
df2['cancer type']  = df2['cancer type'].str.lower()

In [9]:
df2['cancer type']


Out[9]:
0     breast
1      nsclc
2      nsclc
3      nsclc
4       sclc
5      nsclc
6      nsclc
7      nsclc
8      nsclc
9      nsclc
10     nsclc
11     nsclc
12    breast
13    breast
14    breast
...
341              sclc
342             nsclc
343            breast
344             nsclc
345             nsclc
346             nsclc
347              sclc
348             nsclc
349           uterine
350    nasopharyngeal
351             nsclc
352             nsclc
353             nsclc
354             colon
355            breast
Name: cancer type, Length: 356, dtype: object

In [10]:
df3 = df2.copy()

In [11]:
cancer_types = df2['cancer type'].unique()

In [12]:
cancer_type_col = np.zeros(len(df2['cancer type']), dtype=int)

In [13]:
for (i, cancer_type) in enumerate(cancer_types):
    cancer_type_col[np.array(df2['cancer type'] == cancer_type)] = i

In [14]:
cancer_type_col


Out[14]:
array([ 0,  1,  1,  1,  2,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,
        1,  0,  1,  0,  0,  0,  1,  0,  0,  0,  1,  1,  0,  0,  0,  1,  0,
        0,  0,  1,  1,  3,  1,  1,  1,  1,  2,  0,  0,  1,  1,  0,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  4,  4,  4,  4,  1,  1,  0,  4,  1,
        1,  1,  4,  1,  1,  0,  2,  1,  0,  1,  1,  3,  3,  1,  1,  1,  0,
        4,  1,  1,  1,  4,  1,  1,  1,  3,  4,  1,  4,  1,  1,  1,  1,  1,
        1,  4,  2,  1,  4,  1,  0,  4,  4,  1,  1,  1,  1,  4,  4,  0,  0,
        4,  2,  0,  3,  1,  0,  2,  4,  1,  1,  0,  1,  3,  1,  2,  2,  2,
        4,  4,  1,  3,  1,  5,  1,  0,  1,  1,  4,  1,  1,  4,  1,  1,  4,
        2,  0,  1,  4,  1,  0,  0,  1,  1,  6,  4,  1,  1,  1,  1,  1,  1,
        1,  1,  4,  4,  4,  0,  0,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,
        1,  3,  3,  1,  4,  1,  4,  1,  1,  1,  1,  7,  0,  2,  3,  1,  1,
        0,  4,  0,  1,  1,  2,  1,  4,  1,  1,  1,  3,  0,  1,  1,  0,  1,
        2,  1,  1,  2,  0,  4,  1,  0,  0,  1,  1,  1,  1,  1,  1,  1,  0,
        1,  2,  1,  1,  1,  4,  8,  0,  0,  1,  1,  1,  4,  1,  0,  2,  1,
        1,  4,  4,  1,  1,  1,  1,  1,  1,  1,  3,  1,  1,  4,  4,  4,  1,
        4,  1,  2,  4,  1,  1,  1,  9,  0,  1,  1,  1,  0,  2, 10,  4,  0,
        0,  6,  1,  0,  1,  1,  2,  1,  7,  2,  1,  1,  1,  0,  3,  1,  3,
        0,  9,  1,  4,  0,  1,  0,  1,  0,  0,  3,  0,  1,  4,  1,  1,  3,
        2,  1,  1,  1,  0,  1,  1,  2,  1,  2,  1,  4,  1,  1, 11,  0,  1,
        0,  2,  1,  0,  1,  1,  1,  2,  1, 12, 13,  1,  1,  1,  9,  0])

In [15]:
df3['cancer type'] = cancer_type_col

In [16]:
X = df3[['age', 'cancer type', '# of tumors',  'ECOG', 'Prior WBRT', 'Brain Tumor Sx', 'RPA', 'Diagnosis of Primary at the same time as Brain tumor']]

In [17]:
sklearn.preprocessing.OneHotEncoder?

In [18]:
months_to_live = 12
n_train = 250
mask = (df['Dead'] | (df['SurvivalMonths'] >= months_to_live))
y = np.array((df['Dead'] & (df['SurvivalMonths'] < months_to_live)).ix[mask] == 1)
Xa = np.array(X.ix[mask]).astype('int')
n_features = Xa.shape[1]
binarize_mask = np.ones(n_features, dtype=bool)
binarize_mask[0] = False
binarizer = sklearn.preprocessing.OneHotEncoder(categorical_features = binarize_mask)
Xa = binarizer.fit_transform(Xa).todense()
idx = np.arange(len(y))
np.random.shuffle(idx)
yshuffle = y[idx]
Xshuffle = Xa[idx]
Xtrain = Xshuffle[:n_train]
Ytrain = yshuffle[:n_train]
Xtest = Xshuffle[n_train:]
Ytest = yshuffle[n_train:]
print Xtest[[0,1,2], :]
print Ytest[[0,1,2]]
print np.mean(Ytrain)
print np.mean(Ytest)
print Xtrain.shape
print Xtest.shape


[[  0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   1.   0.   0.   0.   1.   1.   0.   0.   0.
    1.   0.   0.   0.   1.  45.]
 [  1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   1.   0.   0.   0.   1.   0.   0.   1.   0.
    0.   1.   0.   1.   0.  61.]
 [  1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   1.   0.   0.   1.   0.   0.   1.   0.
    0.   1.   0.   1.   0.  72.]]
[False False  True]
0.568
0.537037037037
(250, 48)
(54, 48)

In [19]:
lr = sklearn.linear_model.LogisticRegression()
lr.fit(Xtrain, Ytrain)
Ypred = lr.predict(Xtest)
print(np.mean(Ypred))
np.mean(Ypred == Ytest)


0.722222222222
Out[19]:
0.55555555555555558

In [20]:
"rf = sklearn.ensemble.RandomForestClassifier(n_estimators = 300)
rf.fit(Xtrain, Ytrain)
Ypred_rf = rf.predict(Xtest)
print(np.mean(Ypred_rf))
np.mean(Ypred_rf == Ytest)


  File "<ipython-input-20-d07f529efe31>", line 1
    "rf = sklearn.ensemble.RandomForestClassifier(n_estimators = 300)
                                                                    ^
SyntaxError: EOL while scanning string literal

In [21]:
et = sklearn.ensemble.ExtraTreesClassifier(n_estimators = 300)
et.fit(Xtrain, Ytrain)
Ypred_et = et.predict(Xtest)
print(np.mean(Ypred_et))
np.mean(Ypred_et == Ytest)


0.425925925926
Out[21]:
0.55555555555555558

In [22]:
rf = sklearn.ensemble.RandomForestRegressor(n_estimators=200)

In [23]:
rf.fit(Xtrain, Ytrain)


Out[23]:
RandomForestRegressor(bootstrap=True, compute_importances=None,
           criterion='mse', max_depth=None, max_features='auto',
           min_density=None, min_samples_leaf=1, min_samples_split=2,
           n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
           verbose=0)

In [24]:
Ypred_rf = rf.predict(Xtest)

In [25]:
pd.DataFrame({'pred_rf': Ypred_rf, 'pred': Ypred,  'actual':Ytest})


Out[25]:
actual pred pred_rf
0 False True 0.683917
1 False False 0.180000
2 True False 0.255000
3 False True 0.895000
4 True False 0.325000
5 False True 0.590000
6 True True 0.235000
7 False True 0.702083
8 True True 0.435000
9 False True 0.695000
10 True False 0.255000
11 False False 0.070000
12 True True 0.743750
13 False False 0.412583
14 True False 0.270000
15 True False 0.725000
16 True True 0.785000
17 False True 0.727500
18 True True 0.731310
19 True True 0.747500
20 True True 0.845000
21 True False 0.255000
22 True False 0.190000
23 False True 0.440000
24 True True 0.370000
25 True True 0.515000
26 True True 0.570000
27 True True 0.942500
28 False True 0.677500
29 True True 0.560000
30 False True 0.780000
31 False False 0.120000
32 True True 0.415000
33 False True 0.535000
34 False True 0.590000
35 True True 0.580000
36 False False 0.320000
37 False False 0.845000
38 False True 0.493333
39 False False 0.405000
40 False True 0.660000
41 True True 0.560000
42 True True 0.445000
43 True True 0.772500
44 False True 0.130000
45 True True 0.375000
46 False True 0.265333
47 False True 0.730000
48 True True 0.525000
49 False False 0.605000
50 True True 0.845000
51 True True 0.497500
52 True True 0.800000
53 False True 0.507583

54 rows × 3 columns


In [26]:
np.mean(np.abs(Ytest - Ypred) > np.abs(Ytest - Ypred_rf))


Out[26]:
0.44444444444444442

In [27]:
np.mean(np.abs(Ytest - Ypred_rf))


Out[27]:
0.49055136684303363

In [28]:
((~df['Dead']) & (df['SurvivalMonths']  < 4)).sum()


Out[28]:
18

In [28]:


In [28]:


In [28]:


In [28]:


In [28]:


In [28]:


In [28]:


In [ ]: