In [1]:
df = pd.read_excel('BrainMets.xlsx', 'DATA', header=1)
In [2]:
pred = df['Prediction(Cleveland Clinic)']
In [3]:
df.columns
Out[3]:
In [3]:
In [4]:
dead = df['Dead'] == 1
In [5]:
df.ix[~dead]['SurvivalMonths'].describe()
Out[5]:
In [6]:
df['cancer type'].unique()
Out[6]:
In [7]:
df2 = df.copy()
In [8]:
df2['cancer type'] = df2['cancer type'].str.lower()
In [9]:
df2['cancer type']
Out[9]:
In [10]:
df3 = df2.copy()
In [11]:
cancer_types = df2['cancer type'].unique()
In [12]:
cancer_type_col = np.zeros(len(df2['cancer type']), dtype=int)
In [13]:
for (i, cancer_type) in enumerate(cancer_types):
cancer_type_col[np.array(df2['cancer type'] == cancer_type)] = i
In [14]:
cancer_type_col
Out[14]:
In [15]:
df3['cancer type'] = cancer_type_col
In [16]:
X = df3[['age', 'cancer type', '# of tumors', 'ECOG', 'Prior WBRT', 'Brain Tumor Sx', 'RPA', 'Diagnosis of Primary at the same time as Brain tumor']]
In [17]:
sklearn.preprocessing.OneHotEncoder?
In [18]:
months_to_live = 12
n_train = 250
mask = (df['Dead'] | (df['SurvivalMonths'] >= months_to_live))
y = np.array((df['Dead'] & (df['SurvivalMonths'] < months_to_live)).ix[mask] == 1)
Xa = np.array(X.ix[mask]).astype('int')
n_features = Xa.shape[1]
binarize_mask = np.ones(n_features, dtype=bool)
binarize_mask[0] = False
binarizer = sklearn.preprocessing.OneHotEncoder(categorical_features = binarize_mask)
Xa = binarizer.fit_transform(Xa).todense()
idx = np.arange(len(y))
np.random.shuffle(idx)
yshuffle = y[idx]
Xshuffle = Xa[idx]
Xtrain = Xshuffle[:n_train]
Ytrain = yshuffle[:n_train]
Xtest = Xshuffle[n_train:]
Ytest = yshuffle[n_train:]
print Xtest[[0,1,2], :]
print Ytest[[0,1,2]]
print np.mean(Ytrain)
print np.mean(Ytest)
print Xtrain.shape
print Xtest.shape
In [19]:
lr = sklearn.linear_model.LogisticRegression()
lr.fit(Xtrain, Ytrain)
Ypred = lr.predict(Xtest)
print(np.mean(Ypred))
np.mean(Ypred == Ytest)
Out[19]:
In [20]:
"rf = sklearn.ensemble.RandomForestClassifier(n_estimators = 300)
rf.fit(Xtrain, Ytrain)
Ypred_rf = rf.predict(Xtest)
print(np.mean(Ypred_rf))
np.mean(Ypred_rf == Ytest)
In [21]:
et = sklearn.ensemble.ExtraTreesClassifier(n_estimators = 300)
et.fit(Xtrain, Ytrain)
Ypred_et = et.predict(Xtest)
print(np.mean(Ypred_et))
np.mean(Ypred_et == Ytest)
Out[21]:
In [22]:
rf = sklearn.ensemble.RandomForestRegressor(n_estimators=200)
In [23]:
rf.fit(Xtrain, Ytrain)
Out[23]:
In [24]:
Ypred_rf = rf.predict(Xtest)
In [25]:
pd.DataFrame({'pred_rf': Ypred_rf, 'pred': Ypred, 'actual':Ytest})
Out[25]:
In [26]:
np.mean(np.abs(Ytest - Ypred) > np.abs(Ytest - Ypred_rf))
Out[26]:
In [27]:
np.mean(np.abs(Ytest - Ypred_rf))
Out[27]:
In [28]:
((~df['Dead']) & (df['SurvivalMonths'] < 4)).sum()
Out[28]:
In [28]:
In [28]:
In [28]:
In [28]:
In [28]:
In [28]:
In [28]:
In [ ]: