In [7]:
import numpy as np
import pandas as pd
In [91]:
cns_all = pd.read_csv("CNS_all.csv")
In [92]:
cns_all["Age"] = cns_all["Age"].apply(lambda x : 1 if x > 60 else 0)
In [93]:
cns_all["Race"] = cns_all["Race"].apply(lambda x : "White" if x == "White" else "Other")
In [94]:
def stage_mask(stage):
if stage == "I" or stage == "II":
return "I/II"
elif stage == "III" or stage == "IV":
return "III/IV"
else:
return "Missing"
def LDH_mask(ldh):
if ldh == "NO":
return 0
elif ldh == "YES":
return 1
else:
return None
In [95]:
cns_all["Stage"] = cns_all["Stage"].apply(stage_mask)
In [96]:
cns_all["IPI Score"] = cns_all["IPI Score"].apply(lambda x: "L/LI" if x < 3 else "H/HI")
In [97]:
cns_all["PS"] = cns_all["PS"].apply(lambda x: 1 if x > 1 else 0)
In [98]:
cns_all["LDH"] = cns_all["LDH"].apply(LDH_mask)
In [99]:
cns_all["B Symp 1"] = cns_all["B Symp 1"].apply(lambda x : 0 if x == "NO" else 1)
cns_all["B symp 2"] = cns_all["B symp 2"].apply(lambda x : 0 if x == "NO" else 1)
cns_all["B symp 3"] = cns_all["B symp 3"].apply(lambda x : 0 if x == "NO" else 1)
In [103]:
b_symp_any = cns_all["B Symp 1"] + cns_all["B symp 2"] + cns_all["B symp 3"]
In [106]:
cns_all["B Symp"] = b_symp_any.apply(lambda x: 1 if x > 0 else 0)
In [108]:
del cns_all["B Symp 1"]
del cns_all["B symp 2"]
del cns_all["B symp 3"]
In [112]:
cns_all[">1 extranodal"] = cns_all[">1 extranodal"].apply(lambda x : 0 if x == "NO" else 1)
In [115]:
cns_all["BM Involv"] = cns_all["BM Involv"].apply(LDH_mask)
In [120]:
cns_all["PB Involv"] = cns_all["PB Involv"].apply(LDH_mask)
In [129]:
cns = cns_all["CNS "].apply(LDH_mask)
In [131]:
cns_all["cns"] = cns
del cns_all["CNS "]
In [135]:
cns_all["cns"] = cns_all["cns"].apply(lambda x: 1 if x == 1 else 0)
In [136]:
cns_all
Out[136]:
In [137]:
import matplotlib
import json
import requests
from sklearn import ensemble, cross_validation
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
In [138]:
le = preprocessing.LabelEncoder()
In [139]:
def fit_and_transform(column_name, df):
le.fit(df[column_name])
df[column_name] = le.transform(df[column_name])
columns = ["Gender", "Race", "Stage", "IPI Score"]
for column in columns:
fit_and_transform(column, cns_all)
In [168]:
cns_dropped = cns_all.dropna()
cns_filled = cns_all.fillna("missing")
In [170]:
for column in cns_filled.columns.unique():
fit_and_transform(column, cns_filled)
In [343]:
del cns_filled[">1 extranodal"]
In [344]:
target = cns_filled['cns']
main_data = cns_filled.ix[:,:-1]
In [345]:
X, y = shuffle(main_data, target, random_state=13)
offset = 150
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]
In [346]:
params = {'n_estimators':10, 'learning_rate':.001,
'max_depth':2, 'random_state':0, 'loss': 'deviance'}
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
clf_score = clf.score(X_test, y_test)
print("MSE: %.4f" % mse)
print("Accuracy: %.4f" % clf_score)
In [347]:
import pylab as pl
# Plot feature importance
feature_importance = clf.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
pl.subplot(1, 2, 2)
pl.barh(pos, feature_importance[sorted_idx], align='center')
pl.yticks(pos, main_data.columns[sorted_idx])
pl.xlabel('Relative Importance')
pl.title('Variable Importance')
Out[347]:
In [342]:
# Plot training deviance
test_score = np.zeros((params['n_estimators'],))
for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
test_score[i] = clf.loss_(y_test, y_pred)
pl.figure(figsize=(12, 6))
pl.subplot(1, 2, 1)
pl.title('Deviance')
pl.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
label='Training Set Deviance')
pl.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
label='Test Set Deviance')
pl.legend(loc='upper right')
pl.xlabel('Boosting Iterations')
pl.ylabel('Deviance')
Out[342]:
In [261]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
svm_clf = SVC()
neighbors_clf = KNeighborsClassifier()
random_forest_clf = RandomForestClassifier()
clfs = [("SVM", SVC()), ("KNN", KNeighborsClassifier()), ("Random Forest", RandomForestClassifier())]
for name, clf in clfs:
clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
clf_score = clf.score(X_test, y_test)
print name + ": -----------------"
print("MSE: %.4f" % mse)
print("Accuracy: %.4f" % clf_score)
f1scores = cross_validation.cross_val_score(clf, X_test, y_test, cv=3, scoring='f1')
print("F1 Score: %0.2f (+/- %0.2f)" % (f1scores.mean(), f1scores.std() * 2))
precision = cross_validation.cross_val_score(clf, X_test, y_test, cv=3, scoring='precision')
print("Precision Score: %0.2f (+/- %0.2f)" % (precision.mean(), precision.std() * 2))
recall = cross_validation.cross_val_score(clf, X_test, y_test, cv=3, scoring='recall')
print("Recall Score: %0.2f (+/- %0.2f)" % (recall.mean(), recall.std() * 2))
In [210]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.lda import LDA
In [267]:
pca = PCA(n_components=11)
X_r = pca.fit(X).transform(X)
lda = LDA(n_components=2)
X_r2 = lda.fit(X, y).transform(X)
target_names = cns_filled.columns.unique()
# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
% str(pca.explained_variance_ratio_))
plt.figure(figsize=(12, 8))
for c, i, target_name in zip("brgcmy", [0, 1, 2,3, 4, 5, 6, 7, 8, 9, 10], target_names):
plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
plt.legend(loc="upper left")
plt.title('PCA')
plt.figure(figsize=(12, 8))
for c, i, target_name in zip("brgcmy", [0, 1, 2,3, 4, 5, 6], target_names):
plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
plt.legend(loc="upper left")
plt.title('LDA')
plt.show()
In [268]:
from sklearn import linear_model
In [363]:
logreg = linear_model.LogisticRegression(C=1)
In [364]:
logclf = logreg.fit(X_train, y_train)
In [365]:
logclf.score(X_test, y_test)
Out[365]:
In [366]:
logclf.coef_
Out[366]:
In [289]:
from __future__ import print_function
import numpy as np
import statsmodels.api as sm
In [357]:
logit_mod = sm.Logit(y_train, X_train)
logit_res = logit_mod.fit(method='bfgs', disp=0)
print('Parameters: ', logit_res.params)
In [358]:
margeff = logit_res.get_margeff()
print(margeff.summary())
In [359]:
print(logit_res.summary())
In [371]:
cns_filled
Out[371]:
In [ ]: