In [7]:
import numpy as np
import pandas as pd

In [91]:
cns_all = pd.read_csv("CNS_all.csv")

In [92]:
cns_all["Age"] = cns_all["Age"].apply(lambda x : 1 if x > 60 else 0)

In [93]:
cns_all["Race"] = cns_all["Race"].apply(lambda x : "White" if x == "White" else "Other")

In [94]:
def stage_mask(stage):
    if stage == "I" or stage == "II":
        return "I/II"
    elif stage == "III" or stage == "IV":
        return "III/IV"
    else:
        return "Missing"
    
def LDH_mask(ldh):
    if ldh == "NO":
        return 0
    elif ldh == "YES":
        return 1
    else:
        return None

In [95]:
cns_all["Stage"] = cns_all["Stage"].apply(stage_mask)

In [96]:
cns_all["IPI Score"] = cns_all["IPI Score"].apply(lambda x: "L/LI" if x < 3 else "H/HI")

In [97]:
cns_all["PS"] = cns_all["PS"].apply(lambda x: 1 if x > 1 else 0)

In [98]:
cns_all["LDH"] = cns_all["LDH"].apply(LDH_mask)

In [99]:
cns_all["B Symp 1"] = cns_all["B Symp 1"].apply(lambda x : 0 if x == "NO" else 1)
cns_all["B symp 2"] = cns_all["B symp 2"].apply(lambda x : 0 if x == "NO" else 1)
cns_all["B symp 3"] = cns_all["B symp 3"].apply(lambda x : 0 if x == "NO" else 1)

In [103]:
b_symp_any = cns_all["B Symp 1"] + cns_all["B symp 2"] + cns_all["B symp 3"]

In [106]:
cns_all["B Symp"] = b_symp_any.apply(lambda x: 1 if x > 0 else 0)

In [108]:
del cns_all["B Symp 1"]
del cns_all["B symp 2"]
del cns_all["B symp 3"]

In [112]:
cns_all[">1 extranodal"] = cns_all[">1 extranodal"].apply(lambda x : 0 if x == "NO" else 1)

In [115]:
cns_all["BM Involv"] = cns_all["BM Involv"].apply(LDH_mask)

In [120]:
cns_all["PB Involv"] = cns_all["PB Involv"].apply(LDH_mask)

In [129]:
cns = cns_all["CNS "].apply(LDH_mask)

In [131]:
cns_all["cns"] = cns
del cns_all["CNS "]

In [135]:
cns_all["cns"] = cns_all["cns"].apply(lambda x: 1 if x == 1 else 0)

In [136]:
cns_all


Out[136]:
Age Gender Race Stage IPI Score PS LDH >1 extranodal BM Involv PB Involv B Symp cns
0 1 M White III/IV L/LI 0 NaN 0 1 NaN 0 0
1 0 F White III/IV H/HI 1 0 1 0 NaN 1 0
2 0 M White I/II L/LI 0 0 0 0 0 0 0
3 0 F Other Missing L/LI 0 1 0 1 1 0 0
4 0 M White III/IV H/HI 1 1 0 1 0 1 0
5 0 M Other III/IV H/HI 0 1 1 1 NaN 1 0
6 0 M Other III/IV L/LI 0 NaN 1 0 NaN 1 0
7 1 M White III/IV L/LI 0 NaN 0 0 NaN 1 0
8 1 F White III/IV L/LI 0 NaN 0 1 1 1 0
9 1 M Other I/II L/LI 0 0 0 NaN 1 0 0
10 1 M White I/II L/LI 0 0 0 NaN NaN 1 0
11 0 F White Missing L/LI 0 NaN 1 NaN NaN 1 0
12 1 F White I/II L/LI 0 0 0 NaN NaN 0 0
13 0 M White III/IV L/LI 0 1 0 1 1 0 0
14 0 M White III/IV H/HI 0 1 1 1 1 1 0
15 0 F White I/II L/LI 0 0 0 NaN 1 1 0
16 0 M White III/IV L/LI 0 NaN 0 0 NaN 0 0
17 1 M White III/IV H/HI 1 1 0 1 NaN 1 0
18 0 M Other III/IV L/LI 0 NaN 1 1 NaN 1 0
19 0 F White I/II L/LI 0 NaN 0 NaN NaN 1 0
20 0 F Other Missing L/LI 0 NaN 0 1 1 0 0
21 1 M White I/II L/LI 0 NaN 0 NaN NaN 0 0
22 1 M White III/IV L/LI 0 NaN 0 0 NaN 0 1
23 1 M Other I/II L/LI 0 NaN 0 0 0 0 0
24 0 F White I/II L/LI 0 0 0 0 0 0 0
25 1 M White III/IV H/HI 0 NaN 1 1 1 0 0
26 1 F White III/IV H/HI 0 1 0 0 0 0 0
27 0 M Other III/IV L/LI 0 0 1 0 0 0 1
28 0 F Other I/II L/LI 0 0 0 0 0 1 0
29 0 M White Missing L/LI 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ...
229 1 M Other III/IV L/LI 0 0 0 1 1 0 0
230 1 M Other Missing L/LI 0 0 0 1 1 1 0
231 1 M White Missing L/LI 0 NaN 0 1 1 0 0
232 0 M Other Missing L/LI 0 0 0 NaN 0 1 0
233 0 M Other III/IV L/LI 0 0 1 1 1 1 1
234 1 M White III/IV H/HI 0 1 0 0 NaN 1 0
235 1 F White III/IV H/HI 0 0 1 0 0 1 0
236 0 M White III/IV L/LI 0 1 0 0 NaN 0 0
237 0 F Other Missing L/LI 0 NaN 0 0 NaN 0 0
238 0 F Other III/IV L/LI 0 1 0 1 1 1 0
239 0 F Other III/IV L/LI 0 1 0 NaN 1 1 0
240 0 F White Missing L/LI 0 1 0 1 NaN 0 0
241 0 M White I/II L/LI 0 1 0 0 NaN 1 0
242 1 M Other III/IV H/HI 0 NaN 1 0 NaN 0 0
243 0 F White I/II L/LI 0 0 0 0 0 1 0
244 0 M White III/IV H/HI 0 1 1 0 NaN 1 0
245 1 F White III/IV H/HI 0 1 1 NaN NaN 1 0
246 0 M Other III/IV H/HI 0 1 1 0 NaN 1 0
247 1 F White III/IV L/LI 0 0 0 NaN 1 1 0
248 1 M Other III/IV H/HI 0 1 1 1 NaN 1 0
249 0 M White III/IV L/LI 0 0 1 0 NaN 0 0
250 0 M White III/IV L/LI 0 NaN 0 1 1 0 0
251 1 F Other I/II L/LI 0 0 0 0 0 1 0
252 0 M White I/II L/LI 0 0 0 0 NaN 1 0
253 0 M Other III/IV L/LI 0 1 0 1 1 1 0
254 1 F White III/IV H/HI 0 1 0 1 NaN 1 0
255 1 M Other Missing L/LI 0 0 0 NaN NaN 1 1
256 0 M Other III/IV L/LI 0 0 0 NaN 1 1 0
257 1 F White III/IV L/LI 0 0 0 0 NaN 1 0
258 1 M White I/II L/LI 0 NaN 0 NaN NaN 1 0

259 rows × 12 columns


In [137]:
import matplotlib
import json
import requests
from sklearn import ensemble, cross_validation
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing

In [138]:
le = preprocessing.LabelEncoder()

In [139]:
def fit_and_transform(column_name, df):
    le.fit(df[column_name])
    df[column_name] = le.transform(df[column_name])
    
columns = ["Gender", "Race", "Stage", "IPI Score"]

for column in columns:
    fit_and_transform(column, cns_all)

In [168]:
cns_dropped = cns_all.dropna()
cns_filled = cns_all.fillna("missing")

In [170]:
for column in cns_filled.columns.unique():
    fit_and_transform(column, cns_filled)

In [343]:
del cns_filled[">1 extranodal"]

In [344]:
target = cns_filled['cns']
main_data = cns_filled.ix[:,:-1]

In [345]:
X, y = shuffle(main_data, target, random_state=13)
offset = 150
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

In [346]:
params = {'n_estimators':10, 'learning_rate':.001,
          'max_depth':2, 'random_state':0, 'loss': 'deviance'}
clf = ensemble.GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
clf_score = clf.score(X_test, y_test)
print("MSE: %.4f" % mse)
print("Accuracy: %.4f" % clf_score)


MSE: 0.0367
Accuracy: 0.9633

In [347]:
import pylab as pl
# Plot feature importance
feature_importance = clf.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
pl.subplot(1, 2, 2)
pl.barh(pos, feature_importance[sorted_idx], align='center')
pl.yticks(pos, main_data.columns[sorted_idx])
pl.xlabel('Relative Importance')
pl.title('Variable Importance')


Out[347]:
<matplotlib.text.Text at 0x10bd6afd0>

In [342]:
# Plot training deviance
test_score = np.zeros((params['n_estimators'],))

for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
    test_score[i] = clf.loss_(y_test, y_pred)

pl.figure(figsize=(12, 6))
pl.subplot(1, 2, 1)
pl.title('Deviance')
pl.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
        label='Training Set Deviance')
pl.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
        label='Test Set Deviance')
pl.legend(loc='upper right')
pl.xlabel('Boosting Iterations')
pl.ylabel('Deviance')


Out[342]:
<matplotlib.text.Text at 0x10bd17690>

In [261]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

svm_clf = SVC()
neighbors_clf = KNeighborsClassifier()
random_forest_clf = RandomForestClassifier()
clfs = [("SVM", SVC()), ("KNN", KNeighborsClassifier()), ("Random Forest", RandomForestClassifier())]

for name, clf in clfs:
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    clf_score = clf.score(X_test, y_test)
    print name + ": -----------------"
    print("MSE: %.4f" % mse)
    print("Accuracy: %.4f" % clf_score)
    f1scores = cross_validation.cross_val_score(clf, X_test, y_test, cv=3, scoring='f1')
    print("F1 Score: %0.2f (+/- %0.2f)" % (f1scores.mean(), f1scores.std() * 2))
    precision = cross_validation.cross_val_score(clf, X_test, y_test, cv=3, scoring='precision')
    print("Precision Score: %0.2f (+/- %0.2f)" % (precision.mean(), precision.std() * 2))
    recall = cross_validation.cross_val_score(clf, X_test, y_test, cv=3, scoring='recall')
    print("Recall Score: %0.2f (+/- %0.2f)" % (recall.mean(), recall.std() * 2))


SVM: -----------------
MSE: 0.0367
Accuracy: 0.9633
F1 Score: 0.00 (+/- 0.00)
Precision Score: 0.00 (+/- 0.00)
Recall Score: 0.00 (+/- 0.00)
KNN: -----------------
MSE: 0.0367
Accuracy: 0.9633
F1 Score: 0.00 (+/- 0.00)
Precision Score: 0.00 (+/- 0.00)
Recall Score: 0.00 (+/- 0.00)
Random Forest: -----------------
MSE: 0.0642
Accuracy: 0.9358
F1 Score: 0.00 (+/- 0.00)
Precision Score: 0.00 (+/- 0.00)
Recall Score: 0.00 (+/- 0.00)

In [210]:
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.lda import LDA

In [267]:
pca = PCA(n_components=11)
X_r = pca.fit(X).transform(X)

lda = LDA(n_components=2)
X_r2 = lda.fit(X, y).transform(X)

target_names = cns_filled.columns.unique()

# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))


plt.figure(figsize=(12, 8))
for c, i, target_name in zip("brgcmy", [0, 1, 2,3, 4, 5, 6, 7, 8, 9, 10], target_names):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
plt.legend(loc="upper left")
plt.title('PCA')

plt.figure(figsize=(12, 8))
for c, i, target_name in zip("brgcmy", [0, 1, 2,3, 4, 5, 6], target_names):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
plt.legend(loc="upper left")
plt.title('LDA')

plt.show()


explained variance ratio (first two components): [ 0.2259346   0.17002742  0.1201034   0.1041173   0.09072686  0.07485279
  0.06850171  0.06528011  0.05315184  0.01826943  0.00903455]

In [268]:
from sklearn import linear_model

In [363]:
logreg = linear_model.LogisticRegression(C=1)

In [364]:
logclf = logreg.fit(X_train, y_train)

In [365]:
logclf.score(X_test, y_test)


Out[365]:
0.96330275229357798

In [366]:
logclf.coef_


Out[366]:
array([[ 0.08445391,  0.08189909, -0.66691392, -0.34062076]])

In [289]:
from __future__ import print_function
import numpy as np
import statsmodels.api as sm

In [357]:
logit_mod = sm.Logit(y_train, X_train)
logit_res = logit_mod.fit(method='bfgs', disp=0)
print('Parameters: ', logit_res.params)


Parameters:  [-0.44677339  0.01126037 -1.09695065 -1.07596921]

In [358]:
margeff = logit_res.get_margeff()
print(margeff.summary())


        Logit Marginal Effects       
=====================================
Dep. Variable:                      y
Method:                          dydx
At:                           overall
==============================================================================
                dy/dx    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1            -0.0412      0.030     -1.394      0.163        -0.099     0.017
x2             0.0010      0.038      0.028      0.978        -0.073     0.075
x3            -0.1012      0.026     -3.856      0.000        -0.153    -0.050
x4            -0.0993      0.043     -2.318      0.020        -0.183    -0.015
==============================================================================

In [359]:
print(logit_res.summary())


                           Logit Regression Results                           
==============================================================================
Dep. Variable:                      y   No. Observations:                  150
Model:                          Logit   Df Residuals:                      146
Method:                           MLE   Df Model:                            3
Date:                Thu, 29 Jan 2015   Pseudo R-squ.:                -0.06124
Time:                        03:29:56   Log-Likelihood:                -46.921
converged:                       True   LL-Null:                       -44.214
                                        LLR p-value:                     1.000
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1            -0.4468      0.330     -1.354      0.176        -1.093     0.200
x2             0.0113      0.407      0.028      0.978        -0.787     0.810
x3            -1.0970      0.308     -3.556      0.000        -1.702    -0.492
x4            -1.0760      0.486     -2.215      0.027        -2.028    -0.124
==============================================================================

In [371]:
cns_filled


Out[371]:
Stage BM Involv PB Involv B Symp cns
0 1 1 2 0 0
1 1 0 2 1 0
2 0 0 0 0 0
3 2 1 1 0 0
4 1 1 0 1 0
5 1 1 2 1 0
6 1 0 2 1 0
7 1 0 2 1 0
8 1 1 1 1 0
9 0 2 1 0 0
10 0 2 2 1 0
11 2 2 2 1 0
12 0 2 2 0 0
13 1 1 1 0 0
14 1 1 1 1 0
15 0 2 1 1 0
16 1 0 2 0 0
17 1 1 2 1 0
18 1 1 2 1 0
19 0 2 2 1 0
20 2 1 1 0 0
21 0 2 2 0 0
22 1 0 2 0 1
23 0 0 0 0 0
24 0 0 0 0 0
25 1 1 1 0 0
26 1 0 0 0 0
27 1 0 0 0 1
28 0 0 0 1 0
29 2 0 0 0 0
... ... ... ... ... ...
229 1 1 1 0 0
230 2 1 1 1 0
231 2 1 1 0 0
232 2 2 0 1 0
233 1 1 1 1 1
234 1 0 2 1 0
235 1 0 0 1 0
236 1 0 2 0 0
237 2 0 2 0 0
238 1 1 1 1 0
239 1 2 1 1 0
240 2 1 2 0 0
241 0 0 2 1 0
242 1 0 2 0 0
243 0 0 0 1 0
244 1 0 2 1 0
245 1 2 2 1 0
246 1 0 2 1 0
247 1 2 1 1 0
248 1 1 2 1 0
249 1 0 2 0 0
250 1 1 1 0 0
251 0 0 0 1 0
252 0 0 2 1 0
253 1 1 1 1 0
254 1 1 2 1 0
255 2 2 2 1 1
256 1 2 1 1 0
257 1 0 2 1 0
258 0 2 2 1 0

259 rows × 5 columns


In [ ]: