In [73]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# import custom modules wrote by julio
from ispy1 import predictive_statistics
# reload modules without restartign the kernel (makes development easier)
import importlib
importlib.reload(predictive_statistics);
In [2]:
df = pd.read_csv('./data/I-SPY_1_clean_data.csv')
df.head(2)
Out[2]:
In [3]:
# allocate outcome
outcome = 'PCR'
y = predictive_statistics.labels_to_numbers(df, outcome);
# check how unbalanced the data are
df[outcome].value_counts(normalize = True).plot.barh();
plt.title('Normalized Sample size for PCR')
plt.xlabel('group size (%)');
plt.savefig('Sample_Size_PCR.png')
In [4]:
# allocate continous predictors
cont_predictors = ['age','MRI_LD_Baseline', 'MRI_LD_1_3dAC', 'MRI_LD_Int_Reg', 'MRI_LD_PreSurg']
X_cont = df[cont_predictors].values
# allocate clinical predictors
cat_predictors = ['White', 'ER+', 'PR+', 'HR+'];
X_cat = pd.pandas.get_dummies(df[cat_predictors], drop_first=True).values
# allocate a single predictors matrix X
X = np.concatenate( (X_cont, X_cat), axis=1)
# allocate outcome
outcome = 'PCR'
y = predictive_statistics.labels_to_numbers(df, outcome);
In [5]:
from sklearn import metrics
def mymetrics(ypredicted, yexpected):
print(metrics.classification_report(ypredicted, yexpected))
k = metrics.cohen_kappa_score(ypredicted, yexpected); k = np.round(k,3);
auc = metrics.roc_auc_score(ypredicted, yexpected); auc = np.round(auc,3);
accuracy = metrics.accuracy_score(ypredicted, yexpected); accuracy = np.round(accuracy,3);
print("Kappa = " + str(k))
print("AUC = " + str(auc))
print("Accuracy = " + str(accuracy))
# make at least one observation positive
y_crazy = np.zeros_like(y)
y_crazy[np.argwhere(y>0)[0]] = 1
mymetrics(y_crazy, y)
In [6]:
# standard
auc1, kappa1, fpr1, tpr1 = predictive_statistics.Logistic_Regression(X, y, oversample = False)
# oversampled
auc2, kappa2, fpr2, tpr2 = predictive_statistics.Logistic_Regression(X, y, oversample = True, K_neighbors = 4)
title ='Effect of oversampling on Logistic Regression for PCR'
predictive_statistics.plot_compare_roc(fpr1, tpr1,fpr2, tpr2, auc1, auc2, title = title)
plt.savefig('PCR_Logistic.png')
In [7]:
# standard
auc1, kappa1, fpr1, tpr1, forest = predictive_statistics.RandomForest_Classifier(X, y,oversample=False)
t ='Feature Importance for RFC unbalanced'
predictive_statistics.plot_forest_feature_importances_(forest, cont_predictors + cat_predictors, title = t)
plt.savefig('RFC_std_PCR.png')
# unbalanced learning
auc2, kappa2, fpr2, tpr2, Forest = predictive_statistics.RandomForest_Classifier(X, y, oversample=True, K_neighbors = 5)
t ='Feature Importance for RFC ADASYN'
predictive_statistics.plot_forest_feature_importances_(Forest, cont_predictors + cat_predictors, title = t)
plt.savefig('RFC_Adasyn_PCR.png')
In [8]:
# compare
title ='Effect of oversampling on RFC for PCR'
predictive_statistics.plot_compare_roc(fpr1, tpr1,fpr2, tpr2, auc1, auc2, title = title)
plt.savefig('RFC_std_PCR.png')
In [9]:
K = [4,5,6]
for k in K:
predictive_statistics.Logistic_Regression(X, y, oversample=True, K_neighbors = k)
In [10]:
# allocate outcome
outcome = 'Alive'
y = predictive_statistics.labels_to_numbers(df, outcome);
# check how unbalanced the data are
df[outcome].value_counts(normalize = True).plot.barh();
plt.title('Normalized Sample size for Alive')
plt.xlabel('group size (%)');
plt.savefig('Sample_Size_Alive.png')
In [11]:
# standard
auc1, kappa1, fpr1, tpr1 = predictive_statistics.Logistic_Regression(X, y)
# unbalanced learning
auc2, kappa2, fpr2, tpr2 = predictive_statistics.Logistic_Regression(X, y, oversample=True, K_neighbors = 4)
In [12]:
title ='Effect of oversampling on Logistic Regression for Alive'
predictive_statistics.plot_compare_roc(fpr1, tpr1,fpr2, tpr2, auc1, auc2, title = title)
plt.savefig('Alive_Logistic.png')
In [13]:
# standard
auc1, kappa1, fpr1, tpr1, _= predictive_statistics.RandomForest_Classifier(X, y)
# unbalanced learning
auc2, kappa2, fpr2, tpr2, _= predictive_statistics.RandomForest_Classifier(X, y, oversample=True, K_neighbors = 6)
title ='Effect of oversampling on RFC for PCR'
predictive_statistics.plot_compare_roc(fpr1, tpr1,fpr2, tpr2, auc1, auc2, title = title)
plt.savefig('Alive_RFC.png')
In [14]:
# allocate new predictor variable
pcr = predictive_statistics.labels_to_numbers(df, 'PCR').reshape(168,1)
newX = np.concatenate((X,pcr), axis = 1)
In [15]:
# standard
auc1, kappa1, fpr1, tpr1 = predictive_statistics.Logistic_Regression(newX, y)
# unbalanced learning
auc2, kappa2, fpr2, tpr2 = predictive_statistics.Logistic_Regression(newX, y, oversample=True, K_neighbors = 10)
In [16]:
title ='Effect of oversampling on Logistic Regression for Alive (PCR included)'
predictive_statistics.plot_compare_roc(fpr1, tpr1,fpr2, tpr2, auc1, auc2, title = title)
In [17]:
# standard
auc1, kappa1, fpr1, tpr1 , _ = predictive_statistics.RandomForest_Classifier(newX, y)
In [18]:
# unbalanced learning
auc2, kappa2, fpr2, tpr2, _ = predictive_statistics.RandomForest_Classifier(newX, y, oversample=True, K_neighbors = 10)
In [19]:
rcb = pd.get_dummies(df['RCB']).values
newX = np.concatenate((X,rcb), axis = 1)
# standard
auc1, kappa1, fpr1, tpr1 = predictive_statistics.Logistic_Regression(newX, y)
auc2, kappa2, fpr2, tpr2, _= predictive_statistics.RandomForest_Classifier(newX, y)
In [20]:
# compare
title ='Predicting Survival including RCB as a predictor'
predictive_statistics.plot_compare_roc(fpr1, tpr1,fpr2, tpr2, auc1, auc2, title = title)
plt.legend(['Logistic Regression','Random Forest Classifier']);
plt.savefig('LG_vs_RFC_Alive_RCB.png')
In [21]:
# unbalanced learning
auc3, kappa3, fpr3, tpr3 = predictive_statistics.Logistic_Regression(newX, y, oversample=True, K_neighbors = 4)
auc4, kappa4, fpr4, tpr4, _= predictive_statistics.RandomForest_Classifier(newX, y, oversample=True, K_neighbors = 4)
In [38]:
title ='Predicting Survival including RCB as a predictor (ADASYN)'
predictive_statistics.plot_compare_roc(fpr3, tpr3,fpr4, tpr4, auc3, auc4, title = title)
plt.legend(['Logistic Regression','Random Forest Classifier']);
#plt.savefig('LG_vs_RFC_Alive_RCB.png')
In [ ]:
# metrics
mae = metrics.median_absolute_error
def mae_report(Ytest, Yhat, outcome_):
error = mae(Ytest, Yhat)
error = np.round( error, decimals=3)
# report
print('\n' )
print('==' *40)
print('The median absolute error for testing data set of ' + outcome_ + ' is: ' + str(error))
print('==' *40)
def train_test_report(predictor, Xtrain, Ytrain, Xtest, Ytest, outcome):
# train
predictor.fit(Xtrain, Ytrain)
# test
Yhat = predictor.predict(Xtest)
# report
mae_report(Ytest, Yhat, outcome)
# lsq
import statsmodels.api as sm
def lsq(Xtrain,Ytrain, Xtest, Ytest, outcome =''):
# train
OLS = sm.OLS(Ytrain,Xtrain).fit();
print(OLS.summary())
#test
Yhat = OLS.predict(Xtest)
# report
mae_report(Ytest, Yhat, outcome)
# SVR
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
# GridSearchCV utility
def gridsearch(regressor, grid):
optimized_regressor= GridSearchCV( regressor,
param_grid = grid,
cv= 3, verbose = 0, n_jobs = -1,
scoring = metrics.make_scorer(metrics.median_absolute_error))
return optimized_regressor
def svr(Xtrain,Ytrain, Xtest, Ytest, outcome = ''):
# define regressor
regressor = SVR()
# define parameter grid search
grid = dict( kernel = ['rbf','linear','sigmoid'],
C = np.arange(1,11,1),
epsilon = np.arange(1,11,1),
gamma = np.linspace(1/10,10,3))
# perform grid search
grid_search= gridsearch(regressor, grid)
# train, test, and report
train_test_report(grid_search, Xtrain, Ytrain, Xtest, Ytest, outcome)
# ElasticNet
from sklearn.linear_model import ElasticNet as ENet
def ElasticNet(Xtrain,Ytrain, Xtest, Ytest, outcome = ''):
# define regressor
regressor = ENet(max_iter=5000)
# define parameter grid search
grid = dict( alpha = np.arange(1,20,.5), l1_ratio = np.arange(.1,1,.05))
# perform grid search
grid_search= gridsearch(regressor, grid)
# train, test, and report
train_test_report(grid_search, Xtrain, Ytrain, Xtest, Ytest, outcome)
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor as RFR
def RandomForestRegressor(Xtrain,Ytrain, Xtest, Ytest, outcome = ''):
# define regressor
regressor = RFR( criterion='mse', random_state = RANDOM_STATE)
#
num_features = Xtrain.shape[1]
# define parameter grid search
grid = dict( n_estimators = np.arange(5,100,5),
max_features = np.arange(1,num_features, 1),
max_depth = [None, 1, 2, 3, 4, 5])
# perform grid search
grid_search= gridsearch(regressor, grid)
# train, test, and report
train_test_report(grid_search, Xtrain, Ytrain, Xtest, Ytest, outcome)
In [57]:
from sklearn.preprocessing import StandardScaler
# allocate continous predictors
cont_predictors = ['age','MRI_LD_Baseline', 'MRI_LD_1_3dAC', 'MRI_LD_Int_Reg', 'MRI_LD_PreSurg']
contX = df[cont_predictors].values
contX = StandardScaler().fit_transform(contX)
# allocate categorical predictors
cat_pred = ['PCR','White', 'ER+', 'PR+', 'HR+'];
catX = pd.pandas.get_dummies(df[cat_pred], drop_first=True).values
# concatenate predictors
X = np.concatenate( (catX, contX), axis=1)
In [58]:
#outcome
y = df.RFS.values / 30; # conver to months
#split
X_train, X_test, y_train, y_test = predictive_statistics.split_data(X, y, False)
In [59]:
# LSQ
predictive_statistics.lsq(X_train, y_train, X_test, y_test, outcome =' RFS (months)')
In [60]:
importlib.reload(predictive_statistics);
In [61]:
EN = predictive_statistics.ElasticNet(X_train, y_train, X_test, y_test, outcome =' RFS (months)');
In [62]:
SVR = predictive_statistics.svr(X_train, y_train, X_test, y_test, outcome =' RFS(months)');
In [63]:
RFR = predictive_statistics.RandomForestRegressor(X_train, y_train, X_test, y_test, outcome =' RFS(months)')
In [74]:
y = df.Survival_length.values / 30
#split
X_train, X_test, y_train, y_test = predictive_statistics.split_data(X, y, False)
In [75]:
# # LSQ
predictive_statistics.lsq(X_train, y_train, X_test, y_test, outcome ='Survival_length (months)')
In [76]:
predictive_statistics.svr(X_train, y_train, X_test, y_test, outcome ='Survival length (months)');
In [71]:
predictive_statistics.ElasticNet(X_train, y_train, X_test, y_test, outcome = 'Survival length (months)');
In [72]:
predictive_statistics.RandomForestRegressor(X_train, y_train, X_test, y_test, outcome = 'Survival length (months)');
In [ ]: