In [1]:
# import custom modules wrote by julio
import seaborn as sns
import pandas as pd
%matplotlib inline
#from capstone_01 import clean_data
from ispy1 import inferential_statistics
# reload modules without restartign the kernel (makes development easier)
# import importlib
#importlib.reload(inferential_statistics);
In [2]:
df = pd.read_csv('./data/I-SPY_1_clean_data.csv')
df.head(2)
Out[2]:
In [3]:
# example of contingency table
inferential_statistics.contingency_table('PCR', 'ER+',df)
Out[3]:
In [4]:
# Perform chi-2 test on all categorical variables
predictors = ['White', 'ER+', 'PR+', 'HR+','Right_Breast']
outcome = 'PCR'
inferential_statistics.categorical_data(outcome, predictors, df)
Out[4]:
In [5]:
predictors = ['White', 'ER+', 'PR+', 'HR+','Right_Breast','PCR']
outcome = 'Alive'
inferential_statistics.categorical_data(outcome, predictors, df)
Out[5]:
In [6]:
predictor= ['age']
outcome = 'PCR'
anova_table, OLS = inferential_statistics.linear_models(df, outcome, predictor);
sns.boxplot(x= outcome, y=predictor[0], data=df, palette="Set3");
In [7]:
predictor= ['age']
outcome = 'Alive'
anova_table, OLS = inferential_statistics.linear_models(df, outcome, predictor);
sns.boxplot(x= outcome, y=predictor[0], data=df, palette="Set3");
In [8]:
# create a boxplot to visualize this interaction
ax = sns.boxplot(x= 'PCR', y='age', hue ='Alive',data=df, palette="Set3");
ax.set_title('Interactions between age, survival, and PCR');
In [9]:
# create dataframe only for patients with PCR = Yes
df_by_PCR = df.loc[df.PCR=='No',:]
df_by_PCR.head()
# Anova age vs Alive
predictor= ['age']
outcome = 'Alive'
anova_table, OLS = inferential_statistics.linear_models(df_by_PCR, outcome, predictor);
In [10]:
# estimate the effect size
mri_features = ['age']
outcome = 'Alive'
# Effect Size
inferential_statistics.effect_size( df_by_PCR, mri_features, outcome)
Out[10]:
age
has an important effect on Alive
for patients with PCR
= Yes
In [11]:
R = inferential_statistics.anova_MRI('PCR', df);
Estimate the effect size
In [12]:
mri_features = ['MRI_LD_Baseline', 'MRI_LD_1_3dAC', 'MRI_LD_Int_Reg', 'MRI_LD_PreSurg']
outcome = 'PCR'
# Effect Size
inferential_statistics.effect_size( df, mri_features, outcome)
Out[12]:
In [13]:
outcome = 'Alive'
R = inferential_statistics.anova_MRI(outcome, df);
In [14]:
mri_features = ['MRI_LD_Baseline', 'MRI_LD_1_3dAC', 'MRI_LD_Int_Reg', 'MRI_LD_PreSurg']
outcome = 'Alive'
# Effect Size
inferential_statistics.effect_size( df, mri_features, outcome)
Out[14]:
In [15]:
# predictors and outcomes
predictors= ['MRI_LD_Baseline', 'MRI_LD_1_3dAC', 'MRI_LD_Int_Reg', 'MRI_LD_PreSurg']
# split data and run anova
PCR_outcomes = ['No','Yes']
for out in PCR_outcomes:
df_by_PCR = df.loc[df.PCR == out,:]
print('Outcome = Alive' + ' | ' + 'PCR = ' + out)
# Anova
anova_table, OLS = inferential_statistics.linear_models(df_by_PCR, 'Alive', predictors);
# Effect Size
print(inferential_statistics.effect_size( df_by_PCR, predictors, 'Alive'))
print('\n' * 2)
MRI_LD_Baseline
) is not a statistically different between patients who achieved complete pathological response (PCR
)and those who did not. While all other MRI measurements are statistically different between PCR = Yes
, and PCR = No
In [16]:
## 3. Inferential_statistics: Continous vs Categorical (ANOVA)