In [1]:
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import pavelstatsutils
%matplotlib inline
    
In [2]:
    
patients = pd.read_csv("patients.csv")
controls = pd.read_csv("controls.csv")
    
In [3]:
    
df = pd.concat([patients, controls])
df.head()
    
    Out[3]:
In [4]:
    
df.describe()
    
    Out[4]:
In [5]:
    
df.loc[df['PATIENT'] == 1].describe()
    
    Out[5]:
In [6]:
    
df.loc[df['PATIENT'] == 0].describe()
    
    Out[6]:
In [7]:
    
sns.pairplot(df.dropna(how='any'), vars=[x for x in df.columns if x!='PATIENT'], hue="PATIENT", size=2)
    
    Out[7]:
    
In [8]:
    
corr_r, corr_p, corr_b = pavelstatsutils.corr(df)
    
In [9]:
    
corr_r
    
    Out[9]:
In [10]:
    
corr_p
    
    Out[10]:
In [11]:
    
corr_b
    
    Out[11]:
In [12]:
    
sns.heatmap(corr_r)
    
    Out[12]:
    
In [13]:
    
sns.heatmap(corr_b)
    
    Out[13]:
    
In [14]:
    
pat_corr_r, pat_corr_p, pat_corr_b = pavelstatsutils.corr(df.loc[df['PATIENT'] == 1].drop(['PATIENT'], axis=1))
    
In [15]:
    
pat_corr_r
    
    Out[15]:
In [16]:
    
pat_corr_p
    
    Out[16]:
In [17]:
    
pat_corr_b
    
    Out[17]:
In [18]:
    
sns.heatmap(pat_corr_r)
    
    Out[18]:
    
In [19]:
    
sns.heatmap(pat_corr_b)
    
    Out[19]:
    
In [20]:
    
con_corr_r, con_corr_p, con_corr_b = pavelstatsutils.corr(df.loc[df['PATIENT'] == 0].drop(['PATIENT'], axis=1))
    
In [21]:
    
con_corr_r
    
    Out[21]:
In [22]:
    
con_corr_p
    
    Out[22]:
In [23]:
    
con_corr_b
    
    Out[23]:
In [24]:
    
sns.heatmap(con_corr_r)
    
    Out[24]:
    
In [25]:
    
sns.heatmap(con_corr_b)
    
    Out[25]:
    
In [26]:
    
sns.heatmap( pat_corr_b ^ con_corr_b )
    
    Out[26]:
    
In [27]:
    
f, axes = plt.subplots(1, len(df.columns)-2, figsize=(10, 10), sharex=True)
i = 0
for column in df.columns:
    if column != "PATIENT" and column != "gender":
        sns.boxplot(x="PATIENT", y=column, data=df, ax=axes[i])
        i = i + 1
#plt.setp(axes, yticks=[])
plt.tight_layout()
    
    
Shapiro: Shapiro-Wilk's test for normality.
Levene: Levene's test for homoscedasticity.
In [28]:
    
prereq = {}
prereq_p = {}
for column in df.columns:
    if column != 'PATIENT':
        prereq_p[column] = []
        prereq[column] = []
        
        #All Normality Shapiro-Wilk test
        W, p = scipy.stats.shapiro(df[column].dropna())
        normality = p > 0.05
        prereq_p[column].append(p)
        prereq[column].append(normality)
        
        #Patients Normality Shapiro-Wilk test
        W, p = scipy.stats.shapiro(df.loc[df['PATIENT'] == 1, column].dropna())
        normality = p > 0.05
        prereq_p[column].append(p)
        prereq[column].append(normality)
        #Controls Normality Shapiro-Wilk test
        W, p = scipy.stats.shapiro(df.loc[df['PATIENT'] == 0, column].dropna())
        normality = p > 0.05
        prereq_p[column].append(p)
        prereq[column].append(normality)
        #Patients & Controls Homoscedasticity
        W, p = scipy.stats.levene(df.loc[df['PATIENT'] == 1, column].dropna(), df.loc[df['PATIENT'] == 0, column].dropna())
        homoscedasticity = p > 0.05
        prereq_p[column].append(p)
        prereq[column].append(homoscedasticity)
        
prerequisities = pd.DataFrame(prereq, index=['all_normality', 'patients_normality', 'controls_normality', 'homoscedasticity'])
prerequisities_p = pd.DataFrame(prereq_p, index=['all_Shapiro', 'patients_Shapiro', 'controls_Shapiro', 'Levene'])
    
In [29]:
    
prerequisities
    
    Out[29]:
In [30]:
    
prerequisities_p
    
    Out[30]:
In [31]:
    
test = {}
for column in df.columns:
    if column != 'PATIENT':
        test[column] = []
        
        homoscedasticity = prerequisities.loc['homoscedasticity'][column]
        
        #Student's T-test
        if homoscedasticity:
            t, p = scipy.stats.ttest_ind(
                df.loc[df['PATIENT'] == 1, column].dropna(),
                df.loc[df['PATIENT'] == 0, column].dropna(),
                equal_var=homoscedasticity
            )
            test[column].append(p) #Student's T-test (prerequisities fullfilled)
            test[column].append('') #Welsh T-test
        
        #Welsh T-test
        else:
            t, p = scipy.stats.ttest_ind(
                df.loc[df['PATIENT'] == 1, column].dropna(),
                df.loc[df['PATIENT'] == 0, column].dropna(),
                equal_var=homoscedasticity
            )
            test[column].append('') #Student's T-test (prerequisities not fullfilled)
            test[column].append(p)
        
        #Mann-Whitney U-test
        u, p = scipy.stats.mannwhitneyu(
                df.loc[df['PATIENT'] == 1, column].dropna(),
                df.loc[df['PATIENT'] == 0, column].dropna()
        )
        test[column].append(p)
test = pd.DataFrame(test, index=['Student_T-test', 'Welsh_T-test', 'Mann-Whitney_U-test'])
    
In [32]:
    
test
    
    Out[32]:
In [33]:
    
print "p = {}".format( 0.05/float(len(test.columns)) )
    
    
In [34]:
    
rbd = df[df['PATIENT'] == 1]
    
In [35]:
    
rbd = rbd.drop(['PATIENT', 'SN_area', 'SN_index', '3rd_ventricle'], axis=1 )
    
In [36]:
    
rbd
    
    Out[36]:
In [37]:
    
sns.pairplot(rbd.dropna(how='any'), vars=rbd.columns, size=2)
    
    Out[37]: