In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import pavelstatsutils
%matplotlib inline
In [2]:
patients = pd.read_csv("patients.csv")
controls = pd.read_csv("controls.csv")
In [3]:
df = pd.concat([patients, controls])
df.head()
Out[3]:
In [4]:
df.describe()
Out[4]:
In [5]:
df.loc[df['PATIENT'] == 1].describe()
Out[5]:
In [6]:
df.loc[df['PATIENT'] == 0].describe()
Out[6]:
In [7]:
sns.pairplot(df.dropna(how='any'), vars=[x for x in df.columns if x!='PATIENT'], hue="PATIENT", size=2)
Out[7]:
In [8]:
corr_r, corr_p, corr_b = pavelstatsutils.corr(df)
In [9]:
corr_r
Out[9]:
In [10]:
corr_p
Out[10]:
In [11]:
corr_b
Out[11]:
In [12]:
sns.heatmap(corr_r)
Out[12]:
In [13]:
sns.heatmap(corr_b)
Out[13]:
In [14]:
pat_corr_r, pat_corr_p, pat_corr_b = pavelstatsutils.corr(df.loc[df['PATIENT'] == 1].drop(['PATIENT'], axis=1))
In [15]:
pat_corr_r
Out[15]:
In [16]:
pat_corr_p
Out[16]:
In [17]:
pat_corr_b
Out[17]:
In [18]:
sns.heatmap(pat_corr_r)
Out[18]:
In [19]:
sns.heatmap(pat_corr_b)
Out[19]:
In [20]:
con_corr_r, con_corr_p, con_corr_b = pavelstatsutils.corr(df.loc[df['PATIENT'] == 0].drop(['PATIENT'], axis=1))
In [21]:
con_corr_r
Out[21]:
In [22]:
con_corr_p
Out[22]:
In [23]:
con_corr_b
Out[23]:
In [24]:
sns.heatmap(con_corr_r)
Out[24]:
In [25]:
sns.heatmap(con_corr_b)
Out[25]:
In [26]:
sns.heatmap( pat_corr_b ^ con_corr_b )
Out[26]:
In [27]:
f, axes = plt.subplots(1, len(df.columns)-2, figsize=(10, 10), sharex=True)
i = 0
for column in df.columns:
if column != "PATIENT" and column != "gender":
sns.boxplot(x="PATIENT", y=column, data=df, ax=axes[i])
i = i + 1
#plt.setp(axes, yticks=[])
plt.tight_layout()
Shapiro: Shapiro-Wilk's test for normality.
Levene: Levene's test for homoscedasticity.
In [28]:
prereq = {}
prereq_p = {}
for column in df.columns:
if column != 'PATIENT':
prereq_p[column] = []
prereq[column] = []
#All Normality Shapiro-Wilk test
W, p = scipy.stats.shapiro(df[column].dropna())
normality = p > 0.05
prereq_p[column].append(p)
prereq[column].append(normality)
#Patients Normality Shapiro-Wilk test
W, p = scipy.stats.shapiro(df.loc[df['PATIENT'] == 1, column].dropna())
normality = p > 0.05
prereq_p[column].append(p)
prereq[column].append(normality)
#Controls Normality Shapiro-Wilk test
W, p = scipy.stats.shapiro(df.loc[df['PATIENT'] == 0, column].dropna())
normality = p > 0.05
prereq_p[column].append(p)
prereq[column].append(normality)
#Patients & Controls Homoscedasticity
W, p = scipy.stats.levene(df.loc[df['PATIENT'] == 1, column].dropna(), df.loc[df['PATIENT'] == 0, column].dropna())
homoscedasticity = p > 0.05
prereq_p[column].append(p)
prereq[column].append(homoscedasticity)
prerequisities = pd.DataFrame(prereq, index=['all_normality', 'patients_normality', 'controls_normality', 'homoscedasticity'])
prerequisities_p = pd.DataFrame(prereq_p, index=['all_Shapiro', 'patients_Shapiro', 'controls_Shapiro', 'Levene'])
In [29]:
prerequisities
Out[29]:
In [30]:
prerequisities_p
Out[30]:
In [31]:
test = {}
for column in df.columns:
if column != 'PATIENT':
test[column] = []
homoscedasticity = prerequisities.loc['homoscedasticity'][column]
#Student's T-test
if homoscedasticity:
t, p = scipy.stats.ttest_ind(
df.loc[df['PATIENT'] == 1, column].dropna(),
df.loc[df['PATIENT'] == 0, column].dropna(),
equal_var=homoscedasticity
)
test[column].append(p) #Student's T-test (prerequisities fullfilled)
test[column].append('') #Welsh T-test
#Welsh T-test
else:
t, p = scipy.stats.ttest_ind(
df.loc[df['PATIENT'] == 1, column].dropna(),
df.loc[df['PATIENT'] == 0, column].dropna(),
equal_var=homoscedasticity
)
test[column].append('') #Student's T-test (prerequisities not fullfilled)
test[column].append(p)
#Mann-Whitney U-test
u, p = scipy.stats.mannwhitneyu(
df.loc[df['PATIENT'] == 1, column].dropna(),
df.loc[df['PATIENT'] == 0, column].dropna()
)
test[column].append(p)
test = pd.DataFrame(test, index=['Student_T-test', 'Welsh_T-test', 'Mann-Whitney_U-test'])
In [32]:
test
Out[32]:
In [33]:
print "p = {}".format( 0.05/float(len(test.columns)) )
In [34]:
rbd = df[df['PATIENT'] == 1]
In [35]:
rbd = rbd.drop(['PATIENT', 'SN_area', 'SN_index', '3rd_ventricle'], axis=1 )
In [36]:
rbd
Out[36]:
In [37]:
sns.pairplot(rbd.dropna(how='any'), vars=rbd.columns, size=2)
Out[37]: