In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import redcaputils
import statsmodels.formula.api as smf
%matplotlib inline
pd.set_option('display.max_columns', None)
In [2]:
patients = pd.read_csv("patients.csv")
controls = pd.read_csv("controls.csv")
df = pd.concat([patients, controls])
In [3]:
alpha = 0.05
corr_r = {}
corr_p = {}
corr_b = {} #survives Bonferoni
variables = df.columns #or use all
variablesNo = len(variables)
#not counting correlation of the variable with itself, count only half of correlations (they are symmetrical)
Bonferoni = 2 * alpha/float((variablesNo-1)*(variablesNo-1))
for variable in variables:
corr_r[variable] = []
corr_p[variable] = []
corr_b[variable] = []
for column in df.columns:
if variable == column:
corr_r[variable].append(1)
corr_p[variable].append(0)
corr_b[variable].append( True )
else:
corrdf = df[[variable, column]].dropna(how='any')
r, p = scipy.stats.pearsonr(corrdf[variable], corrdf[column])
corr_r[variable].append(r)
corr_p[variable].append(p)
corr_b[variable].append( p < Bonferoni )
corr_r = pd.DataFrame(corr_r, index=df.columns)
corr_p = pd.DataFrame(corr_p, index=df.columns)
corr_b = pd.DataFrame(corr_b, index=df.columns)
In [4]:
corr_r.sort_index()
Out[4]:
In [5]:
corr_p.sort_index()
Out[5]:
In [6]:
sns.heatmap(corr_r.sort_index())
Out[6]:
In [7]:
corr_b.sort_index()
Out[7]:
In [8]:
sns.heatmap(corr_b.sort_index())
Out[8]: