reference on statsmodel ols and disagnostic plots http://mpastell.com/pweave/_downloads/linear_regression.html (visualization) http://www.statsmodels.org/dev/examples/notebooks/generated/regression_diagnostics.html (tests)
In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
alpha = pd.read_csv('../data/PD_whole_tree.txt', sep='\t')
In [3]:
alpha.tail(10)
Out[3]:
In [4]:
alpha.shape
Out[4]:
In [5]:
# look at only the highest rarefaction depth
alpha_high = alpha.loc[alpha['sequences per sample'] == 5870]
In [6]:
# take average of 10 iterations as alpha value
alpha_high = alpha_high.drop(['Unnamed: 0', 'sequences per sample', 'iteration'], axis=1)
In [7]:
alpha_high.head()
Out[7]:
In [8]:
alpha_high.shape
Out[8]:
In [9]:
alpha_avg = pd.DataFrame(alpha_high.mean(axis=0), columns=['alpha_pd'])
In [10]:
mf = pd.read_csv('../data/mapping_cleaned_MrOS.txt', sep='\t', dtype=str, index_col='#SampleID')
In [11]:
mf.head()
Out[11]:
In [12]:
table = pd.merge(mf, alpha_avg, left_index=True, right_index=True)
In [13]:
table.head()
Out[13]:
In [14]:
print(mf.shape, table.shape)
In [15]:
# check
print(alpha_avg.head())
print(table.loc[table.index=='SD8637'].alpha_pd)
print(table.loc[table.index=='PO7016'].alpha_pd)
print(table.loc[table.index=='MN1789'].alpha_pd)
print(table.loc[table.index=='MN1868'].alpha_pd)
print(table.loc[table.index=='PA3814'].alpha_pd)
In [16]:
table.to_csv('../data/mapping_PDalpha.txt', sep='\t')
In [17]:
df = table[['OHVD3', 'OHV1D3', 'OHV24D3', 'ratio_activation', 'ratio_catabolism', 'alpha_pd']]
df = df.apply(pd.to_numeric, errors='coerce') # still need to convert, as their types changed in 'table'
In [18]:
print(df.shape)
In [19]:
df.head()
Out[19]:
In [20]:
df.describe()
Out[20]:
In [21]:
df.to_csv('../data/vitamin_pd.txt', sep='\t')
In [22]:
var = df.columns.drop('alpha_pd')
In [23]:
i = 0
col_list_palette = sns.xkcd_palette(['sky blue'])
sns.set_palette(col_list_palette)
ax = sns.regplot(x=var[i], y="alpha_pd", label=var[i],data=df)
ax.set_xlabel('25(OH)2D3', fontsize=20)
ax.set_ylabel('PD alpha diversity', fontsize=20)
ax = ax.get_figure()
ax.tight_layout()
ax.savefig('../figures/PD_VD3_reg.pdf')
ax.savefig('../figures/PD_VD3_reg.png')
In [24]:
i = 1
col_list_palette = sns.xkcd_palette(['aquamarine'])
sns.set_palette(col_list_palette)
ax = sns.regplot(x=var[i], y="alpha_pd", label=var[i],data=df)
ax.set_xlabel('1,25(OH)2D3', fontsize=20)
ax.set_ylabel('PD alpha diversity', fontsize=20)
ax = ax.get_figure()
ax.tight_layout()
ax.savefig('../figures/PD_V1D3_reg.pdf')
ax.savefig('../figures/PD_V1D3_reg.png')
In [25]:
i = 2
col_list_palette = sns.xkcd_palette(['light orange'])
sns.set_palette(col_list_palette)
ax = sns.regplot(x=var[i], y="alpha_pd", label=var[i],data=df)
ax.set_xlabel('24,25(OH)2D3', fontsize=20)
ax.set_ylabel('PD alpha diversity', fontsize=20)
ax = ax.get_figure()
ax.tight_layout()
ax.savefig('../figures/PD_V24D3_reg.pdf')
ax.savefig('../figures/PD_V24D3_reg.png')
In [26]:
i = 3
col_list_palette = sns.xkcd_palette(['pale purple'])
sns.set_palette(col_list_palette)
ax = sns.regplot(x=var[i], y="alpha_pd", label=var[i],data=df)
ax.set_xlabel(var[i], fontsize=20)
ax.set_ylabel('PD alpha diversity', fontsize=20)
ax = ax.get_figure()
ax.tight_layout()
ax.savefig('../figures/PD_RatioAct_reg.pdf')
ax.savefig('../figures/PD_RatioAct_reg.png')
In [27]:
i = 4
col_list_palette = sns.xkcd_palette(['red'])
sns.set_palette(col_list_palette)
ax = sns.regplot(x=var[i], y="alpha_pd", label=var[i],data=df)
ax.set_xlabel(var[i], fontsize=20)
ax.set_ylabel('PD alpha diversity', fontsize=20)
ax = ax.get_figure()
ax.tight_layout()
ax.savefig('../figures/PD_RatioCat_reg.pdf')
ax.savefig('../figures/PD_RatioCat_reg.png')
In [28]:
# all 5 VitD measurements
sns.set(color_codes=True)
var = df.columns.drop('alpha_pd')
sns.set_palette("Set2", n_colors=len(var))
for i in range(len(var)):
ax = sns.regplot(x=var[i], y="alpha_pd", label=var[i], data=df)
ax.set(xlabel='VitaminD measurement', ylabel='PD alpha diversity')
ax.legend()
plt.xlim(-5, 120)
ax = ax.get_figure()
ax.tight_layout()
ax.savefig('../figures/PD_5VitD_reg.pdf')
ax.savefig('../figures/PD_5VitD_reg.png')
In [29]:
out = []
for i in range(len(var)):
tmp = df[['alpha_pd', var[i]]].dropna(axis=0, how='any')
y = tmp['alpha_pd']
X = tmp[var[i]]
results = smf.OLS(y, sm.add_constant(X)).fit()
#print(results.summary())
# normality test
name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
normtest = lzip(name, test)[1][1]
# condition number
cn = np.linalg.cond(results.model.exog)
# heteroskedasticity tests (null: the residual variance does not depend on the variables in x)
name = ['Lagrange multiplier statistic', 'p-value']
test = sms.het_breuschpagan(results.resid, results.model.exog)
heter = lzip(name, test)[1][1]
# linearity test (null: is linear)
name = ['t value', 'p value']
test = sms.linear_harvey_collier(results)
linear = lzip(name, test)[1][1]
out.append(['alpha_pd', var[i], results.params[1], results.pvalues[1],
results.rsquared_adj, normtest, cn, heter, linear])
out = pd.DataFrame(out, columns=['y', 'X', 'slope', 'pvalue', 'adjusted R-square',
'norm test P-val', 'condition number', 'hetero test P-val', 'linear P-val'])
out
# fine with non-normal residuals
# reference: https://stats.stackexchange.com/questions/29731/regression-when-the-ols-residuals-are-not-normally-distributed
Out[29]:
This is not a concern, see here (https://stats.stackexchange.com/questions/75054/how-do-i-perform-a-regression-on-non-normal-data-which-remain-non-normal-when-tr)
In [30]:
# plot the data and fit
plt.plot(X, y, 'ro')
plt.plot(X, results.fittedvalues, 'b')
plt.xlabel(var[i])
plt.ylabel('PD Alpha')
Out[30]:
In [31]:
# histogram of normalized residuals
plt.hist(results.resid_pearson)
plt.ylabel('Count')
plt.xlabel('Normalized residuals')
Out[31]:
In [32]:
# normality of residual tests
# Jarque-Bera test
name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
test = sms.jarque_bera(results.resid)
print(lzip(name, test))
# Omni test
name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
print(lzip(name, test))
In [33]:
# cooks distance
influence = results.get_influence()
(c, p) = influence.cooks_distance # c is the distance and p is p-value
plt.stem(np.arange(len(c)), c, markerfmt=',')
Out[33]:
In [34]:
# influence test
from statsmodels.stats.outliers_influence import OLSInfluence
test_class = OLSInfluence(results)
test_class.dfbetas[:5,:]
Out[34]:
In [35]:
# residuals against leverage
from statsmodels.graphics.regressionplots import plot_leverage_resid2
fig, ax = plt.subplots(figsize=(8,6))
fig = plot_leverage_resid2(results, ax = ax)
In [36]:
# multicolinearity
np.linalg.cond(results.model.exog) # condition number
Out[36]:
In [37]:
# heteroskedasticity tests (whether residuals have unequal variance)
# Breush-Pagan test
name = ['Lagrange multiplier statistic', 'p-value'
'f-value', 'f p-value']
test = sms.het_breuschpagan(results.resid, results.model.exog)
print(lzip(name, test))
# Goldfeld-Quand test
name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(results.resid, results.model.exog)
print(lzip(name, test))
In [38]:
# linearity test
name = ['t value', 'p value']
test = sms.linear_harvey_collier(results)
lzip(name, test)
Out[38]:
In [ ]: