In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
import statsmodels.api as sm
In [8]:
table = pd.read_csv('../data/genus.txt', sep='\t', index_col=0)
In [9]:
table.head()
Out[9]:
In [10]:
# log transformation
dat = np.log(table + 0.000001)
In [11]:
dat.head()
Out[11]:
In [ ]:
In [14]:
var = table.columns[0:15]
print(var)
In [15]:
vars_vd = np.array(['OHVD3', 'OHV1D3', 'OHV24D3', 'ratio_activation', 'ratio_catabolism'])
In [16]:
# check assumptions
df = dat.copy()
out = []
k = 0
for i in range(len(var)):
tmp = df[[var[i], vars_vd[k]]].dropna(axis=0, how='any')
y = tmp[vars_vd[k]]
X = tmp[var[i]]
results = smf.OLS(y, sm.add_constant(X)).fit()
# normality test
name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
normtest = lzip(name, test)[1][1]
# condition number
cn = np.linalg.cond(results.model.exog)
# heteroskedasticity tests (null: the residual variance does not depend on the variables in x)
name = ['Lagrange multiplier statistic', 'p-value']
test = sms.het_breuschpagan(results.resid, results.model.exog)
heter = lzip(name, test)[1][1]
# linearity test (null: is linear)
# name = ['t value', 'p value']
# test = sms.linear_harvey_collier(results)
# linear = lzip(name, test)[1][1]
out.append([vars_vd[k], var[i], results.params[1], results.pvalues[1],
results.rsquared_adj, normtest, cn, heter])
out = pd.DataFrame(out, columns=['y', 'X', 'slope', 'pvalue', 'adjusted R-square',
'norm test P-val', 'condition number', 'hetero test P-val'])
out_OHVD3 = out.loc[out.pvalue <= 0.05]
out_OHVD3
Out[16]:
In [22]:
k = 0
bact = out_OHVD3.X.values
sns.set(color_codes=True)
sns.set_palette("husl", n_colors=len(bact))
for i in range(len(bact)):
ax = sns.regplot(x=vars_vd[k], y=bact[i],
label=bact[i], data=dat)
ax.set(xlabel='25(OH)D3 (ng/ml)', ylabel='Relative abundance')
ax.legend()
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ax = ax.get_figure()
ax.tight_layout()
In [23]:
## OHV1D3
out = []
k = 1
for i in range(len(var)):
tmp = df[[var[i], vars_vd[k]]].dropna(axis=0, how='any')
y = tmp[vars_vd[k]]
X = tmp[var[i]]
results = smf.OLS(y, sm.add_constant(X)).fit()
# normality test
name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
normtest = lzip(name, test)[1][1]
# condition number
cn = np.linalg.cond(results.model.exog)
# heteroskedasticity tests (null: the residual variance does not depend on the variables in x)
name = ['Lagrange multiplier statistic', 'p-value']
test = sms.het_breuschpagan(results.resid, results.model.exog)
heter = lzip(name, test)[1][1]
# linearity test (null: is linear)
# name = ['t value', 'p value']
# test = sms.linear_harvey_collier(results)
# linear = lzip(name, test)[1][1]
out.append([vars_vd[k], var[i], results.params[1], results.pvalues[1],
results.rsquared_adj, normtest, cn, heter])
out = pd.DataFrame(out, columns=['y', 'X', 'slope', 'pvalue', 'adjusted R-square',
'norm test P-val', 'condition number', 'hetero test P-val'])
out_OHV1D3 = out.loc[out.pvalue <= 0.05]
out_OHV1D3
Out[23]:
In [25]:
k = 1
bact = out_OHV1D3.X.values
sns.set(color_codes=True)
sns.set_palette("husl", n_colors=len(bact))
for i in range(len(bact)):
ax = sns.regplot(x=vars_vd[k], y=bact[i],
label=bact[i], data=dat)
ax.set(xlabel='1,25(OH)2D3 (ng/ml)', ylabel='Relative abundance')
ax.legend()
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ax = ax.get_figure()
ax.tight_layout()
In [26]:
# OHV24D3
out = []
k = 2
for i in range(len(var)):
tmp = df[[var[i], vars_vd[k]]].dropna(axis=0, how='any')
y = tmp[vars_vd[k]]
X = tmp[var[i]]
results = smf.OLS(y, sm.add_constant(X)).fit()
# normality test
name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
normtest = lzip(name, test)[1][1]
# condition number
cn = np.linalg.cond(results.model.exog)
# heteroskedasticity tests (null: the residual variance does not depend on the variables in x)
name = ['Lagrange multiplier statistic', 'p-value']
test = sms.het_breuschpagan(results.resid, results.model.exog)
heter = lzip(name, test)[1][1]
# linearity test (null: is linear)
# name = ['t value', 'p value']
# test = sms.linear_harvey_collier(results)
# linear = lzip(name, test)[1][1]
out.append([vars_vd[k], var[i], results.params[1], results.pvalues[1],
results.rsquared_adj, normtest, cn, heter])
out = pd.DataFrame(out, columns=['y', 'X', 'slope', 'pvalue', 'adjusted R-square',
'norm test P-val', 'condition number', 'hetero test P-val'])
out_OHV24D3 = out.loc[out.pvalue <= 0.05]
out_OHV24D3
Out[26]:
In [27]:
k = 2
bact = out_OHV24D3.X.values
sns.set(color_codes=True)
sns.set_palette("husl", n_colors=len(bact))
for i in range(len(bact)):
ax = sns.regplot(x=vars_vd[k], y=bact[i],
label=bact[i], data=table)
ax.set(xlabel='24,25(OH)2D3 (ng/ml)', ylabel='Relative abundance')
ax.legend()
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ax = ax.get_figure()
ax.tight_layout()
In [ ]: