In [1]:
cd ..
In [2]:
import NotebookImport
from Parallel.Age_HIV_Features import *
It would be useful to know how many of the CpGs associated with age (26,927), and HIV (81,361) are also strongly differentially methylated across cell-types. One could for example, plot a cell-type association p-value histogram for these sites using p-values from this Jaffe and Irizarry table: http://www.genomebiology.com/2014/15/2/R31/suppl/S3 If it is highly enriched for small p-values, I would be cautious about claiming that the cell type effects have been fully corrected for as the authors believe.
In [3]:
dm_cell = pd.read_csv('./data/Jaffee_Supplementary_Table_2.csv', index_col=0,
low_memory=False, skiprows=[0])
In [4]:
g_hiv.value_counts()
Out[4]:
In [5]:
vv = features['HIV (BH)']
r_hiv = pd.DataFrame({c: fisher_exact_test(vv, dm_cell['p.value'] < c)
for c in [.1, .05,.01,.001,.0001,.000001]}).T
r_hiv.index.name = 'p-value cutoff'
count = pd.Series({c: sum(dm_cell['p.value'].ix[vv.index] < c)
for c in [.1, .05,.01,.001,.0001,.000001]}).T
r_hiv['count'] = count
r_hiv = r_hiv[['count','odds_ratio','p']]
In [6]:
r_hiv
Out[6]:
In [7]:
r_hiv = pd.DataFrame({c: fisher_exact_test(g_hiv, dm_cell['p.value'] < c)
for c in [.1, .05,.01,.001,.0001,.000001]}).T
r_hiv.index.name = 'p-value cutoff'
count = pd.Series({c: sum(dm_cell['p.value'].ix[g_hiv.index] < c)
for c in [.1, .05,.01,.001,.0001,.000001]}).T
r_hiv['count'] = count
r_hiv = r_hiv[['count','odds_ratio','p']]
In [8]:
r_hiv
Out[8]:
In [9]:
fc = ((dm_cell.CD4T_mean - dm_cell.CD8T_mean).abs() /
.5*(dm_cell.CD4T_mean + dm_cell.CD8T_mean))
r_hiv_fc = pd.DataFrame({p: fisher_exact_test(g_hiv, fc > c)
for p,c in fc.quantile([.05,.1,.25,.5,.75,.9,.95]).iteritems()}).T
r_hiv_fc
Out[9]:
In [10]:
g_age.value_counts()
Out[10]:
In [11]:
r_age = pd.DataFrame({c: fisher_exact_test(g_age, dm_cell['p.value'] < c)
for c in [.1, .05,.01,.001,.0001,.000001]}).T
r_age.index.name = 'p-value cutoff'
count = pd.Series({c: sum(dm_cell['p.value'].ix[g_age.index] < c)
for c in [.1, .05,.01,.001,.0001,.000001]}).T
r_age['count'] = count
r_age = r_age[['count','odds_ratio','p']]
In [12]:
r_age
Out[12]:
In [13]:
count
Out[13]:
In [14]:
fc = ((dm_cell.CD4T_mean - dm_cell.CD8T_mean).abs() /
.5*(dm_cell.CD4T_mean + dm_cell.CD8T_mean))
r_age_fc = pd.DataFrame({p: fisher_exact_test(g_age, fc > c)
for p,c in fc.quantile([.05,.1,.25,.5,.75,.9,.95]).iteritems()}).T
count = pd.Series({p: sum(fc > c)
for p,c in fc.quantile([.05,.1,.25,.5,.75,.9,.95]).iteritems()})
r_age_fc['count'] = count
r_age_fc = r_age_fc[['count','odds_ratio','p']]
r_age_fc
Out[14]:
It would be reassuring if the link between the HIV infection and aging signatures (from "Unsupervised analysis shows shared phenotypes of HIV and age") still hold when analysis is restricted to CpGs with no cell-type dependency (e.g. cell type composition p-value>0.1).
In [15]:
df_hiv2 = df_hiv.ix[:, duration.index]
df_hiv2 = df_hiv2.dropna(1, how='all')
dd = logit_adj(df_hiv2)
m = dd.mean(1)
s = dd.std(1)
df_norm = dd.subtract(m, axis=0).divide(s, axis=0)
In [16]:
idx = ti(dm_cell['p.value'] > .1).intersection(ti(g_age))
len(idx)
Out[16]:
In [17]:
U,S,vH = frame_svd(df_norm.ix[probe_idx].ix[idx])
p = S ** 2 / sum(S ** 2)
p[:5]
Out[17]:
In [18]:
fig, ax = subplots(1,1, figsize=(4,3))
rr = 1*vH[0]
k = pred_c.index
hiv = duration != 'Control'
age = age
sns.regplot(*match_series(age.ix[k], rr.ix[ti(hiv==0)]),
ax=ax, label='HIV+', ci=None)
sns.regplot(*match_series(age.ix[k], rr.ix[ti(hiv>0)]),
ax=ax, label='Control', ci=None)
ax.set_ylabel('First PC of validated markers', size=12)
ax.set_xlabel('Chronological age (years)', size=14)
ax.set_yticks([0])
ax.axhline(0, ls='--', lw=2.5, color='grey', zorder=-1)
ax.set_xbound(23,70)
ax.set_ybound(-.25,.28)
prettify_ax(ax)
fig.tight_layout()
In [141]:
import statsmodels.api as sm
In [142]:
y = vH[0]
intercept = pd.Series(1, index=y.index)
X = pd.concat([intercept, age, hiv], axis=1, keys=['Intercept', 'age', 'HIV'])
X = X.dropna().ix[y.index]
m1 = sm.OLS(y, X).fit()
X = pd.concat([intercept, age, hiv, hiv*age], axis=1, keys=['Intercept', 'age',
'HIV', 'int'])
X = X.dropna().ix[y.index]
m2 = sm.OLS(y, X).fit()
m2.compare_lr_test(m1)
Out[142]:
In [143]:
m1.pvalues
Out[143]:
In [144]:
m2.pvalues
Out[144]:
In Figure 1A how many sites identified from EPIC as age associated were not identified in Hannum et al? This data would be more informative if represented as a Venn diagram to show the overlapping 26,927 sites between the two analyses.
In [202]:
rr1 = bhCorrection(p_vals.in_set_s1) < .01
rr1.value_counts()
Out[202]:
In [203]:
rr2 = bhCorrection(p_vals.in_set_s3) < .01
rr2.value_counts()
Out[203]:
In [204]:
fisher_exact_test(rr1, rr2)
Out[204]:
In [206]:
rr1.name = 'Hannum'
rr2.name = 'EPIC'
venn_pandas(rr1, rr2)
Out[206]:
It seems as though the odds ratios for HIV and age are mostly opposite except in CpG islands and gene bodies. Wouldn't this argue for the effects of HIV and aging being separate except for at these two locations?
In [208]:
fisher_exact_test(g_age, g_hiv)
Out[208]:
In [211]:
import Setup.DX_Imports as dx
In [215]:
by_annotation = pd.DataFrame({k: fisher_exact_test(g_age, g_hiv.ix[ti(v)])
for k,v in dx.probe_sets.iteritems()}).T
In [217]:
by_annotation.sort('odds_ratio')
Out[217]:
In [365]:
a2 = age[(age > 25) & (age < 69)]
d = pd.DataFrame({s: {c: pearson_pandas(v.ix[ti(study==s)], a2).rho
for c,v in cell_counts.iteritems()}
for s in ['s1','s3','HIV Control','HIV Long','HIV Short']})
d
Out[365]:
In [358]:
a2 = age[(age > 25) & (age < 69)]
idx = ti(study.isin(['s1','HIV Control','s3']))
d = pd.Series({c: pearson_pandas(v.ix[idx], a2).rho
for c,v in cell_counts.iteritems()})
d
Out[358]:
In [362]:
fig, axs = subplots(1,3, figsize=(12,4))
series_scatter(a2, cell_counts.CD8T.ix[idx], s=20, ax=axs[0], ann=None)
axs[0].annotate('$r={0:.2f}$'.format(d['CD8T']), (50, -.04), size=18)
series_scatter(a2, cell_counts.CD4T.ix[idx], s=20, ax=axs[1], ann=None)
axs[1].annotate('$r={0:.2f}$'.format(d['CD4T']), (50, -.04), size=18)
ratio = np.log((cell_counts.CD4T + .01) / (cell_counts.CD8T + .01))
ratio.name = 'log ratio'
series_scatter(a2.ix[idx], ratio.clip(-5,5), s=20, ax=axs[2], ann=None)
axs[2].annotate('$r={0:.2f}$'.format(pearson_pandas(ratio, a2).rho),
(50, -2.8), size=18)
for ax in axs:
prettify_ax(ax)
ax.set_xbound(25,70)
fig.tight_layout()