We know that cell composition is a key confounder when looking at changes in the methylome and how they relate to HIV infected patients. It is well understood that individuals with HIV have lower CD4 counts, for this reason we have pretty comprehesive blood work for these cases. Unfortunately we do not have the same blood work for the controls. For this reason we are using the estimateCellCounts function in the R minfi package to get cell composition for all of our samples.
The estimateCellCounts function uses differentially methylated probes across different flow-sorted cell populations to estimate blood composition via mixture modeling. For this we need arrays measuring methylation of each cell population we want to estimate. While we do not have the exact same cell-types reported in our blood composition report, we can benchmark this method against those that we do have.
In [1]:
import os
if os.getcwd().endswith('Benchmarks'):
os.chdir('..')
Imports and helper functions from Imports notebook.
In [2]:
import NotebookImport
from HIV_Age_Advancement import *
In [3]:
labs['CD4 Absolute'].hist()
Out[3]:
Looking for changes in cell composition with HIV
In [4]:
duration = duration.ix[pred_c.index]
In [5]:
screen_feature(duration=='Control', kruskal_pandas, cell_counts.T, align=False)
Out[5]:
In [6]:
fig, axs = subplots(2,1, figsize=(6,9), sharex=True)
o = ['Control','HIV Short','HIV Long']
violin_plot_pandas(duration, cell_counts.CD4T, order=o, ax=axs[0])
violin_plot_pandas(duration, cell_counts.CD8T, order=o, ax=axs[1])
for ax in axs:
prettify_ax(ax)
ax.set_xlabel('')
axs[0].set_ylabel('CD4T Cell Percentage')
axs[1].set_ylabel('CD8T Cell Percentage')
fig.tight_layout()
fig.savefig(FIGDIR + 'tcells.png', dpi=300)
In [7]:
((cell_counts.CD8T + .01) / (cell_counts.CD4T + .01)).groupby(duration=='Control').mean()
Out[7]:
In [8]:
violin_plot_pandas(duration, np.log2((cell_counts.CD4T + .01) / (cell_counts.CD8T + .01)),
order=['Control','HIV Short','HIV Long'])
In [9]:
cell_types = ['Neutrophil %','Lymphocyte %','Monocyte %','Eosinophil %','Basophil %']
labs[cell_types].sum(1).order()
Out[9]:
In [10]:
keepers = labs.index.difference(['RG065','RG175','RG279','RA182','RM285'])
keepers = keepers.intersection(duration.index)
l3 = labs[cell_types].div(labs[cell_types].sum(1), axis=0) * 100
l3 = l3.ix[keepers]
In [11]:
reg_cd4 = linear_regression(labs['CD4 Absolute'].ix[keepers],
cell_counts.CD4T)
reg_cd8 = linear_regression(labs['CD8 Absolute'].ix[keepers],
cell_counts.CD8T)
In [12]:
import seaborn as sns
sns.set_context("paper", font_scale=1.5, rc={"lines.linewidth": 2.5})
sns.set_style("white")
In [13]:
fig, axs2 = subplots(2,3, figsize=(12,8))
axs = axs2[0]
ll = {'color':'black','alpha':.7, 'ls':'-'}
plot_regression((l3['Neutrophil %'] + l3['Eosinophil %'] + l3['Basophil %']).dropna(),
cell_counts.Gran.ix[keepers] * 100, ax=axs[0],
line_args=[ll,{'alpha':0}],
s=30, color='grey', alpha=1, edgecolor='black')
axs[0].set_xlabel('Estimated granulocytes (%)')
axs[0].set_ylabel('Measured granulocytes (%)');
plot_regression(l3['Lymphocyte %'].ix[keepers],
cell_counts[['Bcell','NK','CD4T','CD8T']].sum(1) * 100,
ax=axs[1], line_args=[ll,{'alpha':0}],
s=30, color='grey', alpha=1, edgecolor='black')
axs[1].set_xlabel('Estimated lymphocytes (%)')
axs[1].set_ylabel('Measured lymphocytes (%)');
plot_regression(l3['Monocyte %'].ix[keepers], cell_counts['Mono'] * 100,
ax=axs[2], line_args=[ll,{'alpha':0}],
s=30, color='grey', alpha=1, edgecolor='black')
axs[2].set_xlabel('Estimated monocytes (%)')
axs[2].set_ylabel('Measured monocytes (%)')
axs = axs2[1]
plot_regression((labs['CD4 %'].ix[keepers] * l3['Lymphocyte %']) / 100.,
cell_counts.CD4T * 100,
ax=axs[0], line_args=[ll,{'alpha':0}],
s=30, color='grey', alpha=1, edgecolor='black')
axs[0].set_xlabel('Estimated CD4 T cells (%)')
axs[0].set_ylabel('Measured CD4 T cells (%)')
plot_regression((labs['CD8 %'].ix[keepers] * l3['Lymphocyte %']) / 100.,
cell_counts.CD8T * 100,
ax=axs[1], line_args=[ll,{'alpha':0}],
s=30, color='grey', alpha=1, edgecolor='black')
axs[1].set_xlabel('Estimated CD8 T cells (%)')
axs[1].set_ylabel('Measured CD8 T cells (%)')
plot_regression(np.log2(labs['CD4/CD8 ratio'].ix[keepers]),
np.log2(((cell_counts.CD4T + .01) / (cell_counts.CD8T + .01))),
ax=axs[2], line_args=[ll,{'alpha':0}],
s=30, color='grey', alpha=1, edgecolor='black')
axs[2].set_ylabel('Log2 CD4/CD8 (estimated)')
axs[2].set_xlabel('Log2 CD4/CD8 (measured)')
axs[2].set_xbound(-5.5,3)
axs[2].set_ybound(-7.5,3)
letters = list(map(chr, range(97, 123)))[:6]
for i,ax in enumerate(axs2.flatten()):
ax.text(-0.1, 1.1, letters[i], transform=ax.transAxes,
fontsize=20, fontweight='bold', va='top', ha='right')
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'figS1.pdf', transparent=False)
In [14]:
c1 = (labs['CD4 %'].ix[keepers] * l3['Lymphocyte %']) / 100.
c2 = cell_counts.CD4T * 100
In [15]:
fig, ax = subplots(figsize=(2.5,2.5))
series_scatter((c1).ix[ti(c2 < 2)], cell_counts.NK * 100, ax=ax)
prettify_ax(ax)
ax.set_xlabel('CD4 (measured)')
ax.set_ylabel('NK (estimated)')
fig.tight_layout()
fig.savefig(FIGDIR + 'inset.png', dpi=300)
Real adjustment is done in BMIQ_Normalization notebook.
In [16]:
betas = pd.read_hdf(HDFS_DIR + 'dx_methylation.h5', 'betas')
betas = betas['s2'].ix[:, duration.index]
betas = betas.groupby(level=0).first()
In [17]:
flow_sorted_data = pd.read_hdf(HDFS_DIR + 'methylation_annotation.h5','flow_sorted_data')
cell_type = pd.read_hdf(HDFS_DIR + 'methylation_annotation.h5', 'label_map')
In [18]:
n2 = flow_sorted_data.groupby(cell_type, axis=1).median()
avg = n2[cell_counts.columns].dot(cell_counts.T)
In [19]:
b1 = logit_adj(betas).T.corrwith(cell_counts.CD8T)
b2 = logit_adj(df_hiv).ix[:, betas.columns].T.corrwith(cell_counts.CD8T)
In [20]:
ss = avg.std(1)
In [21]:
b1 = b1.groupby(level=0).first()
b2 = b2.groupby(level=0).first()
In [22]:
idx = b1.abs() > .01
(b1[idx].abs() < b2[idx].abs()).value_counts()
Out[22]:
In [23]:
from scipy.stats import binom_test
In [24]:
binom_test((b1.abs() < b2.abs()).value_counts())
Out[24]:
In [25]:
fig, axs = subplots(1,2, figsize=(9,3))
ss.clip_upper(.05).hist(bins=30, ax=axs[0])
axs[0].set_xticks([0,.01,.02,.03,.04,.05])
axs[0].set_xticklabels([0,.01,.02,.03,.04,'0.05+'])
axs[0].set_xlabel('adjustment magnitude')
axs[0].set_ylabel('# of patients')
axs[1].hexbin(*match_series(b1.abs(), b2.abs()), gridsize=50,
bins='log')
axs[1].set_aspect(1)
axs[1].plot([0,.9],[0,.9], color='grey', lw=4, ls='--', dash_capstyle='round')
axs[1].set_xlabel('CD8T coorelation \n(pre-adjustment)')
axs[1].set_ylabel('CD8T coorelation \n(post-adjustment)')
for ax in axs:
prettify_ax(ax)
fig.tight_layout()
In [26]:
flow_sorted_data.T.groupby(cell_type == 'CD8T').mean().diff().ix[1].order().ix[500:].head(10)
Out[26]:
In [27]:
p = 'cg19163395'
In [28]:
diff = avg.ix[p].ix[duration.index] - avg.ix[p].median()
diff.hist()
Out[28]:
In [29]:
series_scatter(diff, cell_counts.CD8T)
In [30]:
fig, axs = subplots(1,2, figsize=(9,3))
cell_counts.CD8T.ix[duration.index].hist(color='grey', ax=axs[0])
axs[0].set_xlabel('% CD8T (estimated)')
axs[0].set_xticks([0,.1,.2,.3])
axs[0].set_ylabel('# of patients')
box_plot_pandas(cell_type[cell_type.isin(cell_counts.columns)],
flow_sorted_data.ix[p], ax=axs[1],
order=['CD8T','NK','Bcell','CD4T','Gran','Mono'])
axs[0].set_xlim(-.02,.4)
for ax in axs:
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'Adjust_ab.pdf')
In [31]:
fig, axs = subplots(1,2, figsize=(9,3), sharey=True)
series_scatter(cell_counts.CD8T, betas.ix[p].ix[diff.index],
ax=axs[0], color=colors[1], alpha=.7, s=15)
series_scatter(cell_counts.CD8T, betas.ix[p].ix[diff.index] - diff,
ax=axs[1], color=colors[1], alpha=.7, s=15)
for ax in axs:
prettify_ax(ax)
ax.set_xlabel('% CD8T (estimated)')
ax.set_xticks([0,.1,.2,.3])
ax.set_xlim(-.02,.4)
fig.tight_layout()
fig.savefig(FIGDIR + 'Adjust_cd.pdf')
In [32]:
fig, axs = subplots(1,2, figsize=(9,4))
ss.clip_upper(.05).hist(bins=30, ax=axs[0])
axs[0].set_xticks([0,.01,.02,.03,.04,.05])
axs[0].set_xticklabels([0,.01,.02,.03,.04,'0.05+'])
axs[0].set_xlabel('adjustment magnitude')
axs[0].set_ylabel('# of probes')
axs[1].hexbin(*match_series(b1.abs(), b2.abs()), gridsize=50,
bins='log')
axs[1].set_aspect(1)
axs[1].plot([0,.9],[0,.9], color='grey', lw=4, ls='--', dash_capstyle='round')
axs[1].set_xlabel('CD8T coorelation \n(pre-adjustment)')
axs[1].set_ylabel('CD8T coorelation \n(post-adjustment)')
for ax in axs:
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'Adjust_ef.pdf')
In [33]:
fig, axs = subplots(1,2, figsize=(4,3), sharey=True)
hiv = (duration=='Control').map({True:'HIV+',False:'HIV-'})
hiv.name = ''
box_plot_pandas(hiv, betas.ix[p], ax=axs[0])
box_plot_pandas(hiv, (betas - diff).ix[p], ax=axs[1])
for ax in axs:
prettify_ax(ax)
ax.set_xticklabels(['HIV-','HIV+'])
fig.tight_layout()
In [34]:
from Benchmarks.Model_Comparison_MF import *
In [35]:
pts = mc_adj_c.index
rr = screen_feature(age.ix[pts], pearson_pandas, cell_counts.T, align=False)
rr
Out[35]:
In [36]:
pts = mc_adj_c.index
rr = screen_feature(mc_adj_c.ix[pts] - age, pearson_pandas, cell_counts.T, align=False)
rr
Out[36]:
In [37]:
fig, ax = subplots(figsize=(5,3))
rr.rho.plot(kind='bar', ax=ax)
ax.set_ylabel('coorelation with age')
ax.set_xlim(-.5, 5.5)
prettify_ax(ax)
In [38]:
fig, ax = subplots(figsize=(5,3))
rr.rho.plot(kind='bar', ax=ax)
ax.set_ylabel('coorelation with age')
ax.set_xlim(-.5, 5.5)
prettify_ax(ax)