We know that there is a considerable amount of confounding going on with our HIV-associated aging signal and the patient's cellular composition. Patients with HIV inherently have different cellular compositions including lower CD4 T-cell counts and higher proportions of other cell types. In addition we know that in the normal aging process the composition of blood changes througout time. It is very hard to determine whether appearant age advancement is due to age associated blood composition chnages which happen as a direct consequence of HIV infection, or if HIV infection causes accelerated aging resulting in an adjustment of the blood makeup.
In [1]:
import NotebookImport
from IPython.display import clear_output
from HIV_Age_Advancement import *
from Setup.DX_Imports import *
In [2]:
import statsmodels.api as sm
import seaborn as sns
sns.set_context("paper", font_scale=1.7, rc={"lines.linewidth": 2.5})
sns.set_style("white")
In [3]:
fig, ax = subplots(figsize=(5,4))
plot_regression(a2, p2, ax=ax)
fig.tight_layout()
Interestingly a lot of the paitents off the diagnonal in the recently diagnosed group have detectable HIV rna in the blood plasma.
In [4]:
fig, ax = subplots(figsize=(5,4))
plot_regression(a2.ix[ti(labs['LLQ PLASMA'] != '>LLQ')], p2, ax=ax)
series_scatter(a2.ix[ti(labs['LLQ PLASMA'] == '>LLQ')], p2, color=colors[0],
ax=ax, ann=None)
fig.tight_layout()
In [5]:
fig, axs = subplots(1,3, figsize=(14,4), sharey=True)
age_at_dx = (clinical['estimated duration hiv (months)'] / 12.)
age_at_dx.name = 'age_at_dx'
series_scatter(age, age_at_dx.ix[duration.index], ax=axs[0])
violin_plot_pandas(duration[duration != 'Control'], age, ax=axs[1])
violin_plot_pandas(duration, age_at_dx, ax=axs[2])
for ax in axs:
prettify_ax(ax)
In [6]:
fig, axs = subplots(1,3, figsize=(14,4), sharey=True)
age_at_dx = age - (clinical['estimated duration hiv (months)'] / 12.)
age_at_dx.name = 'age_at_dx'
series_scatter(age, age_at_dx.ix[duration.index], ax=axs[0])
violin_plot_pandas(duration[duration != 'Control'], age, ax=axs[1])
violin_plot_pandas(duration, age_at_dx, ax=axs[2])
for ax in axs:
prettify_ax(ax)
In [7]:
age_advancement = (p2 - a2).ix[duration.index].dropna()
age_advancement.name = 'age_advancement'
reg = linear_regression(age, age_advancement)
age_adj = (age_advancement - age * reg['slope']).dropna()
age_adj = age_adj - reg.intercept
age_adj.name = 'age advancment (adjusted)'
In [8]:
fig, axs = subplots(1,2, figsize=(10,4), sharey=True)
series_scatter(age_advancement, age, ax=axs[0])
series_scatter(age_adj, age, ax=axs[1])
for ax in axs:
prettify_ax(ax)
fig.tight_layout()
In [9]:
residual = (pred_c - age).ix[duration.index]
residual.name = 'residual'
reg = linear_regression(age, residual)
resid_adj = (residual - age * reg['slope']).dropna()
resid_adj = resid_adj - reg.intercept
resid_adj.name = 'residual (adjusted)'
In [10]:
fig, axs = subplots(1,2, figsize=(10,4), sharey=True)
series_scatter(residual, age, ax=axs[0])
series_scatter(resid_adj, age, ax=axs[1])
for ax in axs:
prettify_ax(ax)
fig.tight_layout()
In [11]:
#r = p2 - a2
a,b,c = residual.groupby(duration)
In [12]:
sp.stats.bartlett(a[1].dropna(), c[1].dropna())
Out[12]:
In [13]:
sp.stats.bartlett(a[1].dropna(), b[1].dropna(), c[1].dropna())
Out[13]:
In [14]:
violin_plot_pandas(duration, p2 - a2)
Data from the labs
In [15]:
l2 = (labs.ix[:, labs.dtypes.isin([dtype('int64'), dtype('float64')])]
.dropna(1, how='all'))
l3 = labs.ix[:, ti(labs.apply(lambda s: len(s.unique()), axis=0) < 6)]
In [16]:
spearman_pandas(residual, np.log2(l2['CD4/CD8 ratio']))
Out[16]:
In [17]:
pearson_pandas(residual, np.log2(l2['CD4/CD8 ratio']))
Out[17]:
In [18]:
spearman_pandas(residual.ix[ti(duration=='HIV Long')],
np.log2(l2['CD4/CD8 ratio']))
Out[18]:
In [19]:
spearman_pandas(resid_adj.ix[ti(duration=='HIV Short')],
np.log2(l2['CD4/CD8 ratio']))
Out[19]:
In [20]:
spearman_pandas(resid_adj, np.log2(l2['CD4/CD8 ratio']))
Out[20]:
In [21]:
series_scatter(residual, np.log2(l2['CD4/CD8 ratio']))
In [22]:
l2 = (labs.ix[:, labs.dtypes.isin([dtype('int64'), dtype('float64')])]
.dropna(1, how='all'))
l3 = labs.ix[:, ti(labs.apply(lambda s: len(s.unique()), axis=0) < 6)]
keepers = labs.index.difference(['RG065','RG175','RG279','RA182','RM285'])
keepers = keepers.intersection(duration.index)
l2 = l2.ix[keepers]
l3 = l3.ix[keepers]
In [23]:
duration.name = 'duration'
In [24]:
violin_plot_pandas(combine(labs['LLQ PLASMA'] == '>LLQ', duration=='HIV Long'),
age, order=['neither','duration','both','LLQ PLASMA'])
In [25]:
violin_plot_pandas(combine(labs['LLQ PLASMA'] == '>LLQ', duration=='HIV Long'),
age_advancement, order=['neither','duration','both','LLQ PLASMA'])
In [26]:
series_scatter(np.log(labs['rnvalue PLASMA'][labs['LLQ PLASMA'] == '>LLQ']),
age_advancement)
In [27]:
screen_feature(age_advancement, pearson_pandas, l2.T, align=False).head()
Out[27]:
In [28]:
bins = np.floor(age_advancement / 5.)
bins = bins.clip(-1,2)
spearman_pandas(bins, l2.MCV)
Out[28]:
In [29]:
fig, axs = subplots(1,2, figsize=(6,4))
bins = np.floor(age_advancement / 5.)
bins = bins.clip(-1,2).map({-1: '< 0', 0:'0-5', 1:'5+', 2:'5+'})
box_plot_pandas(bins, l2.MCV, order=['< 0','0-5','5+'], ax=axs[0])
box_plot_pandas(bins, l2['age'], order=['< 0','0-5','5+'], ax=axs[1])
for ax in axs:
prettify_ax(ax)
fig.tight_layout()
In [30]:
fig, ax = subplots(figsize=(5,4))
series_scatter(age_advancement, l2.MCV, ax=ax, color=colors[3],
edgecolor='black')
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'mcv_age_advancement.png', dpi=300)
Cell composition from mixture model estimates
In [31]:
screen_feature(age_advancement, spearman_pandas, cell_counts.T, align=False)
Out[31]:
While we see a significant effect of NK cell concentration with increasing age advancment, this does not seem to be specific to HIV+ patients.
In [32]:
fig, ax = subplots(1,1, figsize=(4,3))
rr = cell_counts.NK
k = pred_c.index
hiv = duration != 'Control'
sns.regplot(*match_series(residual.ix[k], rr.ix[ti(hiv==0)]),
ax=ax, label='HIV+')
sns.regplot(*match_series(residual.ix[k], rr.ix[ti(hiv>0)]),
ax=ax, label='Control')
prettify_ax(ax)
Here we are looking at biological age, MCV, and NK cell count. We contructed a similar model with monocyte count as well but found that it did not add to the model fit.
In [33]:
age_adj.name = 'age_advancement'
hiv = (duration != 'Control').astype(float)
hiv.name = 'HIV'
In [34]:
age.name = 'bio_age'
duration_t = clinical['estimated duration hiv (months)'] / 12.
duration.name = 'duration'
monocytes = labs['Monocyte %']
monocytes.name = 'monocytes'
df = process_factors([age_advancement, duration, age, age_at_dx,
l2.MCV, l2.MCH, cell_counts.NK, cell_counts.CD4T,
monocytes], standardize=True)
fmla = robjects.Formula('age_advancement ~ bio_age + MCV + NK')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])
In [35]:
hiv = (duration != 'Control').astype(float)
hiv.name = 'HIV'
age.name = 'chron_age'
pred_c.name = 'bio_age'
In [36]:
hiv = (duration != 'Control').astype(float)
hiv.name = 'HIV'
df = process_factors([residual, hiv, age, cell_counts.NK, cell_counts.CD4T,
cell_counts.CD8T, cell_counts.Bcell, cell_counts.Mono,
cell_counts.Gran], standardize=False)
fmla = robjects.Formula('residual ~ chron_age + HIV + NK + CD4T + CD8T + '
'Bcell + Mono + Gran')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])
In [37]:
hiv = (duration != 'Control').astype(float)
hiv.name = 'HIV'
df = process_factors([residual, hiv, pred_c, age, cell_counts.NK,
cell_counts.CD4T, cell_counts.CD8T,
cell_counts.Bcell, cell_counts.Mono,
cell_counts.Gran], standardize=False)
fmla = robjects.Formula('residual ~ bio_age + HIV + NK')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])
In [38]:
hiv = (duration != 'Control').astype(float)
hiv.name = 'HIV'
age.name = 'chron_age'
pred_c.name = 'bio_age'
df = process_factors([residual, hiv, pred_c, age, cell_counts.NK,
cell_counts.CD4T, cell_counts.CD8T,
cell_counts.Bcell, cell_counts.Mono,
cell_counts.Gran])
fmla = robjects.Formula('bio_age ~ chron_age + NK + CD4T + CD8T + '
'Bcell + Mono + Gran')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])
In [39]:
1.4299 / 2.3176
Out[39]:
In [40]:
hiv = (duration != 'Control').astype(float)
hiv.name = 'HIV'
age.name = 'chron_age'
pred_c.name = 'bio_age'
df = process_factors([residual, hiv, pred_c, age, cell_counts.NK,
cell_counts.CD4T, cell_counts.CD8T,
cell_counts.Bcell, cell_counts.Mono,
cell_counts.Gran])
fmla = robjects.Formula('residual ~ chron_age + NK + CD4T + CD8T + '
'Bcell + Mono + Gran')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])
In [41]:
rmse = lambda v: (v ** 2).mean() ** .5
In [42]:
v = robjects.r.residuals(m)
r2 = pd.Series(pandas2ri.ri2py(v), index=list(v.names[0]))
r2.name = 'residual'
hiv = (duration != 'Control').astype(float)
hiv.name = 'HIV'
df = process_factors([r2, hiv, pred_c, cell_counts.NK, cell_counts.CD4T,
cell_counts.CD8T, cell_counts.Bcell, cell_counts.Mono,
cell_counts.Gran])
fmla = robjects.Formula('residual ~ HIV')
m1 = robjects.r.lm(fmla, df)
s = robjects.r.summary(m1)
print '\n\n'.join(str(s).split('\n\n')[-3:])