HIV Methylation Age Advancement



In [1]:

    
import NotebookImport
from Setup.Imports import *









    




importing IPython notebook from Setup/Imports






    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
from Setup.MethylationAgeModels import *









    




importing IPython notebook from Setup/MethylationAgeModels



In [3]:

    
from Setup.Read_HIV_Data import *









    




importing IPython notebook from Setup/Read_HIV_Data



In [4]:

    
hiv = (duration=='Control').map({False: 'HIV+', True: 'HIV-'})
hiv.name = 'HIV Status'



In [5]:

    
hiv.value_counts()









    Out[5]:





HIV+    137
HIV-     42
dtype: int64

Run Age Predictions on HIV Dataset



In [6]:

    
def model_plot(prediction):
    fig, axs = subplots(1,2, figsize=(10,4), sharex=True, sharey=True)
    plot_regression(age, prediction.ix[ti(hiv=='HIV-')], ax=axs[0])
    plot_regression(age, prediction.ix[ti(hiv=='HIV+')], ax=axs[1])
    axs[0].set_title('HIV-')
    axs[1].set_title('HIV+')

Hannum Model



In [7]:

    
pred = run_hannum_model(df_hiv)



In [8]:

    
#Do not import
model_plot(pred)



In [9]:

    
get_error(age, pred, denominator='x', groups=hiv)









    Out[9]:






  
    
      
      HIV+
      HIV-
    
  
  
    
      % error
      21.14
      12.07
    
    
      error (years)
      9.07
      5.27

Preforming a linear adjustment on the control data.



In [10]:

    
reg = linear_regression(age, pred.ix[ti(duration=='Control')])
pred_adj = (pred - reg['intercept']) / reg['slope']



In [11]:

    
#Do not import
model_plot(pred_adj)



In [12]:

    
get_error(age, pred_adj, denominator='x', groups=hiv)









    Out[12]:






  
    
      
      HIV+
      HIV-
    
  
  
    
      % error
      12.99
      8.62
    
    
      error (years)
      5.65
      3.77

Horvath Model



In [13]:

    
pred_horvath = run_horvath_model(df_hiv_n)



In [14]:

    
#Do not import
model_plot(pred_horvath)



In [15]:

    
reg = linear_regression(age, pred_horvath.ix[ti(duration=='Control')])
pred_horvath_adj = (pred_horvath - reg['intercept']) / reg['slope']



In [16]:

    
#Do not import
model_plot(pred_horvath_adj)



In [17]:

    
get_error(age, pred_horvath_adj, denominator='x', groups=hiv)









    Out[17]:






  
    
      
      HIV+
      HIV-
    
  
  
    
      % error
      15.67
      8.66
    
    
      error (years)
      6.78
      3.74

Quality Control

Look at detection p-value and concordance of the models



In [18]:

    
#Do not import
fig, axs = subplots(1,2, figsize=(9,4), sharex=True, sharey=True)
plot_regression(pred_adj.ix[ti(hiv=='HIV-')], pred_horvath_adj, ax=axs[0])
plot_regression(pred_adj.ix[ti(hiv=='HIV+')], pred_horvath_adj, ax=axs[1])
fig.tight_layout()



In [19]:

    
get_error(pred_adj, pred_horvath_adj, groups=hiv)









    Out[19]:






  
    
      
      HIV+
      HIV-
    
  
  
    
      % error
      7.05
      7.66
    
    
      error (years)
      3.56
      3.38

Dropping 6 patients with Low Concordance

2/6 have bad probes on the sites used in the models



In [20]:

    
diff = ((pred_adj - pred_horvath_adj) / ((pred_adj + pred_horvath_adj) * .5)).abs()
diff = diff.groupby(level=0).first()
diff.name = 'Absolute difference in models'



In [21]:

    
#Do not import
fig, axs = subplots(1,2, figsize=(10,4))
diff.hist(ax=axs[0])
axs[0].set_xlabel(diff.name)
violin_plot_pandas(duration, diff, ax=axs[1])
fig.tight_layout()



In [22]:

    
in_model = detection_p[detection_p.level_0.isin(hannum_model.index.union(horvath_model.index))]
(diff).ix[in_model.Sample_Name.unique()].dropna().order()









    Out[22]:





Sample_Name
METI-1     0.01
METI-24    0.02
METI-4     0.04
RP285      0.11
METI-5     0.13
RM067      0.14
RP126      0.29
METI-8     0.36
Name: Absolute difference in models, dtype: float64



In [23]:

    
pt = ti(diff < .2)



In [24]:

    
o = ti(diff > .2)
o









    Out[24]:





Index([u'METI-12', u'METI-8', u'RM292', u'RP040', u'RP126'], dtype='object', name=u'Sample_Name')



In [25]:

    
#Do not import
fig, axs = subplots(1,2, figsize=(9,4), sharex=True, sharey=True)
plot_regression(pred_adj.ix[ti(hiv=='HIV-')].ix[pt], pred_horvath_adj, ax=axs[0])
plot_regression(pred_adj.ix[ti(hiv=='HIV+')].ix[pt], pred_horvath_adj, ax=axs[1])
series_scatter(pred_adj.ix[ti(hiv=='HIV-')].ix[o], pred_horvath_adj, ax=axs[1],
               color=colors[0], ann=None)
series_scatter(pred_adj.ix[ti(hiv=='HIV+')].ix[o], pred_horvath_adj, ax=axs[0],
               color=colors[0], ann=None)

fig.tight_layout()
fig.savefig(FIGDIR + 'hiv_model_agreement_filter.png', dpi=300)



In [29]:

    
#Do not import
v = detection_p.groupby('Sample_Name').size()
v.name = '# probes with poor detection'
series_scatter(diff, v)



In [30]:

    
detection_p[detection_p.Sample_Name.isin(o)].groupby('Sample_Name').size().order()









    Out[30]:





Sample_Name
RM292        463
METI-12     2123
RP040       2369
RP126      16562
METI-8     18208
dtype: int64



In [31]:

    
#pt = ti(((pred_adj - pred_horvath_adj)).abs().dropna() < 10)

Merging Hannum and Horvath

Without filter.



In [32]:

    
pred_c = (pred_horvath_adj + pred_adj) / 2
pred_c = pred_c.ix[duration.index]
pred_c.name = 'Predicted Age (Combined)'
reg = linear_regression(age, pred_c.ix[ti(duration=='Control')])
pred_c = (pred_c - reg['intercept']) / reg['slope']



In [33]:

    
#Do not import
model_plot(pred_c)



In [34]:

    
get_error(age, pred_c, denominator='x', groups=hiv)









    Out[34]:






  
    
      
      HIV+
      HIV-
    
  
  
    
      % error
      14.00
      8.16
    
    
      error (years)
      6.05
      3.49

With QC filter



In [35]:

    
pred_c = (pred_horvath_adj + pred_adj) / 2
pred_c = pred_c.ix[duration.index].ix[pt]
pred_c.name = 'Predicted Age (Combined)'
reg = linear_regression(age, pred_c.ix[ti(duration=='Control')])
pred_c = (pred_c - reg['intercept']) / reg['slope']



In [36]:

    
#Do not import
model_plot(pred_c)



In [37]:

    
get_error(age, pred_c, denominator='x', groups=hiv)









    Out[37]:






  
    
      
      HIV+
      HIV-
    
  
  
    
      % error
      13.36
      7.51
    
    
      error (years)
      5.80
      3.30



In [38]:

    
#Do not import
fig, axs = subplots(1,3, figsize=(15,4), sharex=True, sharey=True)
plot_regression(age, pred_c.ix[ti(duration=='Control')], ax=axs[0])
plot_regression(age, pred_c.ix[ti(duration=='HIV Short')], ax=axs[1])
plot_regression(age, pred_c.ix[ti(duration=='HIV Long')], ax=axs[2])
axs[2].set_xlim(20,75)
axs[2].set_ylim(20,75);



In [39]:

    
get_error(age, pred_c, denominator='x', groups=duration)









    Out[39]:






  
    
      
      Control
      HIV Long
      HIV Short
    
  
  
    
      % error
      7.51
      11.50
      19.08
    
    
      error (years)
      3.30
      5.32
      7.26

Comparing Age Advancement vs. Age Acceleration



In [40]:

    
#Do not import
fig, axs = subplots(2,1, figsize=(5,7))

violin_plot_pandas(duration, pred_c / clinical.age, 
                   order=['Control','HIV Short','HIV Long'],
                   ax=axs[0])
axs[0].set_ylabel('Age Acceleration (AMAR)')

violin_plot_pandas(duration, pred_c - clinical.age, 
                   order=['Control','HIV Short','HIV Long'],
                   ax=axs[1])
axs[1].set_ylabel('Age Advance (Predicted - Actual)')

for ax in axs:
    prettify_ax(ax)
fig.tight_layout()

Comparing short vs. long term infected HIV+ subjects



In [41]:

    
kruskal_pandas(duration.ix[ti(hiv == 'HIV+')], (pred_c - age))









    Out[41]:





H    0.16
p    0.69
dtype: float64

Association of age residuals with HIV status.



In [42]:

    
kruskal_pandas(hiv, (pred_c - age))









    Out[42]:





H    3.38e+01
p    6.03e-09
dtype: float64

Mean age resisuals broken down by HIV status. Not that HIV- is ~0 by construction as the full dataset is normalized by the controls.



In [44]:

    
(pred_c - age).dropna().groupby(hiv).mean()









    Out[44]:





HIV Status
HIV+    4.92e+00
HIV-    7.99e-16
dtype: float64

Looking at Predicted Time of Onset

The idea of age acceleration, only really makes sense in this context as a person should age normally until the onset of the disease



In [45]:

    
p2 = (pred_c - (clinical.age - (clinical['estimated duration hiv (months)'] / 12.)))
p2.name = 'Biological Time Since Onset'
a2 = (clinical['estimated duration hiv (months)'] / 12.)
a2.name = 'Actual Time Since Onset'
a2 = a2.ix[pt].dropna()
p2 = p2.ix[pt].dropna()

All HIV Combined



In [46]:

    
#Do not import
fig, ax = subplots(figsize=(5,4))
plot_regression(a2, p2, ax=ax)
fig.tight_layout()

This is just the same linear model as the plot above.



In [103]:

    
#Do not import
p4 = p2 
a2.name = 'chron_age'
p4.name = 'bio_age'
df = process_factors([a2, p4], standardize=False)
fmla = robjects.Formula('bio_age ~ chron_age')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])









    



Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   5.2138     0.9326    5.59 1.25e-07 ***
chron_age     0.9790     0.0600   16.32  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.886 on 132 degrees of freedom
Multiple R-squared:  0.6685,	Adjusted R-squared:  0.666 
F-statistic: 266.2 on 1 and 132 DF,  p-value: < 2.2e-16



In [105]:

    
#Do not import
print robjects.r.confint(m)









    



                2.5 %   97.5 %
(Intercept) 3.3689637 7.058680
chron_age   0.8602881 1.097651

Short and Long Duration Split



In [106]:

    
#Do not import
fig, axs = subplots(1,2, figsize=(10,4))
plot_regression(a2.ix[ti(duration=='HIV Short')], p2, ax=axs[0])
axs[0].set_xbound(0,5)
plot_regression(a2.ix[ti(duration=='HIV Long')], p2, ax=axs[1])