Process Normalized Cross-Study Dataset

For differential expression analysis, we processed all of the data together on a high memory node. This was done using the minfi package in R, and all data were normalized using the preprocessQuantile function. For a full script of the R pipeline see the Methylation_Normalization_MINFI notebook.

Imports and helper functions from Imports notebook.



In [1]:

    
cd ..









    



/cellar/users/agross/TCGA_Code/Methlation



In [2]:

    
import NotebookImport
from Setup.Imports import *
from Setup.MethylationAgeModels import *









    




importing IPython notebook from Setup/Imports






    



Populating the interactive namespace from numpy and matplotlib






    




importing IPython notebook from Setup/MethylationAgeModels

Read in Validation Data



In [3]:

    
path = '/cellar/users/agross/TCGA_Code/Methlation/data/Validation/'



In [16]:

    
betas = pd.read_csv(path + 'raw_data.csv', index_col=0)

manifest1 = pd.read_csv(path + 'Sample Sheet_20151008.csv', skiprows=7)
manifest1.Sample_Name = manifest1.Sample_Name.map(lambda s: s[3:] + '_Neu')
manifest2 = pd.read_csv(path + 'Sample Sheet_20151102.csv', skiprows=7)
manifest2.Sample_Name = manifest2.Sample_Name.map(lambda s: s.strip().replace(' ','_'))
manifest = pd.concat([manifest1, manifest2])
mapping = pd.Series({'{}_{}'.format(s.Sentrix_ID, s.Sentrix_Position): s.Sample_Name
           for i,s in manifest.iterrows()})
betas = betas.rename(columns=mapping)
raw_betas = betas
raw_betas.shape









    Out[16]:





(485512, 96)

Looking at the quality scores on the probes



In [5]:

    
detection = pd.read_csv(path + 'detection_p.csv', index_col=0)
detection = detection.rename(columns=mapping)



In [6]:

    
dp = detection.replace(0., np.nan).stack().order()
bad_probes = (dp > .01).groupby(level=1).sum() 
bad_probes.index = bad_probes.index.map(lambda s: s.strip().replace(' ','_'))
bad_probes.order().tail()









    Out[6]:





206_Neu       272
366_Neu       281
366_CD4      1564
381_CD4     87028
352_CD4    314144
dtype: float64



In [17]:

    
raw_betas = raw_betas.ix[:, raw_betas.columns.difference(ti(bad_probes > 10000))]



In [9]:

    
from IPython.display import clear_output
from rpy2.robjects import pandas2ri
pandas2ri.activate()



In [10]:

    
robjects.r.library('WGCNA');
robjects.r.source("/cellar/users/agross/Data/MethylationAge/Horvath/NORMALIZATION.R")
clear_output()



In [11]:

    
f = '/cellar/users/agross/TCGA_Code/Methlation/data/Hannum_gold_standard.csv'
gold_standard_ah = pd.read_csv(f, index_col=0, header=None, squeeze=True)
gold_standard_ah.name = 'gs'



In [12]:

    
gold_standard = pd.read_csv('/cellar/users/agross/Data/MethylationAge/Horvath/probeAnnotation21kdatMethUsed.csv', index_col=0)
horvath = pd.read_table('/cellar/users/agross/TCGA_Code/Methlation/data/Horvath_Model.csv', index_col=0, skiprows=[0,1])
intercept = horvath.CoefficientTraining['(Intercept)']
horvath = horvath.iloc[1:]



In [13]:

    
fig, ax = subplots(1,1, figsize=(12,4))
gold_standard_ah.ix[gold_standard.index].hist(bins=100, ax=ax, alpha=.5)
betas.ix[gold_standard.index].median(1).hist(bins=100, ax=ax, alpha=.5)









    Out[13]:





<matplotlib.axes.AxesSubplot at 0x7fab6ed97c10>



In [14]:

    
fig, ax = subplots(1,1, figsize=(12,4))
gold_standard.goldstandard2.hist(bins=100, ax=ax, alpha=.5)
betas.ix[gold_standard.goldstandard2.index].median(1).hist(bins=100, ax=ax, alpha=.5)









    Out[14]:





<matplotlib.axes.AxesSubplot at 0x7fab6ed85210>

The pure cell population has a very different distribution than the reference provided by Horvath.



In [15]:

    
gold_standard.goldstandard2.shape









    Out[15]:





(21368,)



In [11]:

    
if False:
    gs = gold_standard.goldstandard2
    gs.name = 'gs'
    df = raw_betas.ix[gs.index].dropna()
    df_r = robjects.r.t(pandas2ri.py2ri(df))
  
    gs = list(gs.ix[df.index])
    gs_r = robjects.FloatVector(gs)

    data_n = robjects.r.BMIQcalibration(df_r, gs_r)
    clear_output()

    data = pandas2ri.ri2py_dataframe(data_n).T
    data.index = data_n.colnames
    data.columns = [c.replace('X','') for c in data_n.rownames]
    data.to_csv(path + 'BMIQ_Horvath.csv')
else:
    data = pd.read_csv(path + 'BMIQ_Horvath.csv', index_col=0)
data_horvath = data



In [28]:

    
cell_counts = pd.read_csv(path + 'cell_counts.csv', index_col=0)
cell_counts = cell_counts.rename(index=mapping)

flow_sorted_data = pd.read_hdf(HDFS_DIR + 'methylation_annotation.h5','flow_sorted_data')
cell_type = pd.read_hdf(HDFS_DIR + 'methylation_annotation.h5', 'label_map')

n2 = flow_sorted_data.groupby(cell_type, axis=1).mean()
avg = n2[cell_counts.columns].dot(cell_counts.T)



In [29]:

    
cc = avg.mean(1)
adj = (data_horvath - avg).add(cc, axis=0)
adj = adj.dropna(how='all', axis=0)

data_s = adj



In [40]:

    
control_color = (0.2980392156862745, 0.4470588235294118, 0.6901960784313725)
hiv_color = (0.3333333333333333, 0.6588235294117647, 0.40784313725490196)

def model_plot(prediction):
    fig, axs = subplots(1,3, figsize=(14,4),sharey=False)
    plot_regression(age, prediction.ix[ti(hiv=='HIV-')], ax=axs[0], 
                   color=control_color, alpha=1, edgecolor='black', s=30, 
                   line_args=[{'color':'grey','alpha':1, 'ls':'--'},
                              {'alpha':0}])
    plot_regression(age, prediction.ix[ti(hiv=='HIV+')], ax=axs[1], 
                    color=hiv_color, alpha=1, edgecolor='black', s=30, 
                    line_args=[{'color':hiv_color,'alpha':1, 'ls':'-'},
                               {'alpha':1, 'color':'grey', 'ls':'--'}])
    axs[0].set_title('HIV-')
    axs[1].set_title('HIV+')
    
    #violin_plot_pandas(hiv, prediction - age, ax=axs[2])
    #axs[2].set_ylabel('Age Advancment')
    #prettify_ax(axs[2])
    for ax in axs[[0,1]]:
        ax.set_xlim(20,75)
        ax.set_ylim(20,75)
        
    ax = axs[2]
    zscore = lambda s: (s - s.mean()) / s.std()
    residual = prediction - age
    e2 = residual.groupby(hiv).apply(zscore)
    #return residual, e2
    sns.violinplot(residual[e2.abs() < 3], hiv, ax=ax, 
                   order=['HIV-','HIV+'],
                   inner='points', inner_kws={"ms": 8}, alpha=.5,
                   bw='scott', cut=0, color=[control_color,
                                             hiv_color, hiv_color])
    x = hiv.map({'HIV-':1, 'HIV+':2})
    series_scatter(x, residual[e2.abs() > 3], color='white', edgecolor='red',
                   alpha=1, s=20, lw=1, ax=ax, ann=None)
    ax.set_ylabel('Biological Age - Chronological Age')
    ax.set_xlabel('')
    prettify_ax(ax)
    fig.tight_layout()



In [42]:

    
from Validation.Process_Clinical import *



In [56]:

    
cell_type = cell_type.ix[data_horvath.columns]



In [332]:

    
pred_horvath = run_horvath_model(data_s.ix[:, ti(cell_type == 'Neu')])
reg = linear_regression(age, pred_horvath.ix[ti(hiv=='HIV-')])
pred_horvath_adj = (pred_horvath - reg['intercept']) / reg['slope']



In [333]:

    
model_plot(pred_horvath_adj)



In [334]:

    
#Do not import
p4 = pred_horvath_adj 
p4.name = 'bio_age'
hiv.name = 'hiv'
df = process_factors([p4, hiv, age], standardize=False)
df.colnames = ['bio_age','hiv','age']
fmla = robjects.Formula('bio_age ~ 1 + hiv + age')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])









    



Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.41589    1.96239  -0.212    0.833    
hivHIV+      0.03962    1.12538   0.035    0.972    
age          1.01002    0.04317  23.397   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.831 on 45 degrees of freedom
Multiple R-squared:  0.9264,	Adjusted R-squared:  0.9231 
F-statistic: 283.1 on 2 and 45 DF,  p-value: < 2.2e-16



In [335]:

    
print robjects.r.confint(m)









    



                 2.5 %   97.5 %
(Intercept) -4.3683399 3.536563
hivHIV+     -2.2270245 2.306257
age          0.9230685 1.096964



In [316]:

    
pred_horvath = run_horvath_model(data_s.ix[:, ti(cell_type == 'CD4')])
reg = linear_regression(age, pred_horvath.ix[ti(hiv=='HIV-')])
pred_horvath_adj = (pred_horvath - reg['intercept']) / reg['slope']



In [317]:

    
model_plot(pred_horvath_adj)



In [320]:

    
(pred_horvath_adj - age).dropna().groupby(hiv).mean()









    Out[320]:





hiv
HIV+    4.81e+00
HIV-    3.26e-15
dtype: float64



In [318]:

    
anova(hiv, (pred_horvath_adj - age).dropna()).ix['p'] / 2









    Out[318]:





0.00027571345113770523



In [298]:

    
get_error(age, pred_horvath_adj, denominator='x', groups=hiv)









    Out[298]:






  
    
      
      HIV+
      HIV-
    
  
  
    
      % error
      13.14
      8.80
    
    
      error (years)
      5.34
      3.69



In [68]:

    
path = '/cellar/users/agross/TCGA_Code/Methlation/data/Validation/'

betas = pd.read_csv(path + 'beta_qn.csv', index_col=0)
betas = betas.rename(columns=mapping)



In [69]:

    
cc = avg.mean(1)
adj = (betas - avg).add(cc, axis=0)
adj = adj.dropna(how='all', axis=1)



In [226]:

    
df = adj
df = df.ix[gold_standard_ah.index]
df = df.T.fillna(gold_standard_ah).T



In [229]:

    
df_cd4 = df.ix[:, ti(cell_type == 'CD4')]
df_neu = df.ix[:, ti(cell_type == 'Neu')]



In [234]:

    
pearson_pandas(df_cd4.mean(1), df_neu.mean(1))









    Out[234]:





rho    1
p      0
dtype: float64



In [258]:

    
def normalize_BMIQ(df, gs=None):
    df_r = robjects.r.t(pandas2ri.py2ri(df))
    if gs is None:
        gs = df.mean(1)
    gs = list(gs)
    gs_r = robjects.FloatVector(gs)

    data_n = robjects.r.BMIQcalibration(df_r, gs_r)
    clear_output()

    data = pandas2ri.ri2py_dataframe(data_n).T
    data.index = data_n.colnames
    data.columns = data_n.rownames
    data.columns = data.columns.str.replace('X','')
    return data



In [260]:

    
data_neu = normalize_BMIQ(df_neu)



In [239]:

    
data_cd4 = normalize_BMIQ(df_cd4)



In [266]:

    
df = data_neu.join(data_cd4)
df.to_csv(path + 'BMIQ_Hannum2.csv')



In [336]:

    
pred_hannum = run_hannum_model(data_neu.ix[:, ti(cell_type == 'Neu')])
reg = linear_regression(age, pred_hannum.ix[ti(hiv=='HIV-')].dropna())
pred_hannum_adj = (pred_hannum - reg['intercept']) / reg['slope']



In [337]:

    
pred_hannum_adj_n = pred_hannum_adj



In [338]:

    
model_plot(pred_hannum_adj)



In [340]:

    
#Do not import
p4 = pred_hannum_adj 
p4.name = 'bio_age'
hiv.name = 'hiv'
df = process_factors([p4, hiv, age], standardize=False)
df.colnames = ['bio_age','hiv','age']
fmla = robjects.Formula('bio_age ~ 1 + hiv + age')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])









    



Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.7298     2.1502   0.339   0.7359    
hivHIV+       2.5410     1.2331   2.061   0.0451 *  
age           0.9824     0.0473  20.770   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.197 on 45 degrees of freedom
Multiple R-squared:  0.912,	Adjusted R-squared:  0.9081 
F-statistic: 233.2 on 2 and 45 DF,  p-value: < 2.2e-16



In [341]:

    
print robjects.r.confint(m)









    



                  2.5 %   97.5 %
(Intercept) -3.60085076 5.060522
hivHIV+      0.05747613 5.024576
age          0.88715463 1.077691



In [331]:

    
violin_plot_pandas(hiv, pred_hannum_adj - age)



In [264]:

    
(pred_hannum_adj - age).groupby(hiv).mean()









    Out[264]:





hiv
HIV+    2.46e+00
HIV-    5.56e-15
dtype: float64



In [309]:

    
anova(hiv, pred_hannum_adj - age).ix['p'] / 2









    Out[309]:





0.023238963548359123



In [310]:

    
pred_hannum = run_hannum_model(data_cd4.ix[:, ti(cell_type == 'CD4')])
reg = linear_regression(age, pred_hannum.ix[ti(hiv=='HIV-')].dropna())
pred_hannum_adj = (pred_hannum - reg['intercept']) / reg['slope']



In [311]:

    
model_plot(pred_hannum_adj)
plt.gcf().axes[2].set_ylim(-10,20)









    Out[311]:





(-10, 20)



In [313]:

    
(pred_hannum_adj - age).groupby(hiv).mean()









    Out[313]:





hiv
HIV+    3.95e+00
HIV-   -7.11e-15
dtype: float64



In [312]:

    
anova(hiv, pred_hannum_adj - age).ix['p'] / 2









    Out[312]:





0.0060418969582970595



In [247]:

    
pn = pred_hannum_adj_n.copy()
pn.name = 'Neu'
pn.index = pn.index.map(lambda s: s.split('_')[0])
pt = pred_hannum_adj.copy()
pt.name = 'CD4'
pt.index = pt.index.map(lambda s: s.split('_')[0])

pa = age.copy()
pa.index = pa.index.map(lambda s: s.split('_')[0])



In [248]:

    
plot_regression(pn - pa, (pt - pa).dropna())



In [249]:

    
plot_regression((pred_hannum_adj - age), (pred_horvath_adj - age).dropna())



In [250]:

    
plot_regression(pred_hannum_adj.dropna(), pred_horvath_adj.dropna())



In [273]:

    
diff = ((pred_hannum_adj - pred_horvath_adj) / ((pred_hannum_adj + pred_horvath_adj) * .5)).abs()
diff = diff.groupby(level=0).first()
diff.name = 'Absolute difference in models'
diff.hist()









    Out[273]:





<matplotlib.axes.AxesSubplot at 0x7fab0181db90>



In [274]:

    
diff.order().tail()









    Out[274]:





218_CD4    0.20
201_CD4    0.27
369_CD4    0.32
385_CD4    0.40
366_CD4    0.40
Name: Absolute difference in models, dtype: float64



In [275]:

    
(pred_horvath_adj - age).groupby(hiv).mean()









    Out[275]:





hiv
HIV+    4.81e+00
HIV-    3.26e-15
dtype: float64



In [276]:

    
(pred_hannum_adj - age).groupby(hiv).mean()









    Out[276]:





hiv
HIV+    3.95e+00
HIV-   -7.11e-15
dtype: float64



In [277]:

    
pred_c = (pred_hannum_adj + pred_horvath_adj) / 2.
reg = linear_regression(age, pred_c.ix[ti(hiv=='HIV-')])
pred_c_adj = (pred_c - reg['intercept']) / reg['slope']
pred_c_adj = pred_c_adj.ix[ti(diff < .3)]



In [278]:

    
model_plot(pred_c_adj)



In [279]:

    
violin_plot_pandas(hiv, pred_c_adj - age)



In [280]:

    
#Do not import
p4 = pred_c.ix[pred_c_adj.index]
p4.name = 'bio_age'
hiv.name = 'hiv'
df = process_factors([p4, hiv, age], standardize=False)
df.colnames = ['bio_age','hiv','age']
fmla = robjects.Formula('bio_age ~  1 + hiv + age')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])









    



Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.20608    1.94852   0.106    0.916    
hivHIV+      5.68002    1.11716   5.084 9.05e-06 ***
age          0.99507    0.04316  23.055  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.589 on 40 degrees of freedom
Multiple R-squared:  0.9386,	Adjusted R-squared:  0.9356 
F-statistic: 305.9 on 2 and 40 DF,  p-value: < 2.2e-16



In [281]:

    
#Do not import
p4 = pred_horvath.ix[pred_c_adj.index]
p4.name = 'bio_age'
hiv.name = 'hiv'
df = process_factors([p4, hiv, age], standardize=False)
df.colnames = ['bio_age','hiv','age']
fmla = robjects.Formula('bio_age ~   1 + hiv + age')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])









    



Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  4.55471    1.91080   2.384    0.022 *  
hivHIV+      4.82536    1.09554   4.405 7.72e-05 ***
age          0.81549    0.04232  19.267  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.52 on 40 degrees of freedom
Multiple R-squared:  0.9148,	Adjusted R-squared:  0.9106 
F-statistic: 214.9 on 2 and 40 DF,  p-value: < 2.2e-16



In [324]:

    
#Do not import
p4 = pred_c_adj 
p4.name = 'bio_age'
hiv.name = 'hiv'
df = process_factors([p4, hiv, age], standardize=False)
df.colnames = ['bio_age','hiv','age']
fmla = robjects.Formula('bio_age ~ 1 + hiv + age')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])









    



Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.20608    1.94852   0.106    0.916    
hivHIV+      5.68002    1.11716   5.084 9.05e-06 ***
age          0.99507    0.04316  23.055  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.589 on 40 degrees of freedom
Multiple R-squared:  0.9386,	Adjusted R-squared:  0.9356 
F-statistic: 305.9 on 2 and 40 DF,  p-value: < 2.2e-16

One sided p-value



In [325]:

    
robjects.r.pt(robjects.r.coef(s)[7], s.rx('df')[0][1], lower=False)[0]









    Out[325]:





4.5233768676129315e-06



In [326]:

    
robjects.r.pt(robjects.r.coef(s)[5], s.rx('df')[0][1], lower=False)[0]









    Out[326]:





0.48289437675454266



In [327]:

    
print robjects.r.confint(m)









    



                 2.5 %   97.5 %
(Intercept) -3.7320304 4.144186
hivHIV+      3.4221458 7.937892
age          0.9078443 1.082303



In [286]:

    
(pred_c_adj - age).groupby(hiv).mean()









    Out[286]:





hiv
HIV+    5.66e+00
HIV-   -4.44e-15
dtype: float64



In [342]:

    
model_plot(pred_c_adj)