Reviewer comments



In [1]:

    
cd ..









    



/cellar/users/agross/TCGA_Code/Methlation



In [2]:

    
import NotebookImport
from Parallel.Age_HIV_Features import *









    




importing IPython notebook from Parallel/Age_HIV_Features






    




importing IPython notebook from HIV_Age_Advancement






    




importing IPython notebook from Setup/Imports






    



Populating the interactive namespace from numpy and matplotlib






    




importing IPython notebook from Setup/MethylationAgeModels






    




importing IPython notebook from Setup/Read_HIV_Data

It would be useful to know how many of the CpGs associated with age (26,927), and HIV (81,361) are also strongly differentially methylated across cell-types. One could for example, plot a cell-type association p-value histogram for these sites using p-values from this Jaffe and Irizarry table: http://www.genomebiology.com/2014/15/2/R31/suppl/S3 If it is highly enriched for small p-values, I would be cautious about claiming that the cell type effects have been fully corrected for as the authors believe.



In [3]:

    
dm_cell = pd.read_csv('./data/Jaffee_Supplementary_Table_2.csv', index_col=0,
                      low_memory=False, skiprows=[0])

HIV association with cell-type specific probes



In [4]:

    
g_hiv.value_counts()









    Out[4]:





False    391683
True      81361
dtype: int64



In [5]:

    
vv = features['HIV (BH)']

r_hiv = pd.DataFrame({c: fisher_exact_test(vv, dm_cell['p.value'] < c)
                      for c in [.1, .05,.01,.001,.0001,.000001]}).T
r_hiv.index.name = 'p-value cutoff'
count = pd.Series({c: sum(dm_cell['p.value'].ix[vv.index] < c)
                      for c in [.1, .05,.01,.001,.0001,.000001]}).T
r_hiv['count'] = count
r_hiv = r_hiv[['count','odds_ratio','p']]



In [6]:

    
r_hiv









    Out[6]:






  
    
      
      count
      odds_ratio
      p
    
    
      p-value cutoff
      
      
      
    
  
  
    
      1.00e-06
      88949
      0.87
      3.25e-41
    
    
      1.00e-04
      144460
      0.86
      7.09e-63
    
    
      1.00e-03
      183835
      0.88
      2.79e-52
    
    
      1.00e-02
      235900
      0.94
      2.52e-14
    
    
      5.00e-02
      285508
      1.01
      3.68e-01
    
    
      1.00e-01
      312476
      1.05
      3.23e-07



In [7]:

    
r_hiv = pd.DataFrame({c: fisher_exact_test(g_hiv, dm_cell['p.value'] < c)
                      for c in [.1, .05,.01,.001,.0001,.000001]}).T
r_hiv.index.name = 'p-value cutoff'
count = pd.Series({c: sum(dm_cell['p.value'].ix[g_hiv.index] < c)
                      for c in [.1, .05,.01,.001,.0001,.000001]}).T
r_hiv['count'] = count
r_hiv = r_hiv[['count','odds_ratio','p']]



In [8]:

    
r_hiv









    Out[8]:






  
    
      
      count
      odds_ratio
      p
    
    
      p-value cutoff
      
      
      
    
  
  
    
      1.00e-06
      88949
      0.95
      2.33e-08
    
    
      1.00e-04
      144460
      0.98
      1.80e-02
    
    
      1.00e-03
      183835
      1.00
      5.73e-01
    
    
      1.00e-02
      235900
      1.06
      3.35e-15
    
    
      5.00e-02
      285508
      1.13
      4.85e-47
    
    
      1.00e-01
      312476
      1.16
      5.79e-67



In [9]:

    
fc = ((dm_cell.CD4T_mean - dm_cell.CD8T_mean).abs() / 
      .5*(dm_cell.CD4T_mean + dm_cell.CD8T_mean))
r_hiv_fc = pd.DataFrame({p: fisher_exact_test(g_hiv, fc > c)
                      for p,c in fc.quantile([.05,.1,.25,.5,.75,.9,.95]).iteritems()}).T
r_hiv_fc

Age association with cell-type specific probes



In [10]:

    
g_age.value_counts()









    Out[10]:





False    446117
True      26927
dtype: int64



In [11]:

    
r_age = pd.DataFrame({c: fisher_exact_test(g_age, dm_cell['p.value'] < c)
                      for c in [.1, .05,.01,.001,.0001,.000001]}).T
r_age.index.name = 'p-value cutoff'
count = pd.Series({c: sum(dm_cell['p.value'].ix[g_age.index] < c)
                      for c in [.1, .05,.01,.001,.0001,.000001]}).T
r_age['count'] = count
r_age = r_age[['count','odds_ratio','p']]



In [12]:

    
r_age









    Out[12]:






  
    
      
      count
      odds_ratio
      p
    
    
      p-value cutoff
      
      
      
    
  
  
    
      1.00e-06
      88949
      2.56
      0
    
    
      1.00e-04
      144460
      3.54
      0
    
    
      1.00e-03
      183835
      4.28
      0
    
    
      1.00e-02
      235900
      5.50
      0
    
    
      5.00e-02
      285508
      6.45
      0
    
    
      1.00e-01
      312476
      6.94
      0



In [13]:

    
count









    Out[13]:





1.00e-06     88949
1.00e-04    144460
1.00e-03    183835
1.00e-02    235900
5.00e-02    285508
1.00e-01    312476
dtype: int64



In [14]:

    
fc = ((dm_cell.CD4T_mean - dm_cell.CD8T_mean).abs() / 
      .5*(dm_cell.CD4T_mean + dm_cell.CD8T_mean))
r_age_fc = pd.DataFrame({p: fisher_exact_test(g_age, fc > c)
                      for p,c in fc.quantile([.05,.1,.25,.5,.75,.9,.95]).iteritems()}).T

count = pd.Series({p: sum(fc > c)
                      for p,c in fc.quantile([.05,.1,.25,.5,.75,.9,.95]).iteritems()})
r_age_fc['count'] = count
r_age_fc = r_age_fc[['count','odds_ratio','p']]
r_age_fc

It would be reassuring if the link between the HIV infection and aging signatures (from "Unsupervised analysis shows shared phenotypes of HIV and age") still hold when analysis is restricted to CpGs with no cell-type dependency (e.g. cell type composition p-value>0.1).



In [15]:

    
df_hiv2 = df_hiv.ix[:, duration.index]
df_hiv2 = df_hiv2.dropna(1, how='all')

dd = logit_adj(df_hiv2)
m = dd.mean(1)
s = dd.std(1)
df_norm = dd.subtract(m, axis=0).divide(s, axis=0)



In [16]:

    
idx = ti(dm_cell['p.value'] > .1).intersection(ti(g_age))
len(idx)









    Out[16]:





1646



In [17]:

    
U,S,vH = frame_svd(df_norm.ix[probe_idx].ix[idx])
p = S ** 2 / sum(S ** 2)
p[:5]









    Out[17]:





array([ 0.12555382,  0.0939205 ,  0.04203538,  0.03512384,  0.02747315])



In [18]:

    
fig, ax = subplots(1,1, figsize=(4,3))
rr = 1*vH[0]
k = pred_c.index
hiv = duration != 'Control'
age = age

sns.regplot(*match_series(age.ix[k], rr.ix[ti(hiv==0)]),
            ax=ax, label='HIV+', ci=None)
sns.regplot(*match_series(age.ix[k], rr.ix[ti(hiv>0)]),
            ax=ax, label='Control', ci=None)
ax.set_ylabel('First PC of validated markers', size=12)
ax.set_xlabel('Chronological age (years)', size=14)

ax.set_yticks([0])
ax.axhline(0, ls='--', lw=2.5, color='grey', zorder=-1)
ax.set_xbound(23,70)
ax.set_ybound(-.25,.28)
prettify_ax(ax)
fig.tight_layout()



In [141]:

    
import statsmodels.api as sm



In [142]:

    
y = vH[0]
intercept = pd.Series(1, index=y.index)

X = pd.concat([intercept, age, hiv], axis=1, keys=['Intercept', 'age', 'HIV'])
X = X.dropna().ix[y.index]
m1 = sm.OLS(y, X).fit()

X = pd.concat([intercept, age, hiv, hiv*age], axis=1, keys=['Intercept', 'age', 
                                                            'HIV', 'int'])
X = X.dropna().ix[y.index]
m2 = sm.OLS(y, X).fit()

m2.compare_lr_test(m1)









    Out[142]:





(7.0503496130212397, 0.0079249782022338278, 1.0)



In [143]:

    
m1.pvalues









    Out[143]:





Intercept    2.99e-13
age          3.19e-09
HIV          1.13e-11
dtype: float64



In [144]:

    
m2.pvalues









    Out[144]:





Intercept    8.73e-04
age          4.17e-02
HIV          2.99e-01
int          8.75e-03
dtype: float64

In Figure 1A how many sites identified from EPIC as age associated were not identified in Hannum et al? This data would be more informative if represented as a Venn diagram to show the overlapping 26,927 sites between the two analyses.



In [202]:

    
rr1 = bhCorrection(p_vals.in_set_s1) < .01
rr1.value_counts()









    Out[202]:





False    411452
True      61592
dtype: int64



In [203]:

    
rr2 = bhCorrection(p_vals.in_set_s3) < .01
rr2.value_counts()









    Out[203]:





False    442224
True      30820
dtype: int64



In [204]:

    
fisher_exact_test(rr1, rr2)









    Out[204]:





odds_ratio    18.81
p              0.00
dtype: float64



In [206]:

    
rr1.name = 'Hannum'
rr2.name = 'EPIC'
venn_pandas(rr1, rr2)









    Out[206]:





<matplotlib_venn._venn2.Venn2 instance at 0x7f95eb2a3fc8>

It seems as though the odds ratios for HIV and age are mostly opposite except in CpG islands and gene bodies. Wouldn't this argue for the effects of HIV and aging being separate except for at these two locations?



In [208]:

    
fisher_exact_test(g_age, g_hiv)









    Out[208]:





odds_ratio    1.29e+00
p             3.88e-59
dtype: float64



In [211]:

    
import Setup.DX_Imports as dx









    




importing IPython notebook from Setup/DX_Imports



In [215]:

    
by_annotation = pd.DataFrame({k: fisher_exact_test(g_age, g_hiv.ix[ti(v)])
                              for k,v in dx.probe_sets.iteritems()}).T



In [217]:

    
by_annotation.sort('odds_ratio')









    Out[217]:






  
    
      
      odds_ratio
      p
    
  
  
    
      Promoter
       1.22
        2.50e-08
    
    
      Gene Body
       1.24
        4.04e-13
    
    
      TSS
       1.33
        2.57e-26
    
    
      DHS Site
       1.46
        8.45e-29
    
    
      Enhancer
       1.46
        1.73e-26
    
    
      PRC2
       1.68
        3.32e-81
    
    
      CpG Island
       1.70
       9.32e-110



In [365]:

    
a2 = age[(age > 25) & (age < 69)]
d = pd.DataFrame({s: {c: pearson_pandas(v.ix[ti(study==s)], a2).rho
                      for c,v in cell_counts.iteritems()}
                  for s in ['s1','s3','HIV Control','HIV Long','HIV Short']})
d









    Out[365]:






  
    
      
      HIV Control
      HIV Long
      HIV Short
      s1
      s3
    
  
  
    
      Bcell
      -0.06
      -0.20
      -0.26
      -0.12
       0.08
    
    
      CD4T
      -0.14
       0.03
      -0.20
      -0.02
       0.03
    
    
      CD8T
      -0.25
       0.01
       0.05
      -0.21
      -0.13
    
    
      Gran
       0.21
       0.01
      -0.05
       0.05
      -0.06
    
    
      Mono
       0.20
      -0.09
       0.26
       0.21
      -0.04
    
    
      NK
      -0.11
       0.10
       0.24
       0.08
       0.16



In [358]:

    
a2 = age[(age > 25) & (age < 69)]
idx = ti(study.isin(['s1','HIV Control','s3']))
d = pd.Series({c: pearson_pandas(v.ix[idx], a2).rho
                      for c,v in cell_counts.iteritems()})
d









    Out[358]:





Bcell   -0.04
CD4T    -0.02
CD8T    -0.16
Gran     0.01
Mono     0.10
NK       0.10
dtype: float64



In [362]:

    
fig, axs = subplots(1,3, figsize=(12,4))

series_scatter(a2, cell_counts.CD8T.ix[idx], s=20, ax=axs[0], ann=None)
axs[0].annotate('$r={0:.2f}$'.format(d['CD8T']), (50, -.04), size=18)
series_scatter(a2, cell_counts.CD4T.ix[idx], s=20, ax=axs[1], ann=None)
axs[1].annotate('$r={0:.2f}$'.format(d['CD4T']), (50, -.04), size=18)
ratio = np.log((cell_counts.CD4T + .01) / (cell_counts.CD8T + .01))
ratio.name = 'log ratio'
series_scatter(a2.ix[idx], ratio.clip(-5,5), s=20, ax=axs[2], ann=None)
axs[2].annotate('$r={0:.2f}$'.format(pearson_pandas(ratio, a2).rho), 
                (50, -2.8), size=18)
for ax in axs:
    prettify_ax(ax)
    ax.set_xbound(25,70)
fig.tight_layout()

	odds_ratio	p
0.05	0.91	2.52e-07
0.10	0.78	1.52e-82
0.25	0.81	5.25e-117
0.50	0.85	2.12e-96
0.75	0.86	5.12e-61
0.90	0.77	2.74e-76
0.95	0.76	9.46e-46

	count	odds_ratio	p
0.05	433822	4.95	2.46e-283
0.10	410989	3.06	0.00e+00
0.25	342491	2.25	0.00e+00
0.50	228327	1.62	1.12e-308
0.75	114164	1.76	0.00e+00
0.90	45666	1.99	2.32e-311
0.95	22833	1.81	1.63e-123

	count	odds_ratio	p
p-value cutoff
1.00e-06	88949	0.87	3.25e-41
1.00e-04	144460	0.86	7.09e-63
1.00e-03	183835	0.88	2.79e-52
1.00e-02	235900	0.94	2.52e-14
5.00e-02	285508	1.01	3.68e-01
1.00e-01	312476	1.05	3.23e-07

	odds_ratio	p
Promoter	1.22	2.50e-08
Gene Body	1.24	4.04e-13
TSS	1.33	2.57e-26
DHS Site	1.46	8.45e-29
Enhancer	1.46	1.73e-26
PRC2	1.68	3.32e-81
CpG Island	1.70	9.32e-110

	HIV Control	HIV Long	HIV Short	s1	s3
Bcell	-0.06	-0.20	-0.26	-0.12	0.08
CD4T	-0.14	0.03	-0.20	-0.02	0.03
CD8T	-0.25	0.01	0.05	-0.21	-0.13
Gran	0.21	0.01	-0.05	0.05	-0.06
Mono	0.20	-0.09	0.26	0.21	-0.04
NK	-0.11	0.10	0.24	0.08	0.16