For differential expression analysis, we processed all of the data together on a high memory node. This was done using the minfi package in R, and all data were normalized using the preprocessQuantile function. For a full script of the R pipeline see the Methylation_Normalization_MINFI notebook.
Imports and helper functions from Imports notebook.
In [1]:
cd ..
In [2]:
import NotebookImport
from Setup.Imports import *
from Setup.MethylationAgeModels import *
In [3]:
path = '/cellar/users/agross/TCGA_Code/Methlation/data/Validation/'
In [16]:
betas = pd.read_csv(path + 'raw_data.csv', index_col=0)
manifest1 = pd.read_csv(path + 'Sample Sheet_20151008.csv', skiprows=7)
manifest1.Sample_Name = manifest1.Sample_Name.map(lambda s: s[3:] + '_Neu')
manifest2 = pd.read_csv(path + 'Sample Sheet_20151102.csv', skiprows=7)
manifest2.Sample_Name = manifest2.Sample_Name.map(lambda s: s.strip().replace(' ','_'))
manifest = pd.concat([manifest1, manifest2])
mapping = pd.Series({'{}_{}'.format(s.Sentrix_ID, s.Sentrix_Position): s.Sample_Name
for i,s in manifest.iterrows()})
betas = betas.rename(columns=mapping)
raw_betas = betas
raw_betas.shape
Out[16]:
In [5]:
detection = pd.read_csv(path + 'detection_p.csv', index_col=0)
detection = detection.rename(columns=mapping)
In [6]:
dp = detection.replace(0., np.nan).stack().order()
bad_probes = (dp > .01).groupby(level=1).sum()
bad_probes.index = bad_probes.index.map(lambda s: s.strip().replace(' ','_'))
bad_probes.order().tail()
Out[6]:
In [17]:
raw_betas = raw_betas.ix[:, raw_betas.columns.difference(ti(bad_probes > 10000))]
In [9]:
from IPython.display import clear_output
from rpy2.robjects import pandas2ri
pandas2ri.activate()
In [10]:
robjects.r.library('WGCNA');
robjects.r.source("/cellar/users/agross/Data/MethylationAge/Horvath/NORMALIZATION.R")
clear_output()
In [11]:
f = '/cellar/users/agross/TCGA_Code/Methlation/data/Hannum_gold_standard.csv'
gold_standard_ah = pd.read_csv(f, index_col=0, header=None, squeeze=True)
gold_standard_ah.name = 'gs'
In [12]:
gold_standard = pd.read_csv('/cellar/users/agross/Data/MethylationAge/Horvath/probeAnnotation21kdatMethUsed.csv', index_col=0)
horvath = pd.read_table('/cellar/users/agross/TCGA_Code/Methlation/data/Horvath_Model.csv', index_col=0, skiprows=[0,1])
intercept = horvath.CoefficientTraining['(Intercept)']
horvath = horvath.iloc[1:]
In [13]:
fig, ax = subplots(1,1, figsize=(12,4))
gold_standard_ah.ix[gold_standard.index].hist(bins=100, ax=ax, alpha=.5)
betas.ix[gold_standard.index].median(1).hist(bins=100, ax=ax, alpha=.5)
Out[13]:
In [14]:
fig, ax = subplots(1,1, figsize=(12,4))
gold_standard.goldstandard2.hist(bins=100, ax=ax, alpha=.5)
betas.ix[gold_standard.goldstandard2.index].median(1).hist(bins=100, ax=ax, alpha=.5)
Out[14]:
The pure cell population has a very different distribution than the reference provided by Horvath.
In [15]:
gold_standard.goldstandard2.shape
Out[15]:
In [11]:
if False:
gs = gold_standard.goldstandard2
gs.name = 'gs'
df = raw_betas.ix[gs.index].dropna()
df_r = robjects.r.t(pandas2ri.py2ri(df))
gs = list(gs.ix[df.index])
gs_r = robjects.FloatVector(gs)
data_n = robjects.r.BMIQcalibration(df_r, gs_r)
clear_output()
data = pandas2ri.ri2py_dataframe(data_n).T
data.index = data_n.colnames
data.columns = [c.replace('X','') for c in data_n.rownames]
data.to_csv(path + 'BMIQ_Horvath.csv')
else:
data = pd.read_csv(path + 'BMIQ_Horvath.csv', index_col=0)
data_horvath = data
In [28]:
cell_counts = pd.read_csv(path + 'cell_counts.csv', index_col=0)
cell_counts = cell_counts.rename(index=mapping)
flow_sorted_data = pd.read_hdf(HDFS_DIR + 'methylation_annotation.h5','flow_sorted_data')
cell_type = pd.read_hdf(HDFS_DIR + 'methylation_annotation.h5', 'label_map')
n2 = flow_sorted_data.groupby(cell_type, axis=1).mean()
avg = n2[cell_counts.columns].dot(cell_counts.T)
In [29]:
cc = avg.mean(1)
adj = (data_horvath - avg).add(cc, axis=0)
adj = adj.dropna(how='all', axis=0)
data_s = adj
In [40]:
control_color = (0.2980392156862745, 0.4470588235294118, 0.6901960784313725)
hiv_color = (0.3333333333333333, 0.6588235294117647, 0.40784313725490196)
def model_plot(prediction):
fig, axs = subplots(1,3, figsize=(14,4),sharey=False)
plot_regression(age, prediction.ix[ti(hiv=='HIV-')], ax=axs[0],
color=control_color, alpha=1, edgecolor='black', s=30,
line_args=[{'color':'grey','alpha':1, 'ls':'--'},
{'alpha':0}])
plot_regression(age, prediction.ix[ti(hiv=='HIV+')], ax=axs[1],
color=hiv_color, alpha=1, edgecolor='black', s=30,
line_args=[{'color':hiv_color,'alpha':1, 'ls':'-'},
{'alpha':1, 'color':'grey', 'ls':'--'}])
axs[0].set_title('HIV-')
axs[1].set_title('HIV+')
#violin_plot_pandas(hiv, prediction - age, ax=axs[2])
#axs[2].set_ylabel('Age Advancment')
#prettify_ax(axs[2])
for ax in axs[[0,1]]:
ax.set_xlim(20,75)
ax.set_ylim(20,75)
ax = axs[2]
zscore = lambda s: (s - s.mean()) / s.std()
residual = prediction - age
e2 = residual.groupby(hiv).apply(zscore)
#return residual, e2
sns.violinplot(residual[e2.abs() < 3], hiv, ax=ax,
order=['HIV-','HIV+'],
inner='points', inner_kws={"ms": 8}, alpha=.5,
bw='scott', cut=0, color=[control_color,
hiv_color, hiv_color])
x = hiv.map({'HIV-':1, 'HIV+':2})
series_scatter(x, residual[e2.abs() > 3], color='white', edgecolor='red',
alpha=1, s=20, lw=1, ax=ax, ann=None)
ax.set_ylabel('Biological Age - Chronological Age')
ax.set_xlabel('')
prettify_ax(ax)
fig.tight_layout()
In [42]:
from Validation.Process_Clinical import *
In [56]:
cell_type = cell_type.ix[data_horvath.columns]
In [332]:
pred_horvath = run_horvath_model(data_s.ix[:, ti(cell_type == 'Neu')])
reg = linear_regression(age, pred_horvath.ix[ti(hiv=='HIV-')])
pred_horvath_adj = (pred_horvath - reg['intercept']) / reg['slope']
In [333]:
model_plot(pred_horvath_adj)
In [334]:
#Do not import
p4 = pred_horvath_adj
p4.name = 'bio_age'
hiv.name = 'hiv'
df = process_factors([p4, hiv, age], standardize=False)
df.colnames = ['bio_age','hiv','age']
fmla = robjects.Formula('bio_age ~ 1 + hiv + age')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])
In [335]:
print robjects.r.confint(m)
In [316]:
pred_horvath = run_horvath_model(data_s.ix[:, ti(cell_type == 'CD4')])
reg = linear_regression(age, pred_horvath.ix[ti(hiv=='HIV-')])
pred_horvath_adj = (pred_horvath - reg['intercept']) / reg['slope']
In [317]:
model_plot(pred_horvath_adj)
In [320]:
(pred_horvath_adj - age).dropna().groupby(hiv).mean()
Out[320]:
In [318]:
anova(hiv, (pred_horvath_adj - age).dropna()).ix['p'] / 2
Out[318]:
In [298]:
get_error(age, pred_horvath_adj, denominator='x', groups=hiv)
Out[298]:
In [68]:
path = '/cellar/users/agross/TCGA_Code/Methlation/data/Validation/'
betas = pd.read_csv(path + 'beta_qn.csv', index_col=0)
betas = betas.rename(columns=mapping)
In [69]:
cc = avg.mean(1)
adj = (betas - avg).add(cc, axis=0)
adj = adj.dropna(how='all', axis=1)
In [226]:
df = adj
df = df.ix[gold_standard_ah.index]
df = df.T.fillna(gold_standard_ah).T
In [229]:
df_cd4 = df.ix[:, ti(cell_type == 'CD4')]
df_neu = df.ix[:, ti(cell_type == 'Neu')]
In [234]:
pearson_pandas(df_cd4.mean(1), df_neu.mean(1))
Out[234]:
In [258]:
def normalize_BMIQ(df, gs=None):
df_r = robjects.r.t(pandas2ri.py2ri(df))
if gs is None:
gs = df.mean(1)
gs = list(gs)
gs_r = robjects.FloatVector(gs)
data_n = robjects.r.BMIQcalibration(df_r, gs_r)
clear_output()
data = pandas2ri.ri2py_dataframe(data_n).T
data.index = data_n.colnames
data.columns = data_n.rownames
data.columns = data.columns.str.replace('X','')
return data
In [260]:
data_neu = normalize_BMIQ(df_neu)
In [239]:
data_cd4 = normalize_BMIQ(df_cd4)
In [266]:
df = data_neu.join(data_cd4)
df.to_csv(path + 'BMIQ_Hannum2.csv')
In [336]:
pred_hannum = run_hannum_model(data_neu.ix[:, ti(cell_type == 'Neu')])
reg = linear_regression(age, pred_hannum.ix[ti(hiv=='HIV-')].dropna())
pred_hannum_adj = (pred_hannum - reg['intercept']) / reg['slope']
In [337]:
pred_hannum_adj_n = pred_hannum_adj
In [338]:
model_plot(pred_hannum_adj)
In [340]:
#Do not import
p4 = pred_hannum_adj
p4.name = 'bio_age'
hiv.name = 'hiv'
df = process_factors([p4, hiv, age], standardize=False)
df.colnames = ['bio_age','hiv','age']
fmla = robjects.Formula('bio_age ~ 1 + hiv + age')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])
In [341]:
print robjects.r.confint(m)
In [331]:
violin_plot_pandas(hiv, pred_hannum_adj - age)
In [264]:
(pred_hannum_adj - age).groupby(hiv).mean()
Out[264]:
In [309]:
anova(hiv, pred_hannum_adj - age).ix['p'] / 2
Out[309]:
In [310]:
pred_hannum = run_hannum_model(data_cd4.ix[:, ti(cell_type == 'CD4')])
reg = linear_regression(age, pred_hannum.ix[ti(hiv=='HIV-')].dropna())
pred_hannum_adj = (pred_hannum - reg['intercept']) / reg['slope']
In [311]:
model_plot(pred_hannum_adj)
plt.gcf().axes[2].set_ylim(-10,20)
Out[311]:
In [313]:
(pred_hannum_adj - age).groupby(hiv).mean()
Out[313]:
In [312]:
anova(hiv, pred_hannum_adj - age).ix['p'] / 2
Out[312]:
In [247]:
pn = pred_hannum_adj_n.copy()
pn.name = 'Neu'
pn.index = pn.index.map(lambda s: s.split('_')[0])
pt = pred_hannum_adj.copy()
pt.name = 'CD4'
pt.index = pt.index.map(lambda s: s.split('_')[0])
pa = age.copy()
pa.index = pa.index.map(lambda s: s.split('_')[0])
In [248]:
plot_regression(pn - pa, (pt - pa).dropna())
In [249]:
plot_regression((pred_hannum_adj - age), (pred_horvath_adj - age).dropna())
In [250]:
plot_regression(pred_hannum_adj.dropna(), pred_horvath_adj.dropna())
In [273]:
diff = ((pred_hannum_adj - pred_horvath_adj) / ((pred_hannum_adj + pred_horvath_adj) * .5)).abs()
diff = diff.groupby(level=0).first()
diff.name = 'Absolute difference in models'
diff.hist()
Out[273]:
In [274]:
diff.order().tail()
Out[274]:
In [275]:
(pred_horvath_adj - age).groupby(hiv).mean()
Out[275]:
In [276]:
(pred_hannum_adj - age).groupby(hiv).mean()
Out[276]:
In [277]:
pred_c = (pred_hannum_adj + pred_horvath_adj) / 2.
reg = linear_regression(age, pred_c.ix[ti(hiv=='HIV-')])
pred_c_adj = (pred_c - reg['intercept']) / reg['slope']
pred_c_adj = pred_c_adj.ix[ti(diff < .3)]
In [278]:
model_plot(pred_c_adj)
In [279]:
violin_plot_pandas(hiv, pred_c_adj - age)
In [280]:
#Do not import
p4 = pred_c.ix[pred_c_adj.index]
p4.name = 'bio_age'
hiv.name = 'hiv'
df = process_factors([p4, hiv, age], standardize=False)
df.colnames = ['bio_age','hiv','age']
fmla = robjects.Formula('bio_age ~ 1 + hiv + age')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])
In [281]:
#Do not import
p4 = pred_horvath.ix[pred_c_adj.index]
p4.name = 'bio_age'
hiv.name = 'hiv'
df = process_factors([p4, hiv, age], standardize=False)
df.colnames = ['bio_age','hiv','age']
fmla = robjects.Formula('bio_age ~ 1 + hiv + age')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])
In [324]:
#Do not import
p4 = pred_c_adj
p4.name = 'bio_age'
hiv.name = 'hiv'
df = process_factors([p4, hiv, age], standardize=False)
df.colnames = ['bio_age','hiv','age']
fmla = robjects.Formula('bio_age ~ 1 + hiv + age')
m = robjects.r.lm(fmla, df)
s = robjects.r.summary(m)
print '\n\n'.join(str(s).split('\n\n')[-3:])
One sided p-value
In [325]:
robjects.r.pt(robjects.r.coef(s)[7], s.rx('df')[0][1], lower=False)[0]
Out[325]:
In [326]:
robjects.r.pt(robjects.r.coef(s)[5], s.rx('df')[0][1], lower=False)[0]
Out[326]:
In [327]:
print robjects.r.confint(m)
In [286]:
(pred_c_adj - age).groupby(hiv).mean()
Out[286]:
In [342]:
model_plot(pred_c_adj)