For differential expression analysis, we processed all of the data together on a high memory node. This was done using the minfi package in R, and all data were normalized using the preprocessQuantile function. For a full script of the R pipeline see the Methylation_Normalization_MINFI notebook.
Imports and helper functions from Imports notebook.
In [1]:
cd ..
In [2]:
import NotebookImport
from Setup.Imports import *
In [3]:
path = '/cellar/users/agross/TCGA_Code/Methlation/data/Validation/'
In [4]:
df_hiv = pd.read_csv(path + 'betas_BMIQ.csv', index_col=0)
In [5]:
hiv_ann = pd.read_excel(path + 'Methylome human NEUvs WB.xlsx',
sheetname='HIV', skiprows=2, index_col=0)
control_ann = pd.read_excel(path + 'Methylome human NEUvs WB.xlsx',
sheetname='Negative', skiprows=2, index_col=0)
ann = pd.concat([hiv_ann, control_ann], keys=['HIV+','HIV-'])
ann = ann.reset_index()
ann = ann.rename(columns={'Age (years)': 'age', 'level_0':'HIV'})
ann = ann.set_index('PATID')
ann = ann.ix[df_hiv.columns]
In [6]:
from IPython.display import clear_output
from rpy2.robjects import pandas2ri
pandas2ri.activate()
In [7]:
import statsmodels.api as sm
In [8]:
hiv = ann.HIV == 'HIV+'
intercept = pd.Series(1, index=hiv.index)
age = ann.age
X = pd.concat([intercept, hiv, age], axis=1,
keys=['Intercept','HIV', 'age'])
X_reduced = X[['Intercept','HIV']]
w = (len(hiv) - hiv.map(hiv.value_counts())).astype(float) / len(hiv)
w = w.ix[X.index]
X = X.ix[df_hiv.columns]
In [9]:
%%time
p = {}
d_hiv = {}
df = logit_adj(df_hiv)
for i,y in df.iterrows():
y.name = 'marker'
y = logit_adj(y)
mod_all = sm.WLS(y, X, weights=w)
res_ref = mod_all.fit()
p[i] = res_ref.params
mod_reduced = sm.WLS(y, X_reduced, weights=w)
m1 = mod_reduced.fit()
d_hiv[i] = res_ref.compare_lr_test(m1)
coef = pd.DataFrame(p).T
d_age = pd.DataFrame(d_hiv, index=['LR','p','df']).T
In [11]:
monocyte_corr = pd.read_csv('/cellar/users/agross/Data/Methylation_Controls/monocyte_age_corr.csv',
index_col=0, squeeze=True)
In [15]:
import Parallel.Age_HIV_Features as fx
In [10]:
from Setup.Read_HIV_Data import *
In [32]:
import Setup.DX_Imports as dx
In [12]:
ann = dx.probe_annotations.sort(['Chromosome','Genomic_Coordinate'])
In [13]:
dx = bhCorrection(d_hiv.p) < .05
dx.value_counts()
Out[13]:
In [14]:
mm = df_hiv.mean(1)
In [15]:
import Parallel.Age_HIV_Features as fx
In [16]:
a,b = match_series((fx.mm > .5), (p.HIV < 0))
a,b = match_series(a,b)
direction = a == b
hiv_bh = dx
hiv_bh = (hiv_bh & (direction == False) & (fx.g_age == False)).dropna()
hiv_bh = hiv_bh.ix[probe_idx].ix[ann.index].dropna()
#hiv_bh = hiv_bh & (t_hiv.abs() > 1)
hiv_bh.value_counts()
Out[16]:
In [17]:
dx_gran = bhCorrection(d_hiv.p) < .01
dx_gran.value_counts()
Out[17]:
In [197]:
dx_gran_u = bhCorrection(d_hiv.p.ix[ti(fx.g_hiv == True)]) < .01
dx_gran_d = bhCorrection(d_hiv.p.ix[ti(fx.g_hiv == False)]) < .01
dx_gran_u.mean() / dx_gran_d.mean()
Out[197]:
In [199]:
dx_gran_t = bhCorrection(d_hiv.p.ix[ti(fx.g_hiv == True)]) < .01
dx_gran_t.value_counts()
Out[199]:
In [162]:
fisher_exact_test(direction, dx_gran)
Out[162]:
In [130]:
fx.g_age.value_counts()
Out[130]:
In [141]:
r = pd.concat([fx.g_age, dx_gran, fx.g_hiv],
keys=['age','pure','mixed'], axis=1).dropna().astype(int).astype(str).apply(lambda s: ''.join(s), axis=1)
In [150]:
fisher_exact_test(fx.g_age, d_hiv.p.rank() < sum(fx.g_hiv))
Out[150]:
In [153]:
d = df_hiv.groupby(hiv, axis=1).mean()
dx = d[1] - d[0]
In [161]:
fisher_exact_test(combine(fx.g_age, fx.g_hiv) == 'both', dx_gran)
Out[161]:
In [159]:
fisher_exact_test(fx.g_age, dx_gran)
Out[159]:
In [190]:
fisher_exact_test(fx.g_hiv, dx_gran)
Out[190]:
In [ ]:
In [63]:
fisher_exact_test(fx.g_hiv_nom, d_hiv.p < .01)
Out[63]:
In [18]:
fisher_exact_test(fx.g_hiv.ix[ti(direction == False)], dx_gran)
Out[18]:
In [20]:
fisher_exact_test(fx.g_hiv.ix[ti(direction)], dx_gran)
Out[20]:
In [21]:
pd.crosstab(fx.g_hiv, dx_gran)
Out[21]:
In [22]:
ann.Gene_Symbol.ix[ti(fx.g_hiv & dx_gran)].value_counts().head()
Out[22]:
In [23]:
dd = dx_gran.ix[ti(fx.g_hiv)]
m = dd.mean()
w = 10
t = lambda s: sp.stats.binom_test(s, w, p=m)
v = pd.concat([pd.rolling_mean(s, w, center=True) for i,s in
dd.groupby(ann.Chromosome)])
lookup = pd.Series({i: t(i) for i in (v.dropna() * w).unique()})
biom_p = (v.dropna() * w).map(lookup).order()
In [25]:
fisher_exact_test(dx_gran.ix[ti(direction==False)], ann.Gene_Symbol=='HCP5')
Out[25]:
In [133]:
ann.ix[ti(hiv_bh)].Gene_Symbol.value_counts().head()
Out[133]:
In [26]:
m = hiv_bh.mean()
w = 200
t = lambda s: sp.stats.binom_test(s, w, p=m)
v = pd.concat([pd.rolling_mean(s, w, center=True) for i,s in
hiv_bh.groupby(ann.Chromosome)])
lookup = pd.Series({i: t(i) for i in (v.dropna() * w).unique()})
biom_p = (v.dropna() * w).map(lookup).order()
In [27]:
def manhattan(vec, chrom, coords, ax=None, ybound=None,
flow='up', ticks=True, gap=3e7):
fig, ax = init_ax(ax, figsize=(9,3))
x = 0
chr_coords = []
for i,c in enumerate(map(str, range(1,23))):
v = vec.ix[ti(chrom == c)].dropna()
series_scatter(coords + x, v, s=10, ax=ax,
color=colors[i % 5],
ann=None, alpha=1, rasterized=True)
chr_len = coords.ix[v.index].max()
x = x + chr_len + gap
chr_coords += [x - (chr_len / 2.)]
ax.set_xbound(gap, x + gap)
if ybound is not None:
ax.set_ybound(ybound[0],ybound[1])
ax.set_xlabel('Chromosome')
if ticks:
ax.set_xticks(chr_coords)
ax.set_xticklabels(map(str, range(1,23)))
else:
ax.set_xticks([])
top = flow == 'down'
prettify_ax(ax, top)
In [28]:
manhattan(-1*np.log10(biom_p), ann.Chromosome, ann.Genomic_Coordinate,
ybound=[0,12], ticks=False)
In [29]:
full_region = ((ann.Chromosome == '6') &
ann.Genomic_Coordinate.isin(range(25000000, 35000000)))
histone_region = ((ann.Chromosome == '6') &
ann.Genomic_Coordinate.isin(range(25000000, 29570008)))
hla_region = ((ann.Chromosome == '6') &
ann.Genomic_Coordinate.isin(range(29570008, 33377112)))
In [30]:
gwas = pd.read_csv('./data/Euro_CHAVI_Setpoint_liftover.csv', index_col=0)
gwas.chromosome = gwas.chromosome.astype(str)
In [17]:
hiv.sort_index()
Out[17]:
In [55]:
tt = dx.ttest_df(hiv, df_hiv)
In [75]:
fig, axs = subplots(2,1, figsize=(10,6), sharex=True)
ax=axs[0]
k = ti((gwas.chromosome == '6') &
gwas.Map.isin(range(20000000, 40000000)))
x = gwas.Map.ix[k]
rr = -1*gwas['-logP']
series_scatter(x, rr, color=colors[0],
s=20, ax=ax, ann=None,
edgecolor='grey')
ax.set_ylabel('GWAS P-value')
ax.set_yticks(range(-12, 1, 2))
ax.set_yticklabels(range(-12, 1, 2), size=14)
ax.set_ylim(-12,0)
ax.set_xlabel('')
#ax.axvspan(29570008, 33377112, alpha=.1)
ax.set_xbound(x.min(), x.max())
prettify_ax(ax, top=True)
k = ti((ann.Chromosome == '6') &
ann.Genomic_Coordinate.isin(range(20000000, 40000000)))
x = ann.Genomic_Coordinate.ix[k].order()
ax = axs[1]
d = (tt[tt.abs() > 0] > 0).ix[ann.index].dropna()
v = pd.concat([pd.rolling_mean(s, 200, center=True) for i,s in
d.groupby(ann.Chromosome)])
v = (v - v.mean()) / v.std()
series_scatter(x[::5], v.ix[x.index], s=20, ann=None, color=colors[0],
edgecolor='grey', ax=ax, alpha=1)
ax.axhline(0, ls='--', color='black', lw=3)
ax.set_ylabel('Enrichment')
ax.spines['bottom'].set_visible(False)
#ax.axvspan(29570008, 33377112, alpha=.1)
ax.set_xbound(x.min(), x.max())
prettify_ax(ax)
ax.set_xlabel('')
Out[75]:
In [76]:
fisher_exact_test(dx, hla_region)
Out[76]:
In [58]:
fx.g_hiv_b.value_counts()
Out[58]:
In [59]:
fisher_exact_test(fx.g_hiv_b, dx)
Out[59]:
In [79]:
In [81]:
DX = dx
In [84]:
def draw_dist_by_annotation(f, ax):
lw = 2.5
draw_dist(f.ix[ti(DX.probe_sets['Promoter'])], ax=ax)
draw_dist(f.ix[ti(DX.probe_sets['CpG Island'])], ax=ax)
draw_dist(f.ix[ti(DX.probe_sets['PRC2'])], ax=ax)
draw_dist(f, ax=ax, colors='grey')
ax.set_yticks([])
ax.set_xticks([0,.5,1])
ax.set_ylabel('Density')
ax.set_xlabel('Fraction with Increased Methylation')
ax.legend(('Promoter','CpG Island','PRC2','All Probes'))
prettify_ax(ax)
return ax
In [85]:
pd.crosstab()
In [89]:
probes_sets = {'PRC2': dx.prc2_probes, 'CpG island': dx.cpg_island,
'DHS': dx.dhs_site, 'Enhancer': dx.enhancer,
'Gene body': dx.gene_body, 'TSS': dx.gene_tss,
'Promoter': dx.promoter}
In [182]:
d2 = bhCorrection(d_hiv.p) < .01
d2 = d2.ix[ti(fx.mm < .5)]
d2.value_counts()
Out[182]:
In [183]:
associations = {}
for j,b in probes_sets.iteritems():
associations[j] = fisher_exact_test(d2,b.ix[fx.probe_idx])
associations = pd.concat(associations)
In [184]:
associations.unstack()
Out[184]:
In [185]:
np.log2(associations.ix[:,'odds_ratio']).plot(kind='bar')
Out[185]:
In [174]:
fisher_exact_test(fx.mm < .5, fx.g_hiv)
Out[174]:
In [178]:
draw_dist(fx.mm, fx.g_hiv)
In [181]:
draw_dist(fx.mm, dx_gran)
In [173]:
fisher_exact_test(fx.mm < .5, dx_gran)
Out[173]:
In [186]:
d2 = bhCorrection(d_hiv.p) < .01
d2 = d2.ix[ti(fx.mm > .5)]
d2.value_counts()
Out[186]:
In [187]:
associations = {}
for j,b in probes_sets.iteritems():
associations[j] = fisher_exact_test(d2,b.ix[fx.probe_idx])
associations = pd.concat(associations)
In [188]:
associations.unstack()
Out[188]:
In [124]:
d2 = bhCorrection(d_hiv.p.ix[ti(fx.g_hiv)]) < .01
d2 = d2.ix[fx.g_hiv.index].fillna(False)
d2.value_counts()
Out[124]:
In [125]:
associations = {}
for j,b in probes_sets.iteritems():
associations[j] = fisher_exact_test(d2, b.ix[fx.probe_idx])
associations = pd.concat(associations)
In [126]:
associations.unstack()
Out[126]:
In [127]:
np.log2(associations.ix[:,'odds_ratio']).plot(kind='bar')
Out[127]: