In [15]:
import pandas as pd
In [14]:
DATA_STORE = '/data_ssd/methylation_annotation_2.h5'
In [2]:
store = pd.HDFStore(DATA_STORE)
islands = pd.read_hdf(DATA_STORE, 'islands')
locations = pd.read_hdf(DATA_STORE, 'locations')
other = pd.read_hdf(DATA_STORE, 'other')
snps = pd.read_hdf(DATA_STORE, 'snps')
probe_annotations = pd.read_hdf(DATA_STORE, 'probe_annotations')
probe_to_island = store['probe_to_island']
island_to_gene = store['island_to_gene']
In [3]:
def map_to_islands(s):
'''
s is a Series of measurments on the probe level.
'''
on_island = s.groupby(island_to_gene.Islands_Name).mean().order()
v = pd.concat([island_to_gene, on_island], axis=1).set_index(0)[1]
islands_mapped_to_genes = v.groupby(level=0).mean().order()
return on_island, islands_mapped_to_genes
In [4]:
def island_plot_maker(df, split, islands, ann, colors=None):
'''
df: a DataFrame of probe beta values
islands: a DataFrame mapping probes to CpG islands and
annotations
ann: a DataFrame mapping probes to gene annotations
and genomic coordinates of probe
'''
if colors is None:
colors = colors_st
groups = split.dropna().unique()
assert len(groups) == 2
def f(region):
p = ti(islands.Islands_Name == region)
p3 = ann.ix[p].join(islands.ix[p]).sort('Genomic_Coordinate')
p = p3.index
in_island = ti(p3.Relation_to_Island == 'Island')
fig, ax = subplots(figsize=(10,4))
for i,g in enumerate(groups):
ax.scatter(p3.Genomic_Coordinate, df[ti(split == g)].ix[p].mean(1),
color=colors[i], label=g)
ax.axvspan(p3.Genomic_Coordinate.ix[in_island[0]] - 30,
p3.Genomic_Coordinate.ix[in_island[-1]] + 30,
alpha=.2, color=colors[2], label='Island')
ax.set_xlabel('Genomic Coordinate')
ax.set_ylabel('Beta Value')
ax.legend(loc='lower right', fancybox=True)
prettify_ax(ax)
return f
In [5]:
cpg_island = probe_to_island.Relation_to_Island == 'Island'
dhs_site = other.DHS == 'TRUE'
enhancer = other.Enhancer == 'TRUE'
gene_body = other.UCSC_RefGene_Group.str.contains('Body')
gene_tss = other.UCSC_RefGene_Group.str.contains('TSS')
promoter = other.Regulatory_Feature_Group.str.contains('Promoter_Associated')
PRC2 probe annotations are initiallized in PRC2 Probes notbook.
In [16]:
p = '/cellar/users/agross/TCGA_Code/MethylTools/Data/PRC2_Binding/'
prc2_probes = pd.read_csv(p + 'mapped_to_methylation_probes.csv',
index_col=0)
prc2_probes = prc2_probes.sum(1)>2
In [17]:
probe_sets = {'PRC2': prc2_probes, 'CpG Island': cpg_island,
'DHS Site': dhs_site, 'Enhancer': enhancer,
'Gene Body': gene_body, 'TSS': gene_tss,
'Promoter': promoter}