Read in Probe Annotations


In [15]:
import pandas as pd

In [14]:
DATA_STORE = '/data_ssd/methylation_annotation_2.h5'

In [2]:
store = pd.HDFStore(DATA_STORE)
islands = pd.read_hdf(DATA_STORE, 'islands')
locations = pd.read_hdf(DATA_STORE, 'locations')
other = pd.read_hdf(DATA_STORE, 'other')
snps = pd.read_hdf(DATA_STORE, 'snps')
probe_annotations = pd.read_hdf(DATA_STORE, 'probe_annotations')
probe_to_island = store['probe_to_island']
island_to_gene = store['island_to_gene']

Auxilary function to map a data-vector from probes onto CpG Islands


In [3]:
def map_to_islands(s):
    '''
    s is a Series of measurments on the probe level.
    '''
    on_island = s.groupby(island_to_gene.Islands_Name).mean().order()
    
    v = pd.concat([island_to_gene, on_island], axis=1).set_index(0)[1]
    islands_mapped_to_genes = v.groupby(level=0).mean().order()
    return on_island, islands_mapped_to_genes

Helper for making CpG island plots


In [4]:
def island_plot_maker(df, split, islands, ann, colors=None):
    '''
    df:      a DataFrame of probe beta values
    islands: a DataFrame mapping probes to CpG islands and 
             annotations
    ann:     a DataFrame mapping probes to gene annotations
             and genomic coordinates of probe
    '''
    if colors is None:
        colors = colors_st
    groups = split.dropna().unique()
    assert len(groups) == 2
    
    def f(region):
        p = ti(islands.Islands_Name == region)
        p3 = ann.ix[p].join(islands.ix[p]).sort('Genomic_Coordinate')
        p = p3.index
        in_island = ti(p3.Relation_to_Island == 'Island')
        
        fig, ax = subplots(figsize=(10,4))
        for i,g in enumerate(groups):
            ax.scatter(p3.Genomic_Coordinate, df[ti(split == g)].ix[p].mean(1),
                       color=colors[i], label=g)
        ax.axvspan(p3.Genomic_Coordinate.ix[in_island[0]] - 30, 
                   p3.Genomic_Coordinate.ix[in_island[-1]] + 30, 
                   alpha=.2, color=colors[2], label='Island')
        ax.set_xlabel('Genomic Coordinate')
        ax.set_ylabel('Beta Value')
        ax.legend(loc='lower right', fancybox=True)
        prettify_ax(ax)
    return f

Create annotation probe sets


In [5]:
cpg_island = probe_to_island.Relation_to_Island == 'Island'
dhs_site = other.DHS == 'TRUE'
enhancer = other.Enhancer == 'TRUE'
gene_body = other.UCSC_RefGene_Group.str.contains('Body')
gene_tss = other.UCSC_RefGene_Group.str.contains('TSS')
promoter = other.Regulatory_Feature_Group.str.contains('Promoter_Associated')

PRC2 probe annotations are initiallized in PRC2 Probes notbook.


In [16]:
p = '/cellar/users/agross/TCGA_Code/MethylTools/Data/PRC2_Binding/'
prc2_probes = pd.read_csv(p + 'mapped_to_methylation_probes.csv',
                          index_col=0)
prc2_probes = prc2_probes.sum(1)>2

In [17]:
probe_sets = {'PRC2': prc2_probes, 'CpG Island': cpg_island,
              'DHS Site': dhs_site, 'Enhancer': enhancer,
              'Gene Body': gene_body, 'TSS': gene_tss,
              'Promoter': promoter}