Helpers for Finding Differentially Methylated Probes


In [1]:
import os
if os.getcwd().endswith('Setup'):
    os.chdir('..')

In [2]:
import NotebookImport
from Setup.Imports import *


/cellar/users/agross/anaconda2/lib/python2.7/site-packages/IPython/nbformat/current.py:19: UserWarning: IPython.nbformat.current is deprecated.

- use IPython.nbformat for read/write/validate public API
- use IPython.nbformat.vX directly to composing notebooks of a particular version

  """)
importing IPython notebook from Setup/Imports
Populating the interactive namespace from numpy and matplotlib

In [3]:
from scipy.special import logit

logit_adj = lambda df: logit(df.clip(.001, .999))

Couple of minor tweaks


In [4]:
def boxplot_panel(hit_vec, response_df):
    """
    Draws a series of paired boxplots with the rows of the response_df
    split according to hit_vec.  
    """
    b = response_df.copy()
    #b.columns = pd.MultiIndex.from_arrays([b.columns, hit_vec.ix[b.columns]])
    b = b.T
    v1, v2 = hit_vec.unique()
    test = lambda v: Stats.anova(hit_vec, v)
    res = b.apply(test).T
    #p = res.p.order()
    p = res.p
    b = b.ix[:, p.index]
    
    l1 = list(b.ix[ti(hit_vec == v1)].as_matrix().T)
    l2 = list(b.ix[ti(hit_vec == v2)].as_matrix().T)

    boxes = [x for t in zip(l1, l2) for x in t]
    ax1, bp = paired_boxplot(boxes)
    
    y_lim = (response_df.T.quantile(.9).max()) * 1.2
    pts = [(i * 3.5 + .5, y_lim) for i, n in enumerate(p) if n < .0000001]
    if len(pts) > 0:
        s1 = ax1.scatter(*zip(*pts), marker='$**$', label='$p<10^{-5}$', s=200)
    else:
        s1 = None
    pts = [(i * 3.5 + .5, y_lim) for i, n in enumerate(p) if (n < .01)
           and (n > .0000001)]
    if len(pts) > 0:
        s2 = ax1.scatter(*zip(*pts), marker='$*$', label='$p<10^{-2}$', s=30)
    else:
        s2 = None
    #ax1.set_xticklabels(b.columns)
    ax1.set_xticklabels('')
    #ax1.set_ybound(-.2,.3)
    #ax1.legend(bp['boxes'][:2] + [s2, s1],
    #           (v1, v2, '$p<10^{-2}$', '$p<10^{-5}$'),
    #           loc='best', scatterpoints=1)

Some additional functions I should add to my statistics module


In [6]:
def entropy(p):
    '''
    Entropy of a methylaion vector. Here we assume 50% methylation is 
    random and 0 or 1 constitute an informative measument. 
    '''
    q = 1. - p
    s = np.sum((p*np.log(p)) + (q*np.log(q))) / (np.log(.5) * len(p))
    s.name = 'Entropy'
    return s

Some functions for dealing with entropy of a matrix


In [8]:
def svd_entropy(df):
    U,S,vH = frame_svd(df)
    p = S ** 2 / sum(S ** 2)
    entropy = -1 * sum((p * np.log(p))) / log(len(p))
    return entropy

def entropy_gain(split, df):
    df = df.ix[:, split.index]
    h_all = svd_entropy(df)
    h_1 = svd_entropy(df.ix[:, ti(split)])
    h_0 = svd_entropy(df.ix[:, ti(split==False)])
    ratio = h_all - (h_1*split.mean() + h_0*(1-split.mean()))
    return pd.Series({'gain':ratio, 'h_all': h_all, 'h_0':h_0, 'h_1':h_1})

Helper functions for diffential expression


In [9]:
def ttest_df(split_vec, df):
    dmean = df.T.groupby(split_vec).mean().T
    dvar = df.T.groupby(split_vec).var().T
    dn = df.T.groupby(split_vec).count().astype(float).T
    s12 = ((((dn[True] - 1)*dvar[True]) + ((dn[False] - 1)*dvar[False])) / 
           (dn.sum(1) - 2)) ** .5
    t = (dmean[True] - dmean[False]) / (s12 * np.sqrt((1/dn[True]) + (1/dn[False])))
    t = t.dropna()
    return t

def effect_size(split_vec, df):
    dmean = df.T.groupby(split_vec).mean().T
    return pd.concat([dmean[True], dmean[False], dmean[True] - dmean[False]],
                     axis=1, keys=['g1','g2','d'])

Read in Gene Sets

  • I don't like this thing with the paths, hopfully fix later

In [10]:
from Data.Annotations import read_in_pathways

gs, gl = read_in_pathways('/cellar/users/agross/TCGA_Code/TCGA/Extra_Data/c2.cp.v3.0.symbols_edit.csv')
gs = pd.DataFrame({p: pd.Series(1, index=s) for p,s in gs.iteritems()})
gs = gs.ix[gl.keys()].fillna(0)
gene_sets = gs

Read in Probe Annotations from Data-Store


In [11]:
STORE = HDFS_DIR + 'methylation_annotation_2.h5'
islands = pd.read_hdf(STORE, 'islands')
locations = pd.read_hdf(STORE, 'locations')
other = pd.read_hdf(STORE, 'other')
snps = pd.read_hdf(STORE, 'snps')
probe_annotations = pd.read_hdf(STORE, 'probe_annotations')

Auxilary function to map a data-vector from probes onto CpG Islands

  • I've got some globals going on in here
  • If I keep it, I'm probalby going to have to move this to a class

In [12]:
isl = islands.sort(['Islands_Name','Relation_to_Island']).dropna()
isl = isl[isl.Islands_Name.isin(ti(isl.Islands_Name.value_counts() > 7))]

ot = other.Regulatory_Feature_Name
ot = ot[ot.isin(ti(ot.value_counts()> 7))]

gb = pd.concat([isl, probe_annotations], axis=1)
gb = gb[gb.Gene_Symbol.notnull() & gb.Islands_Name.notnull()]

g2 = gb.sort('Islands_Name')

top_gene = lambda s: s.Gene_Symbol.value_counts().index[0]
island_to_gene = g2.groupby('Islands_Name').apply(top_gene)

def map_to_islands(s):
    '''
    s is a Series of measurments on the probe level.
    '''
    on_island = s.groupby(isl.Islands_Name).mean().order()
    
    v = pd.concat([island_to_gene, on_island], axis=1).set_index(0)[1]
    islands_mapped_to_genes = v.groupby(level=0).mean().order()
    return on_island, islands_mapped_to_genes

Helper for making CpG island plots


In [1]:
def island_plot_maker(df, split, islands, ann, colors=None):
    '''
    df:      a DataFrame of probe beta values
    islands: a DataFrame mapping probes to CpG islands and 
             annotations
    ann:     a DataFrame mapping probes to gene annotations
             and genomic coordinates of probe
    '''
    if colors is None:
        colors = colors_st
    groups = split.dropna().unique()
    assert len(groups) == 2
    
    def f(region):
        p = ti(islands.Islands_Name == region)
        p3 = ann.ix[p].join(islands.ix[p]).sort('Genomic_Coordinate')
        p = p3.index
        in_island = ti(p3.Relation_to_Island == 'Island')
        
        fig, ax = subplots(figsize=(10,4))
        for i,g in enumerate(groups):
            ax.scatter(p3.Genomic_Coordinate, df[ti(split == g)].ix[p].mean(1),
                       color=colors[i], label=g)
        ax.axvspan(p3.Genomic_Coordinate.ix[in_island[0]] - 30, 
                   p3.Genomic_Coordinate.ix[in_island[-1]] + 30, 
                   alpha=.2, color=colors[2], label='Island')
        ax.set_xlabel('Genomic Coordinate')
        ax.set_ylabel('Beta Value')
        ax.legend(loc='lower right', fancybox=True)
        prettify_ax(ax)
    return f

Create annotation probe sets


In [14]:
cpg_island = isl.Relation_to_Island == 'Island'
dhs_site = other.DHS == 'TRUE'
enhancer = other.Enhancer == 'TRUE'
gene_body = other.UCSC_RefGene_Group.str.contains('Body')
gene_tss = other.UCSC_RefGene_Group.str.contains('TSS')
promoter = other.Regulatory_Feature_Group.str.contains('Promoter_Associated')

PRC2 probe annotations are initiallized in PRC2 Probes notbook.


In [15]:
p = '/cellar/users/agross/Data/GeneSets/PRC2_Binding/'
prc2_probes = pd.read_csv(p + 'mapped_to_methylation_probes.csv',
                          index_col=0)
prc2_probes = prc2_probes.sum(1)>2

In [16]:
probe_sets = {'PRC2': prc2_probes, 'CpG Island': cpg_island,
              'DHS Site': dhs_site, 'Enhancer': enhancer,
              'Gene Body': gene_body, 'TSS': gene_tss,
              'Promoter': promoter}