Imports


In [2]:
import pandas as pd
import os as os

In [1]:
DATA_STORE = '/data_ssd/methylation_annotation_2.h5'

Logit adjustment, while clipping extreme values, recomended for methylation data.


In [4]:
from scipy.special import logit

logit_adj = lambda df: logit(df.clip(.001, .999))

Some functions for dealing with entropy of methylation data


In [9]:
def entropy(p):
    '''
    Entropy of a methylaion vector. Here we assume 50% methylation is 
    random and 0 or 1 constitute an informative measument. 
    '''
    q = 1. - p
    s = np.sum((p*np.log(p)) + (q*np.log(q))) / (np.log(.5) * len(p))
    s.name = 'Entropy'
    return s


def svd_entropy(df):
    '''
    Entropy of a matrix as calculated by the spread singular values in its 
    decomposition.
    '''
    U,S,vH = frame_svd(df)
    p = S ** 2 / sum(S ** 2)
    entropy = -1 * sum((p * np.log(p))) / log(len(p))
    return entropy


def entropy_gain(split, df):
    '''
    Entropy gain of a matrix as the result of being split by a binary vector.
    '''
    df = df.ix[:, split.index]
    h_all = svd_entropy(df)
    h_1 = svd_entropy(df.ix[:, ti(split)])
    h_0 = svd_entropy(df.ix[:, ti(split==False)])
    ratio = h_all - (h_1*split.mean() + h_0*(1-split.mean()))
    return pd.Series({'gain':ratio, 'h_all': h_all, 'h_0':h_0, 'h_1':h_1})

Read in Gene Sets


In [2]:
from Data.Annotations import unstack_geneset_csv
import pkg_resources

pkg_resources.resource_filename('MethylTools', 'c2.cp.v5.csv')
gene_sets = unstack_geneset_csv(GENE_SETS)