In [2]:
import pandas as pd
import os as os
In [1]:
DATA_STORE = '/data_ssd/methylation_annotation_2.h5'
Logit adjustment, while clipping extreme values, recomended for methylation data.
In [4]:
from scipy.special import logit
logit_adj = lambda df: logit(df.clip(.001, .999))
In [9]:
def entropy(p):
'''
Entropy of a methylaion vector. Here we assume 50% methylation is
random and 0 or 1 constitute an informative measument.
'''
q = 1. - p
s = np.sum((p*np.log(p)) + (q*np.log(q))) / (np.log(.5) * len(p))
s.name = 'Entropy'
return s
def svd_entropy(df):
'''
Entropy of a matrix as calculated by the spread singular values in its
decomposition.
'''
U,S,vH = frame_svd(df)
p = S ** 2 / sum(S ** 2)
entropy = -1 * sum((p * np.log(p))) / log(len(p))
return entropy
def entropy_gain(split, df):
'''
Entropy gain of a matrix as the result of being split by a binary vector.
'''
df = df.ix[:, split.index]
h_all = svd_entropy(df)
h_1 = svd_entropy(df.ix[:, ti(split)])
h_0 = svd_entropy(df.ix[:, ti(split==False)])
ratio = h_all - (h_1*split.mean() + h_0*(1-split.mean()))
return pd.Series({'gain':ratio, 'h_all': h_all, 'h_0':h_0, 'h_1':h_1})
In [2]:
from Data.Annotations import unstack_geneset_csv
import pkg_resources
pkg_resources.resource_filename('MethylTools', 'c2.cp.v5.csv')
gene_sets = unstack_geneset_csv(GENE_SETS)