In [1]:
import os
if os.getcwd().endswith('Setup'):
os.chdir('..')
In [2]:
import NotebookImport
from Setup.Imports import *
In [3]:
from scipy.special import logit
logit_adj = lambda df: logit(df.clip(.001, .999))
Couple of minor tweaks
In [4]:
def boxplot_panel(hit_vec, response_df):
"""
Draws a series of paired boxplots with the rows of the response_df
split according to hit_vec.
"""
b = response_df.copy()
#b.columns = pd.MultiIndex.from_arrays([b.columns, hit_vec.ix[b.columns]])
b = b.T
v1, v2 = hit_vec.unique()
test = lambda v: Stats.anova(hit_vec, v)
res = b.apply(test).T
#p = res.p.order()
p = res.p
b = b.ix[:, p.index]
l1 = list(b.ix[ti(hit_vec == v1)].as_matrix().T)
l2 = list(b.ix[ti(hit_vec == v2)].as_matrix().T)
boxes = [x for t in zip(l1, l2) for x in t]
ax1, bp = paired_boxplot(boxes)
y_lim = (response_df.T.quantile(.9).max()) * 1.2
pts = [(i * 3.5 + .5, y_lim) for i, n in enumerate(p) if n < .0000001]
if len(pts) > 0:
s1 = ax1.scatter(*zip(*pts), marker='$**$', label='$p<10^{-5}$', s=200)
else:
s1 = None
pts = [(i * 3.5 + .5, y_lim) for i, n in enumerate(p) if (n < .01)
and (n > .0000001)]
if len(pts) > 0:
s2 = ax1.scatter(*zip(*pts), marker='$*$', label='$p<10^{-2}$', s=30)
else:
s2 = None
#ax1.set_xticklabels(b.columns)
ax1.set_xticklabels('')
#ax1.set_ybound(-.2,.3)
#ax1.legend(bp['boxes'][:2] + [s2, s1],
# (v1, v2, '$p<10^{-2}$', '$p<10^{-5}$'),
# loc='best', scatterpoints=1)
In [6]:
def entropy(p):
'''
Entropy of a methylaion vector. Here we assume 50% methylation is
random and 0 or 1 constitute an informative measument.
'''
q = 1. - p
s = np.sum((p*np.log(p)) + (q*np.log(q))) / (np.log(.5) * len(p))
s.name = 'Entropy'
return s
In [8]:
def svd_entropy(df):
U,S,vH = frame_svd(df)
p = S ** 2 / sum(S ** 2)
entropy = -1 * sum((p * np.log(p))) / log(len(p))
return entropy
def entropy_gain(split, df):
df = df.ix[:, split.index]
h_all = svd_entropy(df)
h_1 = svd_entropy(df.ix[:, ti(split)])
h_0 = svd_entropy(df.ix[:, ti(split==False)])
ratio = h_all - (h_1*split.mean() + h_0*(1-split.mean()))
return pd.Series({'gain':ratio, 'h_all': h_all, 'h_0':h_0, 'h_1':h_1})
In [9]:
def ttest_df(split_vec, df):
dmean = df.T.groupby(split_vec).mean().T
dvar = df.T.groupby(split_vec).var().T
dn = df.T.groupby(split_vec).count().astype(float).T
s12 = ((((dn[True] - 1)*dvar[True]) + ((dn[False] - 1)*dvar[False])) /
(dn.sum(1) - 2)) ** .5
t = (dmean[True] - dmean[False]) / (s12 * np.sqrt((1/dn[True]) + (1/dn[False])))
t = t.dropna()
return t
def effect_size(split_vec, df):
dmean = df.T.groupby(split_vec).mean().T
return pd.concat([dmean[True], dmean[False], dmean[True] - dmean[False]],
axis=1, keys=['g1','g2','d'])
In [10]:
from Data.Annotations import read_in_pathways
gs, gl = read_in_pathways('/cellar/users/agross/TCGA_Code/TCGA/Extra_Data/c2.cp.v3.0.symbols_edit.csv')
gs = pd.DataFrame({p: pd.Series(1, index=s) for p,s in gs.iteritems()})
gs = gs.ix[gl.keys()].fillna(0)
gene_sets = gs
In [11]:
STORE = HDFS_DIR + 'methylation_annotation_2.h5'
islands = pd.read_hdf(STORE, 'islands')
locations = pd.read_hdf(STORE, 'locations')
other = pd.read_hdf(STORE, 'other')
snps = pd.read_hdf(STORE, 'snps')
probe_annotations = pd.read_hdf(STORE, 'probe_annotations')
In [12]:
isl = islands.sort(['Islands_Name','Relation_to_Island']).dropna()
isl = isl[isl.Islands_Name.isin(ti(isl.Islands_Name.value_counts() > 7))]
ot = other.Regulatory_Feature_Name
ot = ot[ot.isin(ti(ot.value_counts()> 7))]
gb = pd.concat([isl, probe_annotations], axis=1)
gb = gb[gb.Gene_Symbol.notnull() & gb.Islands_Name.notnull()]
g2 = gb.sort('Islands_Name')
top_gene = lambda s: s.Gene_Symbol.value_counts().index[0]
island_to_gene = g2.groupby('Islands_Name').apply(top_gene)
def map_to_islands(s):
'''
s is a Series of measurments on the probe level.
'''
on_island = s.groupby(isl.Islands_Name).mean().order()
v = pd.concat([island_to_gene, on_island], axis=1).set_index(0)[1]
islands_mapped_to_genes = v.groupby(level=0).mean().order()
return on_island, islands_mapped_to_genes
In [1]:
def island_plot_maker(df, split, islands, ann, colors=None):
'''
df: a DataFrame of probe beta values
islands: a DataFrame mapping probes to CpG islands and
annotations
ann: a DataFrame mapping probes to gene annotations
and genomic coordinates of probe
'''
if colors is None:
colors = colors_st
groups = split.dropna().unique()
assert len(groups) == 2
def f(region):
p = ti(islands.Islands_Name == region)
p3 = ann.ix[p].join(islands.ix[p]).sort('Genomic_Coordinate')
p = p3.index
in_island = ti(p3.Relation_to_Island == 'Island')
fig, ax = subplots(figsize=(10,4))
for i,g in enumerate(groups):
ax.scatter(p3.Genomic_Coordinate, df[ti(split == g)].ix[p].mean(1),
color=colors[i], label=g)
ax.axvspan(p3.Genomic_Coordinate.ix[in_island[0]] - 30,
p3.Genomic_Coordinate.ix[in_island[-1]] + 30,
alpha=.2, color=colors[2], label='Island')
ax.set_xlabel('Genomic Coordinate')
ax.set_ylabel('Beta Value')
ax.legend(loc='lower right', fancybox=True)
prettify_ax(ax)
return f
In [14]:
cpg_island = isl.Relation_to_Island == 'Island'
dhs_site = other.DHS == 'TRUE'
enhancer = other.Enhancer == 'TRUE'
gene_body = other.UCSC_RefGene_Group.str.contains('Body')
gene_tss = other.UCSC_RefGene_Group.str.contains('TSS')
promoter = other.Regulatory_Feature_Group.str.contains('Promoter_Associated')
PRC2 probe annotations are initiallized in PRC2 Probes notbook.
In [15]:
p = '/cellar/users/agross/Data/GeneSets/PRC2_Binding/'
prc2_probes = pd.read_csv(p + 'mapped_to_methylation_probes.csv',
index_col=0)
prc2_probes = prc2_probes.sum(1)>2
In [16]:
probe_sets = {'PRC2': prc2_probes, 'CpG Island': cpg_island,
'DHS Site': dhs_site, 'Enhancer': enhancer,
'Gene Body': gene_body, 'TSS': gene_tss,
'Promoter': promoter}