In [1]:
%matplotlib inline
In [2]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
In [3]:
import os as os
import pickle as pickle
import pandas as pd
Here I am using a few of my own packages, they are availible on Github under theandygross and should all be instalable by python setup.py
.
In [5]:
from Stats.Scipy import *
from Stats.Survival import *
from Helpers.Pandas import *
from Helpers.LinAlg import *
from Figures.FigureHelpers import *
from Figures.Pandas import *
from Figures.Boxplots import *
from Figures.Regression import *
#from Figures.Survival import draw_survival_curve, survival_and_stats
#from Figures.Survival import draw_survival_curves
#from Figures.Survival import survival_stat_plot
In [6]:
import Data.Firehose as FH
from Data.Containers import get_run
In [7]:
import NotebookImport
from Global_Parameters import *
In [8]:
pd.set_option('precision', 3)
pd.set_option('display.width', 300)
plt.rcParams['font.size'] = 12
In [9]:
'''Color schemes for paper taken from http://colorbrewer2.org/'''
colors = plt.rcParams['axes.color_cycle']
colors_st = ['#CA0020', '#F4A582', '#92C5DE', '#0571B0']
colors_th = ['#E66101', '#FDB863', '#B2ABD2', '#5E3C99']
In [10]:
import seaborn as sns
sns.set_context('paper',font_scale=1.5)
sns.set_style('white')
This reads in data that was pre-processed in the ./Preprocessing/init_RNA notebook.
In [11]:
codes = pd.read_hdf(RNA_SUBREAD_STORE, 'codes')
matched_tn = pd.read_hdf(RNA_SUBREAD_STORE, 'matched_tn')
rna_df = pd.read_hdf(RNA_SUBREAD_STORE, 'all_rna')
In [12]:
data_portal = pd.read_hdf(RNA_STORE, 'matched_tn')
genes = data_portal.index.intersection(matched_tn.index)
pts = data_portal.columns.intersection(matched_tn.columns)
rna_df = rna_df.ix[genes]
matched_tn = matched_tn.ix[genes, pts]
In [13]:
from Data.Annotations import unstack_geneset_csv
gene_sets = unstack_geneset_csv(GENE_SETS)
gene_sets = gene_sets.ix[rna_df.index].fillna(0)
Initialize function for calling model-based gene set enrichment
In [14]:
from rpy2 import robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
mgsa = robjects.packages.importr('mgsa')
In [15]:
gs_r = robjects.ListVector({i: robjects.StrVector(list(ti(g>0))) for i,g in
gene_sets.iteritems()})
def run_mgsa(vec):
v = robjects.r.c(*ti(vec))
r = mgsa.mgsa(v, gs_r)
res = pandas2ri.ri2pandas(mgsa.setsResults(r))
return res
Running the binomial test across 450k probes in the same test space, we rerun the same test a lot. Here I memoize the function to cache results and not recompute them. This eats up a couple GB of memory but should be reasonable.
In [16]:
from scipy.stats import binom_test
def memoize(f):
memo = {}
def helper(x,y,z):
if (x,y,z) not in memo:
memo[(x,y,z)] = f(x,y,z)
return memo[(x,y,z)]
return helper
binom_test_mem = memoize(binom_test)
def binomial_test_screen(df, fc=1.5, p=.5):
"""
Run a binomial test on a DataFrame.
df:
DataFrame of measurements. Should have a multi-index with
subjects on the first level and tissue type ('01' or '11')
on the second level.
fc:
Fold-chance cutoff to use
"""
a, b = df.xs('01', 1, 1), df.xs('11', 1, 1)
dx = a - b
dx = dx[dx.abs() > np.log2(fc)]
n = dx.count(1)
counts = (dx > 0).sum(1)
cn = pd.concat([counts, n], 1)
cn = cn[cn.sum(1) > 0]
b_test = cn.apply(lambda s: binom_test_mem(s[0], s[1], p), axis=1)
dist = (1.*cn[0] / cn[1])
tab = pd.concat([cn[0], cn[1], dist, b_test],
keys=['num_ox', 'num_dx', 'frac', 'p'],
axis=1)
return tab
Added linewidth and number of bins arguments. This should get pushed eventually.
In [17]:
def draw_dist(vec, split=None, ax=None, legend=True, colors=None, lw=2, bins=300):
"""
Draw a smooth distribution from data with an optional splitting factor.
"""
_, ax = init_ax(ax)
if split is None:
split = pd.Series('s', index=vec.index)
colors = {'s': colors} if colors is not None else None
for l,v in vec.groupby(split):
if colors is None:
smooth_dist(v, bins=bins).plot(label=l, lw=lw, ax=ax)
else:
smooth_dist(v, bins=bins).plot(label=l, lw=lw, ax=ax, color=colors[l])
if legend and len(split.unique()) > 1:
ax.legend(loc='upper left', frameon=False)
Some helper functions for fast calculation of odds ratios on matricies.
In [18]:
def odds_ratio_df(a,b):
a = a.astype(int)
b = b.astype(int)
flip = lambda v: (v == 0).astype(int)
a11 = (a.add(b) == 2).sum(axis=1)
a10 = (a.add(flip(b)) == 2).sum(axis=1)
a01 = (flip(a).add(b) == 2).sum(axis=1)
a00 = (flip(a).add(flip(b)) == 2).sum(axis=1)
odds_ratio = (1.*a11 * a00) / (1.*a10 * a01)
df = pd.concat([a00, a01, a10, a11], axis=1,
keys=['00','01','10','11'])
return odds_ratio, df
def fet(s):
odds, p = stats.fisher_exact([[s['00'],s['01']],
[s['10'],s['11']]])
return p
In [1]:
def filter_pathway_hits(hits, gs, cutoff=.00001):
'''
Takes a vector of p-values and a DataFrame of binary defined gene-sets.
Uses the ordering defined by hits to do a greedy filtering on the gene sets.
'''
l = [hits.index[0]]
for gg in hits.index:
flag = 0
for g2 in l:
if gg in l:
flag = 1
break
elif (chi2_cont_test(gs[gg], gs[g2])['p'] < cutoff):
flag = 1
break
if flag == 0:
l.append(gg)
hits_filtered = hits.ix[l]
return hits_filtered