Global Imports


In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots

External Package Imports


In [3]:
import os as os
import pickle as pickle
import pandas as pd

Module Imports

Here I am using a few of my own packages, they are availible on Github under theandygross and should all be instalable by python setup.py.


In [5]:
from Stats.Scipy import *
from Stats.Survival import *

from Helpers.Pandas import *
from Helpers.LinAlg import *

from Figures.FigureHelpers import *
from Figures.Pandas import *
from Figures.Boxplots import *
from Figures.Regression import *
#from Figures.Survival import draw_survival_curve, survival_and_stats
#from Figures.Survival import draw_survival_curves
#from Figures.Survival import survival_stat_plot

In [6]:
import Data.Firehose as FH
from Data.Containers import get_run

Import Global Parameters

  • These need to be changed before you will be able to sucessfully run this code

In [7]:
import NotebookImport
from Global_Parameters import *


importing IPython notebook from Global_Parameters

Tweaking Display Parameters


In [8]:
pd.set_option('precision', 3)
pd.set_option('display.width', 300)
plt.rcParams['font.size'] = 12

In [9]:
'''Color schemes for paper taken from http://colorbrewer2.org/'''
colors = plt.rcParams['axes.color_cycle']
colors_st = ['#CA0020', '#F4A582', '#92C5DE', '#0571B0']
colors_th = ['#E66101', '#FDB863', '#B2ABD2', '#5E3C99']

In [10]:
import seaborn as sns
sns.set_context('paper',font_scale=1.5)
sns.set_style('white')

Read in All of the Expression Data

This reads in data that was pre-processed in the ./Preprocessing/init_RNA notebook.


In [11]:
codes = pd.read_hdf(RNA_SUBREAD_STORE, 'codes')
matched_tn = pd.read_hdf(RNA_SUBREAD_STORE, 'matched_tn')
rna_df = pd.read_hdf(RNA_SUBREAD_STORE, 'all_rna')

In [12]:
data_portal = pd.read_hdf(RNA_STORE, 'matched_tn')
genes = data_portal.index.intersection(matched_tn.index)
pts = data_portal.columns.intersection(matched_tn.columns)
rna_df = rna_df.ix[genes]
matched_tn = matched_tn.ix[genes, pts]

Read in Gene-Sets for GSEA


In [13]:
from Data.Annotations import unstack_geneset_csv

gene_sets = unstack_geneset_csv(GENE_SETS)
gene_sets = gene_sets.ix[rna_df.index].fillna(0)

Initialize function for calling model-based gene set enrichment


In [14]:
from rpy2 import robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

mgsa = robjects.packages.importr('mgsa')

In [15]:
gs_r = robjects.ListVector({i: robjects.StrVector(list(ti(g>0))) for i,g in 
                             gene_sets.iteritems()})
def run_mgsa(vec):
    v = robjects.r.c(*ti(vec))
    r = mgsa.mgsa(v, gs_r)
    res = pandas2ri.ri2pandas(mgsa.setsResults(r))
    return res

Function Tweaks

Running the binomial test across 450k probes in the same test space, we rerun the same test a lot. Here I memoize the function to cache results and not recompute them. This eats up a couple GB of memory but should be reasonable.


In [16]:
from scipy.stats import binom_test

def memoize(f):
    memo = {}
    def helper(x,y,z):
        if (x,y,z) not in memo:            
            memo[(x,y,z)] = f(x,y,z)
        return memo[(x,y,z)]
    return helper

binom_test_mem = memoize(binom_test)

def binomial_test_screen(df, fc=1.5, p=.5):
    """
    Run a binomial test on a DataFrame.

    df:
        DataFrame of measurements.  Should have a multi-index with
        subjects on the first level and tissue type ('01' or '11')
        on the second level.
    fc:
        Fold-chance cutoff to use
    """
    a, b = df.xs('01', 1, 1), df.xs('11', 1, 1)
    dx = a - b
    dx = dx[dx.abs() > np.log2(fc)]
    n = dx.count(1)
    counts = (dx > 0).sum(1)
    cn = pd.concat([counts, n], 1)
    cn = cn[cn.sum(1) > 0]
    b_test = cn.apply(lambda s: binom_test_mem(s[0], s[1], p), axis=1)
    dist = (1.*cn[0] / cn[1])
    tab = pd.concat([cn[0], cn[1], dist, b_test],
                    keys=['num_ox', 'num_dx', 'frac', 'p'],
                    axis=1)
    return tab

Added linewidth and number of bins arguments. This should get pushed eventually.


In [17]:
def draw_dist(vec, split=None, ax=None, legend=True, colors=None, lw=2, bins=300):
    """
    Draw a smooth distribution from data with an optional splitting factor.
    """
    _, ax = init_ax(ax)
    if split is None:
        split = pd.Series('s', index=vec.index)
        colors = {'s': colors} if colors is not None else None
    for l,v in vec.groupby(split):
        if colors is None:
            smooth_dist(v, bins=bins).plot(label=l, lw=lw, ax=ax)
        else:
            smooth_dist(v, bins=bins).plot(label=l, lw=lw, ax=ax, color=colors[l])
    if legend and len(split.unique()) > 1:
        ax.legend(loc='upper left', frameon=False)

Some helper functions for fast calculation of odds ratios on matricies.


In [18]:
def odds_ratio_df(a,b):
    a = a.astype(int)
    b = b.astype(int)
    flip = lambda v: (v == 0).astype(int)

    a11 = (a.add(b) == 2).sum(axis=1)
    a10 = (a.add(flip(b)) == 2).sum(axis=1)
    a01 = (flip(a).add(b) == 2).sum(axis=1)
    a00 = (flip(a).add(flip(b)) == 2).sum(axis=1)
    odds_ratio = (1.*a11 * a00) / (1.*a10 * a01)
    df = pd.concat([a00, a01, a10, a11], axis=1,
                   keys=['00','01','10','11'])
    return odds_ratio, df

def fet(s):
    odds, p = stats.fisher_exact([[s['00'],s['01']],
                                  [s['10'],s['11']]])
    return p

filter_pathway_hits


In [1]:
def filter_pathway_hits(hits, gs, cutoff=.00001):
    '''
    Takes a vector of p-values and a DataFrame of binary defined gene-sets. 
    Uses the ordering defined by hits to do a greedy filtering on the gene sets. 
    '''
    l = [hits.index[0]]
    for gg in hits.index:
        flag = 0
        for g2 in l:
            if gg in l:
                flag = 1
                break
            elif (chi2_cont_test(gs[gg], gs[g2])['p'] < cutoff):
                flag = 1
                break
        if flag == 0:
            l.append(gg)
    hits_filtered = hits.ix[l]
    return hits_filtered