Global Imports


In [4]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
rcParams['axes.color_cycle']


Out[2]:
['#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00', '#ffff33', '#a65628']

External Package Imports


In [ ]:
from __future__ import division

In [2]:
import os as os
import pickle as pickle
import pandas as pd

Module Imports


In [7]:
from Helpers.LinAlg import *
from Helpers.Pandas import *
from Helpers.RPY2 import *
from Stats.Scipy import *
from Stats.Regression import *

from Figures.FigureHelpers import *
from Figures.Pandas import *
from Figures.Boxplots import *
from Figures.Regression import *

Tweaking Display Parameters


In [5]:
pd.set_option('precision', 3)
pd.set_option('display.width', 300)
plt.rcParams['font.size'] = 14

In [ ]:
import seaborn as sns

sns.set_context("paper", font_scale=1.7, rc={"lines.linewidth": 2.5})
sns.set_style("white")

In [6]:
'''Color schemes for paper taken from http://colorbrewer2.org/'''
colors = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00', 
          '#ffff33', '#a65628']
colors_st = ['#CA0020', '#F4A582', '#92C5DE', '#0571B0']
colors_th = ['#E66101', '#FDB863', '#B2ABD2', '#5E3C99']

Preform a logit transformation while constraining extreme values.


In [1]:
from scipy.special import logit

logit_adj = lambda df: logit(df.clip(.001, .999))

Model Fit/Assessment Helpers


In [1]:
def detrend(x,y):
    x, y = match_series(x, y)
    reg = linear_regression(x, y)
    adj = (y - reg['intercept']) / reg['slope']
    return adj

In [3]:
def get_error(x, y, groups=None, denominator='mean'):
    if groups is not None:
        r = pd.DataFrame({g: get_error(x.ix[ti(groups==g)], y, denominator=denominator) 
                          for g in groups.unique()})
        return r
    if denominator == 'x':
        d = x
    elif denominator == 'y':
        d = y
    else:
        d = (x + y) / 2.
    
    pct_error = ((x - y) / d).abs().dropna().mean() * 100.
    avg_error = (x- y).abs().dropna().mean()
    return pd.Series({'error (years)': avg_error, '% error': pct_error})

In [ ]:
def model_fit(m, age):
    reg = linear_regression(age, m)
    error = get_error(age, m)
    vec = pd.Series({'n': int(len(m.dropna())), 
                     'r': reg['r-value'],
                     'error (years)': error['error (years)'],
                     '% error': error['% error']})
    return vec.ix[['n','r','error (years)', '% error']]

Preform two step linear adjustement.

  • First on each data-source seperately
  • Then on all data-sources combined

In [4]:
def two_step_adjustment(m1, m2, age, groups):
    m1_adj = {}
    m2_adj = {}
    k = m1.index.intersection(m2.index)
    for s in groups.unique():
        pts = ti(groups == s).intersection(k)
        m1_adj[s] = detrend(age, m1.ix[pts])
        m2_adj[s] = detrend(age, m2.ix[pts])
    m1_adj = pd.concat(m1_adj.values())
    m2_adj = pd.concat(m2_adj.values())
    mc_adj = (m1_adj + m2_adj) / 2.
    mc_adj = detrend(age, mc_adj)
    mc_adj.name = 'Predicted Age (Combined)'
    return m1_adj, m2_adj, mc_adj

Manhattan plot


In [ ]:
def manhattan(vec, chrom, coords, ax=None, ybound=None,
              flow='up', ticks=True, gap=3e7):
    fig, ax = init_ax(ax, figsize=(9,3))
    x = 0
    chr_coords = []
    for i,c in enumerate(map(str, range(1,23))):
        v = vec.ix[ti(chrom == c)].dropna()
        series_scatter(coords + x, v, s=10, ax=ax, 
                       color=colors[i % 5], 
                       ann=None, alpha=1, rasterized=True) 
        chr_len = coords.ix[v.index].max()
        x = x + chr_len + gap
        chr_coords += [x - (chr_len / 2.)]
    ax.set_xbound(gap, x + gap)
    if ybound is not None:
        ax.set_ybound(ybound[0],ybound[1])
    ax.set_xlabel('Chromosome')
    
    if ticks:
        ax.set_xticks(chr_coords)
        ax.set_xticklabels(map(str, range(1,23)))
    else:
        ax.set_xticks([])
    top = flow == 'down'
    prettify_ax(ax, top)

Distribution plot with inset option


In [ ]:
def draw_dist(vec, split=None, ax=None, legend=True, colors=None,
              lim=None, tail='right', inset=False):
    """
    Draw a smooth distribution from data with an optional splitting factor.
    """
    _, ax = init_ax(ax)
    if split is None:
        split = pd.Series('s', index=vec.index)
        colors = {'s': colors} if colors is not None else None
        
    def get_dist(v):
        dist = smooth_dist(v)
        if tail is not None:
            dist = dist[lim:] if tail is 'right' else dist[:lim]
        return dist
    
    for l,v in vec.groupby(split):
        if colors is None:
            get_dist(v).plot(label=l, lw=2, ax=ax)
        else:
            get_dist(v).plot(label=l, lw=2, ax=ax, color=colors[l])
    if legend and len(split.unique()) > 1:
        ax.legend(loc='upper left', frameon=False)
    ax.set_yticks([])
    ax.set_xlabel(vec.name)
    ax.set_ylabel('probe density')

    
    if inset:
        a = axes([.4, .4, .5, .5])
        max_y = 0
        for l,v in vec.groupby(split):
            v = smooth_dist(v)
            if colors is None:

                a.plot(v.index, v, label=l, lw=2)
            else:
                a.plot(v.index, v, label=l, lw=2, color=colors[l])
            max_y = max(max_y, v.max())
        setp(a, yticks=[])
        a.set_xlabel(vec.name)
        a.set_ylabel('probe density')
        a.set_xlim(vec.min(), vec.max())
        a.set_ylim(0, max_y*1.03)
        a.axvspan(lim, 1, color='grey', alpha=.1)
    #ax.axvspan(0, 1, color='grey', alpha=.05)
    
def odds_ratio(a,b):
    ct = pd.crosstab(a,b).astype(float)
    r = (ct.ix[0,0] * ct.ix[1,1]) / (ct.ix[0,1] * ct.ix[1,0])
    return r

Global Parameters

While it is not always the best practice to set global parameters, I find that for analyses such as these it is beneficial for a few reasons.

  • Selection criteria can be established in one place and updates can be propagated to all subsequent analyses
  • Things such as hard paths can be extracted from the logic and placed in one central location

In [27]:
FIGDIR = '/cellar/users/agross/figures/'
HDFS_DIR = '/cellar/users/agross/Data/SSD/'

path = '/cellar/users/agross/TCGA_Code/Methlation/'
ucsd_path = path + 'data/UCSD_Methylation/'
hannum_path = path + 'data/Hannum/'
path_italy = path + 'data/EPIC_ITALY/'