Global Imports



In [4]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
rcParams['axes.color_cycle']









    Out[2]:





['#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00', '#ffff33', '#a65628']

External Package Imports



In [ ]:

    
from __future__ import division



In [2]:

    
import os as os
import pickle as pickle
import pandas as pd

Module Imports



In [7]:

    
from Helpers.LinAlg import *
from Helpers.Pandas import *
from Helpers.RPY2 import *
from Stats.Scipy import *
from Stats.Regression import *

from Figures.FigureHelpers import *
from Figures.Pandas import *
from Figures.Boxplots import *
from Figures.Regression import *

Tweaking Display Parameters



In [5]:

    
pd.set_option('precision', 3)
pd.set_option('display.width', 300)
plt.rcParams['font.size'] = 14



In [ ]:

    
import seaborn as sns

sns.set_context("paper", font_scale=1.7, rc={"lines.linewidth": 2.5})
sns.set_style("white")



In [6]:

    
'''Color schemes for paper taken from http://colorbrewer2.org/'''
colors = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00', 
          '#ffff33', '#a65628']
colors_st = ['#CA0020', '#F4A582', '#92C5DE', '#0571B0']
colors_th = ['#E66101', '#FDB863', '#B2ABD2', '#5E3C99']

Preform a logit transformation while constraining extreme values.



In [1]:

    
from scipy.special import logit

logit_adj = lambda df: logit(df.clip(.001, .999))

Model Fit/Assessment Helpers



In [1]:

    
def detrend(x,y):
    x, y = match_series(x, y)
    reg = linear_regression(x, y)
    adj = (y - reg['intercept']) / reg['slope']
    return adj



In [3]:

    
def get_error(x, y, groups=None, denominator='mean'):
    if groups is not None:
        r = pd.DataFrame({g: get_error(x.ix[ti(groups==g)], y, denominator=denominator) 
                          for g in groups.unique()})
        return r
    if denominator == 'x':
        d = x
    elif denominator == 'y':
        d = y
    else:
        d = (x + y) / 2.
    
    pct_error = ((x - y) / d).abs().dropna().mean() * 100.
    avg_error = (x- y).abs().dropna().mean()
    return pd.Series({'error (years)': avg_error, '% error': pct_error})



In [ ]:

    
def model_fit(m, age):
    reg = linear_regression(age, m)
    error = get_error(age, m)
    vec = pd.Series({'n': int(len(m.dropna())), 
                     'r': reg['r-value'],
                     'error (years)': error['error (years)'],
                     '% error': error['% error']})
    return vec.ix[['n','r','error (years)', '% error']]

Preform two step linear adjustement.

First on each data-source seperately
Then on all data-sources combined



In [4]:

    
def two_step_adjustment(m1, m2, age, groups):
    m1_adj = {}
    m2_adj = {}
    k = m1.index.intersection(m2.index)
    for s in groups.unique():
        pts = ti(groups == s).intersection(k)
        m1_adj[s] = detrend(age, m1.ix[pts])
        m2_adj[s] = detrend(age, m2.ix[pts])
    m1_adj = pd.concat(m1_adj.values())
    m2_adj = pd.concat(m2_adj.values())
    mc_adj = (m1_adj + m2_adj) / 2.
    mc_adj = detrend(age, mc_adj)
    mc_adj.name = 'Predicted Age (Combined)'
    return m1_adj, m2_adj, mc_adj

Manhattan plot



In [ ]:

    
def manhattan(vec, chrom, coords, ax=None, ybound=None,
              flow='up', ticks=True, gap=3e7):
    fig, ax = init_ax(ax, figsize=(9,3))
    x = 0
    chr_coords = []
    for i,c in enumerate(map(str, range(1,23))):
        v = vec.ix[ti(chrom == c)].dropna()
        series_scatter(coords + x, v, s=10, ax=ax, 
                       color=colors[i % 5], 
                       ann=None, alpha=1, rasterized=True) 
        chr_len = coords.ix[v.index].max()
        x = x + chr_len + gap
        chr_coords += [x - (chr_len / 2.)]
    ax.set_xbound(gap, x + gap)
    if ybound is not None:
        ax.set_ybound(ybound[0],ybound[1])
    ax.set_xlabel('Chromosome')
    
    if ticks:
        ax.set_xticks(chr_coords)
        ax.set_xticklabels(map(str, range(1,23)))
    else:
        ax.set_xticks([])
    top = flow == 'down'
    prettify_ax(ax, top)

Distribution plot with inset option



In [ ]:

    
def draw_dist(vec, split=None, ax=None, legend=True, colors=None,
              lim=None, tail='right', inset=False):
    """
    Draw a smooth distribution from data with an optional splitting factor.
    """
    _, ax = init_ax(ax)
    if split is None:
        split = pd.Series('s', index=vec.index)
        colors = {'s': colors} if colors is not None else None
        
    def get_dist(v):
        dist = smooth_dist(v)
        if tail is not None:
            dist = dist[lim:] if tail is 'right' else dist[:lim]
        return dist
    
    for l,v in vec.groupby(split):
        if colors is None:
            get_dist(v).plot(label=l, lw=2, ax=ax)
        else:
            get_dist(v).plot(label=l, lw=2, ax=ax, color=colors[l])
    if legend and len(split.unique()) > 1:
        ax.legend(loc='upper left', frameon=False)
    ax.set_yticks([])
    ax.set_xlabel(vec.name)
    ax.set_ylabel('probe density')

    
    if inset:
        a = axes([.4, .4, .5, .5])
        max_y = 0
        for l,v in vec.groupby(split):
            v = smooth_dist(v)
            if colors is None:

                a.plot(v.index, v, label=l, lw=2)
            else:
                a.plot(v.index, v, label=l, lw=2, color=colors[l])
            max_y = max(max_y, v.max())
        setp(a, yticks=[])
        a.set_xlabel(vec.name)
        a.set_ylabel('probe density')
        a.set_xlim(vec.min(), vec.max())
        a.set_ylim(0, max_y*1.03)
        a.axvspan(lim, 1, color='grey', alpha=.1)
    #ax.axvspan(0, 1, color='grey', alpha=.05)
    
def odds_ratio(a,b):
    ct = pd.crosstab(a,b).astype(float)
    r = (ct.ix[0,0] * ct.ix[1,1]) / (ct.ix[0,1] * ct.ix[1,0])
    return r

Global Parameters

While it is not always the best practice to set global parameters, I find that for analyses such as these it is beneficial for a few reasons.

Selection criteria can be established in one place and updates can be propagated to all subsequent analyses
Things such as hard paths can be extracted from the logic and placed in one central location



In [27]:

    
FIGDIR = '/cellar/users/agross/figures/'
HDFS_DIR = '/cellar/users/agross/Data/SSD/'

path = '/cellar/users/agross/TCGA_Code/Methlation/'
ucsd_path = path + 'data/UCSD_Methylation/'
hannum_path = path + 'data/Hannum/'
path_italy = path + 'data/EPIC_ITALY/'