In [4]:
%pylab inline
In [2]:
rcParams['axes.color_cycle']
Out[2]:
In [ ]:
from __future__ import division
In [2]:
import os as os
import pickle as pickle
import pandas as pd
In [7]:
from Helpers.LinAlg import *
from Helpers.Pandas import *
from Helpers.RPY2 import *
from Stats.Scipy import *
from Stats.Regression import *
from Figures.FigureHelpers import *
from Figures.Pandas import *
from Figures.Boxplots import *
from Figures.Regression import *
In [5]:
pd.set_option('precision', 3)
pd.set_option('display.width', 300)
plt.rcParams['font.size'] = 14
In [ ]:
import seaborn as sns
sns.set_context("paper", font_scale=1.7, rc={"lines.linewidth": 2.5})
sns.set_style("white")
In [6]:
'''Color schemes for paper taken from http://colorbrewer2.org/'''
colors = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00',
'#ffff33', '#a65628']
colors_st = ['#CA0020', '#F4A582', '#92C5DE', '#0571B0']
colors_th = ['#E66101', '#FDB863', '#B2ABD2', '#5E3C99']
Preform a logit transformation while constraining extreme values.
In [1]:
from scipy.special import logit
logit_adj = lambda df: logit(df.clip(.001, .999))
In [1]:
def detrend(x,y):
x, y = match_series(x, y)
reg = linear_regression(x, y)
adj = (y - reg['intercept']) / reg['slope']
return adj
In [3]:
def get_error(x, y, groups=None, denominator='mean'):
if groups is not None:
r = pd.DataFrame({g: get_error(x.ix[ti(groups==g)], y, denominator=denominator)
for g in groups.unique()})
return r
if denominator == 'x':
d = x
elif denominator == 'y':
d = y
else:
d = (x + y) / 2.
pct_error = ((x - y) / d).abs().dropna().mean() * 100.
avg_error = (x- y).abs().dropna().mean()
return pd.Series({'error (years)': avg_error, '% error': pct_error})
In [ ]:
def model_fit(m, age):
reg = linear_regression(age, m)
error = get_error(age, m)
vec = pd.Series({'n': int(len(m.dropna())),
'r': reg['r-value'],
'error (years)': error['error (years)'],
'% error': error['% error']})
return vec.ix[['n','r','error (years)', '% error']]
Preform two step linear adjustement.
In [4]:
def two_step_adjustment(m1, m2, age, groups):
m1_adj = {}
m2_adj = {}
k = m1.index.intersection(m2.index)
for s in groups.unique():
pts = ti(groups == s).intersection(k)
m1_adj[s] = detrend(age, m1.ix[pts])
m2_adj[s] = detrend(age, m2.ix[pts])
m1_adj = pd.concat(m1_adj.values())
m2_adj = pd.concat(m2_adj.values())
mc_adj = (m1_adj + m2_adj) / 2.
mc_adj = detrend(age, mc_adj)
mc_adj.name = 'Predicted Age (Combined)'
return m1_adj, m2_adj, mc_adj
Manhattan plot
In [ ]:
def manhattan(vec, chrom, coords, ax=None, ybound=None,
flow='up', ticks=True, gap=3e7):
fig, ax = init_ax(ax, figsize=(9,3))
x = 0
chr_coords = []
for i,c in enumerate(map(str, range(1,23))):
v = vec.ix[ti(chrom == c)].dropna()
series_scatter(coords + x, v, s=10, ax=ax,
color=colors[i % 5],
ann=None, alpha=1, rasterized=True)
chr_len = coords.ix[v.index].max()
x = x + chr_len + gap
chr_coords += [x - (chr_len / 2.)]
ax.set_xbound(gap, x + gap)
if ybound is not None:
ax.set_ybound(ybound[0],ybound[1])
ax.set_xlabel('Chromosome')
if ticks:
ax.set_xticks(chr_coords)
ax.set_xticklabels(map(str, range(1,23)))
else:
ax.set_xticks([])
top = flow == 'down'
prettify_ax(ax, top)
Distribution plot with inset option
In [ ]:
def draw_dist(vec, split=None, ax=None, legend=True, colors=None,
lim=None, tail='right', inset=False):
"""
Draw a smooth distribution from data with an optional splitting factor.
"""
_, ax = init_ax(ax)
if split is None:
split = pd.Series('s', index=vec.index)
colors = {'s': colors} if colors is not None else None
def get_dist(v):
dist = smooth_dist(v)
if tail is not None:
dist = dist[lim:] if tail is 'right' else dist[:lim]
return dist
for l,v in vec.groupby(split):
if colors is None:
get_dist(v).plot(label=l, lw=2, ax=ax)
else:
get_dist(v).plot(label=l, lw=2, ax=ax, color=colors[l])
if legend and len(split.unique()) > 1:
ax.legend(loc='upper left', frameon=False)
ax.set_yticks([])
ax.set_xlabel(vec.name)
ax.set_ylabel('probe density')
if inset:
a = axes([.4, .4, .5, .5])
max_y = 0
for l,v in vec.groupby(split):
v = smooth_dist(v)
if colors is None:
a.plot(v.index, v, label=l, lw=2)
else:
a.plot(v.index, v, label=l, lw=2, color=colors[l])
max_y = max(max_y, v.max())
setp(a, yticks=[])
a.set_xlabel(vec.name)
a.set_ylabel('probe density')
a.set_xlim(vec.min(), vec.max())
a.set_ylim(0, max_y*1.03)
a.axvspan(lim, 1, color='grey', alpha=.1)
#ax.axvspan(0, 1, color='grey', alpha=.05)
def odds_ratio(a,b):
ct = pd.crosstab(a,b).astype(float)
r = (ct.ix[0,0] * ct.ix[1,1]) / (ct.ix[0,1] * ct.ix[1,0])
return r
While it is not always the best practice to set global parameters, I find that for analyses such as these it is beneficial for a few reasons.
In [27]:
FIGDIR = '/cellar/users/agross/figures/'
HDFS_DIR = '/cellar/users/agross/Data/SSD/'
path = '/cellar/users/agross/TCGA_Code/Methlation/'
ucsd_path = path + 'data/UCSD_Methylation/'
hannum_path = path + 'data/Hannum/'
path_italy = path + 'data/EPIC_ITALY/'