In [1]:
import itertools
# Our numerical workhorses
import numpy as np
import pandas as pd
import scipy.optimize
import scipy.stats as st
# Numerical differentiation packages
import numdifftools as ndt
# Import pyplot for plotting
import matplotlib.pyplot as plt
# Seaborn, useful for graphics
import seaborn as sns
# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline
# This enables high res graphics inline (only use with static plots (non-Bokeh))
# SVG is preferred, but there is a bug in Jupyter with vertical lines
%config InlineBackend.figure_formats = {'png', 'retina'}
# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2,
'axes.labelsize': 18,
'axes.titlesize': 18,
'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)
In [4]:
# Load DataFrame
df = pd.read_csv('../input/singer_transcript_counts.csv',
comment='#')
# Take a look
df.head()
Out[4]:
In [5]:
_ = df.hist(normed=True, bins=30)
In [6]:
#plot ecdfs
fig, ax= plt.subplots(2,2)
#below makes all possible tuples of (0, 1)*(0,1)
sp_inds= list(itertools.product([0, 1], [0,1]))
for i, gene in enumerate(df.columns):
y= np.arange(len(df[gene]))/len(df[gene])
x= np.sort(df[gene].values)
#plot
ax[sp_inds[i]].plot(x, y, '.')
ax[sp_inds[i]].text(0.7, 0.25, gene, \
transform= ax[sp_inds[i]].transAxes, fontsize= 18)
ax[sp_inds[i]].margins(0.02)
#clean up
for i in [0,1]:
ax[1,i].set_xlabel('mRNA count')
ax[i,0].set_ylabel('ECDF')
plt.tight_layout()
In [25]:
def log_posterior(params, n):
"""
Log posterior for MLE of Singer data
"""
r, p= params
#zero probability of p <0 or p>1
if p<0 or p>1:
return -np.inf
#zero probability of burst parameter r being less than o
if r < 0:
return -np.inf
return st.nbinom.logpmf(n, r, p).sum()
def neg_log_posterior(params, n):
"""
Negative log posterior for MLE of Singer data.
"""
return -log_posterior(params, n)
In [26]:
#guess:
p0= np.array([5, .5])
#define hessian with small step size to avoid discontinuity
hes_fun= ndt.Hessian(log_posterior, step= 0.001)
df_MAP= pd.DataFrame(columns= ['gene', 'r', 'r_err', 'p', 'p_err', 'b'])
#mle
for i, gene in enumerate(df.columns):
#minimize!
res = scipy.optimize.minimize(neg_log_posterior, p0, args=(df[gene],),
method='powell')
#error bars
hes= hes_fun(res.x, df[gene])
cov= -np.linalg.inv(hes)
#unpack
df_out = pd.DataFrame([[gene,
res.x[0],
np.sqrt(cov[0,0]),
res.x[1],
np.sqrt(cov[1,1]),
(1 - res.x[1]) / res.x[1]]],
columns=df_MAP.columns, index=[i])
df_MAP= df_MAP.append(df_out)
df_MAP
Out[26]:
In [27]:
def log_posterior_bimodal(params, n):
"""
Log of posterior for lin comb of neg. binomials
"""
r_1, r_2, p_1, p_2, f= params
if (f<0) or (f>1):
return -np.inf
if (r_1 < 0) or (r_2 < 0) or (p_1 < p_2) or (p_2 < 0) or (p_1 > 1):
return -np.inf
return np.log(f*st.nbinom.pmf(n, r_1, p_1)\
+ (1-f)*st.nbinom.pmf(n, r_2, p_2)).sum()
def neg_log_posterior_bimodal(params, n):
"""
Neg. log posterior for lin comb of neg binom
"""
return -log_posterior_bimodal(params, n)
In [102]:
# Give inital guesses
p0 = np.array([5, 5, 0.3, 0.1, 0.5])
# Define Hessian function with small initial step size to avoid discontinuity
hes_fun = ndt.Hessian(log_posterior_bimodal, step= 0.0001)
# Perform the optimization
res = scipy.optimize.minimize(neg_log_posterior_bimodal, p0,
args=(df['Rex1'],), method='powell')
# Compute error bars
hes = hes_fun(res.x, df['Rex1'])
cov = -np.linalg.inv(hes)
# Store results in DataFrame
columns = ['r1', 'r1_err', 'r2', 'r2_err',
'p1', 'p1_err', 'p2', 'p2_err',
'f', 'f_err', 'b1', 'b2']
#the one-liner below is super smart!
data = [x[j] for x in list(zip(res.x, np.sqrt(np.diag(cov))))
for j in range(len(x))]
data += [(1 - res.x[2]) / res.x[2], (1 - res.x[3]) / res.x[3]]
df_Rex1_bimodal = pd.DataFrame([data], columns=columns)
# Have a look
df_Rex1_bimodal
Out[102]:
In [103]:
fig, ax = plt.subplots(2, 2)
sp_inds = list(itertools.product([0,1], [0,1]))
for i, gene in enumerate(df.columns):
# What data to take from MAP DataFrame
inds = (df_MAP['gene']==gene)
# Build theroetical PMF
n_plot = np.arange(0, df[gene].max()+1)
r, p = df_MAP[inds][['r', 'p']].values.flatten()
# Plot histogram and PMF
_ = ax[sp_inds[i]].hist(df[gene], bins=50, normed=True)
ax[sp_inds[i]].plot(n_plot, st.nbinom.pmf(n_plot, r, p), '-',
color=sns.color_palette()[2])
# If it's Rex1, also show the bimodal PMF
if gene == 'Rex1':
# Build theroetical PMF
cols = ['r1', 'r2', 'p1', 'p2', 'f']
r_1, r_2, p_1, p_2, f = df_Rex1_bimodal[inds][cols].values.flatten()
pmf = f * st.nbinom.pmf(n_plot,r_1, p_1) +\
(1-f) * st.nbinom.pmf(n_plot,r_2, p_2)
ax[sp_inds[i]].plot(n_plot, pmf, '-', color=sns.color_palette()[5])
# Clean up plot
ax[sp_inds[i]].text(0.7, 0.75, gene, transform=ax[sp_inds[i]].transAxes,
fontsize=18)
ax[sp_inds[i]].margins(y=0.02)
# Clean up
for i in [0,1]:
ax[1,i].set_xlabel('mRNA count')
ax[i,0].set_ylabel('probability')
plt.tight_layout()
In [104]:
fig, ax = plt.subplots(2, 2)
sp_inds = list(itertools.product([0,1], [0,1]))
for i, gene in enumerate(df.columns):
# What data to take from MAP DataFrame
inds = (df_MAP['gene']==gene)
# Build ECDF
y = np.arange(len(df[gene])) / len(df[gene])
x = np.sort(df[gene].values)
# Plot
ax[sp_inds[i]].plot(x, y, '.')
ax[sp_inds[i]].text(0.7, 0.25, gene, transform=ax[sp_inds[i]].transAxes,
fontsize=18)
ax[sp_inds[i]].margins(0.02)
# Overlay theoretical CDF
n_plot = np.arange(0, df[gene].max()+1)
r, p = df_MAP[inds][['r', 'p']].values.flatten()
ax[sp_inds[i]].plot(n_plot, st.nbinom.cdf(n_plot, r, p), '-',
color=sns.color_palette()[2])
# If it is Rex1, do bimodal as well
if gene == 'Rex1':
# Build theroetical PMF
cols = ['r1', 'r2', 'p1', 'p2', 'f']
r_1, r_2, p_1, p_2, f = df_Rex1_bimodal[inds][cols].values.flatten()
cdf = f * st.nbinom.cdf(n_plot,r_1, p_1) +\
(1-f) * st.nbinom.cdf(n_plot,r_2, p_2)
ax[sp_inds[i]].plot(n_plot, cdf, '-', color=sns.color_palette()[5])
# Clean up
for i in [0,1]:
ax[1,i].set_xlabel('mRNA count')
ax[i,0].set_ylabel('ECDF')
plt.tight_layout()
In [105]:
fig, ax = plt.subplots(2, 2)
sp_inds = list(itertools.product([0,1], [0,1]))
for i, gene in enumerate(df.columns):
# What data to take from MAP DataFrame
inds = (df_MAP['gene']==gene)
# Parameters
r, p = df_MAP[inds][['r', 'p']].values.flatten()
# x-values
x = np.sort(df[gene].values)
# Make draws
theor_x = np.array(
[np.sort(st.nbinom.rvs(r, p, size=len(x))) for _ in range(1000)])
# Upper and lower bounds
low_theor, up_theor = np.percentile(theor_x, (2.5, 97.5), axis=0)
# Plot Q-Q plots with 95% conf.
ax[sp_inds[i]].fill_between(x, up_theor, low_theor, alpha=0.7)
# Plot 45 degree line
x_lim = ax[sp_inds[i]].get_xlim()
ax[sp_inds[i]].plot(x_lim, x_lim, 'k-')
# Label plot
ax[sp_inds[i]].text(0.25, 0.75, gene, transform=ax[sp_inds[i]].transAxes,
fontsize=18)
# Clean up
for i in [0,1]:
ax[1,i].set_xlabel('mRNA transcripts')
ax[i,0].set_ylabel('mRNA transcripts')
plt.tight_layout()
In [106]:
# Pull out parameters
cols = ['r1', 'r2', 'p1', 'p2', 'f']
r1, r2, p1, p2, f = df_Rex1_bimodal[cols].values.flatten()
# x-values
x = np.sort(df['Rex1'].values)
# Make draws
def draw_double_neg_binom(r1, r2, p1, p2, f, size=1):
"""
Draw from double negative binomial
"""
fr = st.bernoulli.rvs(f, size=size)
return fr * st.nbinom.rvs(r1, p1, size=size) + \
(1-fr) * st.nbinom.rvs(r2, p2, size=size)
theor_x = np.array(
[np.sort(draw_double_neg_binom(r1, r2, p1, p2, f, size=len(x)))
for _ in range(1000)])
# Upper and lower bounds
low_theor, up_theor = np.percentile(theor_x, (2.5, 97.5), axis=0)
# Plot Q-Q plots with 95% conf.
plt.fill_between(x, up_theor, low_theor, alpha=0.7)
# Plot 45 degree line
x_lim = plt.gca().get_xlim()
plt.plot(x_lim, x_lim, 'k-')
# Label axes
plt.xlabel('mRNA transcripts')
plt.ylabel('mRNA transcripts')
plt.title('Rex1 Double Bimodal Model QQ Plot')
Out[106]:
In [ ]:
In [ ]: