In [2]:
# Our numerical workhorses
import numpy as np
import pandas as pd
import scipy.optimize

# Numerical differentiation packages
import numdifftools as ndt

# Import pyplot for plotting
import matplotlib.pyplot as plt

# Seaborn, useful for graphics
import seaborn as sns

# Bokeh stuff
import bokeh.charts
import bokeh.io

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables high res graphics inline (only use with static plots (non-Bokeh))
# SVG is preferred, but there is a bug in Jupyter with vertical lines
%config InlineBackend.figure_formats = {'png', 'retina'}

# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)

# Display graphics in this notebook
bokeh.io.output_notebook()


/Users/davidangeles/anaconda/envs/py35/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
BokehJS successfully loaded.

In [4]:
#load data
df= pd.read_csv('../input/invitro_droplet_data.csv', comment= '#')
df.head()


Out[4]:
Droplet Diameter (um) Droplet Volume (uL) Spindle Length (um) Spindle Width (um) Spindle Area (um2)
0 27.1 0.000010 28.9 10.8 155.8
1 28.2 0.000012 22.7 7.2 81.5
2 29.4 0.000013 26.2 10.5 138.3
3 31.0 0.000016 19.2 9.4 90.5
4 31.0 0.000016 28.4 12.1 172.4

In [7]:
#compute spindle volume
spindle_volume= np.pi*df['Spindle Length (um)']\
            *df['Spindle Width (um)']**2/6.0*1e-9

#compute the ratio v_s/v_0
vol_ratio= spindle_volume/ df['Droplet Volume (uL)']
    
n, b, p= plt.hist(vol_ratio, bins= int(np.sqrt(len(vol_ratio))))
plt.xlabel(r'$V_\mathrm{s} / V_0$')
plt.ylabel('count')


Out[7]:
<matplotlib.text.Text at 0x1102ce9e8>

In [8]:
#compute aspect ratio, k
k= df['Spindle Width (um)']/ df['Spindle Length (um)']

#Plot histogram of results
n, b, p= plt.hist(k, bins= int(np.sqrt(len(k))))
plt.xlabel(r'$k')
plt.ylabel('count')


Out[8]:
<matplotlib.text.Text at 0x10f677160>

In [10]:
#take a quick look at the data
p= bokeh.charts.Scatter(df, x= 'Droplet Diameter (um)', \
                       y= 'Spindle Length (um)')
bokeh.io.show(p)



In [11]:
#estimate the spindle length assuming constant spindle length
#i.e., each spindle is of length theta plus/minus gaussian error
theta= df['Spindle Length (um)'].mean()
r= df['Spindle Length (um)'].std()
n= df['Spindle Length (um)'].count()


# Print mean plus minus a stanrdard error of the mean
print("""
Model a results (≈68% of total posterior probability)
-----------------------------------------------------
θ = {0:.1f} ± {1:.1f} µm
""".format(theta, r / np.sqrt(n)))


Model a results (≈68% of total posterior probability)
-----------------------------------------------------
θ = 32.9 ± 0.2 µm


In [14]:
plt.plot(df['Droplet Diameter (um)'], df['Spindle Length (um)'], '.'\
        , markersize= 10, alpha= 0.25)

plt.plot([0,250], [theta, theta], '-', color= sns.color_palette()[2])

plt.xlabel(u'droplet diamete (µm)')
plt.ylabel(u'spindle length (µm)')


/Users/davidangeles/anaconda/envs/py35/lib/python3.5/site-packages/matplotlib/__init__.py:892: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
Out[14]:
<matplotlib.text.Text at 0x111964c50>

Model b

Model b assumes that spindle size is dependent on total tubulin concentration $$l(d) = \frac{\gamma d}{(1+(\frac{\gamma d}{\theta})^3)^\frac{1}{3}}$$

Use a uniform prior for $$\gamma$$ and $$\theta$$ and a jeffrey prior for the data variance $$\sigma$$


In [35]:
def spindle_length(p, d):
    """
    Theoretical model for spindle length
    """
    
    theta, gamma= p
    return gamma*d/np.cbrt(1+(gamma*d/theta)**3)

#post marginalization over the variance sigma
def log_post(p, d, ell):
    """
    Compute log of posterior for single set of parameters
    p[0]= theta
    p[1]= gamma
    """
    #unpack parameters
    theta, gamma= p
    
    #theoretical length
    ell_theor= spindle_length(p, d)
    
    return -len(d)/2*np.log(np.sum((ell - ell_theor)**2))

#parameter values to plot
th= np.linspace(37, 40, 100)
gamma= np.linspace(0.8, 0.95, 100)

#make a grid
tt, gg= np.meshgrid(th, gamma)

#compute log posterior
log_posterior= np.empty_like(tt)
for j in range(len(th)):
    for i in range(len(gamma)):
        p= np.array([tt[i, j], gg[i, j]]) #parameters to unpack
        log_posterior[i, j]= log_post(p, df['Droplet Diameter (um)'],\
                                      df['Spindle Length (um)'])
#scale
log_posterior -= log_posterior.max()

plt.contourf(tt, gg, np.exp(log_posterior), cmap= plt.cm.Blues,\
             alpha= 0.7)
plt.xlabel(r'$\theta$ (µm)')
plt.ylabel(r'$\gamma$')


Out[35]:
<matplotlib.text.Text at 0x1103720f0>

Find the Maximal A Posteriori Estimate (MAPE)

The a posteriori estimate is the set of parameter values that maximize the posterior probability. This can be done by minimizing the residuals for the model, since the sum of the residuals are inversely proportional to the posterior probability of the parameters


In [37]:
def resid(p, d, ell):
    """
    Residuals for spindle length
    """
    return ell - spindle_length(p,d)

In [38]:
#Initial guess
p0= np.array([40, 0.6])

In [41]:
#place extra args as tuple
args= (df['Droplet Diameter (um)'], df['Spindle Length (um)'])

#compute map
popt, _= scipy.optimize.leastsq(resid, p0, args= args)

#extract vals
theta, gamma= popt

# Print results
print("""
Model 2b, most probable parameters
----------------------------------
θ = {0:.1f} µm
γ = {1:.2f}
""".format(theta, gamma))


Model 2b, most probable parameters
----------------------------------
θ = 38.2 µm
γ = 0.86


In [43]:
#plot

#values of droplet diameter
d_plot= np.linspace(0, 250, 200)

#theoretical curve
spindle_theor= spindle_length(popt, d_plot)

#plot
plt.plot(df['Droplet Diameter (um)'], df['Spindle Length (um)'],\
        '.', markersize= 10, alpha= 0.25)

plt.plot(d_plot, spindle_theor, '-', color= sns.color_palette()[2])

# Use unicode to do plot label with microns with nicely formatted mu
plt.xlabel(u'droplet diameter (µm)')
plt.ylabel(u'spindle length (µm)')


/Users/davidangeles/anaconda/envs/py35/lib/python3.5/site-packages/matplotlib/__init__.py:892: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
Out[43]:
<matplotlib.text.Text at 0x10f9444e0>

In [46]:
# Instantiate Hessian for log posterior
hes_fun = ndt.Hessian(log_post)

# Compute Hessian at MAP
hes = hes_fun(popt, df['Droplet Diameter (um)'],\
              df['Spindle Length (um)'])

# Compute the covariance matrix
cov = -np.linalg.inv(hes)

# Look at it
cov

# Report results
print("""
Results for Model b (≈ 68% of total probability)
------------------------------------------------
θ = {0:.1f} ± {1:.1f} µm
γ = {2:.2f} ± {3:.2f}
""".format(theta, np.sqrt(cov[0,0]), gamma, np.sqrt(cov[1,1])))


Results for Model b (≈ 68% of total probability)
------------------------------------------------
θ = 38.2 ± 0.4 µm
γ = 0.86 ± 0.02