In [1]:
import numpy as np
import numpy.random as npr
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
# DREMI - "Density resampled estimate of mutual information"
#   - to quantify the strength of influence of one protein on another
# DREVI - "Conditional-Density rescaled visualization"
#   - "to visualize and characterize the edge-response function underlying their molecular interaction

In [18]:
def f(x,lam=10):
    ''' test "function" that stochastically maps an input value to
    one of two ranges of output values'''
    r = npr.randn()
    if npr.rand()*x < (lam/3):
        return (2*r)+5
    return r

In [19]:
x = npr.rand(10000)*10

In [20]:
y = np.array([f(x) for x in x])

In [43]:
plt.scatter(x,y,linewidths=0,alpha=0.1)


Out[43]:
<matplotlib.collections.PathCollection at 0x10faea470>

In [47]:
plt.hist(x,bins=50);
plt.figure()
plt.hist(y,bins=50);



In [29]:
ax = sns.kdeplot(x,y,shade=True)
plt.xlabel(r'$X$')
plt.ylabel(r'$f(X)$')


Out[29]:
<matplotlib.text.Text at 0x10f58c278>

In [48]:
# problem the paper tries to address: what if X isn't uniformly distributed?

In [ ]:


In [ ]:


In [30]:
# idea: just use Gaussian process regression

In [31]:
from sklearn.neighbors import KernelDensity

In [32]:
kde = KernelDensity()

In [38]:
XY = np.vstack((x,y)).T

In [39]:
kde.fit(XY)


Out[39]:
KernelDensity(algorithm='auto', atol=0, bandwidth=1.0, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0)

In [ ]:
grid = np.zeros((100,100))

Unclear data-discard step, and unclear why this would work if the tails of the distribution were in fact heavy

"Most markers (such as phosphoproteins) are distributed among the measured cells as heavy-tailed multimodal distributions (fig. S7C). For any pair of markers X and Y, we remove 1% of both tails of these distributions as outliers, removing ~200 cells from a population of ~10,000 cells (per condition and time point). The remaining range is generally well populated by cells and is amenable for a robust density estimate, which results in robust visualization and scoring, as shown in fig. S7."

Unclear why they use a heat equation instead of a standard Gaussian KDE, and I'm not sure what this means: "the evolution of the heat equation decides the shape of the kernel rather than having a fixed Gaussian estimate." They claim that a benefit of the method is that it allows the user to impose that the PDF is zero in some regions, but it's not clear why you would have this knowledge.

The relevant bit of the kde2d implementation:

transformed_data=(data-repmat(MIN_XY,N,1))./repmat(scaling,N,1);
%bin the data uniformly using regular grid;
initial_data=ndhist(transformed_data,n);
% discrete cosine transform of initial data
a= dct2d(initial_data);
% now compute the optimal bandwidth^2
  I=(0:n-1).^2; A2=a.^2;

 t_star=fzero( @(t)(t-evolve(t)),[0,0.1]);

p_02=func([0,2],t_star);p_20=func([2,0],t_star); p_11=func([1,1],t_star);
t_y=(p_02^(3/4)/(4*pi*N*p_20^(3/4)*(p_11+sqrt(p_20*p_02))))^(1/3);
t_x=(p_20^(3/4)/(4*pi*N*p_02^(3/4)*(p_11+sqrt(p_20*p_02))))^(1/3);
% smooth the discrete cosine transform of initial data using t_star
a_t=exp(-(0:n-1)'.^2*pi^2*t_x/2)*exp(-(0:n-1).^2*pi^2*t_y/2).*a;

The section on "Fitting edge-response functions" is not very helpful: repeats 3 seperate times almost verbatim that they fit a line, sigmoid, or double-sigmoid, and if that didn't work, then a "free-form curve" but it's not explained how they do the non-parametric fit


In [ ]:
# the exact problem this paper is trying to solve is tackled by:
# http://www.cc.gatech.edu/~isbell/papers/isbell-density-2007.pdf

# use dual-trees to compute conditional KDE, and they come up with a
# super-fast way to do maximum-likelihood cross-validation of the
# KDE bandwidth

In [ ]:
# cosma's lecture notes on estimating conditional densities:
# http://www.stat.cmu.edu/~cshalizi/402/lectures/06-density/lecture-06.pdf

In [66]:
def noisy_line(x,noise_size=10):
    return x**2 + noise_size*npr.randn()

In [96]:
n = 5000
x = 1.5*npr.randn(n)+ (npr.randint(0,2,n))*7+2
y = np.array([noisy_line(x) for x in x])

In [97]:
plt.scatter(x,y,linewidths=0,alpha=0.1)


Out[97]:
<matplotlib.collections.PathCollection at 0x110b45d30>

In [98]:
plt.hist(x,bins=50);
plt.figure()
plt.hist(y,bins=50);



In [99]:
# idea: just do a fit as usual, but with each point
# weighted according to (1 / its x-density)

In [100]:
from sklearn.neighbors import KernelDensity

In [101]:
from sklearn.grid_search import GridSearchCV

# this snippet from documentation: 
# http://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_kde_sampling.html
params = {'bandwidth': np.logspace(-1, 1, 10)}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(x.reshape((len(x),1)))

print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

# use the best estimator to compute the kernel density estimate
kde = grid.best_estimator_


best bandwidth: 0.2782559402207124

In [127]:
kde = KernelDensity(1.0)

In [128]:
x.shape


Out[128]:
(5000,)

In [129]:
kde.fit(x)


Out[129]:
KernelDensity(algorithm='auto', atol=0, bandwidth=1.0, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0)

In [130]:
x_density = kde.score_samples(x)

In [131]:
np.min(x_density),np.max(x_density)


Out[131]:
(-4594.692666023363, -4594.692666023363)

In [104]:
yerr = 1/x_density

In [106]:
np.min(yerr),np.max(yerr)


Out[106]:
(0.0, 0.0)

In [132]:
def analytic_x_density(x):
    mu1=2
    mu2=9
    return np.exp(-(x-mu1)**2) + np.exp(-(x-mu2)**2)

In [135]:
x_density = analytic_x_density(x)
x_density.shape,np.min(x_density),np.max(x_density)


Out[135]:
((5000,), 1.5801959452908288e-15, 0.99999944424490972)

In [146]:
yerr=(x_density+1)*10

In [155]:
plt.scatter(x,y,linewidths=0,c=x_density,alpha=0.1,cmap='rainbow')


Out[155]:
<matplotlib.collections.PathCollection at 0x110dd1f60>

In [175]:
import george
from george.kernels import ExpSquaredKernel

# Set up the Gaussian process.
kernel = ExpSquaredKernel(10.0)
gp = george.GP(kernel)

# Pre-compute the factorization of the matrix.
gp.compute(x, yerr)

# Compute the log likelihood.
print(gp.lnlikelihood(y))


-26897.0968746

In [176]:
t = np.linspace(-5, 15, 500)
mu, cov = gp.predict(y, t)
std = np.sqrt(np.diag(cov))

In [177]:
mu.shape


Out[177]:
(500,)

In [178]:
std.shape,np.min(std),np.max(std)


Out[178]:
((500,), 0.31292239627432838, 0.91868252553595564)

In [179]:
plt.fill_between(t,mu+std,mu-std)


Out[179]:
<matplotlib.collections.PolyCollection at 0x111c55e80>

In [ ]: