notebook.community

Edit and run



In [1]:

    
%matplotlib inline 
import numpy as np 
import matplotlib.pyplot as plt



In [41]:

    
N = 1000  # number of sampling points  
# h = 0.337 # bandwidth  
NP = 100 # number of grid points for plotting
q = np.random.randn(N)

x = np.linspace(-4,4,NP)
p = np.zeros(NP)
logp = np.zeros(NP)
#kde = stats.gaussian_kde(q, bw_method='silverman')


d = 1 
# h = N**(-1./(d+4)) # scotts 
h = (N * (d + 2) / 4.)**(-1. / (d + 4)) # silverman 

class gaussian_kde:
    
    """
    kernal density estimation using gaussian kernal 
    """
    
    def __init__(self,x,h):
        self.x = x    # sampling points 
        self.bw = h   # bandwidth 
    
    def pdf(self,z):
        
        x = self.x 
        h = self.bw
        N = len(x)

        pdf = sum(np.exp(-(z-x)**2/2./h**2) / np.sqrt(np.pi) / (N * h))
        
        return pdf
    
    def logpdf(self,z):
        
        z = self.pdf(z)
        
        return np.log(z)
    
    def dpdf(self,z,M = 10):
        """
        direct esitmate of the derivative of the probability density function 
        M : number of datapoints used to fit 
        """
        h = 0.5
        N = len(self.x)
            
        # randomly choose M datapoints 
        x = np.zeros(M) 
        for i in range(M):
            j = np.random.rand() * N 
            x[i] = self.x[j] 
            
        x = np.linspace(-3,3,M)

        G = np.zeros((M,M))
        for i in range(M):
            for j in range(M):
                xc = (x[j] + x[i])/2.0
                dx = x[i] - x[j]
                G[i,j] = sum(np.exp(- dx**2/4.0/h**2) * np.exp(-(self.x - xc)**2/h**2)/ np.pi/ h**2 / N)
        
        g = 0.0
        for i in range(M):
            G[i,i] = G[i,i] + g 
        
        b = np.zeros(M)
        for i in range(M):
            b[i] = sum(-(x[i]-self.x)/h**2 * np.exp(-(x[i]-self.x)**2/2./h**2) / np.sqrt(np.pi) / h) / N
        
        c = np.linalg.solve(G,-b/2.0)
        

        dpdf = np.dot(c , np.exp(-(z-x)**2/2./h**2) / np.sqrt(np.pi) / h)
        
        return dpdf        
            
kde = gaussian_kde(q,h)

for j in range(NP):
    p[j] = kde.pdf(x[j])
    logp[j] = kde.dpdf(x[j])

plt.subplot(211) 
plt.plot(x,p)
plt.plot(x,np.exp(-x**2/2.)/np.sqrt(np.pi),'--',label='normal')

plt.subplot(212)
plt.plot(x,-logp)
#plt.plot(x,-x)
plt.legend()
plt.show()









    



/home/bing/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:54: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future



In [59]:

    
# Generate some random two-dimensional data:

from scipy import stats
def measure(n):
    "Measurement model, return two coupled measurements."
    m1 = np.random.normal(size=n)
    m2 = np.random.normal(scale=0.5, size=n)
    return m1+m2, m1-m2

m1, m2 = measure(2000)
xmin = m1.min()
xmax = m1.max()
ymin = m2.min()
ymax = m2.max()

# Perform a kernel density estimate on the data:

X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([m1, m2])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)

# Plot the results:

import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
          extent=[xmin, xmax, ymin, ymax])
ax.plot(m1, m2, 'k.', markersize=2)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
plt.show()



In [74]:

    
help(stats.gaussian_kde)









    



Help on class gaussian_kde in module scipy.stats.kde:

class gaussian_kde(__builtin__.object)
 |  Representation of a kernel-density estimate using Gaussian kernels.
 |  
 |  Kernel density estimation is a way to estimate the probability density
 |  function (PDF) of a random variable in a non-parametric way.
 |  `gaussian_kde` works for both uni-variate and multi-variate data.   It
 |  includes automatic bandwidth determination.  The estimation works best for
 |  a unimodal distribution; bimodal or multi-modal distributions tend to be
 |  oversmoothed.
 |  
 |  Parameters
 |  ----------
 |  dataset : array_like
 |      Datapoints to estimate from. In case of univariate data this is a 1-D
 |      array, otherwise a 2-D array with shape (# of dims, # of data).
 |  bw_method : str, scalar or callable, optional
 |      The method used to calculate the estimator bandwidth.  This can be
 |      'scott', 'silverman', a scalar constant or a callable.  If a scalar,
 |      this will be used directly as `kde.factor`.  If a callable, it should
 |      take a `gaussian_kde` instance as only parameter and return a scalar.
 |      If None (default), 'scott' is used.  See Notes for more details.
 |  
 |  Attributes
 |  ----------
 |  dataset : ndarray
 |      The dataset with which `gaussian_kde` was initialized.
 |  d : int
 |      Number of dimensions.
 |  n : int
 |      Number of datapoints.
 |  factor : float
 |      The bandwidth factor, obtained from `kde.covariance_factor`, with which
 |      the covariance matrix is multiplied.
 |  covariance : ndarray
 |      The covariance matrix of `dataset`, scaled by the calculated bandwidth
 |      (`kde.factor`).
 |  inv_cov : ndarray
 |      The inverse of `covariance`.
 |  
 |  Methods
 |  -------
 |  evaluate
 |  __call__
 |  integrate_gaussian
 |  integrate_box_1d
 |  integrate_box
 |  integrate_kde
 |  pdf
 |  logpdf
 |  resample
 |  set_bandwidth
 |  covariance_factor
 |  
 |  Notes
 |  -----
 |  Bandwidth selection strongly influences the estimate obtained from the KDE
 |  (much more so than the actual shape of the kernel).  Bandwidth selection
 |  can be done by a "rule of thumb", by cross-validation, by "plug-in
 |  methods" or by other means; see [3]_, [4]_ for reviews.  `gaussian_kde`
 |  uses a rule of thumb, the default is Scott's Rule.
 |  
 |  Scott's Rule [1]_, implemented as `scotts_factor`, is::
 |  
 |      n**(-1./(d+4)),
 |  
 |  with ``n`` the number of data points and ``d`` the number of dimensions.
 |  Silverman's Rule [2]_, implemented as `silverman_factor`, is::
 |  
 |      (n * (d + 2) / 4.)**(-1. / (d + 4)).
 |  
 |  Good general descriptions of kernel density estimation can be found in [1]_
 |  and [2]_, the mathematics for this multi-dimensional implementation can be
 |  found in [1]_.
 |  
 |  References
 |  ----------
 |  .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
 |         Visualization", John Wiley & Sons, New York, Chicester, 1992.
 |  .. [2] B.W. Silverman, "Density Estimation for Statistics and Data
 |         Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
 |         Chapman and Hall, London, 1986.
 |  .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
 |         Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
 |  .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
 |         conditional density estimation", Computational Statistics & Data
 |         Analysis, Vol. 36, pp. 279-298, 2001.
 |  
 |  Examples
 |  --------
 |  Generate some random two-dimensional data:
 |  
 |  >>> from scipy import stats
 |  >>> def measure(n):
 |  ...     "Measurement model, return two coupled measurements."
 |  ...     m1 = np.random.normal(size=n)
 |  ...     m2 = np.random.normal(scale=0.5, size=n)
 |  ...     return m1+m2, m1-m2
 |  
 |  >>> m1, m2 = measure(2000)
 |  >>> xmin = m1.min()
 |  >>> xmax = m1.max()
 |  >>> ymin = m2.min()
 |  >>> ymax = m2.max()
 |  
 |  Perform a kernel density estimate on the data:
 |  
 |  >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
 |  >>> positions = np.vstack([X.ravel(), Y.ravel()])
 |  >>> values = np.vstack([m1, m2])
 |  >>> kernel = stats.gaussian_kde(values)
 |  >>> Z = np.reshape(kernel(positions).T, X.shape)
 |  
 |  Plot the results:
 |  
 |  >>> import matplotlib.pyplot as plt
 |  >>> fig, ax = plt.subplots()
 |  >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
 |  ...           extent=[xmin, xmax, ymin, ymax])
 |  >>> ax.plot(m1, m2, 'k.', markersize=2)
 |  >>> ax.set_xlim([xmin, xmax])
 |  >>> ax.set_ylim([ymin, ymax])
 |  >>> plt.show()
 |  
 |  Methods defined here:
 |  
 |  __call__ = evaluate(self, points)
 |  
 |  __init__(self, dataset, bw_method=None)
 |  
 |  covariance_factor = scotts_factor(self)
 |  
 |  evaluate(self, points)
 |      Evaluate the estimated pdf on a set of points.
 |      
 |      Parameters
 |      ----------
 |      points : (# of dimensions, # of points)-array
 |          Alternatively, a (# of dimensions,) vector can be passed in and
 |          treated as a single point.
 |      
 |      Returns
 |      -------
 |      values : (# of points,)-array
 |          The values at each point.
 |      
 |      Raises
 |      ------
 |      ValueError : if the dimensionality of the input points is different than
 |                   the dimensionality of the KDE.
 |  
 |  integrate_box(self, low_bounds, high_bounds, maxpts=None)
 |      Computes the integral of a pdf over a rectangular interval.
 |      
 |      Parameters
 |      ----------
 |      low_bounds : array_like
 |          A 1-D array containing the lower bounds of integration.
 |      high_bounds : array_like
 |          A 1-D array containing the upper bounds of integration.
 |      maxpts : int, optional
 |          The maximum number of points to use for integration.
 |      
 |      Returns
 |      -------
 |      value : scalar
 |          The result of the integral.
 |  
 |  integrate_box_1d(self, low, high)
 |      Computes the integral of a 1D pdf between two bounds.
 |      
 |      Parameters
 |      ----------
 |      low : scalar
 |          Lower bound of integration.
 |      high : scalar
 |          Upper bound of integration.
 |      
 |      Returns
 |      -------
 |      value : scalar
 |          The result of the integral.
 |      
 |      Raises
 |      ------
 |      ValueError
 |          If the KDE is over more than one dimension.
 |  
 |  integrate_gaussian(self, mean, cov)
 |      Multiply estimated density by a multivariate Gaussian and integrate
 |      over the whole space.
 |      
 |      Parameters
 |      ----------
 |      mean : aray_like
 |          A 1-D array, specifying the mean of the Gaussian.
 |      cov : array_like
 |          A 2-D array, specifying the covariance matrix of the Gaussian.
 |      
 |      Returns
 |      -------
 |      result : scalar
 |          The value of the integral.
 |      
 |      Raises
 |      ------
 |      ValueError :
 |          If the mean or covariance of the input Gaussian differs from
 |          the KDE's dimensionality.
 |  
 |  integrate_kde(self, other)
 |      Computes the integral of the product of this  kernel density estimate
 |      with another.
 |      
 |      Parameters
 |      ----------
 |      other : gaussian_kde instance
 |          The other kde.
 |      
 |      Returns
 |      -------
 |      value : scalar
 |          The result of the integral.
 |      
 |      Raises
 |      ------
 |      ValueError
 |          If the KDEs have different dimensionality.
 |  
 |  logpdf(self, x)
 |      Evaluate the log of the estimated pdf on a provided set of points.
 |      
 |      Notes
 |      -----
 |      See `gaussian_kde.evaluate` for more details; this method simply
 |      returns ``np.log(gaussian_kde.evaluate(x))``.
 |  
 |  pdf(self, x)
 |      Evaluate the estimated pdf on a provided set of points.
 |      
 |      Notes
 |      -----
 |      This is an alias for `gaussian_kde.evaluate`.  See the ``evaluate``
 |      docstring for more details.
 |  
 |  resample(self, size=None)
 |      Randomly sample a dataset from the estimated pdf.
 |      
 |      Parameters
 |      ----------
 |      size : int, optional
 |          The number of samples to draw.  If not provided, then the size is
 |          the same as the underlying dataset.
 |      
 |      Returns
 |      -------
 |      resample : (self.d, `size`) ndarray
 |          The sampled dataset.
 |  
 |  scotts_factor(self)
 |      Computes the coefficient (`kde.factor`) that
 |      multiplies the data covariance matrix to obtain the kernel covariance
 |      matrix. The default is `scotts_factor`.  A subclass can overwrite this
 |      method to provide a different method, or set it through a call to
 |      `kde.set_bandwidth`.
 |  
 |  set_bandwidth(self, bw_method=None)
 |      Compute the estimator bandwidth with given method.
 |      
 |      The new bandwidth calculated after a call to `set_bandwidth` is used
 |      for subsequent evaluations of the estimated density.
 |      
 |      Parameters
 |      ----------
 |      bw_method : str, scalar or callable, optional
 |          The method used to calculate the estimator bandwidth.  This can be
 |          'scott', 'silverman', a scalar constant or a callable.  If a
 |          scalar, this will be used directly as `kde.factor`.  If a callable,
 |          it should take a `gaussian_kde` instance as only parameter and
 |          return a scalar.  If None (default), nothing happens; the current
 |          `kde.covariance_factor` method is kept.
 |      
 |      Notes
 |      -----
 |      .. versionadded:: 0.11
 |      
 |      Examples
 |      --------
 |      >>> import scipy.stats as stats
 |      >>> x1 = np.array([-7, -5, 1, 4, 5.])
 |      >>> kde = stats.gaussian_kde(x1)
 |      >>> xs = np.linspace(-10, 10, num=50)
 |      >>> y1 = kde(xs)
 |      >>> kde.set_bandwidth(bw_method='silverman')
 |      >>> y2 = kde(xs)
 |      >>> kde.set_bandwidth(bw_method=kde.factor / 3.)
 |      >>> y3 = kde(xs)
 |      
 |      >>> import matplotlib.pyplot as plt
 |      >>> fig, ax = plt.subplots()
 |      >>> ax.plot(x1, np.ones(x1.shape) / (4. * x1.size), 'bo',
 |      ...         label='Data points (rescaled)')
 |      >>> ax.plot(xs, y1, label='Scott (default)')
 |      >>> ax.plot(xs, y2, label='Silverman')
 |      >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
 |      >>> ax.legend()
 |      >>> plt.show()
 |  
 |  silverman_factor(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [ ]: