In [1]:
%matplotlib inline

In [2]:
# Using KMeans to find outliers in a cluster of points.
# Finding outliers means finding the centroids and then looking
# for elements by their distance from the centroids

In [9]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
X, label = make_blobs(100, centers = 1)

In [10]:
import numpy as np

In [11]:
# a cluster with one center is similar to an SVM with one class.

In [12]:
from sklearn.cluster import KMeans

In [13]:
kmeans = KMeans(n_clusters=1)

In [14]:
kmeans.fit(X)


Out[14]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=1, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [17]:
# looking at the plot
f, ax = plt.subplots(figsize=(7,5))
ax.set_title('Blob')
ax.scatter(X[:, 0], X[:, 1], label='Points')
ax.scatter(kmeans.cluster_centers_[:, 0],
           kmeans.cluster_centers_[:, 1], label='Centroid',
           color='r')
ax.legend(loc='best')


Out[17]:
<matplotlib.legend.Legend at 0x108b4cb10>

In [20]:
# identify the 5 closest points
distances = kmeans.transform(X)

In [21]:
# argsort returns an array of indexes which will sort the array
# in ascending order. Reverse it with [::-1]
sorted_idx = np.argsort(distances.ravel())[::-1][:5]

In [22]:
f, ax = plt.subplots(figsize=(7,5))
ax.set_title('Single Cluster')
ax.scatter(X[:, 0], X[:, 1], label='Points')
ax.scatter(kmeans.cluster_centers_[:, 0],
           kmeans.cluster_centers_[:, 1],
           label='Centroid', color='r')
ax.scatter(X[sorted_idx][:, 0],
           X[sorted_idx][:, 1],
           label='Extreme Value', edgecolors='g',
           facecolors='none', s=100)
ax.legend(loc='best')


Out[22]:
<matplotlib.legend.Legend at 0x108d3da10>

In [23]:
# simulating removing these outliers
new_X = np.delete(X, sorted_idx, axis=0)

In [24]:
# this causes the centroids to move slightly
new_kmeans = KMeans(n_clusters=1)
new_kmeans.fit(new_X)


Out[24]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=1, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [26]:
f, ax = plt.subplots(figsize=(7,5))
ax.set_title("Extreme Values Removed")
ax.scatter(new_X[:, 0], new_X[:, 1], label='Pruned Points')
ax.scatter(kmeans.cluster_centers_[:, 0],
           kmeans.cluster_centers_[:, 1],
           label='Old Centroid',
           color='r', s=80, alpha=.5)
ax.scatter(new_kmeans.cluster_centers_[:, 0],
           new_kmeans.cluster_centers_[:, 1],
           label='New Centroid',
           color='m', s=80, alpha=.5)
ax.legend(loc='best')


Out[26]:
<matplotlib.legend.Legend at 0x108fce790>

In [27]:
# There is a fundamental connection between Gaussian distribution
# and the KMeans clustering. Let's create an empirical Gaussian
# based off the centroid and sample covariance matrix and look
# at the probability of each point that we removed.
# this will show that the points we removed were the least likely
# to occur.

In [28]:
from scipy import stats
emp_dist = stats.multivariate_normal(kmeans.cluster_centers_.ravel())
lowest_prob_idx = np.argsort(emp_dist.pdf(X))[:5]
np.all(X[sorted_idx] == X[lowest_prob_idx])


Out[28]:
True

In [31]:
print kmeans.cluster_centers_
print kmeans.cluster_centers_.ravel()


[[ 5.92830283  6.32759144]]
[ 5.92830283  6.32759144]

In [32]:
help(stats.multivariate_normal)


Help on multivariate_normal_gen in module scipy.stats._multivariate object:

class multivariate_normal_gen(__builtin__.object)
 |  A multivariate normal random variable.
 |  
 |  The `mean` keyword specifies the mean. The `cov` keyword specifies the
 |  covariance matrix.
 |  
 |  Methods
 |  -------
 |  pdf(x, mean=None, cov=1, allow_singular=False)
 |      Probability density function.
 |  logpdf(x, mean=None, cov=1, allow_singular=False)
 |      Log of the probability density function.
 |  rvs(mean=None, cov=1, allow_singular=False, size=1)
 |      Draw random samples from a multivariate normal distribution.
 |  entropy()
 |      Compute the differential entropy of the multivariate normal.
 |  
 |  Parameters
 |  ----------
 |  x : array_like
 |      Quantiles, with the last axis of `x` denoting the components.
 |  %(_doc_default_callparams)s
 |  
 |  Alternatively, the object may be called (as a function) to fix the mean
 |  and covariance parameters, returning a "frozen" multivariate normal
 |  random variable:
 |  
 |  rv = multivariate_normal(mean=None, cov=1, allow_singular=False)
 |      - Frozen object with the same methods but holding the given
 |        mean and covariance fixed.
 |  
 |  Notes
 |  -----
 |  %(_doc_callparams_note)s
 |  
 |  The covariance matrix `cov` must be a (symmetric) positive
 |  semi-definite matrix. The determinant and inverse of `cov` are computed
 |  as the pseudo-determinant and pseudo-inverse, respectively, so
 |  that `cov` does not need to have full rank.
 |  
 |  The probability density function for `multivariate_normal` is
 |  
 |  .. math::
 |  
 |      f(x) = \frac{1}{\sqrt{(2 \pi)^k \det \Sigma}} \exp\left( -\frac{1}{2} (x - \mu)^T \Sigma^{-1} (x - \mu) \right),
 |  
 |  where :math:`\mu` is the mean, :math:`\Sigma` the covariance matrix,
 |  and :math:`k` is the dimension of the space where :math:`x` takes values.
 |  
 |  .. versionadded:: 0.14.0
 |  
 |  Examples
 |  --------
 |  >>> import matplotlib.pyplot as plt
 |  >>> from scipy.stats import multivariate_normal
 |  >>> x = np.linspace(0, 5, 10, endpoint=False)
 |  >>> y = multivariate_normal.pdf(x, mean=2.5, cov=0.5); y
 |  array([ 0.00108914,  0.01033349,  0.05946514,  0.20755375,  0.43939129,
 |          0.56418958,  0.43939129,  0.20755375,  0.05946514,  0.01033349])
 |  >>> plt.plot(x, y)
 |  
 |  The input quantiles can be any shape of array, as long as the last
 |  axis labels the components.  This allows us for instance to
 |  display the frozen pdf for a non-isotropic random variable in 2D as
 |  follows:
 |  
 |  >>> x, y = np.mgrid[-1:1:.01, -1:1:.01]
 |  >>> pos = np.empty(x.shape + (2,))
 |  >>> pos[:, :, 0] = x; pos[:, :, 1] = y
 |  >>> rv = multivariate_normal([0.5, -0.2], [[2.0, 0.3], [0.3, 0.5]])
 |  >>> plt.contourf(x, y, rv.pdf(pos))
 |  
 |  Methods defined here:
 |  
 |  __call__(self, mean=None, cov=1, allow_singular=False)
 |      Create a frozen multivariate normal distribution.
 |      
 |      See `multivariate_normal_frozen` for more information.
 |  
 |  __init__(self)
 |  
 |  entropy(self, mean=None, cov=1)
 |      Compute the differential entropy of the multivariate normal.
 |      
 |      Parameters
 |      ----------
 |      %(_doc_default_callparams)s
 |      
 |      Notes
 |      -----
 |      %(_doc_callparams_note)s
 |      
 |      Returns
 |      -------
 |      h : scalar
 |          Entropy of the multivariate normal distribution
 |  
 |  logpdf(self, x, mean, cov, allow_singular=False)
 |      Log of the multivariate normal probability density function.
 |      
 |      Parameters
 |      ----------
 |      x : array_like
 |          Quantiles, with the last axis of `x` denoting the components.
 |      mean : array_like, optional
 |          Mean of the distribution (default zero)
 |      cov : array_like, optional
 |          Covariance matrix of the distribution (default one)
 |      allow_singular : bool, optional
 |          Whether to allow a singular covariance matrix.  (Default: False)
 |      
 |      Notes
 |      -----
 |      Setting the parameter `mean` to `None` is equivalent to having `mean`
 |          be the zero-vector. The parameter `cov` can be a scalar, in which case
 |          the covariance matrix is the identity times that value, a vector of
 |          diagonal entries for the covariance matrix, or a two-dimensional
 |          array_like.
 |          
 |      
 |      Returns
 |      -------
 |      pdf : ndarray
 |          Log of the probability density function evaluated at `x`
 |  
 |  pdf(self, x, mean, cov, allow_singular=False)
 |      Multivariate normal probability density function.
 |      
 |      Parameters
 |      ----------
 |      x : array_like
 |          Quantiles, with the last axis of `x` denoting the components.
 |      mean : array_like, optional
 |          Mean of the distribution (default zero)
 |      cov : array_like, optional
 |          Covariance matrix of the distribution (default one)
 |      allow_singular : bool, optional
 |          Whether to allow a singular covariance matrix.  (Default: False)
 |      
 |      Notes
 |      -----
 |      Setting the parameter `mean` to `None` is equivalent to having `mean`
 |          be the zero-vector. The parameter `cov` can be a scalar, in which case
 |          the covariance matrix is the identity times that value, a vector of
 |          diagonal entries for the covariance matrix, or a two-dimensional
 |          array_like.
 |          
 |      
 |      Returns
 |      -------
 |      pdf : ndarray
 |          Probability density function evaluated at `x`
 |  
 |  rvs(self, mean=None, cov=1, size=1)
 |      Draw random samples from a multivariate normal distribution.
 |      
 |      Parameters
 |      ----------
 |      mean : array_like, optional
 |          Mean of the distribution (default zero)
 |      cov : array_like, optional
 |          Covariance matrix of the distribution (default one)
 |      allow_singular : bool, optional
 |          Whether to allow a singular covariance matrix.  (Default: False)
 |      size : integer, optional
 |          Number of samples to draw (default 1).
 |      
 |      Notes
 |      -----
 |      Setting the parameter `mean` to `None` is equivalent to having `mean`
 |          be the zero-vector. The parameter `cov` can be a scalar, in which case
 |          the covariance matrix is the identity times that value, a vector of
 |          diagonal entries for the covariance matrix, or a two-dimensional
 |          array_like.
 |          
 |      
 |      Returns
 |      -------
 |      rvs : ndarray or scalar
 |          Random variates of size (`size`, `N`), where `N` is the
 |          dimension of the random variable.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)


In [ ]: