Gaussian Mixture

One can see this unsupervised learning model as a "soft" version of K-Means. In K-Means, for each cluster, the datapoints were separated between belongs to or does not belong to the cluster. Now, what we want is a probability to belong to the cluster. In other words, given a datapoint we would like to compute its probability of belonging to each cluster. For instance, if we look for two clusters, one datapoint will have probability 0.2 to belong to cluster 1 and probability 0.8 of belonging to cluster 2.

As in K-Means, one needs to provide the number of cluster looked for. There exists some extension that are able in certain cases to detect whether the number of cluster given is too low or too high but things will be kept simple for now.

In order to do find the different probability, the clusters are modelised by probability distributions. For Gaussian Mixture, the probability distributions used are, as the name indicates, multivariate gaussians. Each cluster is thus supposed to follow a probability distribution of $N(\mu, \Sigma)$ and the whole idea is to find those $\mu$ and $\Sigma$.


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import random

from numpy import matlib

from scipy.stats import multivariate_normal
from scipy.misc import logsumexp

from fct import normalize_min_max, plot_2d, plot_multivariate_ellipse

In [2]:
## -------------------- Gaussian Multivariate  -------------------

def initiate_var(data, K, sigma):
    """Returns the priors, means and covariance matrices intialized.

    prior: all equal
    means: randomly generated within the limits of the data
    covariance matrices: identity multiplied by a same sigma
    """

    # all prior are equal at first
    priors = np.ones(K) / K

    # define limits of the data
    minis = [np.amin(data[:,j]) for j in range(data.shape[1])]
    maxis = [np.amax(data[:,j]) for j in range(data.shape[1])]
    # Generate random centers
    means = np.array([[random.random() * maxis[j] + minis[j]
                        for j in range(data.shape[1])]
                        for k in range(K)])

    # covariance matrices are identity matrices at first
    covs = [sigma * np.matlib.identity(data.shape[1]) for k in range(K)]

    return priors, means, covs


def update_gammas(data, K, means, covs, priors):
    """Build and returns the gammas matrix and the current multivariates."""

    # Definition of the multivariates using the means and covs matrices
    mlvts = [multivariate_normal(mean=means[k], cov=covs[k]) for k in range(K)]
    # calculate the probability the datapoints using the Probability Density 
    # Function (pdf)
    mlvts_pdf = [mlvts[k].pdf(data) for k in range(K)]

    # Matrix (K, N) each element is with the notation of the course
    # log(Pi_k * N(X_i | mu_k, Sigma_k))
    log_gammas = [np.log(priors[k] * mlvts_pdf[k]) for k in range(K)]

    # We sum other the N elements of the line
    sum_log_gamma = [logsumexp(np.asmatrix(log_gammas).T[n])
                     for n in range(data.shape[0])]

    # gammas is actually the matrix of the log of the gammas
    gammas = np.asmatrix([log_gammas[k] - sum_log_gamma for k in range(K)])

    return gammas, mlvts


def update_mean(data, K, gammas):
    """Returns the new means calculated using the gammas."""

    ones = np.ones(data.shape)
    up = np.dot(np.exp(gammas), data)
    down = np.dot(np.exp(gammas), ones)
    return np.array([(up[k] / down[k]).A1 for k in range(K)])


def update_cov(data, K, means, gammas):
    """Returns the new covariance matrices as a list of matrices."""

    covs = []
    ones = np.ones(data.shape)
    sum_gammas = np.dot(np.exp(gammas), ones)
    for k in range(K):
        # matrix to multiply with the (data - mu) matrix element by element
        # (gamma_k_1 gamma_k_1 .. gamma_k_1)
        # (gamma_k_2 gamma_k_2 .. gamma_k_2)
        # (  ...       ...     ..    ...   )
        # (gamma_k_N gamma_k_2 .. gamma_k_N)
        gammas_k = np.dot(np.exp(gammas[k]).T,
                          np.matlib.ones((1, data.shape[1])))
        # matrix to be substracted from the data matrix
        # (mu_k_1 mu_k_2 .. mu_k_d)
        # (mu_k_1 mu_k_2 .. mu_k_d)
        # (  ...    ...  ..  ...  )
        # (mu_k_1 mu_k_2 .. mu_k_d)
        mu = np.dot(np.matlib.ones((data.shape[0], 1)),
                    np.asmatrix(means[k]))
        cov = np.dot((np.asmatrix(data) - mu).T,
                      np.multiply(gammas_k, data - mu))
        cov /= sum_gammas[k]
        covs.append(cov)
    return covs


def update_prior(data, gammas):
    """Returns a numpy array with the prior for each multivariate."""

    ones = np.matlib.ones(data.shape[0]).T
    sum_gammas = np.dot(np.exp(gammas), ones)
    sum_gammas = np.array([np.sum(np.exp(gammas[k])) for k in range(K)])
    return sum_gammas / data.shape[0]


def gmm(data, K):
    """The Gaussian Mixture Model function."""

    # Initial variable
    sigma = 0.1
    priors, means, covs = initiate_var(data, K, sigma)
    priors_old = np.ones(K)

    # First step
    gammas, m = update_gammas(data, K, means, covs, priors)

    iteration = 0
    while not np.array_equal(np.around(priors, decimals=6),
                             np.around(priors_old, decimals=6)):
        means = update_mean(data, K, gammas)
        covs = update_cov(data, K, means, gammas)
        priors_old = priors
        priors = update_prior(data, gammas)
        gammas_old = gammas
        gammas, m = update_gammas(data, K, means, covs, priors)

        plt.clf()
        title = 'Iteration n°' + str(iteration)
        iteration += 1
        plot_2d(data, color='b', title=title)
        plot_multivariate_ellipse(m, K)
        plt.show(block=False)
        plt.pause(0.0001)

In [3]:
## -------------------- Data  -------------------

k = 2
data = pd.read_csv('datasets/data_clustering.csv')
data = np.array(data)


## -------------------- GMM in use  -------------------

normalize_min_max(data, k)
K = 4
gmm(data, K)

plt.close()