Visualising clusters on the Iris dataset

It can sometimes be a bit difficult to understand what's going on without a good plot, so I wanted to start trying to visualise structures vs cluster definition on the Iris dataset. This will involve building the network structure and then querying the joint distribution of petal width, petal length, sepal width and sepal length given a discrete variable; the joint distribution will be a Gaussian mixture model.

The Python SDK is currently a bit limited for 'custom' queries, so currently only supports a Gaussian mixture query, with multiple continuous variables as the head variables, and a single discrete variable for the tail, e.g. P(petallength, sepalwidth, sepallength, petalwidth | irisclass). Bayes Server obviously supports a lot more.

First off, imports plus some code for plotting ellipses based upon a covariance matrix.


In [1]:
%matplotlib notebook

import pandas as pd

import sys
sys.path.append("../../../bayespy")

import bayespy
from bayespy.network import Builder as builder

import logging
import os

import numpy as np

import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse

# Using the latent variable to cluster data points. Based upon the Iris dataset which has 3 distinct clusters
# (not all of which are linearly separable). Using a joint probability distribution, first based upon the class
# variable 'iris_class' and subsequently the cluster variable as a tail variable. Custom query currently only supports
# a single discrete tail variable and multiple continuous head variables.

# http://stackoverflow.com/questions/12301071/multidimensional-confidence-intervals
def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs):
    """
    Plots an `nstd` sigma error ellipse based on the specified covariance
    matrix (`cov`). Additional keyword arguments are passed on to the
    ellipse patch artist.

    Parameters
    ----------
        cov : The 2x2 covariance matrix to base the ellipse on
        pos : The location of the center of the ellipse. Expects a 2-element
            sequence of [x0, y0].
        nstd : The radius of the ellipse in numbers of standard deviations.
            Defaults to 2 standard deviations.
        ax : The axis that the ellipse will be plotted on. Defaults to the
            current axis.
        Additional keyword arguments are pass on to the ellipse patch.

    Returns
    -------
        A matplotlib ellipse artist
    """
    def eigsorted(cov):
        vals, vecs = np.linalg.eigh(cov)
        order = vals.argsort()[::-1]
        return vals[order], vecs[:,order]

    if ax is None:
        ax = plt.gca()

    vals, vecs = eigsorted(cov)
    theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))

    # Width and height are "full" widths, not radius
    width, height = 2 * nstd * np.sqrt(vals)
    ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs)

    ax.add_artist(ellip)
    return ellip

def plot(head_variables, results):
    fig = plt.figure(figsize=(10, 10))
    current_plot = 1
    for i, hv in enumerate(head_variables):
        for j in range(i + 1, len(head_variables)):
            hv1 = head_variables[j]
            ax = fig.add_subplot(3, 2, current_plot)
            
            ax.set_title("{} vs {}".format(hv, hv1))        
            ax.plot(iris[hv].tolist(), iris[head_variables[j]].tolist(), 'bo')
            
            for k, v in results.items():
                mean = results[k]['mean']
                cov = results[k]['covariance']
                
                #reduce the covariance matrix to the two variables we're currently looking at
                c = np.array([[cov[i,i], cov[i,j]], [cov[j,i], cov[j,j]]], np.float64)
                
                plot_cov_ellipse(c, pos=(mean[i], mean[j]), nstd=3, alpha=0.5, color='green')
                
            ax.set_xlim([iris[hv].min() - 1, iris[hv].max() + 1])
            ax.set_ylim([iris[hv1].min() - 1, iris[hv1].max() + 1])
            current_plot += 1
            
    plt.show()

Next, just a bit of setup code to load the data and setup the Jpype instance.


In [2]:
logger = logging.getLogger()
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)

bayespy.jni.attach(logger)

db_folder = bayespy.utils.get_path_to_parent_dir("")
iris = pd.read_csv(os.path.join(db_folder, "data/iris.csv"), index_col=False)

Naive Bayes

The next step is creating the model by hand. The Python SDK also supports multivariate nodes, but for clarity each node is individually stored here. This is not a fully connected model, as the continuous nodes are only connected through the cluster.


In [3]:
network = bayespy.network.create_network()

petal_length = builder.create_continuous_variable(network, "petal_length")
petal_width = builder.create_continuous_variable(network, "petal_width")
sepal_length = builder.create_continuous_variable(network, "sepal_length")
sepal_width = builder.create_continuous_variable(network, "sepal_width")
nodes = [petal_length, petal_width, sepal_length, sepal_width]
class_variable = builder.create_discrete_variable(network, iris, 'iris_class', iris['iris_class'].unique())

for i, node in enumerate(nodes):
    builder.create_link(network, class_variable, node)

plt.figure()
layout = bayespy.visual.NetworkLayout(network)
graph = layout.build_graph()
pos = layout.fruchterman_reingold_layout(graph)
layout.visualise(graph, pos)


The network still needs training, so kick that off.


In [4]:
with bayespy.data.DataSet(iris, db_folder, logger) as dataset:
    model = bayespy.model.NetworkModel(network, logger)
    model.train(dataset)


INFO:root:Writing 150 rows to storage
Writing 150 rows to storage
INFO:root:Finished writing 150 rows to storage
Finished writing 150 rows to storage
INFO:root:Training model...
Training model...
INFO:root:Finished training model
Finished training model

Now it gets interesting, as we can query the priors that the network has setup. GaussianMixtureQuery returns the same format of covariance matrix that would be output by numpy.conv.


In [5]:
with bayespy.data.DataSet(iris, db_folder, logger) as dataset:
    head_variables = ['sepal_length','sepal_width','petal_length','petal_width']

    query_type_class = bayespy.model.QueryMixtureOfGaussians(
            head_variables=head_variables,
                tail_variables=['iris_class'])

    (engine, _, _) = bayespy.model.InferenceEngine(network).create()
    
    # pass in an inference engine so that multiple queries can be performed, or evidence can be set.
    query = bayespy.model.SingleQuery(network, engine, logger)
    results_class = query.query([query_type_class])

    plot(head_variables, results_class)


INFO:root:Writing 150 rows to storage
Writing 150 rows to storage
INFO:root:Finished writing 150 rows to storage
Finished writing 150 rows to storage

Performance doesn't seem too bad with the naive Bayes model, however it's possible to note that the ellipses have 0 covariance, as each variable is independent of the other (apart from iris_class). To improve performance, the variables could be fully connected.

Fully observed mixture model


In [6]:
network = bayespy.network.create_network()

petal_length = builder.create_continuous_variable(network, "petal_length")
petal_width = builder.create_continuous_variable(network, "petal_width")
sepal_length = builder.create_continuous_variable(network, "sepal_length")
sepal_width = builder.create_continuous_variable(network, "sepal_width")
nodes = [petal_length, petal_width, sepal_length, sepal_width]
class_variable = builder.create_discrete_variable(network, iris, 'iris_class', iris['iris_class'].unique())

for i, node in enumerate(nodes):
    builder.create_link(network, class_variable, node)
    for j in range(i+1, len(nodes)):
        builder.create_link(network, node, nodes[j])

plt.figure()
layout = bayespy.visual.NetworkLayout(network)
graph = layout.build_graph()
pos = layout.fruchterman_reingold_layout(graph)
layout.visualise(graph, pos)



In [7]:
with bayespy.data.DataSet(iris, db_folder, logger) as dataset:
    model = bayespy.model.NetworkModel(network, logger)
    model.train(dataset)
    
    head_variables = ['sepal_length','sepal_width','petal_length','petal_width']

    query_type_class = bayespy.model.QueryMixtureOfGaussians(
            head_variables=head_variables,
                tail_variables=['iris_class'])

    (engine, _, _) = bayespy.model.InferenceEngine(network).create()
    
    # pass in an inference engine so that multiple queries can be performed, or evidence can be set.
    query = bayespy.model.SingleQuery(network, engine, logger)
    results_class = query.query([query_type_class])

    plot(head_variables, results_class)


INFO:root:Writing 150 rows to storage
Writing 150 rows to storage
INFO:root:Finished writing 150 rows to storage
Finished writing 150 rows to storage
INFO:root:Training model...
Training model...
INFO:root:Finished training model
Finished training model

As can be seen, the ellipses are no longer axis aligned and now covary accordingly. If there are more than three clusters, and only three classes, then a latent variable can be used.