It can sometimes be a bit difficult to understand what's going on without a good plot, so I wanted to start trying to visualise structures vs cluster definition on the Iris dataset. This will involve building the network structure and then querying the joint distribution of petal width, petal length, sepal width and sepal length given a discrete variable; the joint distribution will be a Gaussian mixture model.
The Python SDK is currently a bit limited for 'custom' queries, so currently only supports a Gaussian mixture query, with multiple continuous variables as the head variables, and a single discrete variable for the tail, e.g. P(petallength, sepalwidth, sepallength, petalwidth | irisclass). Bayes Server obviously supports a lot more.
First off, imports plus some code for plotting ellipses based upon a covariance matrix.
In [1]:
%matplotlib notebook
import pandas as pd
import sys
sys.path.append("../../../bayespy")
import bayespy
from bayespy.network import Builder as builder
import logging
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
# Using the latent variable to cluster data points. Based upon the Iris dataset which has 3 distinct clusters
# (not all of which are linearly separable). Using a joint probability distribution, first based upon the class
# variable 'iris_class' and subsequently the cluster variable as a tail variable. Custom query currently only supports
# a single discrete tail variable and multiple continuous head variables.
# http://stackoverflow.com/questions/12301071/multidimensional-confidence-intervals
def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs):
"""
Plots an `nstd` sigma error ellipse based on the specified covariance
matrix (`cov`). Additional keyword arguments are passed on to the
ellipse patch artist.
Parameters
----------
cov : The 2x2 covariance matrix to base the ellipse on
pos : The location of the center of the ellipse. Expects a 2-element
sequence of [x0, y0].
nstd : The radius of the ellipse in numbers of standard deviations.
Defaults to 2 standard deviations.
ax : The axis that the ellipse will be plotted on. Defaults to the
current axis.
Additional keyword arguments are pass on to the ellipse patch.
Returns
-------
A matplotlib ellipse artist
"""
def eigsorted(cov):
vals, vecs = np.linalg.eigh(cov)
order = vals.argsort()[::-1]
return vals[order], vecs[:,order]
if ax is None:
ax = plt.gca()
vals, vecs = eigsorted(cov)
theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))
# Width and height are "full" widths, not radius
width, height = 2 * nstd * np.sqrt(vals)
ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs)
ax.add_artist(ellip)
return ellip
def plot(head_variables, results):
fig = plt.figure(figsize=(10, 10))
current_plot = 1
for i, hv in enumerate(head_variables):
for j in range(i + 1, len(head_variables)):
hv1 = head_variables[j]
ax = fig.add_subplot(3, 2, current_plot)
ax.set_title("{} vs {}".format(hv, hv1))
ax.plot(iris[hv].tolist(), iris[head_variables[j]].tolist(), 'bo')
for k, v in results.items():
mean = results[k]['mean']
cov = results[k]['covariance']
#reduce the covariance matrix to the two variables we're currently looking at
c = np.array([[cov[i,i], cov[i,j]], [cov[j,i], cov[j,j]]], np.float64)
plot_cov_ellipse(c, pos=(mean[i], mean[j]), nstd=3, alpha=0.5, color='green')
ax.set_xlim([iris[hv].min() - 1, iris[hv].max() + 1])
ax.set_ylim([iris[hv1].min() - 1, iris[hv1].max() + 1])
current_plot += 1
plt.show()
Next, just a bit of setup code to load the data and setup the Jpype instance.
In [2]:
logger = logging.getLogger()
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)
bayespy.jni.attach(logger)
db_folder = bayespy.utils.get_path_to_parent_dir("")
iris = pd.read_csv(os.path.join(db_folder, "data/iris.csv"), index_col=False)
The next step is creating the model by hand. The Python SDK also supports multivariate nodes, but for clarity each node is individually stored here. This is not a fully connected model, as the continuous nodes are only connected through the cluster.
In [3]:
network = bayespy.network.create_network()
petal_length = builder.create_continuous_variable(network, "petal_length")
petal_width = builder.create_continuous_variable(network, "petal_width")
sepal_length = builder.create_continuous_variable(network, "sepal_length")
sepal_width = builder.create_continuous_variable(network, "sepal_width")
nodes = [petal_length, petal_width, sepal_length, sepal_width]
class_variable = builder.create_discrete_variable(network, iris, 'iris_class', iris['iris_class'].unique())
for i, node in enumerate(nodes):
builder.create_link(network, class_variable, node)
plt.figure()
layout = bayespy.visual.NetworkLayout(network)
graph = layout.build_graph()
pos = layout.fruchterman_reingold_layout(graph)
layout.visualise(graph, pos)
The network still needs training, so kick that off.
In [4]:
with bayespy.data.DataSet(iris, db_folder, logger) as dataset:
model = bayespy.model.NetworkModel(network, logger)
model.train(dataset)
Now it gets interesting, as we can query the priors that the network has setup. GaussianMixtureQuery returns the same format of covariance matrix that would be output by numpy.conv.
In [5]:
with bayespy.data.DataSet(iris, db_folder, logger) as dataset:
head_variables = ['sepal_length','sepal_width','petal_length','petal_width']
query_type_class = bayespy.model.QueryMixtureOfGaussians(
head_variables=head_variables,
tail_variables=['iris_class'])
(engine, _, _) = bayespy.model.InferenceEngine(network).create()
# pass in an inference engine so that multiple queries can be performed, or evidence can be set.
query = bayespy.model.SingleQuery(network, engine, logger)
results_class = query.query([query_type_class])
plot(head_variables, results_class)
Performance doesn't seem too bad with the naive Bayes model, however it's possible to note that the ellipses have 0 covariance, as each variable is independent of the other (apart from iris_class). To improve performance, the variables could be fully connected.
In [6]:
network = bayespy.network.create_network()
petal_length = builder.create_continuous_variable(network, "petal_length")
petal_width = builder.create_continuous_variable(network, "petal_width")
sepal_length = builder.create_continuous_variable(network, "sepal_length")
sepal_width = builder.create_continuous_variable(network, "sepal_width")
nodes = [petal_length, petal_width, sepal_length, sepal_width]
class_variable = builder.create_discrete_variable(network, iris, 'iris_class', iris['iris_class'].unique())
for i, node in enumerate(nodes):
builder.create_link(network, class_variable, node)
for j in range(i+1, len(nodes)):
builder.create_link(network, node, nodes[j])
plt.figure()
layout = bayespy.visual.NetworkLayout(network)
graph = layout.build_graph()
pos = layout.fruchterman_reingold_layout(graph)
layout.visualise(graph, pos)
In [7]:
with bayespy.data.DataSet(iris, db_folder, logger) as dataset:
model = bayespy.model.NetworkModel(network, logger)
model.train(dataset)
head_variables = ['sepal_length','sepal_width','petal_length','petal_width']
query_type_class = bayespy.model.QueryMixtureOfGaussians(
head_variables=head_variables,
tail_variables=['iris_class'])
(engine, _, _) = bayespy.model.InferenceEngine(network).create()
# pass in an inference engine so that multiple queries can be performed, or evidence can be set.
query = bayespy.model.SingleQuery(network, engine, logger)
results_class = query.query([query_type_class])
plot(head_variables, results_class)
As can be seen, the ellipses are no longer axis aligned and now covary accordingly. If there are more than three clusters, and only three classes, then a latent variable can be used.