In [1]:
%matplotlib inline
# Set up Notebook
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white")
# Load the Iris Data
iris = sns.load_dataset("iris")
In [2]:
# Convenience function to plot confusion matrix
import numpy as np
import pandas as pd
# This method produces a colored heatmap that displays the relationship
# between predicted and actual types from a machine leanring method.
def confusion(test, predict, title):
# Define names for the three Iris types
names = ['setosa', 'versicolor', 'virginica']
# Make a 2D histogram from the test and result arrays
pts, xe, ye = np.histogram2d(test, predict, bins=3)
# For simplicity we create a new DataFrame
pd_pts = pd.DataFrame(pts.astype(int), index=names, columns=names )
# Display heatmap and add decorations
hm = sns.heatmap(pd_pts, annot=True, fmt="d")
hm.axes.set_title(title)
return None
# This method produces a colored scatter plot that displays the intrisic
# clustering of a particular data set. The different types are colored
# uniquely.
def splot_data(col1, col2, data, hue_col, label1, label2, xls, yls, sz=8):
# Make the scatter plot on the DataFrame
jp = sns.lmplot(col1, col2, data,
fit_reg=False, hue=hue_col, size=sz, scatter_kws ={'s': 60})
# Decorate the plot and set limits
jp.set_axis_labels(label1, label2)
jp.axes[0,0].set_xlim(xls)
jp.axes[0,0].set_ylim(yls)
sns.despine(offset=0, trim=True)
sns.set(style="ticks", font_scale=2.0)
In [3]:
# Now lets get the data and labels
data = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].values
labels = np.array([i//50 for i in range(iris.shape[0])])
In [4]:
# Principal Component Analysis
from sklearn.decomposition import PCA
# First create our PCA model
# For now we assume two compponents, to make plotting easier.
pca = PCA(n_components=2)
# Fit model to the data
pca.fit(data)
# Compute the transformed data (rotation to PCA space)
data_reduced = pca.transform(data)
# Need to modify to match number of PCA components
cols = ['PCA1', 'PCA2', 'Species']
# For example, if n_components = 3
# cols = ['PCA1', 'PCA2', 'PCA3', 'Species']
# Now create a new DataFrame to hold the results
# First a temporary np.array
tmp_d = np.concatenate((data_reduced, iris['species'].reshape((150, 1))), axis=1)
iris_pca = pd.DataFrame(tmp_d, columns = cols)
The last machine learning technique we will explore is cluster finding. Clustering seeks to first find $N$ clusters in a data set and to subsequently identify which cluster data belong to. While there are a number of different approaches to clustering, one of the easiest to understand is the k-means algorithm. In this algorithm we start with a guess for the number of clusters (again this can be based on a priori information or iteratively quantified). We randomly place cluster centers in the data and determine how well the data cluster to these cluster centers. This information is used to pick new cluster centers, and the process continues until a solution converges (or we reach a predefined number of iterations). This process is displayed in the following figure from Wikipedia.
We can employ k-means clustering with scikit-learn by using the KMeans
within the cluster
module. This requires the number of clusters to
find as an input parameter. After the model is created, we once again
fit the model to the data and subsequently obtain our model predictions.
Note how this process is unsupervised in that we do not use the label
array in this process. After we find our clusters, we plot the original
data and the new clustered data in our principal component axes to
visually quantify how well the algorithm performed.
In [5]:
# We will use kmeans form scikit-learn
from sklearn.cluster import KMeans
# We build our model assuming three clusters
k_means = KMeans(n_clusters=3, random_state=0)
# We fit our data to assign classes
k_means.fit(data)
# Obtain the predictions
y_pred = k_means.predict(data)
In [6]:
# Now we compare the cluster assignments to the real classes.
# Plot original data by using the principal components
splot_data('PCA1', 'PCA2', iris_pca, 'Species',
'First PCA', 'Second PCA', (-4.2, 4.6), (-1.8, 1.6), 6)
# Now we create the DataFrame from our predictions and the PCA components.
tmp_d = np.concatenate((data_reduced, y_pred.reshape((150, 1))), axis=1)
iris_clust = pd.DataFrame(tmp_d, columns = ['PCA1', 'PCA2', 'Cluster'])
# Plot k-means clusters by using the principal components
splot_data('PCA1', 'PCA2', iris_clust, 'Cluster',
'First PCA', 'Second PCA', (-4.2, 4.6), (-1.8, 1.6), 6)