Applied Data Science

Data Exploration and Visualisation

Tom Diethe

In [1]:
import pandas as pd
%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython import display
from import loadmat
from tsne import bh_sne
from sklearn.manifold import TSNE
from sklearn.utils import shuffle
from mpl_toolkits.mplot3d import Axes3D

sns.set_context("talk", font_scale=2, rc={"lines.linewidth": 4})

/usr/local/lib/python2.7/site-packages/matplotlib/ UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

Plotting functions

In [13]:
def scatter2d(X, y):
    fig, ax = plt.subplots(figsize=(15,10))
    ax.scatter(X[:,0], X[:,1], c=y)
    palette = sns.color_palette("husl", len(np.unique(y)))

    for i in range(len(y)):
        ax.text(X[i,0], X[i,1], y[i], color=palette[y[i]-1], fontsize='xx-small')
def scatter3d(X, y):
    fig = plt.figure(figsize=(15,10))
    fig.add_subplot(111, projection='3d')
    ax = Axes3D(fig)
    palette = sns.color_palette("husl", len(np.unique(y)))

    ax.scatter(X[:,0], X[:,1], X[:,2], c=y)
    for i in range(len(y)):
        ax.text(X[i,0], X[i,1], X[i,2], y[i], color=palette[y[i]-1], fontsize='xx-small')

Load MNIST data

In [3]:
mnist_train = loadmat('mnist/mnist_train.mat')
mnist_test = loadmat('mnist/mnist_train.mat')
X_train = mnist_train['train_X']
y_train = mnist_train['train_labels'].ravel()
Xs, ys = shuffle(X_train, y_train, n_samples=1000, random_state=0)

2D version

In [7]:
model = TSNE(n_components=2, random_state=0)
%time P = model.fit_transform(Xs)

CPU times: user 5.88 s, sys: 323 ms, total: 6.2 s
Wall time: 6.77 s

In [8]:
scatter2d(P, ys)

Run Barnes-Hut t-SNE (see

In [9]:
%time B = bh_sne(Xs)

CPU times: user 5.11 s, sys: 63.1 ms, total: 5.17 s
Wall time: 5.74 s

In [10]:
scatter2d(B, ys)

In [ ]:

3D Version

In [11]:
model3 = TSNE(n_components=3, random_state=0)
%time P3 = model3.fit_transform(Xs)

CPU times: user 8.12 s, sys: 350 ms, total: 8.47 s
Wall time: 9.35 s

In [14]:
scatter3d(P3, ys)