Applied Data Science

Data Exploration and Visualisation

Tom Diethe


In [1]:
import pandas as pd
%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython import display
from scipy.io import loadmat
from tsne import bh_sne
from sklearn.manifold import TSNE
from sklearn.utils import shuffle
from mpl_toolkits.mplot3d import Axes3D

sns.set_context("talk", font_scale=2, rc={"lines.linewidth": 4})


/usr/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

Plotting functions


In [13]:
def scatter2d(X, y):
    fig, ax = plt.subplots(figsize=(15,10))
    ax.scatter(X[:,0], X[:,1], c=y)
    palette = sns.color_palette("husl", len(np.unique(y)))

    for i in range(len(y)):
        ax.text(X[i,0], X[i,1], y[i], color=palette[y[i]-1], fontsize='xx-small')
        
def scatter3d(X, y):
    fig = plt.figure(figsize=(15,10))
    fig.add_subplot(111, projection='3d')
    ax = Axes3D(fig)
    palette = sns.color_palette("husl", len(np.unique(y)))

    ax.mouse_init()
    ax.scatter(X[:,0], X[:,1], X[:,2], c=y)
    for i in range(len(y)):
        ax.text(X[i,0], X[i,1], X[i,2], y[i], color=palette[y[i]-1], fontsize='xx-small')

Load MNIST data


In [3]:
mnist_train = loadmat('mnist/mnist_train.mat')
mnist_test = loadmat('mnist/mnist_train.mat')
X_train = mnist_train['train_X']
y_train = mnist_train['train_labels'].ravel()
Xs, ys = shuffle(X_train, y_train, n_samples=1000, random_state=0)

2D version


In [7]:
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
%time P = model.fit_transform(Xs)


CPU times: user 5.88 s, sys: 323 ms, total: 6.2 s
Wall time: 6.77 s

In [8]:
scatter2d(P, ys)


Run Barnes-Hut t-SNE (see https://github.com/danielfrg/tsne)


In [9]:
%time B = bh_sne(Xs)


CPU times: user 5.11 s, sys: 63.1 ms, total: 5.17 s
Wall time: 5.74 s

In [10]:
scatter2d(B, ys)



In [ ]:

3D Version


In [11]:
model3 = TSNE(n_components=3, random_state=0)
np.set_printoptions(suppress=True)
%time P3 = model3.fit_transform(Xs)


CPU times: user 8.12 s, sys: 350 ms, total: 8.47 s
Wall time: 9.35 s

In [14]:
scatter3d(P3, ys)



In [ ]:

PCA on same data


In [4]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
Xp = pca.fit_transform(Xs)
scatter2d(Xp, ys)



In [ ]:

Random projections on same data


In [5]:
from sklearn import random_projection
min_dim = random_projection.johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0], eps=0.9)

grp = random_projection.GaussianRandomProjection(n_components=2)
X_new = grp.fit_transform(Xs)

In [6]:
scatter2d(X_new, ys)



In [6]:
srp = random_projection.SparseRandomProjection(n_components=2)
X_new = grp.fit_transform(Xs)
scatter2d(X_new, ys)



In [ ]: