We'll now look at ALL the dimensionality reduction algorithms we've looked at so far and compare them using several datasets.
In [10]:
    
# Authors: Fabian Pedregosa <fabian.pedregosa@inria.fr>
#          Olivier Grisel <olivier.grisel@ensta.org>
#          Mathieu Blondel <mathieu@mblondel.org>
#          Gael Varoquaux
# License: BSD 3 clause (C) INRIA 2011
plt.close('all')
digits = datasets.load_digits() #n_class=n_class)
X = digits.data
y = digits.target
n_samples, n_features = X.shape
n_neighbors = 30
PALETTE = sns.color_palette('husl', n_colors=len(np.unique(digits.target)))
# PALETTE = sns.diverging_palette(220, 20, n=10, center='dark')
digits_row_colors = [PALETTE[i] for i in digits.target]
#----------------------------------------------------------------------
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)
    plt.figure()
    ax = plt.subplot(111)
    for i in range(X.shape[0]):
        plt.text(X[i, 0], X[i, 1], str(digits.target[i]),
                 color=PALETTE[int(y[i])],
                 fontdict={'weight': 'bold', 'size': 9})
    if hasattr(offsetbox, 'AnnotationBbox'):
        # only print thumbnails with matplotlib > 1.0
        shown_images = np.array([[1., 1.]])  # just something big
        for i in range(digits.data.shape[0]):
            dist = np.sum((X[i] - shown_images) ** 2, 1)
            if np.min(dist) < 4e-3:
                # don't show points that are too close
                continue
            shown_images = np.r_[shown_images, [X[i]]]
            imagebox = offsetbox.AnnotationBbox(
                offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),
                X[i])
            ax.add_artist(imagebox)
    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)
#----------------------------------------------------------------------
# Plot images of the digits
n_img_per_row = 20
img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row))
for i in range(n_img_per_row):
    ix = 10 * i + 1
    for j in range(n_img_per_row):
        iy = 10 * j + 1
        img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8))
plt.imshow(img, cmap=plt.cm.binary)
plt.xticks([])
plt.yticks([])
plt.title('A selection from the 64-dimensional digits dataset');
sns.despine(bottom=True, left=True)
    
    
    
In [4]:
    
from time import time
import ipywidgets
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import offsetbox
import pandas as pd
from sklearn import (manifold, datasets, decomposition, ensemble,
                     discriminant_analysis, random_projection)
import seaborn as sns
sns.set(context='notebook', style='white')
%matplotlib inline
np.random.seed(2016)
n_samples = 10
n_genes = 20
half_genes = int(n_genes/2)
half_samples = int(n_samples/2)
size = n_samples * n_genes
genes = ['Gene_{}'.format(str(i+1).zfill(2)) for i in range(n_genes)]
samples = ['Sample_{}'.format(str(i+1).zfill(2)) for i in range(n_samples)]
mouse_data = pd.DataFrame(np.random.randn(size).reshape(n_samples, n_genes), index=samples, columns=genes)
# Add biological variance
mouse_data.iloc[:half_samples, :half_genes] += 1
mouse_data.iloc[:half_samples, half_genes:] += -1
mouse_data.iloc[half_samples:, half_genes:] += 1
mouse_data.iloc[half_samples:, :half_genes] += -1
# Z_score within genes
mouse_data = (mouse_data - mouse_data.mean())/mouse_data.std()
# Biological samples
mouse_groups = pd.Series(dict(zip(mouse_data.index, (['Mouse_01'] * int(n_samples/2)) + (['Mouse_02'] * int(n_samples/2)))), 
                         name="Mouse")
mouse_to_color = dict(zip(['Mouse_01', 'Mouse_02'], ['lightgrey', 'black']))
mouse_colors = [mouse_to_color[mouse_groups[x]] for x in samples]
# Gene colors
gene_colors = (['SeaGreen'] * half_genes) + (['MediumPurple'] * half_genes)
mouse_row_colors = mouse_colors
mouse_col_colors = gene_colors 
g = sns.clustermap(mouse_data, row_colors=mouse_row_colors, col_cluster=False, row_cluster=False, 
                   linewidth=0.5, col_colors=mouse_col_colors,
                   cbar_kws=dict(label='Normalized Expression'))
plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0);
g.fig.suptitle('Mouse data')
np.random.seed(2016)
n_samples = 10
n_genes = 20
half_genes = int(n_genes/2)
half_samples = int(n_samples/2)
size = n_samples * n_genes
genes = ['Gene_{}'.format(str(i+1).zfill(2)) for i in range(n_genes)]
samples = ['Sample_{}'.format(str(i+1).zfill(2)) for i in range(n_samples)]
pseudotime_data = pd.DataFrame(np.random.randn(size).reshape(n_samples, n_genes), index=samples, columns=genes)
# Add "psueodotime"
pseudotime_data.iloc[:, :half_genes] = pseudotime_data.iloc[:, :half_genes].add(np.square(np.arange(n_samples)/2), axis=0)
pseudotime_data.iloc[:, half_genes:] = pseudotime_data.iloc[:, half_genes:].add(np.square(np.arange(n_samples)[::-1]/2), axis=0)
# Normalize genes using z-scores
pseudotime_data = (pseudotime_data - pseudotime_data.mean())/pseudotime_data.std()
pseudotime_row_colors = sns.color_palette('BrBG', n_colors=n_samples)
pseudotime_col_colors = sns.color_palette("PRGn", n_colors=n_genes)
tidy = pseudotime_data.unstack().reset_index()
tidy = tidy.rename(columns={'level_0': 'Gene', 'level_1': "Sample", 0:'Normalized Expression'})
tidy.head()
g = sns.factorplot(data=tidy, hue='Gene', palette=pseudotime_col_colors, x='Sample', 
                   y='Normalized Expression', aspect=2)
# g.map(plt.plot, x='Sample', y='Normalized Expression')
g = sns.clustermap(pseudotime_data, row_colors=pseudotime_row_colors, col_cluster=False, row_cluster=False, 
                   linewidth=0.5, col_colors=pseudotime_col_colors,
                   cbar_kws=dict(label='Normalized Expression'))
plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0);
g.fig.suptitle('Pseudotime data')
    
In [ ]:
    
shalek2013_expression = pd.read_csv('https://github.com/YeoLab/shalek2013/raw/master/expression.csv')
shalek2013_expression_feature = pd.read_csv('https://github.com/YeoLab/shalek2013/raw/master/expression_feature.csv')
shalek2013_metadata = pd.read_csv('https://github.com/YeoLab/shalek2013/raw/master/metadata.csv')
    
In [18]:
    
methods = [
#     'Clustering: complete', 
#            'Clustering: single', 
#            'Clustering: average', 
#            'Clustering: ward', 
#            'Clustering: centroid',
           'Matrix decomposition: PCA',
           'Matrix decomposition: ICA',
           'Manifold learning: MDS',
           'Manifold learning: t-SNE'
          ]
def explore_clustering(dataset, method):
    if dataset == 'Mouse':
        data = mouse_data
        row_colors = mouse_row_colors
        col_colors = mouse_col_colors
    elif dataset == 'Pseudotime':
        data = pseudotime_data
        row_colors = pseudotime_row_colors
        col_colors = pseudotime_col_colors
    elif dataset == 'Digits':
        data = digits.data
        row_colors = digits_row_colors
        col_colors = None
    # Copy the full name of the method
    fullname = str(method)
    if method.startswith('Clustering'):
        method = method.split()[-1]
        t0 = time()
        g = sns.clustermap(data, row_colors=row_colors, method=method,
                           xticklabels=[], yticklabels=[])
        g.fig.suptitle('{} linkage of the digits (time {:.2f}s)'.format(fullname, time()-t0))
        
    else:
        n_components = 2
        max_iter = 100
        random_state = 0
        n_init = 1
        if method.endswith('PCA'):
            estimator = decomposition.PCA(n_components=n_components)
        elif method.endswith('ICA'):
            estimator = decomposition.FastICA(max_iter=max_iter, n_components=n_components, 
                                              random_state=random_state)
        elif method.endswith('MDS'):
            estimator =  manifold.MDS(n_init=n_init, max_iter=max_iter, random_state=random_state)
        elif method.endswith('t-SNE'):
            estimator = manifold.TSNE(n_components, init='pca', random_state=random_state)
        
        t0 = time()
        smushed = estimator.fit_transform(data)
        title = "{} embedding of the {} (time {:.2f}s)".format(fullname, dataset, time() - t0)
        if dataset == 'Digits':
            plot_embedding(smushed, title)
            # Plot a legend by hand
            fig, ax = plt.subplots(figsize=(1, 1))
            for digit, color in zip(digits.target, PALETTE):
                ax.bar(0, 0, color=color, label=digit)
            ax.patches = []
            ax.legend(loc='center')
            ax.axis('off')
        else:
            fig, ax = plt.subplots()
            
            ax.scatter(smushed[:, 0], smushed[:, 1], color=row_colors, s=40)
            ax.set(title=title)
            sns.despine()
ipywidgets.interact(explore_clustering,
                    dataset=ipywidgets.Dropdown(options=['Mouse', 'Pseudotime', 'Digits'], value='Mouse', 
                                               description='Dataset'),
                    metric=ipywidgets.Dropdown(options=['euclidean', 'cityblock', ], value='euclidean', 
                                               description='Distance metric'),
                    method=ipywidgets.Dropdown(options=methods, value='Matrix decomposition: PCA', 
                                               description='Unsupervised learning method'),);