In [2]:
# Baked-in within python modules
from collections import defaultdict
# Alphabetical order for nonstandard python modules is conventional
# We're doing "import superlongname as abbrev" for our laziness - this way we don't have to type out the whole thing each time.
# Python plotting library
import matplotlib as mpl
import matplotlib.pyplot as plt
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# T-test of independent samples
from scipy.stats import ttest_ind
# Statistical plotting library we'll use
import seaborn as sns
sns.set(style='whitegrid')
# Matrix decomposition
from sklearn.decomposition import PCA, FastICA
# Manifold learning
from sklearn.manifold import MDS, TSNE
# Clustering
from sklearn.cluster import KMeans, MiniBatchKMeans
# Plotting dendrograms
from scipy.cluster import hierarchy
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
In [3]:
shalek2013_metadata = pd.read_csv('../data/shalek2013/metadata.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
shalek2013_expression = pd.read_csv('../data/shalek2013/expression.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
shalek2013_expression_feature = pd.read_csv('../data/shalek2013/expression_feature.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
In [4]:
shalek2013_metadata['color'] = shalek2013_metadata['maturity'].map(
lambda x: 'MediumTurquoise' if x == 'immature' else 'Teal')
shalek2013_metadata.loc[shalek2013_metadata['pooled'], 'color'] = 'black'
shalek2013_metadata
Out[4]:
If you specify "n_components=None
", then the program will find as many components as there are samples.
In [5]:
shalek2013_smusher = PCA(n_components=None)
shalek2013_smushed = pd.DataFrame(shalek2013_smusher.fit_transform(shalek2013_expression),
index=shalek2013_expression.index)
print(shalek2013_smushed.shape)
shalek2013_smushed.head()
Out[5]:
In [6]:
g = sns.clustermap(shalek2013_smushed)
We can add the colors of the samples as the rows, the "MediumTurqouise" (lighter) for the immature cells, "Teal" (darker) for the mature cells, and black for the pooled samples.
In [7]:
g = sns.clustermap(shalek2013_smushed, row_colors=shalek2013_metadata['color'])
We can use different distance metrics, too. Here we're using "cityblock"
(instead of the default "euclidean"
)
In [8]:
g = sns.clustermap(shalek2013_smushed, metric='cityblock', row_colors=shalek2013_metadata['color'])
Or different distance linkage methods. Here we're using "ward"
(instead of the default "average"
)
In [9]:
g = sns.clustermap(shalek2013_smushed, method='ward', row_colors=shalek2013_metadata['color'])
You can combine methods and metrics as well:
In [10]:
g = sns.clustermap(shalek2013_smushed, method='ward', metric='cityblock', row_colors=shalek2013_metadata['color'])
In [11]:
shalek2013_smusher = PCA(n_components=5)
shalek2013_smushed = pd.DataFrame(shalek2013_smusher.fit_transform(shalek2013_expression),
index=shalek2013_expression.index)
print(shalek2013_smushed.shape)
g = sns.clustermap(shalek2013_smushed, method='ward', metric='euclidean',
row_colors=shalek2013_metadata['color'])
In [12]:
figwidth, figheight = 8, 3
fig, ax = plt.subplots(figsize=(figwidth, figheight))
# Make the clustering dendrogram colors not suck
hierarchy.set_link_color_palette(list(map(mpl.colors.rgb2hex, sns.color_palette('Dark2', n_colors=12))))
cluster_threshold = 50
cden = hierarchy.dendrogram(g.dendrogram_row.linkage,
color_threshold=cluster_threshold,
labels=shalek2013_expression.index,
above_threshold_color='k')
plt.axhline(cluster_threshold, color='Crimson', linestyle='--');
plt.xticks(rotation=90, fontsize=4);
Here's a utility function to get the cluster class assignments after thresholding
In [13]:
"""
Cluster assignment and coloring functions from Macaulay supplemental notebooks
"""
class Clusters(dict):
def _repr_html_(self):
html = '<table style="border: 0;">'
for c in self:
hx = mpl.colors.rgb2hex(mpl.colors.colorConverter.to_rgb(c))
html += '<tr style="border: 0;">' \
'<td style="background-color: {0}; ' \
'border: 0;">' \
'<code style="background-color: {0};">'.format(hx)
html += c + '</code></td>'
html += '<td style="border: 0"><code>'
html += repr(self[c]) + '</code>'
html += '</td></tr>'
html += '</table>'
return html
def get_cluster_classes(den, label='ivl'):
cluster_idxs = defaultdict(list)
for c, pi in zip(den['color_list'], den['icoord']):
for leg in pi[1:3]:
i = (leg - 5.0) / 10.0
if abs(i - int(i)) < 1e-5:
cluster_idxs[c].append(int(i))
cluster_classes = Clusters()
for c, l in cluster_idxs.items():
i_l = list(sorted([den[label][i] for i in l]))
cluster_classes[c] = i_l
return cluster_classes
def get_cluster_limits(den):
cluster_idxs = defaultdict(list)
for c, pi in zip(den['color_list'], den['icoord']):
for leg in pi[1:3]:
i = (leg - 5.0) / 10.0
if abs(i - int(i)) < 1e-5:
cluster_idxs[c].append(int(i))
cluster_limits = Clusters()
for c in cluster_idxs:
cluster_limits[c] = (min(cluster_idxs[c]), max(cluster_idxs[c]))
return cluster_limits
In [14]:
# --- Get cluster-defined colors for each cell, for this threshold --- #
clusters = get_cluster_classes(cden)
cluster_cell_colors = []
for cell in shalek2013_expression.index:
for color in clusters:
if cell in clusters[color]:
cluster_cell_colors.append(color)
break
cluster_cell_colors
Out[14]:
Re-plot the clustered heatmap with the samples colored by clusters
In [15]:
g = sns.clustermap(shalek2013_smushed, method='ward', metric='euclidean',
row_colors=cluster_cell_colors)
Plot the first two components of the PCA with the cells colored by the cluster classes
In [16]:
# Initialize a figure with a single subpanel (axes, or ax) to plot on
fig, ax = plt.subplots()
# Plot the first ("0") and second ("1") components
ax.scatter(shalek2013_smushed[0], shalek2013_smushed[1], color=cluster_cell_colors,
s=100, edgecolor='white', linewidth=1)
xlabel = 'PC1 explains {:.1f}% variance'.format(100*shalek2013_smusher.explained_variance_ratio_[0])
ylabel = 'PC2 explains {:.1f}% variance'.format(100*shalek2013_smusher.explained_variance_ratio_[1])
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
# Force legend location to be upper left
ax.legend(loc='upper left')
Out[16]:
In [17]:
from sklearn.cluster import KMeans
n_clusters = 5
estimator = KMeans(n_clusters)
estimator.fit(shalek2013_expression)
kmeans_clusters = estimator.predict(shalek2013_expression)
kmeans_clusters
Out[17]:
In [18]:
kmeans_palette = sns.color_palette('Set2', n_colors=n_clusters)
kmeans_colors = [kmeans_palette[i] for i in kmeans_clusters]
Project the cluster centers into reduced dimensionality space
In [19]:
estimator.cluster_centers_
Out[19]:
In [20]:
cluster_centers = pd.DataFrame(shalek2013_smusher.fit_transform(estimator.cluster_centers_))
cluster_centers
Out[20]:
Plot the PCA-reduced data with the cluster columns, and the cluster centers as a black "X", using the "
In [21]:
plt.scatter(shalek2013_smushed[0], shalek2013_smushed[1], color=kmeans_colors,
s=100, edgecolor='white', linewidth=1);
plt.scatter(cluster_centers[0], cluster_centers[1], color='k', marker='x', s=100, linewidth=3)
Out[21]:
In [22]:
macaulay2016_expression = pd.read_csv('../data/macaulay2016/gene_expression_s.csv', index_col=0)
# Set maximum columns to display as 50 because the dataframe has 49 columns
pd.options.display.max_columns = 50
macaulay2016_metadata = pd.read_csv('../data/macaulay2016/sample_info_qc.csv', index_col=0)
# Add column for gfp
macaulay2016_metadata['gfp_color'] = ['#31a354' if c == 'HIGH' else '#e5f5e0' for c in macaulay2016_metadata['condition']]
# Necessary step for converting the parsed cluster color to be usable with matplotlib
macaulay2016_metadata['cluster_color'] = macaulay2016_metadata['cluster_color'].map(eval)
# --- Filter macaulay2016 data --- #
ensembl_genes = [x for x in macaulay2016_expression.index if x.startswith('ENS')]
cells_pass_qc = macaulay2016_metadata["Pass QC"].index[macaulay2016_metadata["Pass QC"]]
macaulay2016_expression_filtered = macaulay2016_expression.loc[ensembl_genes, cells_pass_qc]
# Recalculate TPM
macaulay2016_expression_filtered = 1e6 * macaulay2016_expression_filtered / macaulay2016_expression_filtered.sum()
# Transpose so it's machine learning format
macaulay2016_expression_filtered = macaulay2016_expression_filtered.T
# Take only "expressed genes" with expression greater than 1 in at least 3 cells
mask = (macaulay2016_expression_filtered > 1).sum() >= 3
macaulay2016_expression_filtered = macaulay2016_expression_filtered.loc[:, mask]
print('macaulay2016_expression_filtered.shape', macaulay2016_expression_filtered.shape)
# Add 1 and log10
macaulay2016_expression_log10 = np.log10(macaulay2016_expression_filtered + 1)
# Macaulay2016 plotting colors
macaulay2016_gfp_colors = macaulay2016_metadata.loc[macaulay2016_expression_log10.index, 'gfp_color']
# Get cluster colors from the paper
macaulay2016_cluster_colors_from_paper = macaulay2016_metadata.loc[macaulay2016_expression_log10.index, 'cluster_color']
macaulay2016_clusters_from_paper = macaulay2016_metadata.loc[macaulay2016_expression_log10.index, 'cluster']
macaulay2016_cluster_to_color_from_paper = dict(zip(macaulay2016_clusters_from_paper, macaulay2016_cluster_colors_from_paper))
Plot the Macaulay data. We'll call the PCA or ICA reduced data macaulay2016_decomposer
because we'll use t-SNE later and we want to distinguish from when we're using matrix decomposition vs manifold learning.
In [23]:
macaulay2016_decomposer = PCA(n_components=10)
macaulay2016_decomposed = pd.DataFrame(macaulay2016_decomposer.fit_transform(macaulay2016_expression_log10),
index=macaulay2016_expression_log10.index)
print(macaulay2016_decomposed.shape)
g = sns.clustermap(macaulay2016_decomposed, method='ward', metric='euclidean',
row_colors=macaulay2016_gfp_colors)
We can also color the rows by the actual clusters from the paper.
In [24]:
macaulay2016_decomposer = PCA(n_components=10)
macaulay2016_decomposed = pd.DataFrame(macaulay2016_decomposer.fit_transform(macaulay2016_expression_log10),
index=macaulay2016_expression_log10.index)
print(macaulay2016_decomposed.shape)
g = sns.clustermap(macaulay2016_decomposed, method='ward', metric='euclidean',
row_colors=macaulay2016_cluster_colors_from_paper)
In [25]:
# YOUR CODE HERE
In [26]:
# YOUR CODE HERE
In [27]:
# YOUR CODE HERE
In [28]:
# YOUR CODE HERE
In [29]:
# YOUR CODE HERE
In [30]:
# YOUR CODE HERE
In [31]:
macaulay2016_decomposer = FastICA(n_components=10, random_state=3984)
macaulay2016_decomposed = pd.DataFrame(macaulay2016_decomposer.fit_transform(macaulay2016_expression_log10),
index=macaulay2016_expression_log10.index)
macaulay2016_tsne_smusher = TSNE(n_components=2, random_state=254)
macaulay2016_tsne_smushed = pd.DataFrame(macaulay2016_tsne_smusher.fit_transform(macaulay2016_decomposed),
index=macaulay2016_expression_log10.index)
fig, ax = plt.subplots()
ax.scatter(macaulay2016_tsne_smushed[0], macaulay2016_tsne_smushed[1],
color=macaulay2016_gfp_colors)
Out[31]:
In [32]:
# YOUR CODE HERE
In [33]:
# YOUR CODE HERE
In [34]:
# YOUR CODE HERE