In [ ]:
# Alphabetical order is standard
# We're doing "import superlongname as abbrev" for our laziness - this way we don't have to type out the whole thing each time.
# Python plotting library
import matplotlib.pyplot as plt
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# T-test of independent samples
from scipy.stats import ttest_ind
# Statistical plotting library we'll use
import seaborn as sns
sns.set(style='whitegrid')
# Matrix decomposition
from sklearn.decomposition import PCA, FastICA
# Manifold learning
from sklearn.manifold import MDS, TSNE
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
# Read the data table
shalek2013_metadata = pd.read_csv('../data/shalek2013/metadata.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
shalek2013_metadata.head()
In [ ]:
shalek2013_expression = pd.read_csv('../data/shalek2013/expression.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
shalek2013_expression.head()
In [ ]:
shalek2013_expression_feature = pd.read_csv('../data/shalek2013/expression_feature.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
shalek2013_expression_feature.head()
In [ ]:
smusher = PCA(n_components=4)
# Turn the matrix-decomposed data
smushed = pd.DataFrame(smusher.fit_transform(shalek2013_expression), index=shalek2013_expression.index)
smushed.head()
In [ ]:
# Initialize a figure with a single subpanel (axes, or ax) to plot on
fig, ax = plt.subplots()
# Plot the first ("0") and second ("1") components (column names)
ax.scatter(smushed[0], smushed[1])
We could plot other components if we felt like it:
In [ ]:
# Initialize a figure with a single subpanel (axes, or ax) to plot on
fig, ax = plt.subplots()
# Plot the second ("1") and fourth ("3") components
ax.scatter(smushed[1], smushed[3])
Let's use a different color for mature and immature cells using a list comprehension. All X11 color names are valid here. I also like this list for looking up color names
In [ ]:
maturity_color = ['mediumturquoise' if x == 'immature' else 'teal' for x in shalek2013_metadata['maturity']]
# Initialize a figure with a single subpanel (axes, or ax) to plot on
fig, ax = plt.subplots()
# Plot the first ("0") and second ("1") components
ax.scatter(smushed[0], smushed[1], color=maturity_color)
Hmm those points way to the left look strange .. is it because they are pooled? Let's add a black outline to the pooled samples. That means we'll have to plot them separately.
In [ ]:
singles = shalek2013_metadata.query('pooled == False').index
singles
In [ ]:
pooled = shalek2013_metadata.query('pooled == True').index
pooled
We'll use .loc notation to access the pooled and single rows separately.
In [ ]:
# Initialize a figure with a single subpanel (axes, or ax) to plot on
fig, ax = plt.subplots()
# Plot the first ("0") and second ("1") components
ax.scatter(smushed.loc[singles, 0], smushed.loc[singles, 1], color=maturity_color)
ax.scatter(smushed.loc[pooled, 0], smushed.loc[pooled, 1], color=maturity_color, edgecolor='black', linewidth=1)
Hmmm it's hard to tell which ones have the outline so lets make the markers bigger with the argument s=100. "s" is short for "size." The default value is 20.
In [ ]:
# Initialize a figure with a single subpanel (axes, or ax) to plot on
fig, ax = plt.subplots()
# Plot the first ("0") and second ("1") components
ax.scatter(smushed.loc[singles, 0], smushed.loc[singles, 1], color=maturity_color, s=100)
ax.scatter(smushed.loc[pooled, 0], smushed.loc[pooled, 1], color=maturity_color, edgecolor='black', linewidth=1, s=100)
Let's add a nice legend too.
In [ ]:
# Initialize a figure with a single subpanel (axes, or ax) to plot on
fig, ax = plt.subplots()
# Plot the first ("0") and second ("1") components
ax.scatter(smushed.loc[singles, 0], smushed.loc[singles, 1], color=maturity_color, s=100, label='singles')
ax.scatter(smushed.loc[pooled, 0], smushed.loc[pooled, 1], color=maturity_color,
edgecolor='black', linewidth=1, s=100, label='pooled')
ax.legend(loc='best')
Oh hmm that only made a legend for the light blue. We'll have to plot the different colors separately. To do that, we'll have to get a subset of the data using just the mature and immature cells.
In [ ]:
immature = shalek2013_metadata.query('maturity == "immature"').index
immature
In [ ]:
mature = shalek2013_metadata.query('maturity == "mature"').index
mature
Now plot the different subsets separately so they have different labels in the legend.
In [ ]:
# Initialize a figure with a single subpanel (axes, or ax) to plot on
fig, ax = plt.subplots()
# Plot the first ("0") and second ("1") components
ax.scatter(smushed.loc[singles & immature, 0], smushed.loc[singles & immature, 1], color='MediumTurquoise',
s=100, label='single, immature')
ax.scatter(smushed.loc[singles & mature, 0], smushed.loc[singles & mature, 1], color='Teal',
s=100, label='single, mature')
ax.scatter(smushed.loc[pooled, 0], smushed.loc[pooled, 1], color='black',
edgecolor='black', linewidth=1, s=100, label='pooled')
ax.legend()
Hmm the legend overlaps with the cells so let's put the legend in the upper left.
In [ ]:
# Initialize a figure with a single subpanel (axes, or ax) to plot on
fig, ax = plt.subplots()
# Plot the first ("0") and second ("1") components
ax.scatter(smushed.loc[singles & immature, 0], smushed.loc[singles & immature, 1], color='MediumTurquoise',
s=100, label='single, immature')
ax.scatter(smushed.loc[singles & mature, 0], smushed.loc[singles & mature, 1], color='Teal',
s=100, label='single, mature')
ax.scatter(smushed.loc[pooled, 0], smushed.loc[pooled, 1], color='black',
edgecolor='black', linewidth=1, s=100, label='pooled')
# Force legend location to be upper left
ax.legend(loc='upper left')
Now that we don't have the pooled samples outlined in black, let's also add a white outline so it's easier to tell the individual cells apart, with adding the following to ax.scatter:
edgecolor='white', linewidth=1
In [ ]:
# Initialize a figure with a single subpanel (axes, or ax) to plot on
fig, ax = plt.subplots()
# Plot the first ("0") and second ("1") components
ax.scatter(smushed.loc[singles & immature, 0], smushed.loc[singles & immature, 1], color='MediumTurquoise',
s=100, label='single, immature', edgecolor='white', linewidth=1)
ax.scatter(smushed.loc[singles & mature, 0], smushed.loc[singles & mature, 1], color='Teal',
s=100, label='single, mature', edgecolor='white', linewidth=1)
ax.scatter(smushed.loc[pooled, 0], smushed.loc[pooled, 1], color='black',
s=100, label='pooled', edgecolor='white', linewidth=1)
# Force legend location to be upper left
ax.legend(loc='upper left')
And label the x and y axes with the percentage explained variance.
In [ ]:
# Initialize a figure with a single subpanel (axes, or ax) to plot on
fig, ax = plt.subplots()
# Plot the first ("0") and second ("1") components
ax.scatter(smushed.loc[singles & immature, 0], smushed.loc[singles & immature, 1], color='MediumTurquoise',
s=100, label='single, immature', edgecolor='white', linewidth=1)
ax.scatter(smushed.loc[singles & mature, 0], smushed.loc[singles & mature, 1], color='Teal',
s=100, label='single, mature', edgecolor='white', linewidth=1)
ax.scatter(smushed.loc[pooled, 0], smushed.loc[pooled, 1], color='black', edgecolor='white', linewidth=1,
s=100, label='pooled')
xlabel = 'PC1 explains {:.1f}% variance'.format(100*smusher.explained_variance_ratio_[0])
ylabel = 'PC2 explains {:.1f}% variance'.format(100*smusher.explained_variance_ratio_[1])
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
# Force legend location to be upper left
ax.legend(loc='upper left')
If we like, we can save the figure with fig.savefig("shalek2013_pca.pdf"). The format of the file to save is auto-detected
In [ ]:
# Initialize a figure with a single subpanel (axes, or ax) to plot on
fig, ax = plt.subplots()
# Plot the first ("0") and second ("1") components
ax.scatter(smushed.loc[singles & immature, 0], smushed.loc[singles & immature, 1], color='MediumTurquoise',
s=100, label='single, immature', edgecolor='white', linewidth=1)
ax.scatter(smushed.loc[singles & mature, 0], smushed.loc[singles & mature, 1], color='Teal',
s=100, label='single, mature', edgecolor='white', linewidth=1)
ax.scatter(smushed.loc[pooled, 0], smushed.loc[pooled, 1], color='black',
edgecolor='white', linewidth=1, s=100, label='pooled')
xlabel = 'PC1 explains {:.1f}% variance'.format(100*smusher.explained_variance_ratio_[0])
ylabel = 'PC2 explains {:.1f}% variance'.format(100*smusher.explained_variance_ratio_[1])
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
# Force legend location to be upper left
ax.legend(loc='upper left')
fig.savefig('shalek2013_pca.pdf')
To do this exercise, you don't have to use the cells above, you can press "+" to add a cell and work in there instead.
FastICA), t-SNE (TSNE) and MDS (MDS)FastICA(random_state=0) for example
In [ ]:
macaulay2016_expression = pd.read_csv('../data/macaulay2016/gene_expression_s.csv', index_col=0)
macaulay2016_expression.head()
Notice that there's ERCCs and GFP expression in this matrix as well.
In [ ]:
macaulay2016_expression.tail()
Read metadata
In [ ]:
# Set maximum columns to display as 50 because the dataframe has 49 columns
pd.options.display.max_columns = 50
macaulay2016_metadata = pd.read_csv('../data/macaulay2016/sample_info_qc.csv', index_col=0)
# Necessary step for converting the parsed cluster color to be usable with matplotlib
macaulay2016_metadata['cluster_color'] = macaulay2016_metadata['cluster_color'].map(eval)
macaulay2016_metadata.head()
Filter the gene expression data to use only ensembl genes (no ERCCs or GFP), only use cells that passed QC, and recalculate transcripts per million without the ERCCS.
In [ ]:
ensembl_genes = [x for x in macaulay2016_expression.index if x.startswith('ENS')]
cells_pass_qc = macaulay2016_metadata["Pass QC"].index[macaulay2016_metadata["Pass QC"]]
macaulay2016_expression_filtered = macaulay2016_expression.loc[ensembl_genes, cells_pass_qc]
# Recalculate TPM
macaulay2016_expression_filtered = 1e6 * macaulay2016_expression_filtered / macaulay2016_expression_filtered.sum()
# Transpose so it's machine learning format
macaulay2016_expression_filtered = macaulay2016_expression_filtered.T
# Take only "expressed genes" with expression greater than 1 in at least 3 cells
mask = (macaulay2016_expression_filtered > 1).sum() >= 3
macaulay2016_expression_filtered = macaulay2016_expression_filtered.loc[:, mask]
print(macaulay2016_expression_filtered.shape)
macaulay2016_expression_filtered.head()
Add 1 and take the log10
In [ ]:
macaulay2016_expression_log10 = np.log10(macaulay2016_expression_filtered + 1)
macaulay2016_expression_log10.head()
To help you with plotting, here is a dictionary which maps the name of the cluster to the (r,g,b) color values, where each value is from 0 to 1.
In [ ]:
macaulay2016_colors = macaulay2016_metadata.loc[macaulay2016_expression_log10.index, 'cluster_color']
macaulay2016_clusters = macaulay2016_metadata.loc[macaulay2016_expression_log10.index, 'cluster']
macaulay2016_cluster_to_color = dict(zip(macaulay2016_clusters, macaulay2016_colors))
macaulay2016_cluster_to_color
Here's an example plot using a groupby, which is a super-duper convenient function (on pandas DataFrames only - that's why we're making the PCA and FastICA results into dataframes) to take (non-overlapping) subsets of your data. Here we're taking all cells that correspond to each cluster, and plotting them here.
In [ ]:
fig, ax = plt.subplots()
for cluster, df in macaulay2016_expression_log10.groupby(macaulay2016_clusters):
print(cluster)
color = macaulay2016_cluster_to_color[cluster]
ax.scatter(df['ENSDARG00000000001'],
df['ENSDARG00000000002'], color=color, label=cluster)
ax.legend()
In [ ]:
smusher = PCA(n_components=4)
macaulay2016_reduced = pd.DataFrame(smusher.fit_transform(macaulay2016_expression_log10),
index=macaulay2016_expression_log10.index)
macaulay2016_reduced.head()
Add a column indicating the color
In [ ]:
macaulay2016_reduced = macaulay2016_reduced.join(macaulay2016_clusters)
macaulay2016_reduced.head()
We can use groupby on the "cluster" column to plot one pair of axes:
In [ ]:
fig, ax = plt.subplots()
for cluster, df in macaulay2016_reduced.groupby('cluster'):
color = macaulay2016_cluster_to_color[cluster]
ax.scatter(df[0], df[1], color=color, label=cluster)
Or we can use seaborn's pairplot to plot all the axes of the data:
In [ ]:
sns.pairplot(macaulay2016_reduced, hue='cluster', palette='Set2')
To do this exercise, you don't have to use the cells above, you can press "+" to add a cell and work in there instead.