In [ ]:
# Alphabetical order is standard
# We're doing "import superlongname as abbrev" for our laziness - this way we don't have to type out the whole thing each time.
# Python plotting library
import matplotlib.pyplot as plt
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
sns.set(style='whitegrid')
# Label processing
from sklearn import preprocessing
# Matrix decomposition
from sklearn.decomposition import PCA, FastICA
# Classification
from sklearn.svm import SVC
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
In [ ]:
metadata = pd.read_csv('../data/shalek2013/metadata.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
expression = pd.read_csv('../data/shalek2013/expression.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
expression_feature = pd.read_csv('../data/shalek2013/expression_feature.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
# creating new column indicating color
metadata['color'] = metadata['maturity'].map(
lambda x: 'MediumTurquoise' if x == 'immature' else 'Teal')
metadata.loc[metadata['pooled'], 'color'] = 'black'
# Create a column indicating both maturity and pooled for coloring with seaborn, e.g. sns.pairplot
metadata['group'] = metadata['maturity']
metadata.loc[metadata['pooled'], 'group'] = 'pooled'
# Create a palette and ordering for using with sns.pairplot
palette = ['MediumTurquoise', 'Teal', 'black']
order = ['immature', 'mature', 'pooled']
singles_ids = [x for x in expression.index if x.startswith('S')]
singles = expression.loc[singles_ids]
# Use only the genes that are substantially expressed in single cells
singles = singles.loc[:, (singles > 1).sum() >= 3]
singles.shape
# Now because computers only understand numbers, we'll convert the
# category label of "mature" and "immature" into integers to a using a
# `LabelEncoder`. Let's look at that column again, only for mature cells:
singles_maturity = metadata.loc[singles.index, 'maturity']
# Instantiate the encoder
encoder = preprocessing.LabelEncoder()
# Get number of categories and transform "mature"/"immature" to numbers
target = encoder.fit_transform(singles_maturity)
## Run the classifier!!
# Yay so now we can run a classifier!
classifier = SVC(kernel='linear')
classifier.fit(singles, target)
Let's create that coefficients
series again which we'll use extensively.
In [ ]:
coefficients = pd.Series(classifier.coef_.flat, index=singles.columns)
coefficients.head()
In [ ]:
below_stringent_cutoff = coefficients[coefficients < (coefficients.mean() - 3*coefficients.std())]
print(below_stringent_cutoff.shape)
below_stringent_cutoff.head()
And get just the genes from this cutoff
In [ ]:
singles_below_stringent_cutoff = singles[below_stringent_cutoff.index]
print(singles_below_stringent_cutoff.shape)
singles_below_stringent_cutoff.head()
We could plot all of these genes right now but that doesn't tell us about how these genes are different between the groups. We'll need to make this into tidy data for that.
In [ ]:
sns.violinplot(data=singles_below_stringent_cutoff)
Now we need to make this data into a tidy dataframe, where every row is a single observation of a gene and a sample. We will do this with two commands .unstack()
and .reset_index()
:
In [ ]:
singles_below_stringent_cutoff_unstacked = singles_below_stringent_cutoff.unstack()
print(singles_below_stringent_cutoff_unstacked.shape)
singles_below_stringent_cutoff_unstacked.head()
Now we'll "reset" the index, meaning, move all the row names into the dataframe instead of as the row labels:
In [ ]:
singles_below_stringent_cutoff_reset_index = singles_below_stringent_cutoff_unstacked.reset_index()
print(singles_below_stringent_cutoff_reset_index.shape)
singles_below_stringent_cutoff_reset_index.head()
Let's rename the columns to something more meaningful
In [ ]:
singles_below_stringent_cutoff_tidy = singles_below_stringent_cutoff_reset_index.rename(
columns={'level_0': 'gene_id', 'level_1': 'sample_id', 0:'log2(TPM+1)'})
print(singles_below_stringent_cutoff_tidy.shape)
singles_below_stringent_cutoff_tidy.head()
Now we'll want to add our metadata so we have the information for plotting
In [ ]:
singles_below_stringent_cutoff_tidy_metadata = singles_below_stringent_cutoff_tidy.join(metadata, on='sample_id')
print(singles_below_stringent_cutoff_tidy_metadata.shape)
singles_below_stringent_cutoff_tidy_metadata.head()
Now we can plot! We'll specify that the hue
should indicate the mature/immature group so they'll get compared side by side:
In [ ]:
fig, ax = plt.subplots()
sns.violinplot(hue='group', y='log2(TPM+1)', x='gene_id',
data=singles_below_stringent_cutoff_tidy_metadata, kind='violin')
We can make it bigger and wider:
In [ ]:
figwidth = 16
figheight = 4
fig, ax = plt.subplots(figsize=(figwidth, figheight))
sns.violinplot(hue='group', y='log2(TPM+1)', x='gene_id',
data=singles_below_stringent_cutoff_tidy_metadata, kind='violin')
Hmm the genes are still hard to read .. maybe we can orient this horizontally? Notice that we have to swap the figwidth
and figheight
:
figheight = 16
figwidth = 4
And change the orientation to "h" (horizontal):
orient='h'
And swap the x
and y
:
x='log2(TPM+1)', y='gene_id'
In [ ]:
figheight = 10
figwidth = 4
fig, ax = plt.subplots(figsize=(figwidth, figheight))
sns.violinplot(hue='group', x='log2(TPM+1)', y='gene_id', orient='h',
data=singles_below_stringent_cutoff_tidy_metadata)
My personal preference is I like ...
inner=None
: the inside to not have the mean and std dev info, but rather be emptyscale='width'
: the violinplot to be maximal width, even if it's very long. The default is for the whole area to be integrated to 1 but I think it's harder to read.palette=palette
: Create a palette
variable that we created earlierhue_order=order
: make sure the sample are in the order that we want
In [ ]:
singles_below_stringent_cutoff_tidy_metadata.head()
In [ ]:
figheight = 10
figwidth = 4
# Create a palette and ordering for using with seaborn
palette = ['MediumTurquoise', 'Teal']
order = ['immature', 'mature']
fig, ax = plt.subplots(figsize=(figwidth, figheight))
sns.violinplot(hue='group', x='log2(TPM+1)', y='gene_id', orient='h',
data=singles_below_stringent_cutoff_tidy_metadata,
inner=None, scale='width', hue_order=order, palette=palette, ax=ax)
Since we just have two categories, we could also do split=True
to show two separate distributions (which looks cool and is useful - a rare treat!)
Note: this only works if you have exactly two hue
categories!
In [ ]:
figheight = 10
figwidth = 4
# Create a palette and ordering for using with seaborn
palette = ['MediumTurquoise', 'Teal',]
order = ['immature', 'mature',]
fig, ax = plt.subplots(figsize=(figwidth, figheight))
sns.violinplot(hue='group', x='log2(TPM+1)', y='gene_id', orient='h',
data=singles_below_stringent_cutoff_tidy_metadata,
inner=None, scale='width', hue_order=order, palette=palette, ax=ax, split=True)
I also like to add a stripplot
which adds the individual cell observations
In [ ]:
figheight = 10
figwidth = 4
# Create a palette and ordering for using with seaborn
palette = ['MediumTurquoise', 'Teal',]
order = ['immature', 'mature',]
fig, ax = plt.subplots(figsize=(figwidth, figheight))
sns.violinplot(hue='group', x='log2(TPM+1)', y='gene_id', orient='h',
data=singles_below_stringent_cutoff_tidy_metadata,
inner=None, scale='width', hue_order=order, palette=palette, ax=ax, split=True)
sns.stripplot(hue='group', x='log2(TPM+1)', y='gene_id', orient='h',
data=singles_below_stringent_cutoff_tidy_metadata,
hue_order=order, palette=palette, ax=ax, split=True,
edgecolor='white', linewidth=1, jitter=True)
In [ ]:
facetgrid = sns.factorplot(x='group', y='log2(TPM+1)', col='gene_id',
data=singles_below_stringent_cutoff_tidy_metadata, kind='violin')
facetgrid.savefig('shalek2013_singles_below_stringent_cutoff_violins.pdf')
Yikes, that's so wide! Let's wrap the columns around after they reach length 4 with:
col_wrap=4
In [ ]:
facetgrid = sns.factorplot(x='group', y='log2(TPM+1)', col='gene_id', col_wrap=4,
data=singles_below_stringent_cutoff_tidy_metadata, kind='violin')
facetgrid.savefig('shalek2013_singles_below_stringent_cutoff_violins_col_wrapped.pdf')
Let's pass the same parameters that we used for the violins before with:
inner=None, scale='width', palette=palette, order=order
In [ ]:
facetgrid = sns.factorplot(x='group', y='log2(TPM+1)', col='gene_id', col_wrap=4,
data=singles_below_stringent_cutoff_tidy_metadata, kind='violin',
inner=None, scale='width', palette=palette, order=order)
facetgrid.savefig('shalek2013_singles_below_stringent_cutoff_violins_palette.pdf')
We can also add stripplot
!! We'll send each facet (individual axes) the data it needs with map_dataframe
In [ ]:
facetgrid = sns.factorplot(x='group', y='log2(TPM+1)', col='gene_id', col_wrap=4,
data=singles_below_stringent_cutoff_tidy_metadata, kind='violin',
inner=None, scale='width', palette=palette, order=order,)
facetgrid.map_dataframe(sns.stripplot, x='group', y='log2(TPM+1)', palette=palette,
order=order, jitter=True, edgecolor='white', linewidth=1)
facetgrid.savefig('shalek2013_singles_below_stringent_cutoff_violins_stripplot.pdf')
violinplot
+stripplot
of the top-enriched genes ABOVE the upper cutoff.violinplot
+stripplot
of the top-enriched genes ABOVE the upper cutoff.Use as many code cells as you need.
In [ ]:
# YOUR CODE HERE
In [ ]:
# YOUR CODE HERE
In [ ]:
# YOUR CODE HERE