In [ ]:
# Alphabetical order is standard
# We're doing "import superlongname as abbrev" for our laziness - this way we don't have to type out the whole thing each time.
# Python plotting library
import matplotlib.pyplot as plt
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
sns.set(style='whitegrid')
# Label processing
from sklearn import preprocessing
# Matrix decomposition
from sklearn.decomposition import PCA, FastICA
# Classification
from sklearn.svm import SVC
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
Utility functions for gene ontology and SVM decision boundary plotting
In [ ]:
def plot_svc_decision_function(clf, ax=None):
"""Plot the decision function for a 2D SVC"""
if ax is None:
ax = plt.gca()
x = np.linspace(plt.xlim()[0], plt.xlim()[1], 30)
y = np.linspace(plt.ylim()[0], plt.ylim()[1], 30)
Y, X = np.meshgrid(y, x)
P = np.zeros_like(X)
for i, xi in enumerate(x):
for j, yj in enumerate(y):
P[i, j] = clf.decision_function([[xi, yj]])
# plot the margins
ax.contour(X, Y, P, colors='k',
levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
In [ ]:
metadata = pd.read_csv('../data/shalek2013/metadata.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
expression = pd.read_csv('../data/shalek2013/expression.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
expression_feature = pd.read_csv('../data/shalek2013/expression_feature.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
expression = expression
# creating new column indicating color
metadata['color'] = metadata['maturity'].map(
lambda x: 'MediumTurquoise' if x == 'immature' else 'Teal')
metadata.loc[metadata['pooled'], 'color'] = 'black'
# Create a column indicating both maturity and pooled for coloring with seaborn, e.g. sns.pairplot
metadata['group'] = metadata['maturity']
metadata.loc[metadata['pooled'], 'group'] = 'pooled'
metadata
In [ ]:
subset = expression_feature.query('gene_category == "LPS Response"')
subset.head()
Assign the variable lps_response_genes
based on the gene ids pulled out from this subset:
In [ ]:
lps_response_genes = subset.index
lps_response_genes
For this analysis We want to compare the difference between the "mature" and "immature" cells in the Shalek2013 data.
In [ ]:
singles_ids = [x for x in expression.index if x.startswith('S')]
singles = expression.loc[singles_ids]
singles.shape
Use only the genes that are substantially expressed in single cells
In [ ]:
singles = singles.loc[:, (singles > 1).sum() >= 3]
singles.shape
Now because computers only understand numbers, we'll convert the category label of "mature" and "immature" into integers to a using a LabelEncoder
. Let's look at that column again, only for mature cells:
In [ ]:
singles_maturity = metadata.loc[singles.index, 'maturity']
singles_maturity
In [ ]:
# Instantiate the encoder
encoder = preprocessing.LabelEncoder()
# Get number of categories and transform "mature"/"immature" to numbers
target = encoder.fit_transform(singles_maturity)
target
In [ ]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear')
classifier.fit(singles, target)
We'll use PCA or ICA to reduce our data for visualizing the SVM decision boundary. We'll only use two components for visualization so please don't change this to more components unless you like broken code!
In [ ]:
n_components = 2
smusher = PCA(n_components=n_components)
smushed = pd.DataFrame(smusher.fit_transform(singles), index=singles.index)
print(smushed.shape)
smushed.head()
# Let's add the group identifier here for plotting:
smushed_with_group = smushed.join(metadata['group'])
print(smushed_with_group.shape)
smushed_with_group.head()
In [ ]:
# Make a grid of the 2d PCA space
n_intervals = 50
x_min, x_max = smushed[0].min() - 1, smushed[0].max() + 1
y_min, y_max = smushed[1].min() - 1, smushed[1].max() + 1
X = np.linspace(x_min, x_max, n_intervals)
Y = np.linspace(y_min, y_max, n_intervals)
xx, yy = np.meshgrid(X, Y)
two_d_space = np.c_[xx.ravel(), yy.ravel()]
# Transform the two PCA space ot gene space
gene_space = smusher.inverse_transform(two_d_space)
# Get the decision boundary
Z = classifier.decision_function(gene_space)
Z = Z.reshape(xx.shape)
support_vectors_smushed = pd.DataFrame(smusher.transform(classifier.support_vectors_))
fig, ax = plt.subplots()
# ax.scatter(smushed_intervals[0], smushed_intervals[1], color='pink')
ax.scatter(smushed[0], smushed[1], color=metadata['color'], s=50)
ax.scatter(support_vectors_smushed[0], support_vectors_smushed[1], s=200, facecolors='none')
ax.contourf(X, Y, Z, colors='grey',
levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
That looks really cool! Almost ALL the data points are support vectors (except that one cell). Why do you think so?
kernel="rbf"
) for SVC.This was nice but how do we biologically assess our classifiers? We want to do that using our biological knowledge of the data. Let's first look at the distribution of the coefficients in the data. This is the coefficient assigned to each feature. We'll first make it a Series so it's easy to work with (it's currently a numpy array).
In [ ]:
coefficients = pd.Series(classifier.coef_.flat, index=singles.columns)
coefficients.head()
Now let's plot the distribution of the coefficients across all featurs:
In [ ]:
sns.distplot(coefficients)
This is semi-gaussian .. what does that mean? It means that most of the features in the dataset don't actually help you tell the difference between the datasets.
Let's get the features which are 2 std devs away from the mean of the data. Let's plot what that would look like
In [ ]:
mean = coefficients.mean()
std = coefficients.std()
multiplier = 2
lower_cutoff = mean - multiplier * std
upper_cutoff = mean + multiplier * std
fig, ax = plt.subplots()
sns.distplot(coefficients)
# Add vertical lines
ymin, ymax = ax.get_ylim()
ax.vlines([lower_cutoff, upper_cutoff], ymin, ymax, linestyle='--', color='Crimson')
In [ ]:
below_cutoff = coefficients[coefficients < lower_cutoff]
print(below_cutoff.shape)
below_cutoff.head()
In [ ]:
sns.clustermap(singles[below_cutoff.index])
In [ ]:
# YOUR CODE HERE
In [ ]:
# YOUR CODE HERE
In [ ]:
# YOUR CODE HERE