We're going to use the classifier knowledge that we've learned so far and apply it to the shalek2013 and macaulay2016 datasets.
For the GO analysis, we'll need a few other packages:
mygene
for looking up the gene ontology categories of genesgoatools
for performing gene ontology enrichment analysisfishers_exact_test
for goatools
Use the following commands at your terminal to install the packages. Some of them are on Github so it's important to get the whole command right.
$ pip install mygene
$ pip install git+git://github.com/olgabot/goatools.git
$ pip install git+https://github.com/brentp/fishers_exact_test.git
In [1]:
# Alphabetical order is standard
# We're doing "import superlongname as abbrev" for our laziness - this way we don't have to type out the whole thing each time.
# From python standard library
import collections
# Python plotting library
import matplotlib.pyplot as plt
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
sns.set(style='whitegrid')
# Label processing
from sklearn import preprocessing
# Matrix decomposition
from sklearn.decomposition import PCA, FastICA
# Matrix decomposition
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
# Manifold learning
from sklearn.manifold import MDS, TSNE
# Gene ontology
import goatools
import mygene
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
Utility functions for gene ontology and SVM decision boundary plotting
In [2]:
def plot_svc_decision_function(clf, ax=None):
"""Plot the decision function for a 2D SVC"""
if ax is None:
ax = plt.gca()
x = np.linspace(plt.xlim()[0], plt.xlim()[1], 30)
y = np.linspace(plt.ylim()[0], plt.ylim()[1], 30)
Y, X = np.meshgrid(y, x)
P = np.zeros_like(X)
for i, xi in enumerate(x):
for j, yj in enumerate(y):
P[i, j] = clf.decision_function([[xi, yj]])
# plot the margins
ax.contour(X, Y, P, colors='k',
levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
GO_KEYS = 'go.BP', 'go.MF', 'go.CC'
def parse_mygene_output(mygene_output):
"""Convert mygene.querymany output to a gene id to go term mapping (dictionary)
Parameters
----------
mygene_output : dict or list
Dictionary (returnall=True) or list (returnall=False) of
output from mygene.querymany
Output
------
gene_name_to_go : dict
Mapping of gene name to a set of GO ids
"""
# if "returnall=True" was specified, need to get just the "out" key
if isinstance(mygene_output, dict):
mygene_output = mygene_output['out']
gene_name_to_go = collections.defaultdict(set)
for line in mygene_output:
gene_name = line['query']
for go_key in GO_KEYS:
try:
go_terms = line[go_key]
except KeyError:
continue
if isinstance(go_terms, dict):
go_ids = set([go_terms['id']])
else:
go_ids = set(x['id'] for x in go_terms)
gene_name_to_go[gene_name] |= go_ids
return gene_name_to_go
In [3]:
metadata = pd.read_csv('../data/shalek2013/metadata.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
expression = pd.read_csv('../data/shalek2013/expression.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
expression_feature = pd.read_csv('../data/shalek2013/expression_feature.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
# creating new column indicating color
metadata['color'] = metadata['maturity'].map(
lambda x: 'MediumTurquoise' if x == 'immature' else 'Teal')
metadata.loc[metadata['pooled'], 'color'] = 'black'
# Create a column indicating both maturity and pooled for coloring with seaborn, e.g. sns.pairplot
metadata['group'] = metadata['maturity']
metadata.loc[metadata['pooled'], 'group'] = 'pooled'
# Create a palette and ordering for using with sns.pairplot
palette = ['MediumTurquoise', 'Teal', 'black']
order = ['immature', 'mature', 'pooled']
metadata
Out[3]:
In [4]:
subset = expression_feature.query('gene_category == "LPS Response"')
subset.head()
Out[4]:
Assign the variable lps_response_genes
based on the gene ids pulled out from this subset:
In [5]:
lps_response_genes = subset.index
lps_response_genes
Out[5]:
For this analysis We want to compare the difference between the "mature" and "immature" cells in the Shalek2013 data.
In [6]:
singles_ids = [x for x in expression.index if x.startswith('S')]
singles = expression.loc[singles_ids]
singles.shape
Out[6]:
Use only the genes that are substantially expressed in single cells
In [7]:
singles = singles.loc[:, (singles > 1).sum() >= 3]
singles.shape
Out[7]:
Now because computers only understand numbers, we'll convert the category label of "mature" and "immature" into integers to a using a LabelEncoder
. Let's look at that column again, only for mature cells:
In [8]:
singles_maturity = metadata.loc[singles.index, 'maturity']
singles_maturity
Out[8]:
In [9]:
# Instantiate the encoder
encoder = preprocessing.LabelEncoder()
# Get number of categories and transform "mature"/"immature" to numbers
target = encoder.fit_transform(singles_maturity)
target
Out[9]:
In [10]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear')
classifier.fit(singles, target)
Out[10]:
We'll use PCA or ICA to reduce our data for visualizing the SVM decision boundary. Stick to 32 or fewer components because the next steps will die if you use more than 32. Also, this n_components
variable will be used later so pay attention :)
In [174]:
n_components = 2
smusher = PCA(n_components=n_components)
smushed = pd.DataFrame(smusher.fit_transform(singles), index=singles.index)
print(smushed.shape)
smushed.head()
# Let's add the group identifier here for plotting:
smushed_with_group = smushed.join(metadata['group'])
print(smushed_with_group.shape)
smushed_with_group.head()
Out[174]:
In [185]:
n_intervals = 50
x_min, x_max = smushed[0].min() - 1, smushed[0].max() + 1
y_min, y_max = smushed[1].min() - 1, smushed[1].max() + 1
X = np.linspace(x_min, x_max, n_intervals)
Y = np.linspace(y_min, y_max, n_intervals)
xx, yy = np.meshgrid(X, Y)
# Get the decision boundary
# Z = classifier.decision_function(np.c_[xx.ravel(), yy.ravel()])
two_d_space = np.c_[xx.ravel(), yy.ravel()]
gene_space = smusher.inverse_transform(two_d_space)
Z = classifier.decision_function(gene_space)
Z = Z.reshape(xx.shape)
support_vectors_smushed = pd.DataFrame(smusher.transform(classifier.support_vectors_))
fig, ax = plt.subplots()
# ax.scatter(smushed_intervals[0], smushed_intervals[1], color='pink')
ax.scatter(smushed[0], smushed[1], color=metadata['color'], s=50)
ax.scatter(support_vectors_smushed[0], support_vectors_smushed[1], s=200, facecolors='none')
ax.contourf(X, Y, Z, colors='grey',
levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
Out[185]:
That looks really cool! Almost ALL the data points are support vectors (except that one cell). Why do you think so?
This was nice but how do we biologically assess our classifiers? We want to do that using our biological knowledge of the data. Let's first look at the distribution of the coefficients in the data. This is the coefficient assigned to each feature. We'll first make it a Series so it's easy to work with (it's currently a numpy array).
In [192]:
coefficients = pd.Series(classifier.coef_.flat, index=singles.columns)
coefficients.head()
Out[192]:
Now let's plot the distribution of the coefficients across all featurs:
In [186]:
sns.distplot(coefficients)
Out[186]:
This is semi-gaussian .. what does that mean? It means that most of the features in the dataset don't actually help you tell the difference between the datasets.
Let's get the features which are 2 std devs away from the mean of the data. Let's plot what that would look like
In [195]:
mean = coefficients.mean()
std = coefficients.std()
multiplier = 2
lower_cutoff = mean - multiplier * std
upper_cutoff = mean + multiplier * std
fig, ax = plt.subplots()
sns.distplot(coefficients)
# Add vertical lines
ymin, ymax = ax.get_ylim()
ax.vlines([lower_cutoff, upper_cutoff], ymin, ymax, linestyle='--', color='Crimson')
Out[195]:
In [197]:
below_cutoff = coefficients[coefficients < lower_cutoff]
print(below_cutoff.shape)
below_cutoff.head()
Out[197]:
In [198]:
sns.clustermap(singles[below_cutoff.index])
Out[198]:
Gene ontology is a tree (aka directed acyclic graph or "dag") of gene annotations. The topmost node is the most general, and the bottommost nodes are the most specific. Here is an example GO graph.
Three GO Domains:
In [ ]:
from goatools.base import download_go_basic_obo
obo_fname = download_go_basic_obo()
# Show the filename
obo_fname
In [ ]:
obo_dag = goatools.obo_parser.GODag(obo_file='go-basic.obo')
Here we are establishing the background for our GOEA. Defining your background is very important because, for example, tehre are lots of neural genes so if you use all human genes as background in your study of which genes are upregulated in Neuron Type X vs Neuron Type Y, you'll get a bunch of neuron genes (which is true) but not the smaller differences between X and Y. Typicall, you use all expressed genes as the background.
For our data, we can access all expressed genes very simply by getting the column names in the dataframe: expression.columns
.
In [ ]:
# Initialize the "mygene.info" (http://mygene.info/) interface
mg = mygene.MyGeneInfo()
mygene_output = mg.querymany(expression.columns,
scopes='symbol', fields=['go.BP', 'go.MF', 'go.CC'], species='mouse',
returnall=True)
gene_name_to_go = parse_mygene_output(mygene_output)
In [ ]:
go_enricher = goatools.GOEnrichmentStudy(expression.columns, gene_name_to_go, obo_dag)
In [ ]:
genes_of_interest =
results = go.run_study(genes[:5])
go_enrichment = pd.DataFrame([r.__dict__ for r in results])
go_enrichment.head()
In [ ]:
In [ ]:
import pandas.util.testing as pdt
In [ ]:
pdt.assert_numpy_array_equal(two_d_space_v1, two_d_space_v2)
In [ ]:
two_d_space.shape
In [ ]:
plt.scatter(two_d_space[:, 0], two_d_space[:, 1], color='black')
In [ ]:
expression.index[:10]
In [ ]:
clf = ExtraTreesClassifier(n_estimators=100000, n_jobs=-1, verbose=1)
In [ ]:
expression.index.duplicated()
In [ ]:
expression.drop_duplicates()
In [ ]:
# assoc = pd.read_table('danio-rerio-gene-ontology.txt').dropna()
# assoc_df = assoc.groupby('Ensembl Gene ID').agg(lambda s: ';'.join(s))
# assoc_s = assoc_df['GO Term Accession'].apply(lambda s: set(s.split(';')))
# assoc_dict = assoc_s.to_dict()
In [ ]:
import goatools
# cl = gene_annotation.sort(col, ascending=False)[gene_annotation[col] > 5e-4].index
g = goatools.GOEnrichmentStudy(list(gene_annotation.index), assoc_dict, obo_dag, study=list(cl))
In [ ]:
In [ ]:
for r in g.results[:25]:
print r.goterm.id, '{:.2}'.format(r.p_bonferroni), r.ratio_in_study, r.goterm.name, r.goterm.namespace
In [ ]:
unsmushed = smusher.inverse_transform(two_d_space)
Z = classifier.decision_function(unsmushed)
Z = Z.reshape(xx.shape)
fig, ax = plt.subplots()
ax.scatter(reduced_data[:, 0], reduced_data[:, 1], c=target, cmap='Dark2')
ax.contour(X, Y, Z, colors='k',
levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
kernel="rbf"
) for SVC.
In [ ]:
def visualize_tree(estimator, X, y, smusher, boundaries=True,
xlim=None, ylim=None):
estimator.fit(X, y)
smushed = smusher.fit_transform(X)
if xlim is None:
xlim = (smushed[:, 0].min() - 0.1, smushed[:, 0].max() + 0.1)
if ylim is None:
ylim = (smushed[:, 1].min() - 0.1, smushed[:, 1].max() + 0.1)
x_min, x_max = xlim
y_min, y_max = ylim
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
two_d_space = np.c_[xx.ravel(), yy.ravel()]
unsmushed = smusher.inverse_transform(two_d_space)
Z = estimator.predict(unsmushed)
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, alpha=0.2, cmap='Paired')
plt.clim(y.min(), y.max())
# Plot also the training points
plt.scatter(smushed[:, 0], smushed[:, 1], c=y, s=50, cmap='Paired')
plt.axis('off')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.clim(y.min(), y.max())
# Plot the decision boundaries
def plot_boundaries(i, xlim, ylim):
if i < 0:
return
tree = estimator.tree_
if tree.feature[i] == 0:
plt.plot([tree.threshold[i], tree.threshold[i]], ylim, '-k')
plot_boundaries(tree.children_left[i],
[xlim[0], tree.threshold[i]], ylim)
plot_boundaries(tree.children_right[i],
[tree.threshold[i], xlim[1]], ylim)
elif tree.feature[i] == 1:
plt.plot(xlim, [tree.threshold[i], tree.threshold[i]], '-k')
plot_boundaries(tree.children_left[i], xlim,
[ylim[0], tree.threshold[i]])
plot_boundaries(tree.children_right[i], xlim,
[tree.threshold[i], ylim[1]])
if boundaries:
plot_boundaries(0, plt.xlim(), plt.ylim())
In [ ]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
from sklearn.decomposition import PCA, FastICA
from sklearn.manifold import TSNE, MDS
smusher = PCA(n_components=2)
# reduced_data = smusher.fit_transform(singles+1)
visualize_tree(classifier, singles, np.array(target), smusher)
In [ ]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
classifier = RandomForestClassifier()
smusher = PCA(n_components=2)
# reduced_data = smusher.fit_transform(singles+1)
visualize_tree(classifier, singles, np.array(target), smusher, boundaries=False)
In [ ]:
classifier = ExtraTreesClassifier()
smusher = PCA(n_components=2)
# reduced_data = smusher.fit_transform(singles+1)
visualize_tree(classifier, singles, np.array(target), smusher, boundaries=False)
In [ ]:
pd.options.display.max_columns = 50
In [ ]:
macaulay2016_metadata = pd.read_csv('../4._Case_Study/macaulay2016/sample_info_qc.csv', index_col=0)
macaulay2016_metadata.head()
In [ ]:
macaulay2016_cluster_names = tuple(sorted(macaulay2016_metadata['cluster'].unique()))
macaulay2016_cluster_names
In [ ]:
macaulay2016_target = macaulay2016_metadata['cluster'].map(lambda x: macaulay2016_cluster_names.index(x))
macaulay2016_target
In [ ]:
macaulay2016_expression = pd.read_csv('../4._Case_Study/macaulay2016/gene_expression_s.csv', index_col=0).T
In [ ]:
macaulay2016_expression.head()
In [ ]:
macaulay2016_expression_filtered = macaulay2016_expression[[x for x in macaulay2016_expression if x.startswith("ENS")]]
macaulay2016_expression_filtered.shape
In [ ]:
macaulay2016_expression_filtered = macaulay2016_expression_filtered.loc[macaulay2016_metadata.index]
In [ ]:
macaulay2016_expression_filtered = 1e6*macaulay2016_expression_filtered.divide(macaulay2016_expression_filtered.sum(axis=1), axis=0)
macaulay2016_expression_filtered.head()
In [ ]:
macaulay2016_expression_filtered = np.log10(macaulay2016_expression_filtered+1)
macaulay2016_expression_filtered.head()
In [ ]:
macaulay2016_expression_filtered = macaulay2016_expression_filtered.loc[:, (macaulay2016_expression_filtered > 1).sum() >=3]
macaulay2016_expression_filtered.shape
In [ ]:
# classifier = SVC(kernel='linear')
# classifier = DecisionTreeClassifier(max_depth=10)
classifier = ExtraTreesClassifier(n_estimators=1000)
classifier.fit(macaulay2016_expression_filtered, macaulay2016_target)
In [ ]:
smusher = FastICA(n_components=2, random_state=0)
smushed_data = smusher.fit_transform(macaulay2016_expression_filtered)
x_min, x_max = smushed_data[:, 0].min(), smushed_data[:, 0].max()
y_min, y_max = smushed_data[:, 1].min(), smushed_data[:, 1].max()
delta_x = 0.05 * abs(x_max - x_min)
delta_y = 0.05 * abs(x_max - x_min)
x_min -= delta_x
x_max += delta_x
y_min -= delta_y
y_max += delta_y
X = np.linspace(x_min, x_max, 100)
Y = np.linspace(y_min, y_max, 100)
xx, yy = np.meshgrid(X, Y)
two_d_space = np.c_[xx.ravel(), yy.ravel()]
two_d_space
In [ ]:
high_dimensional_space = smusher.inverse_transform(two_d_space)
In [ ]:
# Get the class boundaries
Z = classifier.predict(high_dimensional_space)
In [ ]:
import matplotlib as mpl
macaulay2016_metadata['cluster_color_hex'] = macaulay2016_metadata['cluster_color'].map(lambda x: mpl.colors.rgb2hex(eval(x)))
In [ ]:
int_to_cluster_name = dict(zip(range(len(macaulay2016_cluster_names)), macaulay2016_cluster_names))
int_to_cluster_name
In [ ]:
cluster_name_to_color = dict(zip(macaulay2016_metadata['cluster'], macaulay2016_metadata['cluster_color_hex']))
cluster_name_to_color
In [ ]:
macaulay2016_palette = [mpl.colors.hex2color(cluster_name_to_color[int_to_cluster_name[i]])
for i in range(len(macaulay2016_cluster_names))]
macaulay2016_palette
In [ ]:
cmap = mpl.colors.ListedColormap(macaulay2016_palette)
cmap
In [ ]:
x_min, x_max
In [ ]:
y = macaulay2016_target
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, alpha=0.2, cmap=cmap)
plt.clim(y.min(), y.max())
# Plot also the training points
plt.scatter(smushed_data[:, 0], smushed_data[:, 1], s=50, color=macaulay2016_metadata['cluster_color_hex'],
edgecolor='k') #c=macaulay2016_target, s=50, cmap='Set2')
plt.axis('off')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.clim(y.min(), y.max())
In [ ]:
smusher = FastICA(n_components=4, random_state=354)
smushed_data = pd.DataFrame(smusher.fit_transform(macaulay2016_expression_filtered))
# x_min, x_max = smushed_data[:, 0].min(), smushed_data[:, 0].max()
# y_min, y_max = smushed_data[:, 1].min(), smushed_data[:, 1].max()
# delta_x = 0.05 * abs(x_max - x_min)
# delta_y = 0.05 * abs(x_max - x_min)
# x_min -= delta_x
# x_max += delta_x
# y_min -= delta_y
# y_max += delta_y
# X = np.linspace(x_min, x_max, 100)
# Y = np.linspace(y_min, y_max, 100)
# xx, yy = np.meshgrid(X, Y)
# low_dimensional_space = np.c_[xx.ravel(), yy.ravel()]
# low_dimensional_space
In [ ]:
smushed_data.max() - smushed_data.min()
In [ ]:
grid = smushed_data.apply(lambda x: pd.Series(np.linspace(x.min(), x.max(), 50)))
grid.head()
# grid = [x.ravel() for x in grid]
# grid
# low_dimensional_space = np.concatenate(grid, axis=0)
# low_dimensional_space.shape
# # low_dimensional_space = low_dimensional_space.reshape(shape)
In [ ]:
x1, x2, x3, x4 = np.meshgrid(*[grid[col] for col in grid])
low_dimensional_space = np.c_[x1.ravel(), x2.ravel(), x3.ravel(), x4.ravel()]
In [ ]:
high_dimensional_space = smusher.inverse_transform(low_dimensional_space)
In [ ]:
smushed_data['hue'] = macau
In [ ]:
sns.pairplot(smushed_data)