In [15]:
# Alphabetical order is standard
# We're doing "import superlongname as abbrev" for our laziness - this way we don't have to type out the whole thing each time.
# Python plotting library
import matplotlib.pyplot as plt
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# T-test of independent samples
from scipy.stats import ttest_ind
# Statistical plotting library we'll use
import seaborn as sns
sns.set(style='whitegrid')
# Matrix decomposition
from sklearn.decomposition import PCA, FastICA
# Matrix decomposition
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
# Manifold learning
from sklearn.manifold import MDS, TSNE
# Gene ontology
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
In [28]:
shalek2013_metadata = pd.read_csv('../data/shalek2013/metadata.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
shalek2013_expression = pd.read_csv('../data/shalek2013/expression.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
shalek2013_expression_feature = pd.read_csv('../data/shalek2013/expression_feature.csv',
# Sets the first (Python starts counting from 0 not 1) column as the row names
index_col=0)
In [29]:
from sklearn.svm import SVC
In [30]:
classifier = SVC(kernel='linear')
In [31]:
def plot_svc_decision_function(clf, ax=None):
"""Plot the decision function for a 2D SVC"""
if ax is None:
ax = plt.gca()
x = np.linspace(plt.xlim()[0], plt.xlim()[1], 30)
y = np.linspace(plt.ylim()[0], plt.ylim()[1], 30)
Y, X = np.meshgrid(y, x)
P = np.zeros_like(X)
for i, xi in enumerate(x):
for j, yj in enumerate(y):
P[i, j] = clf.decision_function([[xi, yj]])
# plot the margins
ax.contour(X, Y, P, colors='k',
levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
In [32]:
shalek2013_singles = shalek2013_expression.loc[[x for x in shalek2013_expression.index if x.startswith('S')]]
shalek2013_singles.shape
Out[32]:
In [33]:
shalek2013_singles = shalek2013_singles.loc[:, (shalek2013_singles > 1).sum() >= 3]
shalek2013_singles.shape
Out[33]:
In [34]:
mature = 'S12', 'S13', 'S16'
target = [1 if x in mature else 0 for x in shalek2013_singles.index]
target
Out[34]:
In [35]:
classifier.fit(shalek2013_singles, target)
Out[35]:
In [39]:
from sklearn.decomposition import PCA, FastICA
from sklearn.manifold import TSNE, MDS
smusher = FastICA(n_components=4)
reduced_data = smusher.fit_transform(shalek2013_singles+1)
In [40]:
reduced_data_df = pd.DataFrame(reduced_data)
In [41]:
reduced_data_df.min()
Out[41]:
In [42]:
reduced_data_df.max()
Out[42]:
In [43]:
reduced_data_df.quantile(1)
Out[43]:
In [51]:
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
X = np.linspace(x_min, x_max, 50)
Y = np.linspace(y_min, y_max, 50)
xx, yy = np.meshgrid(X, Y)
# Get the decision boundary
# Z = classifier.decision_function(np.c_[xx.ravel(), yy.ravel()])
two_d_space = np.c_[xx.ravel(), yy.ravel()]
two_d_space
Out[51]:
In [61]:
xx.shape
Out[61]:
In [62]:
np.concatenate([xx.ravel(), yy.ravel()]).reshape(50**2, 2)
Out[62]:
In [52]:
two_d_space.shape
Out[52]:
In [53]:
plt.scatter(two_d_space[:, 0], two_d_space[:, 1], color='black')
Out[53]:
In [56]:
unsmushed = smusher.inverse_transform(two_d_space)
In [57]:
Z = classifier.decision_function(unsmushed)
Z = Z.reshape(xx.shape)
In [ ]:
fig, ax = plt.subplots()
ax.scatter(reduced_data[:, 0], reduced_data[:, 1], c=target, cmap='Dark2')
ax.contour(X, Y, Z, colors='k',
levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
In [ ]:
def visualize_tree(estimator, X, y, smusher, boundaries=True,
xlim=None, ylim=None):
estimator.fit(X, y)
smushed = smusher.fit_transform(X)
if xlim is None:
xlim = (smushed[:, 0].min() - 0.1, smushed[:, 0].max() + 0.1)
if ylim is None:
ylim = (smushed[:, 1].min() - 0.1, smushed[:, 1].max() + 0.1)
x_min, x_max = xlim
y_min, y_max = ylim
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
two_d_space = np.c_[xx.ravel(), yy.ravel()]
unsmushed = smusher.inverse_transform(two_d_space)
Z = estimator.predict(unsmushed)
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, alpha=0.2, cmap='Paired')
plt.clim(y.min(), y.max())
# Plot also the training points
plt.scatter(smushed[:, 0], smushed[:, 1], c=y, s=50, cmap='Paired')
plt.axis('off')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.clim(y.min(), y.max())
# Plot the decision boundaries
def plot_boundaries(i, xlim, ylim):
if i < 0:
return
tree = estimator.tree_
if tree.feature[i] == 0:
plt.plot([tree.threshold[i], tree.threshold[i]], ylim, '-k')
plot_boundaries(tree.children_left[i],
[xlim[0], tree.threshold[i]], ylim)
plot_boundaries(tree.children_right[i],
[tree.threshold[i], xlim[1]], ylim)
elif tree.feature[i] == 1:
plt.plot(xlim, [tree.threshold[i], tree.threshold[i]], '-k')
plot_boundaries(tree.children_left[i], xlim,
[ylim[0], tree.threshold[i]])
plot_boundaries(tree.children_right[i], xlim,
[tree.threshold[i], ylim[1]])
if boundaries:
plot_boundaries(0, plt.xlim(), plt.ylim())
In [ ]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
from sklearn.decomposition import PCA, FastICA
from sklearn.manifold import TSNE, MDS
smusher = PCA(n_components=2)
# reduced_data = smusher.fit_transform(shalek2013_singles+1)
visualize_tree(classifier, shalek2013_singles, np.array(target), smusher)
In [ ]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
classifier = RandomForestClassifier()
smusher = PCA(n_components=2)
# reduced_data = smusher.fit_transform(shalek2013_singles+1)
visualize_tree(classifier, shalek2013_singles, np.array(target), smusher, boundaries=False)
In [ ]:
classifier = ExtraTreesClassifier()
smusher = PCA(n_components=2)
# reduced_data = smusher.fit_transform(shalek2013_singles+1)
visualize_tree(classifier, shalek2013_singles, np.array(target), smusher, boundaries=False)
In [ ]:
pd.options.display.max_columns = 50
In [ ]:
macaulay2016_metadata = pd.read_csv('../4._Case_Study/macaulay2016/sample_info_qc.csv', index_col=0)
macaulay2016_metadata.head()
In [ ]:
macaulay2016_cluster_names = tuple(sorted(macaulay2016_metadata['cluster'].unique()))
macaulay2016_cluster_names
In [ ]:
macaulay2016_target = macaulay2016_metadata['cluster'].map(lambda x: macaulay2016_cluster_names.index(x))
macaulay2016_target
In [ ]:
macaulay2016_expression = pd.read_csv('../4._Case_Study/macaulay2016/gene_expression_s.csv', index_col=0).T
In [ ]:
macaulay2016_expression.head()
In [ ]:
macaulay2016_expression_filtered = macaulay2016_expression[[x for x in macaulay2016_expression if x.startswith("ENS")]]
macaulay2016_expression_filtered.shape
In [ ]:
macaulay2016_expression_filtered = macaulay2016_expression_filtered.loc[macaulay2016_metadata.index]
In [ ]:
macaulay2016_expression_filtered = 1e6*macaulay2016_expression_filtered.divide(macaulay2016_expression_filtered.sum(axis=1), axis=0)
macaulay2016_expression_filtered.head()
In [ ]:
macaulay2016_expression_filtered = np.log10(macaulay2016_expression_filtered+1)
macaulay2016_expression_filtered.head()
In [ ]:
macaulay2016_expression_filtered = macaulay2016_expression_filtered.loc[:, (macaulay2016_expression_filtered > 1).sum() >=3]
macaulay2016_expression_filtered.shape
In [ ]:
# classifier = SVC(kernel='linear')
# classifier = DecisionTreeClassifier(max_depth=10)
classifier = ExtraTreesClassifier(n_estimators=1000)
classifier.fit(macaulay2016_expression_filtered, macaulay2016_target)
In [ ]:
smusher = FastICA(n_components=2, random_state=0)
smushed_data = smusher.fit_transform(macaulay2016_expression_filtered)
x_min, x_max = smushed_data[:, 0].min(), smushed_data[:, 0].max()
y_min, y_max = smushed_data[:, 1].min(), smushed_data[:, 1].max()
delta_x = 0.05 * abs(x_max - x_min)
delta_y = 0.05 * abs(x_max - x_min)
x_min -= delta_x
x_max += delta_x
y_min -= delta_y
y_max += delta_y
X = np.linspace(x_min, x_max, 100)
Y = np.linspace(y_min, y_max, 100)
xx, yy = np.meshgrid(X, Y)
two_d_space = np.c_[xx.ravel(), yy.ravel()]
two_d_space
In [ ]:
high_dimensional_space = smusher.inverse_transform(two_d_space)
In [ ]:
# Get the class boundaries
Z = classifier.predict(high_dimensional_space)
In [ ]:
import matplotlib as mpl
macaulay2016_metadata['cluster_color_hex'] = macaulay2016_metadata['cluster_color'].map(lambda x: mpl.colors.rgb2hex(eval(x)))
In [ ]:
int_to_cluster_name = dict(zip(range(len(macaulay2016_cluster_names)), macaulay2016_cluster_names))
int_to_cluster_name
In [ ]:
cluster_name_to_color = dict(zip(macaulay2016_metadata['cluster'], macaulay2016_metadata['cluster_color_hex']))
cluster_name_to_color
In [ ]:
macaulay2016_palette = [mpl.colors.hex2color(cluster_name_to_color[int_to_cluster_name[i]])
for i in range(len(macaulay2016_cluster_names))]
macaulay2016_palette
In [ ]:
cmap = mpl.colors.ListedColormap(macaulay2016_palette)
cmap
In [ ]:
x_min, x_max
In [ ]:
y = macaulay2016_target
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, alpha=0.2, cmap=cmap)
plt.clim(y.min(), y.max())
# Plot also the training points
plt.scatter(smushed_data[:, 0], smushed_data[:, 1], s=50, color=macaulay2016_metadata['cluster_color_hex'],
edgecolor='k') #c=macaulay2016_target, s=50, cmap='Set2')
plt.axis('off')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.clim(y.min(), y.max())
In [ ]:
smusher = FastICA(n_components=4, random_state=354)
smushed_data = pd.DataFrame(smusher.fit_transform(macaulay2016_expression_filtered))
# x_min, x_max = smushed_data[:, 0].min(), smushed_data[:, 0].max()
# y_min, y_max = smushed_data[:, 1].min(), smushed_data[:, 1].max()
# delta_x = 0.05 * abs(x_max - x_min)
# delta_y = 0.05 * abs(x_max - x_min)
# x_min -= delta_x
# x_max += delta_x
# y_min -= delta_y
# y_max += delta_y
# X = np.linspace(x_min, x_max, 100)
# Y = np.linspace(y_min, y_max, 100)
# xx, yy = np.meshgrid(X, Y)
# low_dimensional_space = np.c_[xx.ravel(), yy.ravel()]
# low_dimensional_space
In [ ]:
smushed_data.max() - smushed_data.min()
In [ ]:
grid = smushed_data.apply(lambda x: pd.Series(np.linspace(x.min(), x.max(), 50)))
grid.head()
# grid = [x.ravel() for x in grid]
# grid
# low_dimensional_space = np.concatenate(grid, axis=0)
# low_dimensional_space.shape
# # low_dimensional_space = low_dimensional_space.reshape(shape)
In [ ]:
x1, x2, x3, x4 = np.meshgrid(*[grid[col] for col in grid])
low_dimensional_space = np.c_[x1.ravel(), x2.ravel(), x3.ravel(), x4.ravel()]
In [ ]:
high_dimensional_space = smusher.inverse_transform(low_dimensional_space)
In [ ]:
smushed_data['hue'] = macau
In [ ]:
sns.pairplot(smushed_data)