In [1]:
import os
import common
# Assign notebook and folder names
notebook_name = '05_pca_vs_ica'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder
In [2]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
In [3]:
input_folder = os.path.join(common.DATA_FOLDER, '001_downsample_macosko_data')
csv = os.path.join(input_folder, 'expression_table1_subset.csv')
table1 = pd.read_csv(csv, index_col=0)
print(table1.shape)
table1.head()
Out[3]:
In [4]:
input_folder = os.path.join(common.DATA_FOLDER, '002_robust_pca')
csv = os.path.join(input_folder, 'lowrank.csv')
lowrank = pd.read_csv(csv, index_col=0)
print(lowrank.shape)
lowrank.head()
Out[4]:
In [5]:
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()
Out[5]:
In [6]:
cluster_identities_lowrank = cluster_identities.loc[lowrank.index]
cluster_identities_lowrank.head()
Out[6]:
In [7]:
cluster_ids = cluster_identities_lowrank.unique()
cluster_ids
Out[7]:
In [8]:
cluster_id_to_name = {24: 'Rods', 25: 'Cones', 26: 'Bipolar cells (group1)', 27: 'Bipolar cells (group2)',
33: 'Bipolar cells (group3)', 34: 'Muller glia'}
In [9]:
cluster_names_lowrank = cluster_identities_lowrank.map(cluster_id_to_name)
cluster_names_lowrank.head()
Out[9]:
In [11]:
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids))
id_to_color = dict(zip(cluster_ids, colors))
color_labels = [id_to_color[i] for i in cluster_identities_lowrank]
color_labels[:4]
Out[11]:
In [32]:
cluster_names_to_color = dict((cluster_id_to_name[i], id_to_color[i]) for i in cluster_ids)
cluster_names_to_color
Out[32]:
In [12]:
sns.set(style='whitegrid')
In [84]:
fig, ax = plt.subplots()
sns.heatmap(lowrank, xticklabels=[], yticklabels=[])
ax.set(xlabel='Genes', ylabel='Cells')
Out[84]:
In [16]:
clustergrid = sns.clustermap(lowrank, mask=mask, xticklabels=[], yticklabels=[],
row_colors=color_labels)
In [13]:
from sklearn.decomposition import FastICA
ica = FastICA(n_components=6)
reduced = pd.DataFrame(ica.fit_transform(lowrank), index=lowrank.index)
reduced.head()
Out[13]:
In [14]:
component_norms = reduced.apply(np.linalg.norm).sort_values(ascending=False)
component_norms
Out[14]:
In [15]:
plt.plot(np.arange(len(component_norms)), component_norms, 'o')
Out[15]:
In [16]:
reduced = reduced[component_norms.index]
reduced.head()
Out[16]:
In [17]:
reduced_names = reduced.join(cluster_names_lowrank)
reduced_names.head()
Out[17]:
In [19]:
sns.heatmap(reduced_names.groupby('cluster_id').median())
Out[19]:
In [101]:
sns.pairplot(reduced_names, hue='cluster_id', palette=cluster_names_to_color)
Out[101]:
In [89]:
ica_components = pd.DataFrame(ica.components_, columns=lowrank.columns)
print(ica_components.shape)
ica_components.head()
Out[89]:
In [91]:
sns.distplot(ica_components.values.flat)
Out[91]:
In [22]:
from sklearn.decomposition import PCA
pca = PCA()
pca_reduced = pd.DataFrame(pca.fit_transform(lowrank), index=lowrank.index)
pca_reduced.head()
Out[22]:
In [23]:
plt.plot(pca.explained_variance_ratio_[:10], 'o-')
Out[23]:
In [24]:
pca_reduced_subset = pca_reduced.loc[:, :5]
In [25]:
pca_reduced_names = pca_reduced_subset.join(cluster_names_lowrank)
pca_reduced_names.head()
Out[25]:
In [26]:
sns.heatmap(pca_reduced_names.groupby('cluster_id').mean())
Out[26]:
In [27]:
sns.pairplot(pca_reduced_names, hue='cluster_id',
palette=cluster_names_to_color)
In [53]:
matrix = np.array([[1, 1], [0, 2]]).astype(float).T
matrix
Out[53]:
In [54]:
np.linalg.svd(matrix)
Out[54]:
In [55]:
pca0 = PCA()
ica0 = FastICA()
In [59]:
pcad = pca0.fit_transform(matrix)
pcad
Out[59]:
In [47]:
pca0.explained_variance_ratio_
Out[47]:
In [48]:
pca0.components_
Out[48]:
In [49]:
matrix
Out[49]:
In [50]:
%pdb
In [51]:
icad = ica0.fit_transform(matrix)
icad
Out[51]:
In [128]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0, perplexity=120)
smushed = pd.DataFrame(tsne.fit_transform(reduced), index=lowrank.index)
# smushed = pd.DataFrame(tsne.fit_transform(pca_reduced.loc[:, :2]), index=lowrank.index)
smushed.head()
Out[128]:
In [129]:
smushed_names = smushed.join(cluster_names_lowrank)
smushed_names.head()
Out[129]:
In [130]:
sns.pairplot(smushed_names, hue='cluster_id', palette=cluster_names_to_color)
Out[130]:
In [ ]:
In [14]:
rpca_alm.lmbda
Out[14]:
In [15]:
U, s, V = np.linalg.svd(rpca_alm.L)
In [16]:
U
Out[16]:
In [17]:
sns.distplot(s[s > 0.1], kde=False)
Out[17]:
In [59]:
diff = rpca_alm.L - lowrank
In [60]:
datasets = {'Original': lowrank, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S,
'Difference: Original - Low-Rank': diff}
common.heatmaps(datasets)
In [61]:
L = pd.DataFrame(rpca_alm.L, index=lowrank.index, columns=lowrank.columns)
L.head()
Out[61]:
In [63]:
sns.distplot(lowrank.values.flat)
Out[63]:
In [62]:
sns.distplot(L.values.flat)
Out[62]:
In [74]:
lowrank_tidy = lowrank.unstack().reset_index()
lowrank_tidy['dataset'] = 'Original'
L_tidy = L.unstack().reset_index()
L_tidy['dataset'] = 'Low-Rank'
tidy = pd.concat([lowrank_tidy, L_tidy])
tidy = tidy.rename(columns={0: 'molecules'})
tidy.head()
Out[74]:
In [75]:
sns.violinplot(x='dataset', y='molecules', data=tidy)
Out[75]:
In [76]:
sns.boxplot(x='dataset', y='molecules', data=tidy)
Out[76]:
In [37]:
S = pd.DataFrame(rpca_alm.S, index=lowrank.index, columns=lowrank.columns)
S.head()
Out[37]:
In [21]:
diff.head()
Out[21]:
In [22]:
gr0 = rpca_alm.L > 0
diff_gr0 = lowrank - gr0
datasets = {'Original': lowrank, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S,
'Difference: Original - Low-Rank': diff_gr0}
common.heatmaps(datasets)
In [23]:
clustergrid = sns.clustermap(L, xticklabels=[], yticklabels=[],
row_colors=color_labels)
In [24]:
g_original = sns.clustermap(lowrank.T.corr(method='spearman'), xticklabels=[], yticklabels=[],
col_colors=color_labels)
In [39]:
S.head()
Out[39]:
In [49]:
sns.distplot(S.values.flat)
Out[49]:
In [52]:
np.median(S.values)
Out[52]:
In [56]:
high_in_sparse = (S > 10).any()
print(high_in_sparse.sum())
S.loc[:, high_in_sparse]
Out[56]:
In [47]:
data = S[S > 0]
data = data.fillna(0)
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[], row_colors=color_labels)
In [38]:
data = S.T.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[],
col_colors=color_labels, row_colors=color_labels)
In [44]:
data = S.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[])
In [25]:
data = L.T.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[],
col_colors=color_labels, row_colors=color_labels)
So this seemed to have flipped some of the cells into different types, and made the within-cluster distances smaller
In [45]:
reconstructed = L + S
data = reconstructed.T.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[],
col_colors=color_labels, row_colors=color_labels)
In [81]:
csv = os.path.join(data_folder, 'sparse.csv')
S.to_csv(csv)
In [82]:
csv = os.path.join(data_folder, 'lowrank.csv')
L.to_csv(csv)
In [79]:
L.shape
Out[79]:
In [ ]:
from sklearn.decomposition import ICA
ica = ICA(n_components=)
In [36]:
reduced = rpcaADMM.rpcaADMM(lowrank)
# print(reduced.shape)
# reduced.head()
In [62]:
rpcaADMM.rpcaADMM()
In [38]:
reduced.keys()
Out[38]:
In [76]:
ncols = 4
nrows = 1
axsize = 3
width = ncols * axsize
height = nrows * axsize
fig, axes = plt.subplots(ncols=ncols, figsize=(width, height))
axes_iter = axes.flat
x_keys = [key for key in reduced if key.startswith('X')]
ax = next(axes_iter)
data = lowrank
mask = data == 0
sns.heatmap(lowrank, mask=mask, ax=ax, xticklabels=[], yticklabels=[])
ax.set(title='Original')
for ax, key in zip(axes_iter, x_keys):
data = reduced[key]
mask = data == 0
vmin = data.min().min()
vmax = data.max().max()
center = 0
sns.heatmap(reduced[key], mask=mask, ax=ax, xticklabels=[], yticklabels=[])
ax.set(title=key)
In [79]:
ncols = 4
nrows = 1
axsize = 3
width = ncols * axsize * 1.25
height = nrows * axsize
fig, axes = plt.subplots(ncols=ncols, figsize=(width, height))
axes_iter = axes.flat
x_keys = [key for key in reduced if key.startswith('X')]
ax = next(axes_iter)
common.heatmap(lowrank, ax=ax)
ax.set(title='Original')
for ax, key in zip(axes_iter, x_keys):
common.heatmap(reduced[key], ax=ax)
ax.set(title=key)
In [55]:
U, s, V = np.linalg.svd(reduced['X3_admm'])
In [61]:
reduced['X2_admm'][reduced['X2_admm'].nonzero()]
Out[61]:
In [80]:
ax
Out[80]:
In [81]:
sns.heatmap??
In [ ]: