In [2]:
import os
import common
# Assign notebook and folder names
notebook_name = '02_robust_pca'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder
In [3]:
%pdb
In [4]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
In [5]:
input_folder = os.path.join(common.DATA_FOLDER, '001_downsample_macosko_data')
csv = os.path.join(input_folder, 'expression_table1_subset.csv')
table1 = pd.read_csv(csv, index_col=0)
print(table1.shape)
table1.head()
Out[5]:
In [6]:
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()
Out[6]:
In [7]:
cluster_identities_table1 = cluster_identities.loc[table1.index]
cluster_identities_table1.head()
Out[7]:
In [8]:
cluster_ids = cluster_identities_table1.unique()
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids))
id_to_color = dict(zip(cluster_ids, colors))
color_labels = [id_to_color[i] for i in cluster_identities_table1]
color_labels[:4]
Out[8]:
In [9]:
sns.set(style='whitegrid')
In [10]:
mask = table1 == 0
fig, ax = plt.subplots()
sns.heatmap(table1, mask=mask, xticklabels=[], yticklabels=[])
ax.set(xlabel='Genes', ylabel='Cells')
Out[10]:
In [11]:
clustergrid = sns.clustermap(table1, mask=mask, xticklabels=[], yticklabels=[],
row_colors=color_labels)
In [12]:
import sys
sys.path.extend(['/Users/olgabot/code/robust-pca/', '/Users/olgabot/code/rpcaADMM/'])
import r_pca
import rpcaADMM
In [104]:
r_pca.R_pca??
In [13]:
%%time
rpca_alm = r_pca.R_pca(table1.as_matrix())
rpca_alm.fit()
In [14]:
rpca_alm.lmbda
Out[14]:
In [15]:
U, s, V = np.linalg.svd(rpca_alm.L)
In [16]:
U
Out[16]:
In [17]:
sns.distplot(s[s > 0.1], kde=False)
Out[17]:
In [59]:
diff = rpca_alm.L - table1
In [60]:
datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S,
'Difference: Original - Low-Rank': diff}
common.heatmaps(datasets)
In [61]:
L = pd.DataFrame(rpca_alm.L, index=table1.index, columns=table1.columns)
L.head()
Out[61]:
In [63]:
sns.distplot(table1.values.flat)
Out[63]:
In [62]:
sns.distplot(L.values.flat)
Out[62]:
In [101]:
diff = table1 - L
diff_tidy = diff.unstack().reset_index()
diff_tidy['dataset'] = 'Difference'
table1_tidy = table1.unstack().reset_index()
table1_tidy['dataset'] = 'Original'
L_tidy = L.unstack().reset_index()
L_tidy['dataset'] = 'Low-Rank'
tidy = pd.concat([table1_tidy, L_tidy, diff_tidy])
tidy = tidy.rename(columns={0: 'molecules'})
tidy.head()
sns.violinplot(x='dataset', y='molecules', data=tidy)
Out[101]:
In [103]:
sns.boxplot(x='dataset', y='molecules', data=tidy)
Out[103]:
In [37]:
S = pd.DataFrame(rpca_alm.S, index=table1.index, columns=table1.columns)
S.head()
Out[37]:
In [21]:
diff.head()
Out[21]:
In [22]:
gr0 = rpca_alm.L > 0
diff_gr0 = table1 - gr0
datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S,
'Difference: Original - Low-Rank': diff_gr0}
common.heatmaps(datasets)
In [23]:
clustergrid = sns.clustermap(L, xticklabels=[], yticklabels=[],
row_colors=color_labels)
In [24]:
g_original = sns.clustermap(table1.T.corr(method='spearman'), xticklabels=[], yticklabels=[],
col_colors=color_labels)
In [88]:
import fastcluster
In [89]:
fastcluster.pdist?
In [95]:
table1_clustergrid = common.clustermap(table1.T.corr(method='spearman'), col_colors=color_labels)
table1_clustergrid.savefig(os.path.join(figure_folder, 'expression_table1_clustermap.pdf'))
In [39]:
S.head()
Out[39]:
In [49]:
sns.distplot(S.values.flat)
Out[49]:
In [52]:
np.median(S.values)
Out[52]:
In [56]:
high_in_sparse = (S > 10).any()
print(high_in_sparse.sum())
S.loc[:, high_in_sparse]
Out[56]:
In [47]:
data = S[S > 0]
data = data.fillna(0)
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[], row_colors=color_labels)
In [38]:
data = S.T.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[],
col_colors=color_labels, row_colors=color_labels)
In [44]:
data = S.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[])
In [96]:
data = L.T.corr(method='spearman')
g_rpca = common.clustermap(data, col_colors=color_labels)
g_rpca.savefig(os.path.join(figure_folder, 'low_rank_clustermap.pdf'))
In [99]:
U, s, V = np.linalg.svd(L)
plt.plot(s[:10], 'o-')
Out[99]:
In [100]:
U, s, V = np.linalg.svd(table1)
plt.plot(s[:10], 'o-')
Out[100]:
So this seemed to have flipped some of the cells into different types, and made the within-cluster distances smaller
In [45]:
reconstructed = L + S
data = reconstructed.T.corr(method='spearman')
g_rpca = sns.clustermap(data, xticklabels=[], yticklabels=[],
col_colors=color_labels, row_colors=color_labels)
In [81]:
csv = os.path.join(data_folder, 'sparse.csv')
S.to_csv(csv)
In [83]:
data_folder
Out[83]:
In [82]:
csv = os.path.join(data_folder, 'lowrank.csv')
L.to_csv(csv)
In [79]:
L.shape
Out[79]:
In [ ]:
from sklearn.decomposition import ICA
ica = ICA(n_components=)
In [36]:
reduced = rpcaADMM.rpcaADMM(table1)
# print(reduced.shape)
# reduced.head()
In [62]:
rpcaADMM.rpcaADMM()
In [38]:
reduced.keys()
Out[38]:
In [76]:
ncols = 4
nrows = 1
axsize = 3
width = ncols * axsize
height = nrows * axsize
fig, axes = plt.subplots(ncols=ncols, figsize=(width, height))
axes_iter = axes.flat
x_keys = [key for key in reduced if key.startswith('X')]
ax = next(axes_iter)
data = table1
mask = data == 0
sns.heatmap(table1, mask=mask, ax=ax, xticklabels=[], yticklabels=[])
ax.set(title='Original')
for ax, key in zip(axes_iter, x_keys):
data = reduced[key]
mask = data == 0
vmin = data.min().min()
vmax = data.max().max()
center = 0
sns.heatmap(reduced[key], mask=mask, ax=ax, xticklabels=[], yticklabels=[])
ax.set(title=key)
In [79]:
ncols = 4
nrows = 1
axsize = 3
width = ncols * axsize * 1.25
height = nrows * axsize
fig, axes = plt.subplots(ncols=ncols, figsize=(width, height))
axes_iter = axes.flat
x_keys = [key for key in reduced if key.startswith('X')]
ax = next(axes_iter)
common.heatmap(table1, ax=ax)
ax.set(title='Original')
for ax, key in zip(axes_iter, x_keys):
common.heatmap(reduced[key], ax=ax)
ax.set(title=key)
In [55]:
U, s, V = np.linalg.svd(reduced['X3_admm'])
In [61]:
reduced['X2_admm'][reduced['X2_admm'].nonzero()]
Out[61]:
In [80]:
ax
Out[80]:
In [81]:
sns.heatmap??
In [ ]: