In [1]:
import os
import common
# Assign notebook and folder names
notebook_name = '03_robust_pca_range_of_lambdas'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder
In [11]:
import fastcluster
In [15]:
fastcluster.linkage()
Out[15]:
%pdb
In [3]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
In [4]:
input_folder = os.path.join(common.DATA_FOLDER, '001_downsample_macosko_data')
csv = os.path.join(input_folder, 'expression_table1_subset.csv')
table1 = pd.read_csv(csv, index_col=0)
print(table1.shape)
table1.head()
Out[4]:
In [22]:
polo.optimal_leaf_ordering
In [17]:
fastcluster.linkage(table1).shape
Out[17]:
In [5]:
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()
Out[5]:
In [6]:
cluster_identities_table1 = cluster_identities.loc[table1.index]
cluster_identities_table1.head()
Out[6]:
In [7]:
cluster_ids = cluster_identities_table1.unique()
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids))
id_to_color = dict(zip(cluster_ids, colors))
color_labels = [id_to_color[i] for i in cluster_identities_table1]
color_labels[:4]
Out[7]:
In [9]:
import sys
sys.path.extend(['/Users/olgabot/code/robust-pca/', '/Users/olgabot/code/rpcaADMM/'])
import r_pca
import rpcaADMM
In [30]:
def tidifier(data, col, group):
tidy = data.unstack().reset_index()
tidy[col] = group
return tidy
In [31]:
%%time
lmbdas = np.arange(0.05, 0.35, 0.05)
rpcas = {}
for lmbda in lmbdas:
prefix = 'lambda{:.2f}'.format(lmbda)
print('--- {} ---'.format(prefix))
rpca = r_pca.R_pca(table1.as_matrix(), lmbda=lmbda)
rpca.fit()
rpcas[lmbda] = rpca
U, s, V = np.linalg.svd(rpca.L)
if (s == 0).all():
continue
fig, ax = plt.subplots()
sns.distplot(s[s > 1e-2], kde=False)
fig.savefig(os.path.join(figure_folder, '{}_low_rank_singular_values_distplot.pdf'.format(
prefix)))
fig.suptitle(prefix)
L = pd.DataFrame(rpca.L, index=table1.index, columns=table1.columns)
diff = table1 - L
datasets = {'Original': table1, 'Low-Rank': L, 'Sparse': rpca.S,
'Difference: Original - Low-Rank': diff}
diff_tidy = tidifier(diff, 'dataset', 'Difference')
table1_tidy = tidifier(table1, 'dataset', 'Difference')
L_tidy = tidifier(L, 'dataset', 'Difference')
tidy = pd.concat([table1_tidy, L_tidy, diff_tidy])
tidy = tidy.rename(columns={0: 'molecules'})
fig, ax = plt.subplots()
sns.violinplot(x='dataset', y='molecules', data=tidy)
fig.savefig(os.path.join(figure_folder, '{}_magnitude_violinplots.pdf'))
common.heatmaps(datasets)
fig = plt.gcf()
fig.suptitle(prefix)
fig.savefig(os.path.join(figure_folder, '{}_matrix_heatmaps.pdf'.format(prefix)))
data = L.T.corr(method='spearman')
g_rpca = common.clustermap(data, col_colors=color_labels, row_colors=color_labels)
g_rpca.fig.suptitle(prefix)
g_rpca.savefig(os.path.join(figure_folder, '{}_clustermap.pdf'.format(prefix)))
In [36]:
reduced = rpcaADMM.rpcaADMM(table1)
# print(reduced.shape)
# reduced.head()
In [62]:
rpcaADMM.rpcaADMM()
In [38]:
reduced.keys()
Out[38]:
In [76]:
ncols = 4
nrows = 1
axsize = 3
width = ncols * axsize
height = nrows * axsize
fig, axes = plt.subplots(ncols=ncols, figsize=(width, height))
axes_iter = axes.flat
x_keys = [key for key in reduced if key.startswith('X')]
ax = next(axes_iter)
data = table1
mask = data == 0
sns.heatmap(table1, mask=mask, ax=ax, xticklabels=[], yticklabels=[])
ax.set(title='Original')
for ax, key in zip(axes_iter, x_keys):
data = reduced[key]
mask = data == 0
vmin = data.min().min()
vmax = data.max().max()
center = 0
sns.heatmap(reduced[key], mask=mask, ax=ax, xticklabels=[], yticklabels=[])
ax.set(title=key)
In [79]:
ncols = 4
nrows = 1
axsize = 3
width = ncols * axsize * 1.25
height = nrows * axsize
fig, axes = plt.subplots(ncols=ncols, figsize=(width, height))
axes_iter = axes.flat
x_keys = [key for key in reduced if key.startswith('X')]
ax = next(axes_iter)
common.heatmap(table1, ax=ax)
ax.set(title='Original')
for ax, key in zip(axes_iter, x_keys):
common.heatmap(reduced[key], ax=ax)
ax.set(title=key)
In [55]:
U, s, V = np.linalg.svd(reduced['X3_admm'])
In [61]:
reduced['X2_admm'][reduced['X2_admm'].nonzero()]
Out[61]:
In [80]:
ax
Out[80]:
In [81]:
sns.heatmap??
In [ ]: