In [2]:
import os
import common
# Assign notebook and folder names
notebook_name = '02_robust_pca'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder
In [3]:
%pdb
In [4]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
In [5]:
input_folder = os.path.join(common.DATA_FOLDER, '001_downsample_macosko_data')
csv = os.path.join(input_folder, 'expression_table1_subset.csv')
table1 = pd.read_csv(csv, index_col=0)
print(table1.shape)
table1.head()
Out[5]:
In [6]:
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()
Out[6]:
In [7]:
cluster_identities_table1 = cluster_identities.loc[table1.index]
cluster_identities_table1.head()
Out[7]:
In [8]:
cluster_ids = cluster_identities_table1.unique()
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids))
id_to_color = dict(zip(cluster_ids, colors))
color_labels = [id_to_color[i] for i in cluster_identities_table1]
color_labels[:4]
Out[8]:
In [9]:
sns.set(style='whitegrid')
In [10]:
mask = table1 == 0
fig, ax = plt.subplots()
sns.heatmap(table1, mask=mask, xticklabels=[], yticklabels=[])
ax.set(xlabel='Genes', ylabel='Cells')
Out[10]:
In [11]:
clustergrid = sns.clustermap(table1, mask=mask, xticklabels=[], yticklabels=[],
row_colors=color_labels)
In [12]:
import sys
sys.path.extend(['/Users/olgabot/code/robust-pca/', '/Users/olgabot/code/rpcaADMM/'])
import r_pca
import rpcaADMM
In [104]:
r_pca.R_pca??
In [13]:
%%time
rpca_alm = r_pca.R_pca(table1.as_matrix())
rpca_alm.fit()
In [14]:
rpca_alm.lmbda
Out[14]:
In [15]:
U, s, V = np.linalg.svd(rpca_alm.L)
In [16]:
U
Out[16]:
In [17]:
sns.distplot(s[s > 0.1], kde=False)
Out[17]:
In [59]:
diff = rpca_alm.L - table1
In [60]:
datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S,
'Difference: Original - Low-Rank': diff}
common.heatmaps(datasets)
In [61]:
L = pd.DataFrame(rpca_alm.L, index=table1.index, columns=table1.columns)
L.head()
Out[61]:
In [63]:
sns.distplot(table1.values.flat)
Out[63]:
In [62]:
sns.distplot(L.values.flat)
Out[62]:
In [101]:
diff = table1 - L
diff_tidy = diff.unstack().reset_index()
diff_tidy['dataset'] = 'Difference'
table1_tidy = table1.unstack().reset_index()
table1_tidy['dataset'] = 'Original'
L_tidy = L.unstack().reset_index()
L_tidy['dataset'] = 'Low-Rank'
tidy = pd.concat([table1_tidy, L_tidy, diff_tidy])
tidy = tidy.rename(columns={0: 'molecules'})
tidy.head()
sns.violinplot(x='dataset', y='molecules', data=tidy)
Out[101]:
In [103]:
sns.boxplot(x='dataset', y='molecules', data=tidy)
Out[103]:
In [37]:
S = pd.DataFrame(rpca_alm.S, index=table1.index, columns=table1.columns)
S.head()
Out[37]:
In [21]:
diff.head()
Out[21]:
In [22]:
gr0 = rpca_alm.L > 0
diff_gr0 = table1 - gr0
datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S,
'Difference: Original - Low-Rank': diff_gr0}
common.heatmaps(datasets)
In [23]:
clustergrid = sns.clustermap(L, xticklabels=[], yticklabels=[],
row_colors=color_labels)
In [24]:
g_original = sns.clustermap(table1.T.corr(method='spearman'), xticklabels=[], yticklabels=[],
col_colors=color_labels)
In [88]:
import fastcluster
In [89]:
fastcluster.pdist?
In [95]:
table1_clustergrid = common.clustermap(table1.T.corr(method='spearman'), col_colors=color_labels)
table1_clustergrid.savefig(os.path.join(figure_folder, 'expression_table1_clustermap.pdf'))