In [16]:
import os
import common
# Assign notebook and folder names
notebook_name = '06_exploring_with_josh'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder
In [17]:
%pdb
In [39]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
In [24]:
input_folder = os.path.join(common.DATA_FOLDER, '001_downsample_macosko_data')
csv = os.path.join(input_folder, 'expression_table1_subset.csv')
table1 = pd.read_csv(csv, index_col=0)
print(table1.shape)
table1.head()
Out[24]:
In [ ]:
cluster_name_to_id = {'Horizontal cells': [1], 'Retinal Ganglion cells': [2],
'Amacrine cells': np.arange(3, 24),
'Rods', [24], 'Cones': [25],
'Bipolar cells': np.arange(26, 34),
''}
In [5]:
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()
Out[5]:
In [6]:
cluster_identities_table1 = cluster_identities.loc[table1.index]
cluster_identities_table1.head()
Out[6]:
In [15]:
cluster_ids = cluster_identities_table1.unique()
cluster_ids
Out[15]:
In [32]:
cluster_names = cluster_identities_table1.map(common.cluster_id_to_name)
cluster_names.head()
Out[32]:
In [7]:
colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids))
id_to_color = dict(zip(cluster_ids, colors))
color_labels = [id_to_color[i] for i in cluster_identities_table1]
color_labels[:4]
Out[7]:
In [25]:
genes_of_interest = ['RHO', 'PAX6', 'GNAT1', 'SLC24A1']
In [56]:
subset = table1[genes_of_interest]
subset.head()
Out[56]:
In [52]:
# subset_log = np.log(subset+1)
# subset_log.head()
Out[52]:
In [57]:
subset_names = subset.join(cluster_names)
subset_names.head()
Out[57]:
In [58]:
sns.pairplot(subset_names, hue='cluster_id')
Out[58]:
In [46]:
np
Out[46]:
In [47]:
sns.pairplot(subset.apply(np.log), hue='cluster_id')
In [8]:
sns.set(style='whitegrid')
In [9]:
mask = table1 == 0
fig, ax = plt.subplots()
sns.heatmap(table1, mask=mask, xticklabels=[], yticklabels=[])
ax.set(xlabel='Genes', ylabel='Cells')
Out[9]:
In [11]:
clustergrid = sns.clustermap(table1, mask=mask, xticklabels=[], yticklabels=[],
row_colors=color_labels)
In [62]:
import sys
sys.path.extend(['/Users/olgabot/code/robust-pca/', '/Users/olgabot/code/rpcaADMM/'])
import r_pca
import rpcaADMM
In [63]:
r_pca.R_pca??
In [64]:
%%time
rpca_alm = r_pca.R_pca(table1.as_matrix())
rpca_alm.fit()
In [71]:
sns.distplot(s[s > 0.1], kde=False)
Out[71]:
In [72]:
diff = rpca_alm.L - table1
In [73]:
datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S,
'Difference: Original - Low-Rank': diff}
common.heatmaps(datasets)
In [74]:
L = pd.DataFrame(rpca_alm.L, index=table1.index, columns=table1.columns)
L.head()
Out[74]:
In [75]:
L_subset = L[genes_of_interest]
L_names = L_subset.join(cluster_names)
sns.pairplot(L_names, hue='cluster_id')
Out[75]:
In [76]:
sns.distplot(table1.values.flat)
Out[76]:
In [78]:
sns.distplot(L.values.flat)
Out[78]:
In [79]:
diff = table1 - L
diff_tidy = diff.unstack().reset_index()
diff_tidy['dataset'] = 'Difference'
table1_tidy = table1.unstack().reset_index()
table1_tidy['dataset'] = 'Original'
L_tidy = L.unstack().reset_index()
L_tidy['dataset'] = 'Low-Rank'
tidy = pd.concat([table1_tidy, L_tidy, diff_tidy])
tidy = tidy.rename(columns={0: 'molecules'})
tidy.head()
sns.violinplot(x='dataset', y='molecules', data=tidy)
Out[79]:
In [80]:
sns.boxplot(x='dataset', y='molecules', data=tidy)
Out[80]:
In [81]:
S = pd.DataFrame(rpca_alm.S, index=table1.index, columns=table1.columns)
S.head()
Out[81]:
In [88]:
sns.boxplot(table1[genes_of_interest])
Out[88]:
In [86]:
sns.boxplot(L[genes_of_interest])
Out[86]:
In [87]:
sns.boxplot(S[genes_of_interest])
Out[87]:
In [21]:
diff.head()
Out[21]:
In [22]:
gr0 = rpca_alm.L > 0
diff_gr0 = table1 - gr0
datasets = {'Original': table1, 'Low-Rank':rpca_alm.L, 'Sparse': rpca_alm.S,
'Difference: Original - Low-Rank': diff_gr0}
common.heatmaps(datasets)
In [23]:
clustergrid = sns.clustermap(L, xticklabels=[], yticklabels=[],
row_colors=color_labels)
In [24]:
g_original = sns.clustermap(table1.T.corr(method='spearman'), xticklabels=[], yticklabels=[],
col_colors=color_labels)