In [96]:
    
import os
import common
# Assign notebook and folder names
notebook_name = '01_downsample_macosko_data'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder
    
In [2]:
    
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
    
In [4]:
    
table1 = pd.read_csv('expression_table1.csv', index_col=0)
print(table1.shape)
table1.head()
    
    
    Out[4]:
In [16]:
    
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
                                   names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()
    
    
    Out[16]:
In [17]:
    
cluster_identities.groupby(cluster_identities).size()
    
    Out[17]:
In [19]:
    
cluster_sizes_table1 = table1.groupby(cluster_identities, axis=0).size()
cluster_sizes_table1
    
    Out[19]:
In [21]:
    
del cluster_markers_dropna
    
In [22]:
    
big_clusters = cluster_sizes_table1[cluster_sizes_table1 > 100]
big_clusters
    
    Out[22]:
In [26]:
    
big_clusters.index = big_clusters.index.astype(int)
big_clusters
    
    Out[26]:
In [28]:
    
cells_in_big_clusters = cluster_identities.isin(big_clusters.index)
cells_in_big_clusters = cells_in_big_clusters[cells_in_big_clusters]
cells_in_big_clusters.sum()
    
    Out[28]:
In [29]:
    
cells_in_big_clusters.head()
    
    Out[29]:
In [107]:
    
table1_big_clusters, y = table1.align(cells_in_big_clusters, axis=0, join='inner')
print(table1_big_clusters.shape)
print(y.shape)
    
    
Grab random genes and make sure this has decent cluster representation
In [103]:
    
# np.random.seed(2017)
# n_cells = 200
# random_cells = np.random.choice(x.index, size=n_cells, replace=False)
# #  Perform the subset
# table1_subset_cells = x.loc[random_cells, :]
# print(table1_subset_cells.shape)
# table1_subset_cells.groupby(cluster_identities).size()
    
In [112]:
    
np.random.seed(2017)
n_cells = 50
table1_subset_cells = table1_big_clusters.groupby(
    cluster_identities, as_index=False, group_keys=False).apply(
        lambda x: x.loc[np.random.choice(x.index, size=n_cells, replace=False)])
print(table1_subset_cells.shape)
table1_subset_cells.head()
    
    
    Out[112]:
In [113]:
    
cluster_markers = pd.read_excel('macosko2015/mmc4.xlsx', 
                                sheetname='FINAL_MARKERS_FOR_EACH_CLUSTER.',
                                skiprows=3)
print(cluster_markers.shape)
cluster_markers.head()
    
    
    Out[113]:
In [114]:
    
cluster_markers.dtypes
    
    Out[114]:
In [115]:
    
cluster_markers.tail()
    
    Out[115]:
In [116]:
    
cluster_markers = cluster_markers.dropna()
print(cluster_markers.shape)
cluster_markers.tail()
    
    
    Out[116]:
In [117]:
    
big_clusters.index.values.dtype
    
    Out[117]:
In [118]:
    
cluster_markers.groupby('cluster #').size()
    
    Out[118]:
Okay this file is corrupted..... have to go in and manually change the file so the cluster numbers are in the right column
In [119]:
    
cluster_markers = pd.read_excel('macosko2015/mmc4_v2.xlsx', skiprows=3)
cluster_markers = cluster_markers.rename(columns={"Unnamed: 0": 'gene_symbol'})
# Remove any rows with NA because those are all header rows
print(cluster_markers.shape)
cluster_markers = cluster_markers.dropna()
print(cluster_markers.shape)
cluster_markers['cluster #'] = cluster_markers['cluster #'].astype(int)
cluster_markers.head()
    
    
    Out[119]:
In [120]:
    
cluster_markers.dtypes
    
    Out[120]:
In [1]:
    
cluster_markers.groupby('cluster #').size()
    
    
In [ ]:
    
table1.groupby()
    
In [122]:
    
len(cluster_markers['gene_symbol'].unique())
    
    Out[122]:
In [123]:
    
cleaned_symbols = cluster_markers['gene_symbol'].str.split().str[0]
len(cleaned_symbols.unique())
    
    Out[123]:
Assign the symbols to the column
In [124]:
    
cluster_markers['gene_symbol'] = cleaned_symbols
    
In [125]:
    
cluster_markers.to_csv(os.path.join(data_folder, 'cluster_markers_v2.csv'))
    
In [126]:
    
rows = cluster_markers['cluster #'].isin(big_clusters.index.values)
print(rows.sum())
cluster_markers_big_clusters = cluster_markers.loc[rows]
print(cluster_markers_big_clusters.shape)
cluster_markers_big_clusters.head()
    
    
    Out[126]:
In [127]:
    
gene_subset = cluster_markers_big_clusters['gene_symbol'].unique()
len(gene_subset)
    
    Out[127]:
In [128]:
    
table1_subset_cells_genes = table1_subset_cells.loc[:, gene_subset]
print(table1_subset_cells_genes.shape)
table1_subset_cells_genes.head()
    
    
    Out[128]:
In [129]:
    
table1_subset_cells_genes.to_csv(
    os.path.join(data_folder, 'expression_table1_subset.csv'))
    
In [2]:
    
    
    
In [ ]: