In [96]:
import os
import common
# Assign notebook and folder names
notebook_name = '01_downsample_macosko_data'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder
In [2]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
In [4]:
table1 = pd.read_csv('expression_table1.csv', index_col=0)
print(table1.shape)
table1.head()
Out[4]:
In [16]:
cluster_identities = pd.read_table('macosko2015/retina_clusteridentities.txt', header=None,
names=['barcode', 'cluster_id'], index_col=0, squeeze=True)
print(cluster_identities.shape)
cluster_identities.head()
Out[16]:
In [17]:
cluster_identities.groupby(cluster_identities).size()
Out[17]:
In [19]:
cluster_sizes_table1 = table1.groupby(cluster_identities, axis=0).size()
cluster_sizes_table1
Out[19]:
In [21]:
del cluster_markers_dropna
In [22]:
big_clusters = cluster_sizes_table1[cluster_sizes_table1 > 100]
big_clusters
Out[22]:
In [26]:
big_clusters.index = big_clusters.index.astype(int)
big_clusters
Out[26]:
In [28]:
cells_in_big_clusters = cluster_identities.isin(big_clusters.index)
cells_in_big_clusters = cells_in_big_clusters[cells_in_big_clusters]
cells_in_big_clusters.sum()
Out[28]:
In [29]:
cells_in_big_clusters.head()
Out[29]:
In [107]:
table1_big_clusters, y = table1.align(cells_in_big_clusters, axis=0, join='inner')
print(table1_big_clusters.shape)
print(y.shape)
Grab random genes and make sure this has decent cluster representation
In [103]:
# np.random.seed(2017)
# n_cells = 200
# random_cells = np.random.choice(x.index, size=n_cells, replace=False)
# # Perform the subset
# table1_subset_cells = x.loc[random_cells, :]
# print(table1_subset_cells.shape)
# table1_subset_cells.groupby(cluster_identities).size()
In [112]:
np.random.seed(2017)
n_cells = 50
table1_subset_cells = table1_big_clusters.groupby(
cluster_identities, as_index=False, group_keys=False).apply(
lambda x: x.loc[np.random.choice(x.index, size=n_cells, replace=False)])
print(table1_subset_cells.shape)
table1_subset_cells.head()
Out[112]:
In [113]:
cluster_markers = pd.read_excel('macosko2015/mmc4.xlsx',
sheetname='FINAL_MARKERS_FOR_EACH_CLUSTER.',
skiprows=3)
print(cluster_markers.shape)
cluster_markers.head()
Out[113]:
In [114]:
cluster_markers.dtypes
Out[114]:
In [115]:
cluster_markers.tail()
Out[115]:
In [116]:
cluster_markers = cluster_markers.dropna()
print(cluster_markers.shape)
cluster_markers.tail()
Out[116]:
In [117]:
big_clusters.index.values.dtype
Out[117]:
In [118]:
cluster_markers.groupby('cluster #').size()
Out[118]:
Okay this file is corrupted..... have to go in and manually change the file so the cluster numbers are in the right column
In [119]:
cluster_markers = pd.read_excel('macosko2015/mmc4_v2.xlsx', skiprows=3)
cluster_markers = cluster_markers.rename(columns={"Unnamed: 0": 'gene_symbol'})
# Remove any rows with NA because those are all header rows
print(cluster_markers.shape)
cluster_markers = cluster_markers.dropna()
print(cluster_markers.shape)
cluster_markers['cluster #'] = cluster_markers['cluster #'].astype(int)
cluster_markers.head()
Out[119]:
In [120]:
cluster_markers.dtypes
Out[120]:
In [1]:
cluster_markers.groupby('cluster #').size()
In [ ]:
table1.groupby()
In [122]:
len(cluster_markers['gene_symbol'].unique())
Out[122]:
In [123]:
cleaned_symbols = cluster_markers['gene_symbol'].str.split().str[0]
len(cleaned_symbols.unique())
Out[123]:
Assign the symbols to the column
In [124]:
cluster_markers['gene_symbol'] = cleaned_symbols
In [125]:
cluster_markers.to_csv(os.path.join(data_folder, 'cluster_markers_v2.csv'))
In [126]:
rows = cluster_markers['cluster #'].isin(big_clusters.index.values)
print(rows.sum())
cluster_markers_big_clusters = cluster_markers.loc[rows]
print(cluster_markers_big_clusters.shape)
cluster_markers_big_clusters.head()
Out[126]:
In [127]:
gene_subset = cluster_markers_big_clusters['gene_symbol'].unique()
len(gene_subset)
Out[127]:
In [128]:
table1_subset_cells_genes = table1_subset_cells.loc[:, gene_subset]
print(table1_subset_cells_genes.shape)
table1_subset_cells_genes.head()
Out[128]:
In [129]:
table1_subset_cells_genes.to_csv(
os.path.join(data_folder, 'expression_table1_subset.csv'))
In [2]:
In [ ]: