In [1]:
import os

HOME = os.path.expanduser('~')

DATA_FOLDER = os.path.abspath(
    os.path.join(HOME, 'projects', 'cshl-singlecell-2017', 'data'))
FIGURE_FOLDER = os.path.abspath(
    os.path.join(HOME, 'projects', 'cshl-singlecell-2017', 'figures'))


notebook_name = '50_Example_workflow_reanalyzing_macosko2015'
data_folder = os.path.join(DATA_FOLDER, notebook_name)
figure_folder = os.path.join(FIGURE_FOLDER, notebook_name)

input_folder = os.path.join(DATA_FOLDER, '91_filter_genes')

! mkdir -p $data_folder
! mkdir -p $figure_folder

In [3]:
from sklearn.decomposition import PCA, FastICA, NMF
from sklearn.manifold import TSNE, MDS

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import phenograph

import macosko2015
%matplotlib inline

In [5]:
counts, cells, genes = macosko2015.load_differential_clusters()
print('counts.shape', counts.shape)
print('cells.shape', cells.shape)
print('genes.shape', genes.shape)


counts.shape (6020, 1339)
cells.shape (6020, 5)
genes.shape (1339, 39)

In [6]:
counts.head()


Out[6]:
1500015O10RIK 1500016L03RIK 1700025G04RIK 1810009A15RIK 1810037I17RIK 2010107E04RIK 2410066E13RIK 2610017I09RIK 2900011O08RIK 4632428N05RIK ... ZFHX4 ZFP36 ZFP365 ZFP36L1 ZFP804A ZIC1 ZIC4 ZMAT4 ZWINT ZYX
r1_GGCCGCAGTCCG 0 0 5 4 7 28 5 0 33 0 ... 3 0 7 0 0 0 0 7 53 0
r1_CTTGTGCGGGAA 0 0 9 4 4 33 4 0 43 0 ... 8 0 4 0 0 0 0 5 65 1
r1_GCGCAACTGCTC 0 0 11 1 4 26 2 0 30 0 ... 0 0 2 0 0 0 0 4 38 0
r1_GATTGGGAGGCA 0 0 8 2 5 14 6 0 20 0 ... 1 0 1 0 0 1 0 3 17 0
r1_GTGCCGCCTCTC 0 0 1 19 1 13 0 0 0 0 ... 0 0 3 0 0 0 0 0 3 0

5 rows × 1339 columns


In [8]:
genes.head()


Out[8]:
cluster_01 cluster_02 cluster_03 cluster_04 cluster_05 cluster_06 cluster_07 cluster_08 cluster_09 cluster_10 ... cluster_30 cluster_31 cluster_32 cluster_33 cluster_34 cluster_35 cluster_36 cluster_37 cluster_38 cluster_39
1500015O10RIK False False False False False False False False False False ... False False False False False True False False False False
1500016L03RIK True False False False False False False False False False ... False False False False False False False False False False
1700025G04RIK False False True False False False False False False False ... False False False False False False False False False False
1810009A15RIK False False False False False False False False False False ... False False False False False False False False True False
1810037I17RIK False False False False False False False False False False ... False False False False False True False False False False

5 rows × 39 columns


In [9]:
cells.head()


Out[9]:
cluster_id celltype cluster_n cluster_n_celltype cluster_celltype_with_id
r1_GGCCGCAGTCCG cluster_02 Retinal ganglion cells 2 #2 (Retinal ganglion cells) Retinal ganglion cells (cluster_02)
r1_CTTGTGCGGGAA cluster_02 Retinal ganglion cells 2 #2 (Retinal ganglion cells) Retinal ganglion cells (cluster_02)
r1_GCGCAACTGCTC cluster_02 Retinal ganglion cells 2 #2 (Retinal ganglion cells) Retinal ganglion cells (cluster_02)
r1_GATTGGGAGGCA cluster_02 Retinal ganglion cells 2 #2 (Retinal ganglion cells) Retinal ganglion cells (cluster_02)
r1_GTGCCGCCTCTC cluster_25 Cones 25 #25 (Cones) Cones (cluster_25)

New stuff starts here


In [10]:
pcaer = PCA(n_components=15)
# pcad = pcaer.fit_tra

In [11]:
pcad = pcaer.fit_transform(counts)
pcad


Out[11]:
array([[  5.46385143e+02,   3.61032065e+02,   1.31796312e+02, ...,
          8.38057605e+00,  -4.03285725e-01,  -1.31660168e+01],
       [  4.86800651e+02,   3.27605853e+02,   1.75593152e+02, ...,
         -4.40190141e+01,   1.90795060e+01,  -1.68542990e+01],
       [  4.40246012e+02,   2.88133238e+02,   1.01042238e+02, ...,
          4.01767899e+01,  -3.15857548e+01,  -1.29648300e-01],
       ..., 
       [ -3.31366005e+00,  -1.46143131e+01,   4.10197139e+00, ...,
          5.96139569e-01,  -9.32569512e-02,  -3.44633097e-01],
       [ -5.11851428e+00,  -1.22843269e+01,   3.83111549e+00, ...,
         -8.87281469e-01,  -4.48869722e-01,   1.65518590e+00],
       [ -9.61988144e+00,  -9.88434613e-01,   3.20068012e+00, ...,
          3.51019094e-01,   1.99771531e+00,  -1.36430659e+00]])

In [12]:
pcad.shape


Out[12]:
(6020, 15)

In [13]:
pcad_df = pd.DataFrame(pcad, index=counts.index)
print(pcad_df.shape)
pcad_df.head()


(6020, 15)
Out[13]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
r1_GGCCGCAGTCCG 546.385143 361.032065 131.796312 -64.649849 191.113302 30.551624 22.844912 124.082877 32.257848 21.242494 59.163781 -65.547015 8.380576 -0.403286 -13.166017
r1_CTTGTGCGGGAA 486.800651 327.605853 175.593152 -63.951792 89.795230 1.100060 0.223203 -73.949596 -48.630008 -16.423170 -4.778505 3.567647 -44.019014 19.079506 -16.854299
r1_GCGCAACTGCTC 440.246012 288.133238 101.042238 -48.536213 178.394218 24.691353 43.541390 48.282533 13.902831 5.559776 8.546234 -19.202391 40.176790 -31.585755 -0.129648
r1_GATTGGGAGGCA 315.786259 196.251963 82.756895 -38.942127 56.502276 8.040594 -5.991658 -66.179184 -11.675624 -8.598888 -12.141604 24.230639 -3.149860 10.407659 2.943108
r1_GTGCCGCCTCTC -217.350470 453.545334 -46.417528 46.860288 -19.148056 24.754838 3.674198 3.078475 -16.366093 28.454098 -1.383903 10.201579 26.241438 4.163889 -29.343265

In [16]:
%%time

smusher = TSNE()
tsned = smusher.fit_transform(pcad_df)
print(tsned.shape)
tsned


(6020, 2)
CPU times: user 1min 12s, sys: 20.8 s, total: 1min 32s
Wall time: 1min 32s

In [ ]: