In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA as sklearnPCA
from plotnine import *

Read in expression matrix

mRNA-Seq from 10 individual C.elegans worms. Processed with CEL-Seq-pipeline (https://github.com/eco32i/CEL-Seq-pipeline)


In [3]:
!head ../data/CE_exp.umi.tab












In [4]:
!tail ../data/CE_exp.umi.tab











Expression matrix contains read counts in genes. Columns are worms rows are genes.


In [5]:
ce = pd.read_csv('../data/CE_exp.umi.tab', sep='\t', skipfooter=5)
ce


/home/ilya/.venv/pydata3/lib/python3.5/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
  if __name__ == '__main__':
Out[5]:
#Sample: CE_1.genome CE_10.genome CE_2.genome CE_3.genome CE_4.genome CE_5.genome CE_6.genome CE_7.genome CE_8.genome CE_9.genome
0 2L52.1 0 0 1 0 0 0 1 0 0 1
1 2L52.2 0 0 0 0 0 0 0 0 0 0
2 2RSSE.1 0 0 0 0 0 0 0 0 0 0
3 2RSSE.2 0 0 0 0 0 0 0 0 0 0
4 2RSSE.3 0 0 0 0 0 0 0 0 0 0
5 2RSSE.4 0 0 0 0 0 0 0 0 0 0
6 2RSSE.5 0 0 0 0 0 0 0 0 0 0
7 2RSSE.6 0 0 0 0 0 0 0 0 0 0
8 2RSSE.7 0 0 0 0 0 0 0 0 0 0
9 2RSSE.8 0 0 0 0 0 0 0 0 0 0
10 3R5.1 15 23 22 34 23 20 21 22 37 26
11 3R5.2 0 0 0 0 0 0 1 0 0 0
12 4R79.1 0 0 0 0 0 0 0 0 0 0
13 4R79.2 0 0 2 0 0 0 0 0 0 0
14 4R79.4 0 0 0 0 0 0 0 0 0 0
15 4R79.5 0 0 0 0 0 0 0 0 0 0
16 6R55.2 0 0 0 0 0 0 0 0 0 0
17 AC3.1 0 0 0 1 0 1 0 0 0 0
18 AC3.10 0 0 1 0 0 0 0 0 0 1
19 AC3.11 0 0 0 0 0 0 0 0 0 0
20 AC3.12 0 0 0 0 0 0 0 0 0 0
21 AC3.13 0 1 0 0 0 0 0 0 0 0
22 AC3.14 0 0 0 0 0 0 0 0 0 0
23 AC3.15 0 0 0 0 0 0 0 0 0 0
24 AC3.16 0 0 0 0 0 0 0 0 0 0
25 AC3.2 8 36 8 11 12 10 25 11 14 34
26 AC3.3 0 0 0 0 0 0 0 0 0 0
27 AC3.4 0 0 0 0 0 0 0 0 0 0
28 AC3.5 18 65 27 33 23 18 54 23 40 52
29 AC3.6 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ...
45431 ZK973.9 108 264 200 260 129 168 293 138 266 283
45432 ZK993.1 0 2 1 1 0 1 3 0 2 0
45433 ZK993.2 0 0 0 0 0 0 0 0 0 0
45434 ZK993.4 0 0 0 0 0 0 0 0 0 0
45435 ZK994.1 1 0 0 0 0 0 0 0 0 0
45436 ZK994.10 0 0 0 0 0 0 0 0 0 0
45437 ZK994.11 0 0 0 0 0 0 0 0 0 0
45438 ZK994.3 0 2 1 4 0 1 3 0 1 1
45439 ZK994.6 1 1 1 0 0 0 2 0 2 3
45440 ZK994.7 0 0 0 0 0 0 0 0 0 0
45441 ZK994.8 0 0 0 0 0 0 0 0 0 0
45442 ZK994.9 0 0 0 0 0 0 0 0 0 0
45443 ZK994.t1 0 0 0 0 0 0 0 0 0 0
45444 ZK994.t2 0 0 0 0 0 0 0 0 0 0
45445 ZK994.t3 0 0 0 0 1 0 0 0 0 0
45446 cTel17.1 0 0 0 0 0 0 0 0 0 0
45447 cTel17.2 0 0 0 0 0 0 0 0 0 0
45448 cTel29B.1 0 0 0 0 0 0 0 0 0 0
45449 cTel29B.2 0 0 0 0 0 0 0 0 0 0
45450 cTel3X.1 0 0 0 0 0 0 0 0 0 0
45451 cTel3X.2 0 0 0 0 0 0 0 0 0 0
45452 cTel3X.3 0 0 0 0 0 0 0 0 0 0
45453 cTel52S.1 0 0 0 0 0 0 0 0 0 0
45454 cTel54X.1 2 2 2 4 6 2 4 3 2 0
45455 cTel55X.1 42 61 62 66 40 29 73 43 74 96
45456 cTel79B.1 0 0 0 0 0 0 0 0 0 0
45457 cTel79B.2 0 0 0 0 0 0 0 0 0 0
45458 cTel7X.1 0 0 0 0 0 0 0 0 0 0
45459 cTel7X.2 0 0 0 0 0 0 0 0 0 0
45460 cTel7X.3 0 0 0 0 0 0 0 0 0 0

45461 rows × 11 columns

PCA is sensitive to variable scaling. Therefore before performing the analysis we need to normalize the data. StandardScaler will transform every variable to unti space (mean 0, variance 1). Note also that sklearn expects columns to be genes (features) and rows to be worms (samples, or observations). Therefore we transpose the matrix before doing anything.


In [ ]:
#ce = ce.ix[ce.ix[:,1:].mean(axis=1)>500,:]

In [6]:
X_std = StandardScaler().fit_transform(ce.ix[:,1:].values.T)
X_std


/home/ilya/.venv/pydata3/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/home/ilya/.venv/pydata3/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
Out[6]:
array([[-0.65465367,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.65465367,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.52752523,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [-0.65465367,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.65465367,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.52752523,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [7]:
sklearn_pca = sklearnPCA(n_components=10)
Y_sklearn = sklearn_pca.fit_transform(X_std)
Y_sklearn


Out[7]:
array([[ -1.07546376e+02,   4.63864340e+00,  -8.96436965e+00,
          1.74959571e+00,  -5.84279506e+00,   3.64185213e+01,
         -4.97689548e+01,   4.14305419e+01,   9.12143621e+00,
         -1.64410248e-13],
       [  7.77641744e+01,  -8.29901734e+01,  -2.03120362e+01,
          6.66481269e+01,   5.10038587e+00,   8.95508016e+00,
          3.46475822e+00,   1.66973996e+00,   7.01552270e-01,
         -1.64410248e-13],
       [ -3.36179150e+01,   9.16017706e+00,   1.63654260e+01,
          2.03123487e+01,   4.29966650e+00,  -7.75515474e+01,
         -3.25176584e+01,  -1.09572684e+01,   1.05471985e+01,
         -1.64410248e-13],
       [  3.76400268e+01,  -1.24952448e+01,  -2.99456479e+01,
         -5.82751506e+01,   7.81762978e+01,  -5.50576813e+00,
          4.93608408e+00,   6.96783833e+00,   1.97547133e+00,
         -1.64410248e-13],
       [ -8.43712970e+01,   7.85590075e+00,   2.77664314e+00,
          8.37739348e+00,  -3.92919991e-01,  -4.24752998e+00,
          1.90791736e+01,   1.57587810e+00,  -6.98659115e+01,
         -1.64410248e-13],
       [ -8.43210890e+01,   1.85271086e+01,   1.77668949e+01,
          1.73906779e+01,  -2.55728861e-01,  -4.09465479e+00,
          5.93272681e+01,   2.24907466e+01,   3.58779136e+01,
         -1.64410248e-13],
       [  7.42276449e+01,  -2.42989805e+01,   9.47912540e+01,
         -3.63997991e+01,  -1.42317994e+01,   1.42164758e+01,
         -4.62953882e+00,  -1.05048650e+00,  -2.07310201e+00,
         -1.64410248e-13],
       [ -8.12814638e+01,   3.42275602e+00,  -1.19913537e+01,
         -4.74455895e+00,  -2.38797453e+00,   3.11294041e+01,
         -3.56876769e+00,  -6.79489740e+01,   1.45785413e+01,
         -1.64410248e-13],
       [  5.42756272e+01,  -8.83970945e+00,  -5.29276284e+01,
         -4.93592923e+01,  -6.96660877e+01,  -1.60469460e+01,
          8.35345972e+00,   6.21332461e+00,   2.11770771e+00,
         -1.64410248e-13],
       [  1.47230668e+02,   8.50195222e+01,  -7.55918226e+00,
          3.43006583e+01,   5.20095538e+00,   1.67269649e+01,
         -4.67582402e+00,  -3.91340607e-01,  -2.98080745e+00,
         -1.64410248e-13]])

Y_sklearn is a numpy array of the shape (num_samples, n_components) where original X data is projected onto the number of extracted principal components

Plot explained variance


In [8]:
sklearn_pca.explained_variance_


Out[8]:
array([  7.11281203e+03,   1.54625101e+03,   1.39685295e+03,
         1.35865661e+03,   1.12790529e+03,   9.19430967e+02,
         7.58026841e+02,   7.05313255e+02,   6.59751045e+02,
         2.70307297e-26])

In [9]:
sklearn_pca.explained_variance_ratio_


Out[9]:
array([  4.56388324e-01,   9.92140529e-02,   8.96280367e-02,
         8.71771966e-02,   7.23712087e-02,   5.89946081e-02,
         4.86382317e-02,   4.52559034e-02,   4.23324379e-02,
         1.73440678e-30])

In [10]:
vdf = pd.DataFrame()
vdf['PC'] = [(i+1) for i,x in enumerate(sklearn_pca.explained_variance_ratio_)]
vdf['var'] = sklearn_pca.explained_variance_ratio_

(ggplot(vdf, aes(x='PC', y='var'))
    + geom_point(size=5)
    + ylab('Explained variance')
    + theme(figure_size=(12,10))
)


Out[10]:
<ggplot: (-9223363244110839622)>

In [11]:
pca_df = pd.DataFrame()
pca_df['sample'] = ['CE_%i' % (x+1) for x in range(10)]
pca_df['PC1'] = Y_sklearn[:,0]
pca_df['PC2'] = Y_sklearn[:,1]

(ggplot(pca_df, aes(x='PC1', y='PC2', color='sample'))
    + geom_point(size=5)
    + theme(figure_size=(12,10))
)


Out[11]:
<ggplot: (-9223363244110838654)>

In [13]:
pca_df = pd.DataFrame()
pca_df['sample'] = ['CE_%i' % (x+1) for x in range(10)]
pca_df['PC1'] = Y_sklearn[:,0]
pca_df['PC3'] = Y_sklearn[:,2]

(ggplot(pca_df, aes(x='PC1', y='PC3', color='sample'))
    + geom_point(size=5)
    + theme(figure_size=(12,10))
)


Out[13]:
<ggplot: (-9223363244110848796)>

In [20]:
pca_df = pd.DataFrame()
pca_df['sample'] = ['CE_%i' % (x+1) for x in range(10)]
pca_df['PCA2'] = Y_sklearn[:,1]
pca_df['PCA4'] = Y_sklearn[:,3]

g = ggplot(pca_df, aes(x='PCA2', y='PCA4', color='sample')) \
    + geom_point(size=10)
print(g)


<ggplot: (-9223363276523880974)>

In [ ]: