In [1]:

    
%matplotlib inline



In [2]:

    
import pandas as pd
import numpy as np
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA as sklearnPCA
from plotnine import *

Read in expression matrix

mRNA-Seq from 10 individual C.elegans worms. Processed with CEL-Seq-pipeline (https://github.com/eco32i/CEL-Seq-pipeline)



In [3]:

    
!head ../data/CE_exp.umi.tab



In [4]:

    
!tail ../data/CE_exp.umi.tab

Expression matrix contains read counts in genes. Columns are worms rows are genes.



In [5]:

    
ce = pd.read_csv('../data/CE_exp.umi.tab', sep='\t', skipfooter=5)
ce









    



/home/ilya/.venv/pydata3/lib/python3.5/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
  if __name__ == '__main__':






    Out[5]:






  
    
      
      #Sample:
      CE_1.genome
      CE_10.genome
      CE_2.genome
      CE_3.genome
      CE_4.genome
      CE_5.genome
      CE_6.genome
      CE_7.genome
      CE_8.genome
      CE_9.genome
    
  
  
    
      0
      2L52.1
      0
      0
      1
      0
      0
      0
      1
      0
      0
      1
    
    
      1
      2L52.2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      2RSSE.1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      2RSSE.2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      2RSSE.3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      5
      2RSSE.4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      6
      2RSSE.5
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      7
      2RSSE.6
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      8
      2RSSE.7
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      9
      2RSSE.8
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      10
      3R5.1
      15
      23
      22
      34
      23
      20
      21
      22
      37
      26
    
    
      11
      3R5.2
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
    
    
      12
      4R79.1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      13
      4R79.2
      0
      0
      2
      0
      0
      0
      0
      0
      0
      0
    
    
      14
      4R79.4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      15
      4R79.5
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      16
      6R55.2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      17
      AC3.1
      0
      0
      0
      1
      0
      1
      0
      0
      0
      0
    
    
      18
      AC3.10
      0
      0
      1
      0
      0
      0
      0
      0
      0
      1
    
    
      19
      AC3.11
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      20
      AC3.12
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      21
      AC3.13
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      22
      AC3.14
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      23
      AC3.15
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      24
      AC3.16
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      25
      AC3.2
      8
      36
      8
      11
      12
      10
      25
      11
      14
      34
    
    
      26
      AC3.3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      27
      AC3.4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      28
      AC3.5
      18
      65
      27
      33
      23
      18
      54
      23
      40
      52
    
    
      29
      AC3.6
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      45431
      ZK973.9
      108
      264
      200
      260
      129
      168
      293
      138
      266
      283
    
    
      45432
      ZK993.1
      0
      2
      1
      1
      0
      1
      3
      0
      2
      0
    
    
      45433
      ZK993.2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45434
      ZK993.4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45435
      ZK994.1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45436
      ZK994.10
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45437
      ZK994.11
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45438
      ZK994.3
      0
      2
      1
      4
      0
      1
      3
      0
      1
      1
    
    
      45439
      ZK994.6
      1
      1
      1
      0
      0
      0
      2
      0
      2
      3
    
    
      45440
      ZK994.7
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45441
      ZK994.8
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45442
      ZK994.9
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45443
      ZK994.t1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45444
      ZK994.t2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45445
      ZK994.t3
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      45446
      cTel17.1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45447
      cTel17.2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45448
      cTel29B.1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45449
      cTel29B.2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45450
      cTel3X.1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45451
      cTel3X.2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45452
      cTel3X.3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45453
      cTel52S.1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45454
      cTel54X.1
      2
      2
      2
      4
      6
      2
      4
      3
      2
      0
    
    
      45455
      cTel55X.1
      42
      61
      62
      66
      40
      29
      73
      43
      74
      96
    
    
      45456
      cTel79B.1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45457
      cTel79B.2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45458
      cTel7X.1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45459
      cTel7X.2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      45460
      cTel7X.3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

45461 rows × 11 columns

PCA is sensitive to variable scaling. Therefore before performing the analysis we need to normalize the data. StandardScaler will transform every variable to unti space (mean 0, variance 1). Note also that sklearn expects columns to be genes (features) and rows to be worms (samples, or observations). Therefore we transpose the matrix before doing anything.



In [ ]:

    
#ce = ce.ix[ce.ix[:,1:].mean(axis=1)>500,:]



In [6]:

    
X_std = StandardScaler().fit_transform(ce.ix[:,1:].values.T)
X_std









    



/home/ilya/.venv/pydata3/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/home/ilya/.venv/pydata3/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)






    Out[6]:





array([[-0.65465367,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.65465367,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.52752523,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [-0.65465367,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.65465367,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.52752523,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])



In [7]:

    
sklearn_pca = sklearnPCA(n_components=10)
Y_sklearn = sklearn_pca.fit_transform(X_std)
Y_sklearn









    Out[7]:





array([[ -1.07546376e+02,   4.63864340e+00,  -8.96436965e+00,
          1.74959571e+00,  -5.84279506e+00,   3.64185213e+01,
         -4.97689548e+01,   4.14305419e+01,   9.12143621e+00,
         -1.64410248e-13],
       [  7.77641744e+01,  -8.29901734e+01,  -2.03120362e+01,
          6.66481269e+01,   5.10038587e+00,   8.95508016e+00,
          3.46475822e+00,   1.66973996e+00,   7.01552270e-01,
         -1.64410248e-13],
       [ -3.36179150e+01,   9.16017706e+00,   1.63654260e+01,
          2.03123487e+01,   4.29966650e+00,  -7.75515474e+01,
         -3.25176584e+01,  -1.09572684e+01,   1.05471985e+01,
         -1.64410248e-13],
       [  3.76400268e+01,  -1.24952448e+01,  -2.99456479e+01,
         -5.82751506e+01,   7.81762978e+01,  -5.50576813e+00,
          4.93608408e+00,   6.96783833e+00,   1.97547133e+00,
         -1.64410248e-13],
       [ -8.43712970e+01,   7.85590075e+00,   2.77664314e+00,
          8.37739348e+00,  -3.92919991e-01,  -4.24752998e+00,
          1.90791736e+01,   1.57587810e+00,  -6.98659115e+01,
         -1.64410248e-13],
       [ -8.43210890e+01,   1.85271086e+01,   1.77668949e+01,
          1.73906779e+01,  -2.55728861e-01,  -4.09465479e+00,
          5.93272681e+01,   2.24907466e+01,   3.58779136e+01,
         -1.64410248e-13],
       [  7.42276449e+01,  -2.42989805e+01,   9.47912540e+01,
         -3.63997991e+01,  -1.42317994e+01,   1.42164758e+01,
         -4.62953882e+00,  -1.05048650e+00,  -2.07310201e+00,
         -1.64410248e-13],
       [ -8.12814638e+01,   3.42275602e+00,  -1.19913537e+01,
         -4.74455895e+00,  -2.38797453e+00,   3.11294041e+01,
         -3.56876769e+00,  -6.79489740e+01,   1.45785413e+01,
         -1.64410248e-13],
       [  5.42756272e+01,  -8.83970945e+00,  -5.29276284e+01,
         -4.93592923e+01,  -6.96660877e+01,  -1.60469460e+01,
          8.35345972e+00,   6.21332461e+00,   2.11770771e+00,
         -1.64410248e-13],
       [  1.47230668e+02,   8.50195222e+01,  -7.55918226e+00,
          3.43006583e+01,   5.20095538e+00,   1.67269649e+01,
         -4.67582402e+00,  -3.91340607e-01,  -2.98080745e+00,
         -1.64410248e-13]])

Y_sklearn is a numpy array of the shape (num_samples, n_components) where original X data is projected onto the number of extracted principal components

Plot explained variance



In [8]:

    
sklearn_pca.explained_variance_









    Out[8]:





array([  7.11281203e+03,   1.54625101e+03,   1.39685295e+03,
         1.35865661e+03,   1.12790529e+03,   9.19430967e+02,
         7.58026841e+02,   7.05313255e+02,   6.59751045e+02,
         2.70307297e-26])



In [9]:

    
sklearn_pca.explained_variance_ratio_









    Out[9]:





array([  4.56388324e-01,   9.92140529e-02,   8.96280367e-02,
         8.71771966e-02,   7.23712087e-02,   5.89946081e-02,
         4.86382317e-02,   4.52559034e-02,   4.23324379e-02,
         1.73440678e-30])



In [10]:

    
vdf = pd.DataFrame()
vdf['PC'] = [(i+1) for i,x in enumerate(sklearn_pca.explained_variance_ratio_)]
vdf['var'] = sklearn_pca.explained_variance_ratio_

(ggplot(vdf, aes(x='PC', y='var'))
    + geom_point(size=5)
    + ylab('Explained variance')
    + theme(figure_size=(12,10))
)









    












    Out[10]:





<ggplot: (-9223363244110839622)>



In [11]:

    
pca_df = pd.DataFrame()
pca_df['sample'] = ['CE_%i' % (x+1) for x in range(10)]
pca_df['PC1'] = Y_sklearn[:,0]
pca_df['PC2'] = Y_sklearn[:,1]

(ggplot(pca_df, aes(x='PC1', y='PC2', color='sample'))
    + geom_point(size=5)
    + theme(figure_size=(12,10))
)









    












    Out[11]:





<ggplot: (-9223363244110838654)>



In [13]:

    
pca_df = pd.DataFrame()
pca_df['sample'] = ['CE_%i' % (x+1) for x in range(10)]
pca_df['PC1'] = Y_sklearn[:,0]
pca_df['PC3'] = Y_sklearn[:,2]

(ggplot(pca_df, aes(x='PC1', y='PC3', color='sample'))
    + geom_point(size=5)
    + theme(figure_size=(12,10))
)









    












    Out[13]:





<ggplot: (-9223363244110848796)>



In [20]:

    
pca_df = pd.DataFrame()
pca_df['sample'] = ['CE_%i' % (x+1) for x in range(10)]
pca_df['PCA2'] = Y_sklearn[:,1]
pca_df['PCA4'] = Y_sklearn[:,3]

g = ggplot(pca_df, aes(x='PCA2', y='PCA4', color='sample')) \
    + geom_point(size=10)
print(g)









    












    



<ggplot: (-9223363276523880974)>



In [ ]:

	#Sample:	CE_1.genome	CE_10.genome	CE_2.genome	CE_3.genome	CE_4.genome	CE_5.genome	CE_6.genome	CE_7.genome	CE_8.genome	CE_9.genome
0	2L52.1	0	0	1	0	0	0	1	0	0	1
1	2L52.2	0	0	0	0	0	0	0	0	0	0
2	2RSSE.1	0	0	0	0	0	0	0	0	0	0
3	2RSSE.2	0	0	0	0	0	0	0	0	0	0
4	2RSSE.3	0	0	0	0	0	0	0	0	0	0
5	2RSSE.4	0	0	0	0	0	0	0	0	0	0
6	2RSSE.5	0	0	0	0	0	0	0	0	0	0
7	2RSSE.6	0	0	0	0	0	0	0	0	0	0
8	2RSSE.7	0	0	0	0	0	0	0	0	0	0
9	2RSSE.8	0	0	0	0	0	0	0	0	0	0
10	3R5.1	15	23	22	34	23	20	21	22	37	26
11	3R5.2	0	0	0	0	0	0	1	0	0	0
12	4R79.1	0	0	0	0	0	0	0	0	0	0
13	4R79.2	0	0	2	0	0	0	0	0	0	0
14	4R79.4	0	0	0	0	0	0	0	0	0	0
15	4R79.5	0	0	0	0	0	0	0	0	0	0
16	6R55.2	0	0	0	0	0	0	0	0	0	0
17	AC3.1	0	0	0	1	0	1	0	0	0	0
18	AC3.10	0	0	1	0	0	0	0	0	0	1
19	AC3.11	0	0	0	0	0	0	0	0	0	0
20	AC3.12	0	0	0	0	0	0	0	0	0	0
21	AC3.13	0	1	0	0	0	0	0	0	0	0
22	AC3.14	0	0	0	0	0	0	0	0	0	0
23	AC3.15	0	0	0	0	0	0	0	0	0	0
24	AC3.16	0	0	0	0	0	0	0	0	0	0
25	AC3.2	8	36	8	11	12	10	25	11	14	34
26	AC3.3	0	0	0	0	0	0	0	0	0	0
27	AC3.4	0	0	0	0	0	0	0	0	0	0
28	AC3.5	18	65	27	33	23	18	54	23	40	52
29	AC3.6	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...
45431	ZK973.9	108	264	200	260	129	168	293	138	266	283
45432	ZK993.1	0	2	1	1	0	1	3	0	2	0
45433	ZK993.2	0	0	0	0	0	0	0	0	0	0
45434	ZK993.4	0	0	0	0	0	0	0	0	0	0
45435	ZK994.1	1	0	0	0	0	0	0	0	0	0
45436	ZK994.10	0	0	0	0	0	0	0	0	0	0
45437	ZK994.11	0	0	0	0	0	0	0	0	0	0
45438	ZK994.3	0	2	1	4	0	1	3	0	1	1
45439	ZK994.6	1	1	1	0	0	0	2	0	2	3
45440	ZK994.7	0	0	0	0	0	0	0	0	0	0
45441	ZK994.8	0	0	0	0	0	0	0	0	0	0
45442	ZK994.9	0	0	0	0	0	0	0	0	0	0
45443	ZK994.t1	0	0	0	0	0	0	0	0	0	0
45444	ZK994.t2	0	0	0	0	0	0	0	0	0	0
45445	ZK994.t3	0	0	0	0	1	0	0	0	0	0
45446	cTel17.1	0	0	0	0	0	0	0	0	0	0
45447	cTel17.2	0	0	0	0	0	0	0	0	0	0
45448	cTel29B.1	0	0	0	0	0	0	0	0	0	0
45449	cTel29B.2	0	0	0	0	0	0	0	0	0	0
45450	cTel3X.1	0	0	0	0	0	0	0	0	0	0
45451	cTel3X.2	0	0	0	0	0	0	0	0	0	0
45452	cTel3X.3	0	0	0	0	0	0	0	0	0	0
45453	cTel52S.1	0	0	0	0	0	0	0	0	0	0
45454	cTel54X.1	2	2	2	4	6	2	4	3	2	0
45455	cTel55X.1	42	61	62	66	40	29	73	43	74	96
45456	cTel79B.1	0	0	0	0	0	0	0	0	0	0
45457	cTel79B.2	0	0	0	0	0	0	0	0	0	0
45458	cTel7X.1	0	0	0	0	0	0	0	0	0	0
45459	cTel7X.2	0	0	0	0	0	0	0	0	0	0
45460	cTel7X.3	0	0	0	0	0	0	0	0	0	0