This notebook gives an overview of the RNA-seq quality control data for the MyConnectome project.



In [25]:

    
import numpy
import pandas as pd
import glob
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

sns.set(style="whitegrid")

Load in the FastQC summary results (averaging across lanes/directions within each session).



In [52]:

    
PerBaseSeqQuality=pd.read_csv('PerBaseSeqQuality.csv',index_col=0)
PerBaseGCContent=pd.read_csv('PerBaseGCContent.csv',index_col=0)
PerBaseNContent=pd.read_csv('PerBaseNContent.csv',index_col=0)
rin=pd.read_csv('rin.txt',header=None,names=['rin'])



In [54]:

    
fig=plt.figure(figsize=(16,24))
fig.add_subplot(421)
sns.violinplot(PerBaseSeqQuality)
plt.title('Per base sequence quality',fontsize=20)
plt.xlabel('bases',fontsize=16)
fig.add_subplot(422)
sns.tsplot(PerBaseSeqQuality.mean(1))
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(423)
sns.violinplot(PerBaseGCContent)
plt.title('Per base GC content',fontsize=20)
plt.xlabel('bases',fontsize=16)
fig.add_subplot(424)
sns.tsplot(PerBaseGCContent.mean(1))
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(425)
sns.violinplot(PerBaseNContent)
plt.title('Per base N content',fontsize=20)
plt.xlabel('bases',fontsize=16)
fig.add_subplot(426)
sns.tsplot(PerBaseNContent.mean(1))
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(427)
sns.distplot(rin)
plt.title('RNA Integrity Number (RIN)',fontsize=20)
plt.xlabel('bases',fontsize=16)
fig.add_subplot(428)
sns.tsplot(rin.rin)
plt.xlabel('sessions',fontsize=16)









    Out[54]:





<matplotlib.text.Text at 0x12b717c10>

Load the alignment metrics from Picard and plot summaries.



In [28]:

    
align_metrics=pd.read_csv('alignment_metrics.txt',sep='\t')
align_metrics.describe()









    Out[28]:






  
    
      
      TOTAL_READS
      PF_READS
      PCT_PF_READS
      PF_NOISE_READS
      PF_READS_ALIGNED
      PCT_PF_READS_ALIGNED
      PF_ALIGNED_BASES
      PF_HQ_ALIGNED_READS
      PF_HQ_ALIGNED_BASES
      PF_HQ_ALIGNED_Q20_BASES
      ...
      MEAN_READ_LENGTH
      READS_ALIGNED_IN_PAIRS
      PCT_READS_ALIGNED_IN_PAIRS
      BAD_CYCLES
      STRAND_BALANCE
      PCT_CHIMERAS
      PCT_ADAPTER
      SAMPLE
      LIBRARY
      READ_GROUP
    
  
  
    
      count
             48.000000
             48.000000
       48
       48.000000
             48.000000
       48
       4.800000e+01
             48.000000
       4.800000e+01
       4.800000e+01
      ...
        48
             48.000000
       48.000000
       48.000000
       48.000000
       48.000000
       48
        0
        0
        0
    
    
      mean
       33583646.958333
       33583646.958333
        1
        0.895833
       33583646.958333
        1
       3.391769e+09
       32646518.458333
       3.297123e+09
       3.240662e+09
      ...
       101
       29801006.541667
        0.882191
        0.125000
        0.582078
        0.325858
        0
      NaN
      NaN
      NaN
    
    
      std
        9332021.092923
        9332021.092923
        0
        1.171280
        9332021.092923
        0
       9.424790e+08
        9050897.665647
       9.140865e+08
       9.038253e+08
      ...
         0
        8703742.886927
        0.042092
        0.334219
        0.016064
        0.045042
        0
      NaN
      NaN
      NaN
    
    
      min
       14922784.000000
       14922784.000000
        1
        0.000000
       14922784.000000
        1
       1.507147e+09
       14540301.000000
       1.468518e+09
       1.419814e+09
      ...
       101
       11950623.000000
        0.777227
        0.000000
        0.551731
        0.196175
        0
      NaN
      NaN
      NaN
    
    
      25%
       27398773.500000
       27398773.500000
        1
        0.000000
       27398773.500000
        1
       2.767103e+09
       26613832.500000
       2.687827e+09
       2.657182e+09
      ...
       101
       24105165.000000
        0.858091
        0.000000
        0.569786
        0.302256
        0
      NaN
      NaN
      NaN
    
    
      50%
       33852478.500000
       33852478.500000
        1
        0.500000
       33852478.500000
        1
       3.418912e+09
       32922774.500000
       3.325016e+09
       3.264962e+09
      ...
       101
       29948581.500000
        0.897850
        0.000000
        0.581946
        0.325823
        0
      NaN
      NaN
      NaN
    
    
      75%
       39473116.750000
       39473116.750000
        1
        1.250000
       39473116.750000
        1
       3.986627e+09
       37946961.250000
       3.832464e+09
       3.766903e+09
      ...
       101
       35343390.000000
        0.909056
        0.000000
        0.593531
        0.343582
        0
      NaN
      NaN
      NaN
    
    
      max
       54034420.000000
       54034420.000000
        1
        5.000000
       54034420.000000
        1
       5.457142e+09
       52885082.000000
       5.341065e+09
       5.263497e+09
      ...
       101
       48866545.000000
        0.929025
        1.000000
        0.617913
        0.491049
        0
      NaN
      NaN
      NaN
    
  

8 rows × 24 columns



In [55]:

    
fig=plt.figure(figsize=(16,24))
fig.add_subplot(421)
sns.distplot(align_metrics.TOTAL_READS)
plt.title('Total reads',fontsize=20)
fig.add_subplot(422)
sns.tsplot(align_metrics.TOTAL_READS)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(423)
sns.distplot(align_metrics.PF_HQ_ALIGNED_READS/align_metrics.TOTAL_READS)
plt.title('Percentage of high quality alignments',fontsize=20)
fig.add_subplot(424)
sns.tsplot(align_metrics.PF_HQ_ALIGNED_READS/align_metrics.TOTAL_READS)
plt.xlabel('sessions',fontsize=16)









    Out[55]:





<matplotlib.axes._subplots.AxesSubplot at 0x1225c0390>

Load the rna-seq metrics from Picard and plot summaries.



In [29]:

    
rnaseq_metrics=pd.read_csv('rnaseq_metrics.txt',sep='\t')
rnaseq_metrics.describe()









    Out[29]:






  
    
      
      PF_BASES
      PF_ALIGNED_BASES
      RIBOSOMAL_BASES
      CODING_BASES
      UTR_BASES
      INTRONIC_BASES
      INTERGENIC_BASES
      IGNORED_READS
      CORRECT_STRAND_READS
      INCORRECT_STRAND_READS
      ...
      PCT_MRNA_BASES
      PCT_USABLE_BASES
      PCT_CORRECT_STRAND_READS
      MEDIAN_CV_COVERAGE
      MEDIAN_5PRIME_BIAS
      MEDIAN_3PRIME_BIAS
      MEDIAN_5PRIME_TO_3PRIME_BIAS
      SAMPLE
      LIBRARY
      READ_GROUP
    
  
  
    
      count
       4.800000e+01
       4.800000e+01
       48
       4.800000e+01
       4.800000e+01
       4.800000e+01
       4.800000e+01
       48
       48
       48
      ...
       48.000000
       48.000000
       48
       48.000000
       48.000000
       48.000000
       48.000000
        0
        0
        0
    
    
      mean
       6.646251e+09
       6.645865e+09
        0
       2.202528e+09
       1.522187e+09
       1.112084e+09
       1.809065e+09
        0
        0
        0
      ...
        0.554635
        0.554603
        0
        0.783146
        0.023094
        0.296721
        0.091490
      NaN
      NaN
      NaN
    
    
      std
       1.811126e+09
       1.811020e+09
        0
       7.315647e+08
       4.624208e+08
       3.928677e+08
       4.869813e+08
        0
        0
        0
      ...
        0.059503
        0.059500
        0
        0.062576
        0.006401
        0.043320
        0.026964
      NaN
      NaN
      NaN
    
    
      min
       3.049706e+09
       3.049552e+09
        0
       8.932921e+08
       6.098608e+08
       5.491224e+08
       7.005939e+08
        0
        0
        0
      ...
        0.347725
        0.347702
        0
        0.685367
        0.011376
        0.216391
        0.033854
      NaN
      NaN
      NaN
    
    
      25%
       5.378171e+09
       5.377825e+09
        0
       1.727695e+09
       1.204495e+09
       8.238215e+08
       1.574658e+09
        0
        0
        0
      ...
        0.540673
        0.540644
        0
        0.741171
        0.018696
        0.266579
        0.075889
      NaN
      NaN
      NaN
    
    
      50%
       6.628100e+09
       6.627725e+09
        0
       2.246144e+09
       1.525553e+09
       1.042155e+09
       1.829488e+09
        0
        0
        0
      ...
        0.564226
        0.564189
        0
        0.766493
        0.022549
        0.292372
        0.092082
      NaN
      NaN
      NaN
    
    
      75%
       7.963288e+09
       7.962802e+09
        0
       2.726507e+09
       1.790935e+09
       1.353596e+09
       2.137919e+09
        0
        0
        0
      ...
        0.589496
        0.589465
        0
        0.817122
        0.026101
        0.318905
        0.104573
      NaN
      NaN
      NaN
    
    
      max
       1.064566e+10
       1.064496e+10
        0
       3.685544e+09
       2.430826e+09
       2.319200e+09
       3.004370e+09
        0
        0
        0
      ...
        0.631248
        0.631209
        0
        0.941701
        0.043929
        0.415660
        0.162498
      NaN
      NaN
      NaN
    
  

8 rows × 25 columns



In [63]:

    
fig=plt.figure(figsize=(16,24))
fig.add_subplot(421)
sns.distplot(rnaseq_metrics.CODING_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.title('Percent coding bases',fontsize=20)
fig.add_subplot(422)
sns.tsplot(rnaseq_metrics.CODING_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(423)
sns.distplot(rnaseq_metrics.UTR_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.title('Percent UTR bases',fontsize=20)
fig.add_subplot(424)
sns.tsplot(rnaseq_metrics.UTR_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(425)
sns.distplot(rnaseq_metrics.INTRONIC_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.title('Percent intronic bases',fontsize=20)
fig.add_subplot(426)
sns.tsplot(rnaseq_metrics.INTRONIC_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(427)
sns.distplot(rnaseq_metrics.INTERGENIC_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.title('Percent intergenic bases',fontsize=20)
fig.add_subplot(428)
sns.tsplot(rnaseq_metrics.INTERGENIC_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.xlabel('sessions',fontsize=16)









    Out[63]:





<matplotlib.text.Text at 0x12d225f50>



In [64]:

    
fig=plt.figure(figsize=(16,24))

fig.add_subplot(421)
sns.distplot(rnaseq_metrics.MEDIAN_3PRIME_BIAS)
plt.title("Median 3' bias",fontsize=20)
fig.add_subplot(422)
sns.tsplot(rnaseq_metrics.MEDIAN_3PRIME_BIAS)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(423)
sns.distplot(rnaseq_metrics.MEDIAN_5PRIME_BIAS)
plt.title("Median 5' bias",fontsize=20)
fig.add_subplot(424)
sns.tsplot(rnaseq_metrics.MEDIAN_5PRIME_BIAS)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(425)
sns.distplot(rnaseq_metrics.MEDIAN_CV_COVERAGE)
plt.title('Median CV coverage',fontsize=20)
fig.add_subplot(426)
sns.tsplot(rnaseq_metrics.MEDIAN_CV_COVERAGE)
plt.xlabel('sessions',fontsize=16)









    Out[64]:





<matplotlib.text.Text at 0x122d53290>

	TOTAL_READS	PF_READS	PCT_PF_READS	PF_NOISE_READS	PF_READS_ALIGNED	PCT_PF_READS_ALIGNED	PF_ALIGNED_BASES	PF_HQ_ALIGNED_READS	PF_HQ_ALIGNED_BASES	PF_HQ_ALIGNED_Q20_BASES	...	MEAN_READ_LENGTH	READS_ALIGNED_IN_PAIRS	PCT_READS_ALIGNED_IN_PAIRS	BAD_CYCLES	STRAND_BALANCE	PCT_CHIMERAS	PCT_ADAPTER	SAMPLE	LIBRARY	READ_GROUP
count	48.000000	48.000000	48	48.000000	48.000000	48	4.800000e+01	48.000000	4.800000e+01	4.800000e+01	...	48	48.000000	48.000000	48.000000	48.000000	48.000000	48	0	0	0
mean	33583646.958333	33583646.958333	1	0.895833	33583646.958333	1	3.391769e+09	32646518.458333	3.297123e+09	3.240662e+09	...	101	29801006.541667	0.882191	0.125000	0.582078	0.325858	0	NaN	NaN	NaN
std	9332021.092923	9332021.092923	0	1.171280	9332021.092923	0	9.424790e+08	9050897.665647	9.140865e+08	9.038253e+08	...	0	8703742.886927	0.042092	0.334219	0.016064	0.045042	0	NaN	NaN	NaN
min	14922784.000000	14922784.000000	1	0.000000	14922784.000000	1	1.507147e+09	14540301.000000	1.468518e+09	1.419814e+09	...	101	11950623.000000	0.777227	0.000000	0.551731	0.196175	0	NaN	NaN	NaN
25%	27398773.500000	27398773.500000	1	0.000000	27398773.500000	1	2.767103e+09	26613832.500000	2.687827e+09	2.657182e+09	...	101	24105165.000000	0.858091	0.000000	0.569786	0.302256	0	NaN	NaN	NaN
50%	33852478.500000	33852478.500000	1	0.500000	33852478.500000	1	3.418912e+09	32922774.500000	3.325016e+09	3.264962e+09	...	101	29948581.500000	0.897850	0.000000	0.581946	0.325823	0	NaN	NaN	NaN
75%	39473116.750000	39473116.750000	1	1.250000	39473116.750000	1	3.986627e+09	37946961.250000	3.832464e+09	3.766903e+09	...	101	35343390.000000	0.909056	0.000000	0.593531	0.343582	0	NaN	NaN	NaN
max	54034420.000000	54034420.000000	1	5.000000	54034420.000000	1	5.457142e+09	52885082.000000	5.341065e+09	5.263497e+09	...	101	48866545.000000	0.929025	1.000000	0.617913	0.491049	0	NaN	NaN	NaN