This notebook gives an overview of the RNA-seq quality control data for the MyConnectome project.


In [25]:
import numpy
import pandas as pd
import glob
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

sns.set(style="whitegrid")

Load in the FastQC summary results (averaging across lanes/directions within each session).


In [52]:
PerBaseSeqQuality=pd.read_csv('PerBaseSeqQuality.csv',index_col=0)
PerBaseGCContent=pd.read_csv('PerBaseGCContent.csv',index_col=0)
PerBaseNContent=pd.read_csv('PerBaseNContent.csv',index_col=0)
rin=pd.read_csv('rin.txt',header=None,names=['rin'])

In [54]:
fig=plt.figure(figsize=(16,24))
fig.add_subplot(421)
sns.violinplot(PerBaseSeqQuality)
plt.title('Per base sequence quality',fontsize=20)
plt.xlabel('bases',fontsize=16)
fig.add_subplot(422)
sns.tsplot(PerBaseSeqQuality.mean(1))
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(423)
sns.violinplot(PerBaseGCContent)
plt.title('Per base GC content',fontsize=20)
plt.xlabel('bases',fontsize=16)
fig.add_subplot(424)
sns.tsplot(PerBaseGCContent.mean(1))
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(425)
sns.violinplot(PerBaseNContent)
plt.title('Per base N content',fontsize=20)
plt.xlabel('bases',fontsize=16)
fig.add_subplot(426)
sns.tsplot(PerBaseNContent.mean(1))
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(427)
sns.distplot(rin)
plt.title('RNA Integrity Number (RIN)',fontsize=20)
plt.xlabel('bases',fontsize=16)
fig.add_subplot(428)
sns.tsplot(rin.rin)
plt.xlabel('sessions',fontsize=16)


Out[54]:
<matplotlib.text.Text at 0x12b717c10>

Load the alignment metrics from Picard and plot summaries.


In [28]:
align_metrics=pd.read_csv('alignment_metrics.txt',sep='\t')
align_metrics.describe()


Out[28]:
TOTAL_READS PF_READS PCT_PF_READS PF_NOISE_READS PF_READS_ALIGNED PCT_PF_READS_ALIGNED PF_ALIGNED_BASES PF_HQ_ALIGNED_READS PF_HQ_ALIGNED_BASES PF_HQ_ALIGNED_Q20_BASES ... MEAN_READ_LENGTH READS_ALIGNED_IN_PAIRS PCT_READS_ALIGNED_IN_PAIRS BAD_CYCLES STRAND_BALANCE PCT_CHIMERAS PCT_ADAPTER SAMPLE LIBRARY READ_GROUP
count 48.000000 48.000000 48 48.000000 48.000000 48 4.800000e+01 48.000000 4.800000e+01 4.800000e+01 ... 48 48.000000 48.000000 48.000000 48.000000 48.000000 48 0 0 0
mean 33583646.958333 33583646.958333 1 0.895833 33583646.958333 1 3.391769e+09 32646518.458333 3.297123e+09 3.240662e+09 ... 101 29801006.541667 0.882191 0.125000 0.582078 0.325858 0 NaN NaN NaN
std 9332021.092923 9332021.092923 0 1.171280 9332021.092923 0 9.424790e+08 9050897.665647 9.140865e+08 9.038253e+08 ... 0 8703742.886927 0.042092 0.334219 0.016064 0.045042 0 NaN NaN NaN
min 14922784.000000 14922784.000000 1 0.000000 14922784.000000 1 1.507147e+09 14540301.000000 1.468518e+09 1.419814e+09 ... 101 11950623.000000 0.777227 0.000000 0.551731 0.196175 0 NaN NaN NaN
25% 27398773.500000 27398773.500000 1 0.000000 27398773.500000 1 2.767103e+09 26613832.500000 2.687827e+09 2.657182e+09 ... 101 24105165.000000 0.858091 0.000000 0.569786 0.302256 0 NaN NaN NaN
50% 33852478.500000 33852478.500000 1 0.500000 33852478.500000 1 3.418912e+09 32922774.500000 3.325016e+09 3.264962e+09 ... 101 29948581.500000 0.897850 0.000000 0.581946 0.325823 0 NaN NaN NaN
75% 39473116.750000 39473116.750000 1 1.250000 39473116.750000 1 3.986627e+09 37946961.250000 3.832464e+09 3.766903e+09 ... 101 35343390.000000 0.909056 0.000000 0.593531 0.343582 0 NaN NaN NaN
max 54034420.000000 54034420.000000 1 5.000000 54034420.000000 1 5.457142e+09 52885082.000000 5.341065e+09 5.263497e+09 ... 101 48866545.000000 0.929025 1.000000 0.617913 0.491049 0 NaN NaN NaN

8 rows × 24 columns


In [55]:
fig=plt.figure(figsize=(16,24))
fig.add_subplot(421)
sns.distplot(align_metrics.TOTAL_READS)
plt.title('Total reads',fontsize=20)
fig.add_subplot(422)
sns.tsplot(align_metrics.TOTAL_READS)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(423)
sns.distplot(align_metrics.PF_HQ_ALIGNED_READS/align_metrics.TOTAL_READS)
plt.title('Percentage of high quality alignments',fontsize=20)
fig.add_subplot(424)
sns.tsplot(align_metrics.PF_HQ_ALIGNED_READS/align_metrics.TOTAL_READS)
plt.xlabel('sessions',fontsize=16)


Out[55]:
<matplotlib.axes._subplots.AxesSubplot at 0x1225c0390>

Load the rna-seq metrics from Picard and plot summaries.


In [29]:
rnaseq_metrics=pd.read_csv('rnaseq_metrics.txt',sep='\t')
rnaseq_metrics.describe()


Out[29]:
PF_BASES PF_ALIGNED_BASES RIBOSOMAL_BASES CODING_BASES UTR_BASES INTRONIC_BASES INTERGENIC_BASES IGNORED_READS CORRECT_STRAND_READS INCORRECT_STRAND_READS ... PCT_MRNA_BASES PCT_USABLE_BASES PCT_CORRECT_STRAND_READS MEDIAN_CV_COVERAGE MEDIAN_5PRIME_BIAS MEDIAN_3PRIME_BIAS MEDIAN_5PRIME_TO_3PRIME_BIAS SAMPLE LIBRARY READ_GROUP
count 4.800000e+01 4.800000e+01 48 4.800000e+01 4.800000e+01 4.800000e+01 4.800000e+01 48 48 48 ... 48.000000 48.000000 48 48.000000 48.000000 48.000000 48.000000 0 0 0
mean 6.646251e+09 6.645865e+09 0 2.202528e+09 1.522187e+09 1.112084e+09 1.809065e+09 0 0 0 ... 0.554635 0.554603 0 0.783146 0.023094 0.296721 0.091490 NaN NaN NaN
std 1.811126e+09 1.811020e+09 0 7.315647e+08 4.624208e+08 3.928677e+08 4.869813e+08 0 0 0 ... 0.059503 0.059500 0 0.062576 0.006401 0.043320 0.026964 NaN NaN NaN
min 3.049706e+09 3.049552e+09 0 8.932921e+08 6.098608e+08 5.491224e+08 7.005939e+08 0 0 0 ... 0.347725 0.347702 0 0.685367 0.011376 0.216391 0.033854 NaN NaN NaN
25% 5.378171e+09 5.377825e+09 0 1.727695e+09 1.204495e+09 8.238215e+08 1.574658e+09 0 0 0 ... 0.540673 0.540644 0 0.741171 0.018696 0.266579 0.075889 NaN NaN NaN
50% 6.628100e+09 6.627725e+09 0 2.246144e+09 1.525553e+09 1.042155e+09 1.829488e+09 0 0 0 ... 0.564226 0.564189 0 0.766493 0.022549 0.292372 0.092082 NaN NaN NaN
75% 7.963288e+09 7.962802e+09 0 2.726507e+09 1.790935e+09 1.353596e+09 2.137919e+09 0 0 0 ... 0.589496 0.589465 0 0.817122 0.026101 0.318905 0.104573 NaN NaN NaN
max 1.064566e+10 1.064496e+10 0 3.685544e+09 2.430826e+09 2.319200e+09 3.004370e+09 0 0 0 ... 0.631248 0.631209 0 0.941701 0.043929 0.415660 0.162498 NaN NaN NaN

8 rows × 25 columns


In [63]:
fig=plt.figure(figsize=(16,24))
fig.add_subplot(421)
sns.distplot(rnaseq_metrics.CODING_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.title('Percent coding bases',fontsize=20)
fig.add_subplot(422)
sns.tsplot(rnaseq_metrics.CODING_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(423)
sns.distplot(rnaseq_metrics.UTR_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.title('Percent UTR bases',fontsize=20)
fig.add_subplot(424)
sns.tsplot(rnaseq_metrics.UTR_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(425)
sns.distplot(rnaseq_metrics.INTRONIC_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.title('Percent intronic bases',fontsize=20)
fig.add_subplot(426)
sns.tsplot(rnaseq_metrics.INTRONIC_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(427)
sns.distplot(rnaseq_metrics.INTERGENIC_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.title('Percent intergenic bases',fontsize=20)
fig.add_subplot(428)
sns.tsplot(rnaseq_metrics.INTERGENIC_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.xlabel('sessions',fontsize=16)


Out[63]:
<matplotlib.text.Text at 0x12d225f50>

In [64]:
fig=plt.figure(figsize=(16,24))

fig.add_subplot(421)
sns.distplot(rnaseq_metrics.MEDIAN_3PRIME_BIAS)
plt.title("Median 3' bias",fontsize=20)
fig.add_subplot(422)
sns.tsplot(rnaseq_metrics.MEDIAN_3PRIME_BIAS)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(423)
sns.distplot(rnaseq_metrics.MEDIAN_5PRIME_BIAS)
plt.title("Median 5' bias",fontsize=20)
fig.add_subplot(424)
sns.tsplot(rnaseq_metrics.MEDIAN_5PRIME_BIAS)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(425)
sns.distplot(rnaseq_metrics.MEDIAN_CV_COVERAGE)
plt.title('Median CV coverage',fontsize=20)
fig.add_subplot(426)
sns.tsplot(rnaseq_metrics.MEDIAN_CV_COVERAGE)
plt.xlabel('sessions',fontsize=16)


Out[64]:
<matplotlib.text.Text at 0x122d53290>