This notebook gives an overview of the RNA-seq quality control data for the MyConnectome project.
In [25]:
import numpy
import pandas as pd
import glob
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="whitegrid")
Load in the FastQC summary results (averaging across lanes/directions within each session).
In [52]:
PerBaseSeqQuality=pd.read_csv('PerBaseSeqQuality.csv',index_col=0)
PerBaseGCContent=pd.read_csv('PerBaseGCContent.csv',index_col=0)
PerBaseNContent=pd.read_csv('PerBaseNContent.csv',index_col=0)
rin=pd.read_csv('rin.txt',header=None,names=['rin'])
In [54]:
fig=plt.figure(figsize=(16,24))
fig.add_subplot(421)
sns.violinplot(PerBaseSeqQuality)
plt.title('Per base sequence quality',fontsize=20)
plt.xlabel('bases',fontsize=16)
fig.add_subplot(422)
sns.tsplot(PerBaseSeqQuality.mean(1))
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(423)
sns.violinplot(PerBaseGCContent)
plt.title('Per base GC content',fontsize=20)
plt.xlabel('bases',fontsize=16)
fig.add_subplot(424)
sns.tsplot(PerBaseGCContent.mean(1))
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(425)
sns.violinplot(PerBaseNContent)
plt.title('Per base N content',fontsize=20)
plt.xlabel('bases',fontsize=16)
fig.add_subplot(426)
sns.tsplot(PerBaseNContent.mean(1))
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(427)
sns.distplot(rin)
plt.title('RNA Integrity Number (RIN)',fontsize=20)
plt.xlabel('bases',fontsize=16)
fig.add_subplot(428)
sns.tsplot(rin.rin)
plt.xlabel('sessions',fontsize=16)
Out[54]:
Load the alignment metrics from Picard and plot summaries.
In [28]:
align_metrics=pd.read_csv('alignment_metrics.txt',sep='\t')
align_metrics.describe()
Out[28]:
In [55]:
fig=plt.figure(figsize=(16,24))
fig.add_subplot(421)
sns.distplot(align_metrics.TOTAL_READS)
plt.title('Total reads',fontsize=20)
fig.add_subplot(422)
sns.tsplot(align_metrics.TOTAL_READS)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(423)
sns.distplot(align_metrics.PF_HQ_ALIGNED_READS/align_metrics.TOTAL_READS)
plt.title('Percentage of high quality alignments',fontsize=20)
fig.add_subplot(424)
sns.tsplot(align_metrics.PF_HQ_ALIGNED_READS/align_metrics.TOTAL_READS)
plt.xlabel('sessions',fontsize=16)
Out[55]:
Load the rna-seq metrics from Picard and plot summaries.
In [29]:
rnaseq_metrics=pd.read_csv('rnaseq_metrics.txt',sep='\t')
rnaseq_metrics.describe()
Out[29]:
In [63]:
fig=plt.figure(figsize=(16,24))
fig.add_subplot(421)
sns.distplot(rnaseq_metrics.CODING_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.title('Percent coding bases',fontsize=20)
fig.add_subplot(422)
sns.tsplot(rnaseq_metrics.CODING_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(423)
sns.distplot(rnaseq_metrics.UTR_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.title('Percent UTR bases',fontsize=20)
fig.add_subplot(424)
sns.tsplot(rnaseq_metrics.UTR_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(425)
sns.distplot(rnaseq_metrics.INTRONIC_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.title('Percent intronic bases',fontsize=20)
fig.add_subplot(426)
sns.tsplot(rnaseq_metrics.INTRONIC_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(427)
sns.distplot(rnaseq_metrics.INTERGENIC_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.title('Percent intergenic bases',fontsize=20)
fig.add_subplot(428)
sns.tsplot(rnaseq_metrics.INTERGENIC_BASES/rnaseq_metrics.PF_ALIGNED_BASES)
plt.xlabel('sessions',fontsize=16)
Out[63]:
In [64]:
fig=plt.figure(figsize=(16,24))
fig.add_subplot(421)
sns.distplot(rnaseq_metrics.MEDIAN_3PRIME_BIAS)
plt.title("Median 3' bias",fontsize=20)
fig.add_subplot(422)
sns.tsplot(rnaseq_metrics.MEDIAN_3PRIME_BIAS)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(423)
sns.distplot(rnaseq_metrics.MEDIAN_5PRIME_BIAS)
plt.title("Median 5' bias",fontsize=20)
fig.add_subplot(424)
sns.tsplot(rnaseq_metrics.MEDIAN_5PRIME_BIAS)
plt.xlabel('sessions',fontsize=16)
fig.add_subplot(425)
sns.distplot(rnaseq_metrics.MEDIAN_CV_COVERAGE)
plt.title('Median CV coverage',fontsize=20)
fig.add_subplot(426)
sns.tsplot(rnaseq_metrics.MEDIAN_CV_COVERAGE)
plt.xlabel('sessions',fontsize=16)
Out[64]: