In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import seaborn
In [46]:
!cd .. && make -j10 outputs/msu/coverage/galGal4.pd_df.csv
!cd .. && make -j10 outputs/msu/coverage/galGal5.pd_df.csv
In [47]:
df_galGal4 = pd.DataFrame.from_csv('../outputs/msu/coverage/galGal4.pd_df.csv')
total_bases_in_galGal4 = 1046932099
df_galGal5 = pd.DataFrame.from_csv('../outputs/msu/coverage/galGal5.pd_df.csv')
total_bases_in_galGal5 = 1004291883
In [48]:
def plots(df, total_bases_in_reference):
df['coverage'] = df['total ref bases covered'] / total_bases_in_reference
df['coverage >= 90%'] = df['total ref bases covered, alignment length > 90% read'] / total_bases_in_reference
############################################
f = plt.figure(figsize=(8, 6))
f.suptitle('Histogram', fontsize=15)
plt.xlabel('Mapped read length')
plt.ylabel('Read count')
df['histogram'].plot(kind='bar')
############################################
f = plt.figure(figsize=(8, 6))
f.suptitle('Fraction of the reference covered', fontsize=15)
plt.xlabel('Mapped read length')
plt.ylabel('% covered by reads longer than length')
plt.ylim([0, 1])
df['coverage'].plot(kind='bar')
############################################
# Response curve
f = plt.figure(figsize=(8, 6))
f.suptitle('# of bases covered in reference', fontsize=15)
plt.xlabel('Read length')
plt.ylabel('Base count')
df['total ref bases covered'].plot(label='any length')
df['total ref bases covered, alignment length > 90% read'].plot(label='alignment length > 90% read length')
plt.legend(loc='lower left')
###########################################
f = plt.figure(figsize=(8, 6))
f.suptitle('Total bases per read length', fontsize=15)
plt.xlabel('Read length')
plt.ylabel('# of bases in reads longer than read length')
df['# bases in reads >= minimum'].plot()
###########################################
plt.figure()
(df['total ref bases covered'] / df['# bases in reads >= minimum']).plot(label="all alignments")
(df['total ref bases covered, alignment length > 90% read'] / df['# bases in reads >= minimum']).plot(label="alignment > 90% read")
plt.legend(loc="upper left")
plt.xlabel('Read length')
#plt.figure()
#(df_unmasked['# bases in reads >= minimum'].cumsum() / df_unmasked['total ref bases covered'].cumsum()).plot(label="all alignments")
#(df_unmasked['# bases in reads >= minimum'].cumsum() / df_unmasked['total ref bases covered, alignment length > 90% read'].cumsum()).plot(label="alignment > 90% read")
#plt.legend(loc="upper right")
#plt.xlabel('Read length')
##########################################
In [49]:
plots(df_galGal4, total_bases_in_galGal4)
In [50]:
plots(df_galGal5, total_bases_in_galGal5)
In [51]:
df_galGal4
Out[51]:
In [52]:
df_galGal5
Out[52]: