MSU seq mapping coverage


In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import seaborn

In [46]:
!cd .. && make -j10 outputs/msu/coverage/galGal4.pd_df.csv
!cd .. && make -j10 outputs/msu/coverage/galGal5.pd_df.csv


mkdir -p outputs/msu/coverage
python scripts/count_reads_pd.py outputs/msu/msu.fasta workdirs/msucov/galGal4/output workdirs/msucov/galGal4_90/output outputs/msu/coverage/galGal4.pd_df.csv
mkdir -p outputs/msu/coverage
python scripts/count_reads_pd.py outputs/msu/msu.fasta workdirs/msucov/galGal5/output workdirs/msucov/galGal5_90/output outputs/msu/coverage/galGal5.pd_df.csv

In [47]:
df_galGal4 = pd.DataFrame.from_csv('../outputs/msu/coverage/galGal4.pd_df.csv')
total_bases_in_galGal4 = 1046932099

df_galGal5 = pd.DataFrame.from_csv('../outputs/msu/coverage/galGal5.pd_df.csv')
total_bases_in_galGal5 = 1004291883

In [48]:
def plots(df, total_bases_in_reference):
    df['coverage'] = df['total ref bases covered'] / total_bases_in_reference
    df['coverage >= 90%'] = df['total ref bases covered, alignment length > 90% read'] / total_bases_in_reference

    ############################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Histogram', fontsize=15)
    plt.xlabel('Mapped read length')
    plt.ylabel('Read count')
    df['histogram'].plot(kind='bar')

    ############################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Fraction of the reference covered', fontsize=15)
    plt.xlabel('Mapped read length')
    plt.ylabel('% covered by reads longer than length')
    plt.ylim([0, 1])
    df['coverage'].plot(kind='bar')

    ############################################
    
    # Response curve
    f = plt.figure(figsize=(8, 6))
    f.suptitle('# of bases covered in reference', fontsize=15)
    plt.xlabel('Read length')
    plt.ylabel('Base count')

    df['total ref bases covered'].plot(label='any length')
    df['total ref bases covered, alignment length > 90% read'].plot(label='alignment length > 90% read length')

    plt.legend(loc='lower left')

    ###########################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Total bases per read length', fontsize=15)
    plt.xlabel('Read length')
    plt.ylabel('# of bases in reads longer than read length')

    df['# bases in reads >= minimum'].plot()

    ###########################################
    plt.figure()
    (df['total ref bases covered'] / df['# bases in reads >= minimum']).plot(label="all alignments")
    (df['total ref bases covered, alignment length > 90% read'] / df['# bases in reads >= minimum']).plot(label="alignment > 90% read")
    plt.legend(loc="upper left")
    plt.xlabel('Read length')

    #plt.figure()
    #(df_unmasked['# bases in reads >= minimum'].cumsum() / df_unmasked['total ref bases covered'].cumsum()).plot(label="all alignments")
    #(df_unmasked['# bases in reads >= minimum'].cumsum() / df_unmasked['total ref bases covered, alignment length > 90% read'].cumsum()).plot(label="alignment > 90% read")
    #plt.legend(loc="upper right")
    #plt.xlabel('Read length')

    ##########################################

In [49]:
plots(df_galGal4, total_bases_in_galGal4)



In [50]:
plots(df_galGal5, total_bases_in_galGal5)



In [51]:
df_galGal4


Out[51]:
# reads >= minimum # bases in reads >= minimum histogram total ref bases covered total ref bases covered, alignment length > 90% read coverage coverage >= 90%
500 290112 340089050 164768 267083551 264227145 0.255111 0.252382
1000 125344 227771559 49997 191297878 188899477 0.182722 0.180431
1500 75347 166686753 32009 145023394 143514865 0.138522 0.137081
2000 43338 111072122 23049 99631759 98872814 0.095165 0.094441
2500 20289 59625709 13236 54910690 54654383 0.052449 0.052204
3000 7053 23646963 5284 22113184 22051329 0.021122 0.021063
3500 1769 6723847 1441 6271437 6253187 0.005990 0.005973
4000 328 1414991 259 1254088 1254088 0.001198 0.001198
4500 69 330590 57 279633 279633 0.000267 0.000267
5000 12 63917 9 37592 37592 0.000036 0.000036
5500 3 17774 2 11283 11283 0.000011 0.000011
6000 1 6498 1 0 0 0.000000 0.000000

In [52]:
df_galGal5


Out[52]:
# reads >= minimum # bases in reads >= minimum histogram total ref bases covered total ref bases covered, alignment length > 90% read coverage coverage >= 90%
500 290112 340089050 164768 264145666 261863782 0.263017 0.260745
1000 125344 227771559 49997 189532229 187660123 0.188722 0.186858
1500 75347 166686753 32009 144082905 142845818 0.143467 0.142235
2000 43338 111072122 23049 99081457 98441474 0.098658 0.098021
2500 20289 59625709 13236 54634182 54415408 0.054401 0.054183
3000 7053 23646963 5284 22044660 21992782 0.021950 0.021899
3500 1769 6723847 1441 6252198 6237607 0.006225 0.006211
4000 328 1414991 259 1253984 1253984 0.001249 0.001249
4500 69 330590 57 279420 279420 0.000278 0.000278
5000 12 63917 9 37574 37574 0.000037 0.000037
5500 3 17774 2 11281 11281 0.000011 0.000011
6000 1 6498 1 0 0 0.000000 0.000000