Moleculo mapping coverage


In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import seaborn

In [2]:
#!make -j10 outputs/coverage/galGal4.pd_df.csv
#!make -j10 outputs/coverage/galGal5.pd_df.csv

In [3]:
df_galGal4 = pd.DataFrame.from_csv('../outputs/coverage/galGal4.pd_df.csv')
total_bases_in_galGal4 = 1046932099

df_galGal5 = pd.DataFrame.from_csv('../outputs/coverage/galGal5.pd_df.csv')
total_bases_in_galGal5 = 1004291883

In [11]:
def plots(df, total_bases_in_reference):
    df['coverage'] = df['total ref bases covered'] / total_bases_in_reference
    df['coverage >= 90%'] = df['total ref bases covered, alignment length > 90% read'] / total_bases_in_reference

    ############################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Histogram', fontsize=15)
    plt.xlabel('Mapped read length')
    plt.ylabel('Read count')
    df['histogram'].plot(kind='bar')

    ############################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Fraction of the reference covered', fontsize=15)
    plt.xlabel('Mapped read length')
    plt.ylabel('% covered by reads longer than length')
    plt.ylim([0, 1])
    df['coverage'].plot(kind='bar')

    ############################################
    
    # Response curve
    f = plt.figure(figsize=(8, 6))
    f.suptitle('# of bases covered in reference', fontsize=15)
    plt.xlabel('Read length')
    plt.ylabel('Base count')

    df['total ref bases covered'].plot(label='any length')
    df['total ref bases covered, alignment length > 90% read'].plot(label='alignment length > 90% read length')

    plt.legend(loc='lower left')

    ###########################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Total bases per read length', fontsize=15)
    plt.xlabel('Read length')
    plt.ylabel('# of bases in reads longer than read length')

    df['# bases in reads >= minimum'].plot()

    ###########################################
    plt.figure()
    (df['total ref bases covered'] / df['# bases in reads >= minimum']).plot(label="all alignments")
    (df['total ref bases covered, alignment length > 90% read'] / df['# bases in reads >= minimum']).plot(label="alignment > 90% read")
    plt.legend(loc="upper left")
    plt.xlabel('Read length')

    #plt.figure()
    #(df_unmasked['# bases in reads >= minimum'].cumsum() / df_unmasked['total ref bases covered'].cumsum()).plot(label="all alignments")
    #(df_unmasked['# bases in reads >= minimum'].cumsum() / df_unmasked['total ref bases covered, alignment length > 90% read'].cumsum()).plot(label="alignment > 90% read")
    #plt.legend(loc="upper right")
    #plt.xlabel('Read length')

    ##########################################

In [12]:
plots(df_galGal4, total_bases_in_galGal4)



In [6]:
plots(df_galGal5, total_bases_in_galGal5)



In [7]:
df_galGal4


Out[7]:
# reads >= minimum # bases in reads >= minimum histogram total ref bases covered total ref bases covered, alignment length > 90% read coverage coverage >= 90%
500 1579060 5979144764 380562 927910674 925822633 0.886314 0.884320
1000 1198498 5706510537 177996 920383267 918020505 0.879124 0.876867
1500 1020502 5487974694 132078 912292972 909924455 0.871397 0.869134
2000 888424 5258736875 96460 904748098 902303609 0.864190 0.861855
2500 791964 5042941128 75044 897466659 894890488 0.857235 0.854774
3000 716920 4837330130 60860 889907023 887192266 0.850014 0.847421
3500 656060 4640156299 51165 882147637 879249554 0.842603 0.839834
4000 604895 4448667417 43718 873898090 870871350 0.834723 0.831832
4500 561177 4263131768 39481 865122253 862116741 0.826340 0.823470
5000 521696 4075829221 36230 854823136 851974516 0.816503 0.813782
5500 485466 3885765099 34768 844054080 841523399 0.806217 0.803799
6000 450698 3685910766 34063 831686019 829599786 0.794403 0.792410
6500 416635 3473016287 35228 816958760 815248603 0.780336 0.778702
7000 381407 3235103366 51526 799055139 797844080 0.763235 0.762078
7500 329881 2859908483 93426 771917788 771051427 0.737314 0.736487
8000 236455 2135422172 78923 710463468 709888394 0.678615 0.678065
8500 157532 1485325024 57626 622975796 622604544 0.595049 0.594694
9000 99906 981945384 40074 513196419 512986778 0.490191 0.489990
9500 59832 611940150 26485 386738420 386652648 0.369402 0.369320
10000 33347 354233242 16539 261704561 261661410 0.249973 0.249932
10500 16808 185063470 9709 154001806 153980637 0.147098 0.147078
11000 7099 80929655 4960 73223404 73223404 0.069941 0.069941
11500 2139 25304661 1749 23911868 23911868 0.022840 0.022840
12000 390 4846373 327 4518281 4518281 0.004316 0.004316
12500 63 869385 31 680764 680764 0.000650 0.000650
13000 32 476315 3 338814 338814 0.000324 0.000324
13500 29 436517 7 299009 299009 0.000286 0.000286
14000 22 340118 5 216579 216579 0.000207 0.000207
14500 17 268354 5 188081 188081 0.000180 0.000180
15000 12 194878 3 129280 129280 0.000123 0.000123

In [8]:
df_galGal5


Out[8]:
# reads >= minimum # bases in reads >= minimum histogram total ref bases covered total ref bases covered, alignment length > 90% read coverage coverage >= 90%
500 1579060 5979144764 380562 913680116 913026335 0.909775 0.909124
1000 1198498 5706510537 177996 907748827 907001978 0.903870 0.903126
1500 1020502 5487974694 132078 902445388 901620029 0.898589 0.897767
2000 888424 5258736875 96460 896357037 895426322 0.892526 0.891600
2500 791964 5042941128 75044 889922540 888923456 0.886119 0.885125
3000 716920 4837330130 60860 882966200 881875628 0.879193 0.878107
3500 656060 4640156299 51165 875697893 874481504 0.871956 0.870744
4000 604895 4448667417 43718 867915876 866664167 0.864207 0.862960
4500 561177 4263131768 39481 859565770 858343729 0.855892 0.854676
5000 521696 4075829221 36230 849764306 848543802 0.846133 0.844918
5500 485466 3885765099 34768 839470351 838399317 0.835883 0.834816
6000 450698 3685910766 34063 827699134 826785018 0.824162 0.823252
6500 416635 3473016287 35228 813446213 812685110 0.809970 0.809212
7000 381407 3235103366 51526 796105654 795618546 0.792703 0.792218
7500 329881 2859908483 93426 769814533 769424401 0.766525 0.766136
8000 236455 2135422172 78923 709588256 709264920 0.706556 0.706234
8500 157532 1485325024 57626 622522079 622334878 0.619862 0.619675
9000 99906 981945384 40074 513274389 513134396 0.511081 0.510941
9500 59832 611940150 26485 387201043 387135139 0.385546 0.385481
10000 33347 354233242 16539 262037142 262014834 0.260917 0.260895
10500 16808 185063470 9709 154323774 154313138 0.153664 0.153654
11000 7099 80929655 4960 73264844 73264844 0.072952 0.072952
11500 2139 25304661 1749 23977977 23977977 0.023876 0.023876
12000 390 4846373 327 4518122 4518122 0.004499 0.004499
12500 63 869385 31 680833 680833 0.000678 0.000678
13000 32 476315 3 338861 338861 0.000337 0.000337
13500 29 436517 7 299005 299005 0.000298 0.000298
14000 22 340118 5 216576 216576 0.000216 0.000216
14500 17 268354 5 188061 188061 0.000187 0.000187
15000 12 194878 3 129262 129262 0.000129 0.000129