MSU seq mapping coverage


In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import seaborn

In [2]:
!cd .. && make -j10 outputs/msu/coverage/galGal4.latest.pd_df.csv


make: `outputs/msu/coverage/galGal4.latest.pd_df.csv' is up to date.

In [3]:
df_galGal4 = pd.DataFrame.from_csv('../outputs/msu/coverage/galGal4.latest.pd_df.csv')
total_bases_in_galGal4 = 1046932099

In [4]:
def plots(df, total_bases_in_reference):
    df['coverage'] = df['total ref bases covered'] / total_bases_in_reference
    df['coverage >= 90%'] = df['total ref bases covered, alignment length > 90% read'] / total_bases_in_reference

    ############################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Histogram', fontsize=15)
    plt.xlabel('Mapped read length')
    plt.ylabel('Read count')
    df['histogram'].plot(kind='bar')

    ############################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Fraction of the reference covered', fontsize=15)
    plt.xlabel('Mapped read length')
    plt.ylabel('% covered by reads longer than length')
    plt.ylim([0, 1])
    df['coverage'].plot(kind='bar')

    ############################################
    
    # Response curve
    f = plt.figure(figsize=(8, 6))
    f.suptitle('# of bases covered in reference', fontsize=15)
    plt.xlabel('Read length')
    plt.ylabel('Base count')

    df['total ref bases covered'].plot(label='any length')
    df['total ref bases covered, alignment length > 90% read'].plot(label='alignment length > 90% read length')

    plt.legend(loc='lower left')

    ###########################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Total bases per read length', fontsize=15)
    plt.xlabel('Read length')
    plt.ylabel('# of bases in reads longer than read length')

    df['# bases in reads >= minimum'].plot()

    ###########################################
    plt.figure()
    (df['total ref bases covered'] / df['# bases in reads >= minimum']).plot(label="all alignments")
    (df['total ref bases covered, alignment length > 90% read'] / df['# bases in reads >= minimum']).plot(label="alignment > 90% read")
    plt.legend(loc="upper left")
    plt.xlabel('Read length')

    #plt.figure()
    #(df_unmasked['# bases in reads >= minimum'].cumsum() / df_unmasked['total ref bases covered'].cumsum()).plot(label="all alignments")
    #(df_unmasked['# bases in reads >= minimum'].cumsum() / df_unmasked['total ref bases covered, alignment length > 90% read'].cumsum()).plot(label="alignment > 90% read")
    #plt.legend(loc="upper right")
    #plt.xlabel('Read length')

    ##########################################

In [5]:
plots(df_galGal4, total_bases_in_galGal4)



In [6]:
df_galGal4


Out[6]:
# reads >= minimum # bases in reads >= minimum histogram total ref bases covered total ref bases covered, alignment length > 90% read coverage coverage >= 90%
500 338389 404457155 191726 306702222 303552969 0.292953 0.289945
1000 146663 274228322 55425 224412923 221696743 0.214353 0.211758
1500 91238 206499561 36179 175492121 173701096 0.167625 0.165914
2000 55059 143562827 27416 126214739 125240786 0.120557 0.119626
2500 27643 82280725 17090 74738476 74377995 0.071388 0.071044
3000 10553 35715138 7565 33170484 33077558 0.031684 0.031595
3500 2988 11441217 2337 10673769 10651361 0.010195 0.010174
4000 651 2814972 511 2538702 2534683 0.002425 0.002421
4500 140 675974 112 573163 573163 0.000547 0.000547
5000 28 152152 19 101040 101040 0.000097 0.000097
5500 9 53730 5 27585 27585 0.000026 0.000026
6000 4 25231 3 12466 12466 0.000012 0.000012