MSU seq mapping coverage



In [1]:

    
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import seaborn



In [2]:

    
!cd .. && make -j10 outputs/msu/coverage/galGal4.latest.pd_df.csv









    



make: `outputs/msu/coverage/galGal4.latest.pd_df.csv' is up to date.



In [3]:

    
df_galGal4 = pd.DataFrame.from_csv('../outputs/msu/coverage/galGal4.latest.pd_df.csv')
total_bases_in_galGal4 = 1046932099



In [4]:

    
def plots(df, total_bases_in_reference):
    df['coverage'] = df['total ref bases covered'] / total_bases_in_reference
    df['coverage >= 90%'] = df['total ref bases covered, alignment length > 90% read'] / total_bases_in_reference

    ############################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Histogram', fontsize=15)
    plt.xlabel('Mapped read length')
    plt.ylabel('Read count')
    df['histogram'].plot(kind='bar')

    ############################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Fraction of the reference covered', fontsize=15)
    plt.xlabel('Mapped read length')
    plt.ylabel('% covered by reads longer than length')
    plt.ylim([0, 1])
    df['coverage'].plot(kind='bar')

    ############################################
    
    # Response curve
    f = plt.figure(figsize=(8, 6))
    f.suptitle('# of bases covered in reference', fontsize=15)
    plt.xlabel('Read length')
    plt.ylabel('Base count')

    df['total ref bases covered'].plot(label='any length')
    df['total ref bases covered, alignment length > 90% read'].plot(label='alignment length > 90% read length')

    plt.legend(loc='lower left')

    ###########################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Total bases per read length', fontsize=15)
    plt.xlabel('Read length')
    plt.ylabel('# of bases in reads longer than read length')

    df['# bases in reads >= minimum'].plot()

    ###########################################
    plt.figure()
    (df['total ref bases covered'] / df['# bases in reads >= minimum']).plot(label="all alignments")
    (df['total ref bases covered, alignment length > 90% read'] / df['# bases in reads >= minimum']).plot(label="alignment > 90% read")
    plt.legend(loc="upper left")
    plt.xlabel('Read length')

    #plt.figure()
    #(df_unmasked['# bases in reads >= minimum'].cumsum() / df_unmasked['total ref bases covered'].cumsum()).plot(label="all alignments")
    #(df_unmasked['# bases in reads >= minimum'].cumsum() / df_unmasked['total ref bases covered, alignment length > 90% read'].cumsum()).plot(label="alignment > 90% read")
    #plt.legend(loc="upper right")
    #plt.xlabel('Read length')

    ##########################################



In [5]:

    
plots(df_galGal4, total_bases_in_galGal4)



In [6]:

    
df_galGal4









    Out[6]:






  
    
      
      # reads >= minimum
      # bases in reads >= minimum
      histogram
      total ref bases covered
      total ref bases covered, alignment length > 90% read
      coverage
      coverage >= 90%
    
  
  
    
      500 
       338389
       404457155
       191726
       306702222
       303552969
       0.292953
       0.289945
    
    
      1000
       146663
       274228322
        55425
       224412923
       221696743
       0.214353
       0.211758
    
    
      1500
        91238
       206499561
        36179
       175492121
       173701096
       0.167625
       0.165914
    
    
      2000
        55059
       143562827
        27416
       126214739
       125240786
       0.120557
       0.119626
    
    
      2500
        27643
        82280725
        17090
        74738476
        74377995
       0.071388
       0.071044
    
    
      3000
        10553
        35715138
         7565
        33170484
        33077558
       0.031684
       0.031595
    
    
      3500
         2988
        11441217
         2337
        10673769
        10651361
       0.010195
       0.010174
    
    
      4000
          651
         2814972
          511
         2538702
         2534683
       0.002425
       0.002421
    
    
      4500
          140
          675974
          112
          573163
          573163
       0.000547
       0.000547
    
    
      5000
           28
          152152
           19
          101040
          101040
       0.000097
       0.000097
    
    
      5500
            9
           53730
            5
           27585
           27585
       0.000026
       0.000026
    
    
      6000
            4
           25231
            3
           12466
           12466
       0.000012
       0.000012

	# reads >= minimum	# bases in reads >= minimum	histogram	total ref bases covered	total ref bases covered, alignment length > 90% read	coverage	coverage >= 90%
500	338389	404457155	191726	306702222	303552969	0.292953	0.289945
1000	146663	274228322	55425	224412923	221696743	0.214353	0.211758
1500	91238	206499561	36179	175492121	173701096	0.167625	0.165914
2000	55059	143562827	27416	126214739	125240786	0.120557	0.119626
2500	27643	82280725	17090	74738476	74377995	0.071388	0.071044
3000	10553	35715138	7565	33170484	33077558	0.031684	0.031595
3500	2988	11441217	2337	10673769	10651361	0.010195	0.010174
4000	651	2814972	511	2538702	2534683	0.002425	0.002421
4500	140	675974	112	573163	573163	0.000547	0.000547
5000	28	152152	19	101040	101040	0.000097	0.000097
5500	9	53730	5	27585	27585	0.000026	0.000026
6000	4	25231	3	12466	12466	0.000012	0.000012