MSU seq mapping coverage



In [1]:

    
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import seaborn



In [46]:

    
!cd .. && make -j10 outputs/msu/coverage/galGal4.pd_df.csv
!cd .. && make -j10 outputs/msu/coverage/galGal5.pd_df.csv









    



mkdir -p outputs/msu/coverage
python scripts/count_reads_pd.py outputs/msu/msu.fasta workdirs/msucov/galGal4/output workdirs/msucov/galGal4_90/output outputs/msu/coverage/galGal4.pd_df.csv
mkdir -p outputs/msu/coverage
python scripts/count_reads_pd.py outputs/msu/msu.fasta workdirs/msucov/galGal5/output workdirs/msucov/galGal5_90/output outputs/msu/coverage/galGal5.pd_df.csv



In [47]:

    
df_galGal4 = pd.DataFrame.from_csv('../outputs/msu/coverage/galGal4.pd_df.csv')
total_bases_in_galGal4 = 1046932099

df_galGal5 = pd.DataFrame.from_csv('../outputs/msu/coverage/galGal5.pd_df.csv')
total_bases_in_galGal5 = 1004291883



In [48]:

    
def plots(df, total_bases_in_reference):
    df['coverage'] = df['total ref bases covered'] / total_bases_in_reference
    df['coverage >= 90%'] = df['total ref bases covered, alignment length > 90% read'] / total_bases_in_reference

    ############################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Histogram', fontsize=15)
    plt.xlabel('Mapped read length')
    plt.ylabel('Read count')
    df['histogram'].plot(kind='bar')

    ############################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Fraction of the reference covered', fontsize=15)
    plt.xlabel('Mapped read length')
    plt.ylabel('% covered by reads longer than length')
    plt.ylim([0, 1])
    df['coverage'].plot(kind='bar')

    ############################################
    
    # Response curve
    f = plt.figure(figsize=(8, 6))
    f.suptitle('# of bases covered in reference', fontsize=15)
    plt.xlabel('Read length')
    plt.ylabel('Base count')

    df['total ref bases covered'].plot(label='any length')
    df['total ref bases covered, alignment length > 90% read'].plot(label='alignment length > 90% read length')

    plt.legend(loc='lower left')

    ###########################################
    f = plt.figure(figsize=(8, 6))
    f.suptitle('Total bases per read length', fontsize=15)
    plt.xlabel('Read length')
    plt.ylabel('# of bases in reads longer than read length')

    df['# bases in reads >= minimum'].plot()

    ###########################################
    plt.figure()
    (df['total ref bases covered'] / df['# bases in reads >= minimum']).plot(label="all alignments")
    (df['total ref bases covered, alignment length > 90% read'] / df['# bases in reads >= minimum']).plot(label="alignment > 90% read")
    plt.legend(loc="upper left")
    plt.xlabel('Read length')

    #plt.figure()
    #(df_unmasked['# bases in reads >= minimum'].cumsum() / df_unmasked['total ref bases covered'].cumsum()).plot(label="all alignments")
    #(df_unmasked['# bases in reads >= minimum'].cumsum() / df_unmasked['total ref bases covered, alignment length > 90% read'].cumsum()).plot(label="alignment > 90% read")
    #plt.legend(loc="upper right")
    #plt.xlabel('Read length')

    ##########################################



In [49]:

    
plots(df_galGal4, total_bases_in_galGal4)



In [50]:

    
plots(df_galGal5, total_bases_in_galGal5)



In [51]:

    
df_galGal4









    Out[51]:






  
    
      
      # reads >= minimum
      # bases in reads >= minimum
      histogram
      total ref bases covered
      total ref bases covered, alignment length > 90% read
      coverage
      coverage >= 90%
    
  
  
    
      500 
       290112
       340089050
       164768
       267083551
       264227145
       0.255111
       0.252382
    
    
      1000
       125344
       227771559
        49997
       191297878
       188899477
       0.182722
       0.180431
    
    
      1500
        75347
       166686753
        32009
       145023394
       143514865
       0.138522
       0.137081
    
    
      2000
        43338
       111072122
        23049
        99631759
        98872814
       0.095165
       0.094441
    
    
      2500
        20289
        59625709
        13236
        54910690
        54654383
       0.052449
       0.052204
    
    
      3000
         7053
        23646963
         5284
        22113184
        22051329
       0.021122
       0.021063
    
    
      3500
         1769
         6723847
         1441
         6271437
         6253187
       0.005990
       0.005973
    
    
      4000
          328
         1414991
          259
         1254088
         1254088
       0.001198
       0.001198
    
    
      4500
           69
          330590
           57
          279633
          279633
       0.000267
       0.000267
    
    
      5000
           12
           63917
            9
           37592
           37592
       0.000036
       0.000036
    
    
      5500
            3
           17774
            2
           11283
           11283
       0.000011
       0.000011
    
    
      6000
            1
            6498
            1
               0
               0
       0.000000
       0.000000



In [52]:

    
df_galGal5









    Out[52]:






  
    
      
      # reads >= minimum
      # bases in reads >= minimum
      histogram
      total ref bases covered
      total ref bases covered, alignment length > 90% read
      coverage
      coverage >= 90%
    
  
  
    
      500 
       290112
       340089050
       164768
       264145666
       261863782
       0.263017
       0.260745
    
    
      1000
       125344
       227771559
        49997
       189532229
       187660123
       0.188722
       0.186858
    
    
      1500
        75347
       166686753
        32009
       144082905
       142845818
       0.143467
       0.142235
    
    
      2000
        43338
       111072122
        23049
        99081457
        98441474
       0.098658
       0.098021
    
    
      2500
        20289
        59625709
        13236
        54634182
        54415408
       0.054401
       0.054183
    
    
      3000
         7053
        23646963
         5284
        22044660
        21992782
       0.021950
       0.021899
    
    
      3500
         1769
         6723847
         1441
         6252198
         6237607
       0.006225
       0.006211
    
    
      4000
          328
         1414991
          259
         1253984
         1253984
       0.001249
       0.001249
    
    
      4500
           69
          330590
           57
          279420
          279420
       0.000278
       0.000278
    
    
      5000
           12
           63917
            9
           37574
           37574
       0.000037
       0.000037
    
    
      5500
            3
           17774
            2
           11281
           11281
       0.000011
       0.000011
    
    
      6000
            1
            6498
            1
               0
               0
       0.000000
       0.000000

	# reads >= minimum	# bases in reads >= minimum	histogram	total ref bases covered	total ref bases covered, alignment length > 90% read	coverage	coverage >= 90%
500	290112	340089050	164768	267083551	264227145	0.255111	0.252382
1000	125344	227771559	49997	191297878	188899477	0.182722	0.180431
1500	75347	166686753	32009	145023394	143514865	0.138522	0.137081
2000	43338	111072122	23049	99631759	98872814	0.095165	0.094441
2500	20289	59625709	13236	54910690	54654383	0.052449	0.052204
3000	7053	23646963	5284	22113184	22051329	0.021122	0.021063
3500	1769	6723847	1441	6271437	6253187	0.005990	0.005973
4000	328	1414991	259	1254088	1254088	0.001198	0.001198
4500	69	330590	57	279633	279633	0.000267	0.000267
5000	12	63917	9	37592	37592	0.000036	0.000036
5500	3	17774	2	11283	11283	0.000011	0.000011
6000	1	6498	1	0	0	0.000000	0.000000