In [1]:
import pandas as pd
import bitarray

1. Read genome length


In [2]:
with open( "../../d9539_asm_v1.2.fa") as fh:
    genome = fh.readlines()[1:]
    genome = "".join(genome).replace("\n","")
    genome_length = len(genome)
    print(genome_length)


5752

2. Read annotation


In [3]:
cds_intervals = pd.read_csv("filt_orf_stats.csv")[["q_cds_start","q_cds_end"]]
#Transform intervals from 1-based coord to 0-based coord
cds_intervals = cds_intervals.apply(lambda row: (row[0]-1,row[1]-1),axis=1).tolist()
cds_intervals


Out[3]:
[(24, 104),
 (129, 236),
 (214, 387),
 (392, 1654),
 (1602, 2123),
 (2178, 2387),
 (2350, 3432),
 (3428, 5368),
 (5373, 5663)]

3. Calculate coverage


In [4]:
genome_pos = bitarray.bitarray(genome_length)
genome_pos.setall(False)

In [5]:
for pair in cds_intervals:
    genome_pos[pair[0]:(pair[1]+1)] = True

In [6]:
#Coverage
cov = 100.0 * sum(genome_pos) / genome_length
print("Coverage: {}%".format( cov ) )


Coverage: 96.557719054242%

In [ ]: