In [2]:
import sys
import gzip
dirname_data = '../../data/AmphiBase/'
#filename_bg_gz = dirname_data + 'Kwon201710_HYNLE_7730gDNA.Hynobius_mitochondrion.bwa_mem.hit.bg.gz'
#filename_bg_gz = dirname_data + 'Kwon201710_HYNQU_x002gDNA.Hynobius_mitochondrion.bwa_mem.hit.bg.gz'
filename_bg_gz = dirname_data + 'Kwon201710_HYNYA_7701gDNA.Hynobius_mitochondrion.bwa_mem.hit.bg.gz'
filename_seqlen_gz = dirname_data + 'Hynobius_mitochondrion.seqlen.gz'
seqlen_list = dict()
f_seqlen_gz = gzip.open(filename_seqlen_gz,'rt')
for line in f_seqlen_gz:
tokens = line.strip().split()
seqlen_list[ tokens[0] ] = int(tokens[1])
f_seqlen_gz.close()
wig_list = dict()
f_gz = gzip.open(filename_bg_gz,'rt')
lines = f_gz.readlines()
#print("\n".join(lines[:3]))
for line in lines:
tokens = line.strip().split("\t")
t_id = tokens[0]
if not t_id in seqlen_list:
sys.stderr.write('SEQLEN not available: %s\n'%t_id)
continue
if not t_id in wig_list:
wig_list[t_id] = [0 for x in range(seqlen_list[t_id])]
for i in range(int(tokens[1]), int(tokens[2])):
wig_list[t_id][i] += int(tokens[3])
f_gz.close()
print(wig_list[t_id][:4])
sys.stderr.write('Done\n')
In [3]:
%matplotlib inline
#t_id = 'NC_008079.1|HYNLE'
#t_id = 'NC_010224.1|HYNQU'
t_id = 'NC_013825.1|HYNYA'
import matplotlib.pyplot as plt
print( wig_list[t_id][3000:3200])
fig = plt.figure(figsize=(12,4))
ax1 = fig.add_subplot(1,1,1)
ax1.plot(range(seqlen_list[t_id]), wig_list[t_id], 'b-')
ax1.grid()
ax1.set_ylabel('Read count', fontsize=14)
tmp_cov_pct = len([x for x in wig_list[t_id] if x > 0])/seqlen_list[t_id]*100.0
ax1.set_xlabel('%s (%s bp; Cov %.2f pct)'%(t_id, "{:,}".format(seqlen_list[t_id]), tmp_cov_pct), fontsize=14)
plt.show()
In [ ]: