In [29]:
import os
import sys
dirname = '../../data/AmphiBase'
filename_fa = os.path.join(dirname, 'HYNYA_AB201711.prot.fa')
count_total = 0
count_NA = 0
COG_freq = dict()
f_fa = open(filename_fa,'r')
for line in f_fa:
if line.startswith('>'):
count_total += 1
tmp_COG = line.strip().split()[2]
if tmp_COG == 'NA':
count_NA += 1
continue
for tmp_COG_code in tmp_COG.split('|')[0].split('=')[1]:
if not tmp_COG_code in COG_freq:
COG_freq[tmp_COG_code] = 0
COG_freq[tmp_COG_code] += 1
f_fa.close()
print(COG_freq)
COG_annot = dict()
COG_code_list = []
f_label = open(os.path.join(dirname, 'eggnog4.functional_categories.txt'),'r')
for line in f_label:
if line.strip() == '':
continue
if line.startswith(' ['):
tokens = line.strip().split()
tmp_COG_code = tokens[0].replace('[','').replace(']','')
COG_code_list.append(tmp_COG_code)
COG_annot[tmp_COG_code] = {'desc': ' '.join(tokens[1:]), 'category':tmp_COG_category}
else:
tmp_COG_category = line.strip()
f_label.close()
#print(COG_annot)
In [30]:
%matplotlib inline
bar_color = dict()
bar_color['INFORMATION STORAGE AND PROCESSING'] = 'red'
bar_color['CELLULAR PROCESSES AND SIGNALING'] = 'blue'
bar_color['METABOLISM'] = 'green'
bar_color['POORLY CHARACTERIZED'] = 'grey'
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,10))
ax1 = fig.add_subplot(1,1,1)
i = 0
for tmp_COG_code in COG_code_list:
if tmp_COG_code in COG_freq:
tmp_color = bar_color[ COG_annot[tmp_COG_code]['category']]
ax1.barh(i, COG_freq[tmp_COG_code], facecolor=tmp_color)
ax1.text(COG_freq[tmp_COG_code]+10, i,'%d'%COG_freq[tmp_COG_code])
i += 1
ax1.set_yticks([x for x in range(i)])
ax1.set_yticklabels(['%s [%s]'%(COG_annot[x]['desc'], x) for x in COG_code_list])
ax1.grid()
plt.show()
In [ ]: