In [201]:
%matplotlib inline
In [202]:
from Bio import SeqIO
from collections import defaultdict
import matplotlib.pyplot as plt
In [203]:
aa_groups = defaultdict(int)
aa_dict = {
'A': 'hydrophobic',
'R': 'positively charged',
'N': 'uncharged polar',
'D': 'negatively charged',
'C': 'special',
'Q': 'uncharged polar',
'E': 'negatively charged',
'G': 'special',
'H': 'positively charged',
'I': 'hydrophobic',
'L': 'hydrophobic',
'K': 'positively charged',
'M': 'hydrophobic',
'F': 'hydrophobic',
'P': 'special',
'S': 'uncharged polar',
'T': 'uncharged polar',
'W': 'hydrophobic',
'Y': 'hydrophobic',
'V': 'hydrophobic',
}
In [204]:
total_aa = 0
for rec in SeqIO.parse('gfp-Monomer.faa', 'fasta'):
for a in str(rec.seq):
if a in aa_dict:
total_aa += 1
aa_groups[aa_dict[a.upper()]] += 1
In [205]:
for a, c in aa_groups.iteritems():
print a, '%.2f' % (float(c)/total_aa*100.0)
In [206]:
groups = sorted(aa_groups.keys())
percnt = [aa_groups[g]/float(total_aa) for g in groups]
In [209]:
weights = {'hydrophobic': 1.0/8.0,
'special': 1.0/4.0,
'uncharged polar': 1.0/4.0,
'positively charged': 1.0/3.0,
'negatively charged': 1.0/2.0,
}
In [210]:
adj_percnt = [aa_groups[g]*weights[g]/float(total_aa*0.2) for g in groups]
In [211]:
plt.plot(range(len(groups)), adj_percnt, 'o--', label='adjusted')
plt.plot(range(len(groups)), percnt, 'o-', label='not adjusted')
plt.xticks(range(len(groups)), groups, rotation=45)
plt.title('Amino Acids Composition: Monomer')
plt.ylabel('Percentage')
plt.xlabel('Amino Acids Groups')
plt.legend()
Out[211]:
In [212]:
aa_groups = defaultdict(int)
total_aa = 0
for rec in SeqIO.parse('gfp-Dimer.faa', 'fasta'):
for a in str(rec.seq):
if a in aa_dict:
total_aa += 1
aa_groups[aa_dict[a.upper()]] += 1
di_percnt = [aa_groups[g]/float(total_aa) for g in groups]
di_adj_percnt = [aa_groups[g]*weights[g]/float(total_aa*0.2) for g in groups]
In [213]:
di_percnt = [aa_groups[g]/float(total_aa) for g in groups]
di_adj_percnt = [aa_groups[g]*weights[g]/float(total_aa*0.2) for g in groups]
plt.plot(range(len(groups)), adj_percnt, 'o-', label='monomer-adjusted')
plt.plot(range(len(groups)), percnt, 'o--', label='monomer')
plt.plot(range(len(groups)), di_adj_percnt, 'o-', label='dimer-adjusted')
plt.plot(range(len(groups)), di_percnt, 'o--', label='dimer')
plt.xticks(range(len(groups)), groups, rotation=45)
plt.title('Amino Acids Composition: Dimer')
plt.ylabel('Percentage')
plt.xlabel('Amino Acids Groups')
plt.legend()
Out[213]:
In [214]:
aa_groups = defaultdict(int)
total_aa = 0
for rec in SeqIO.parse('gfp-tetramer.faa', 'fasta'):
for a in str(rec.seq):
if a in aa_dict:
total_aa += 1
aa_groups[aa_dict[a.upper()]] += 1
tetra_percnt = [aa_groups[g]/float(total_aa) for g in groups]
tetra_adj_percnt = [aa_groups[g]*weights[g]/float(total_aa*0.2) for g in groups]
In [215]:
import string
fig = plt.figure(figsize=(10,8))
plt.plot(range(len(groups)), tetra_adj_percnt, 'o-', label='tetramer-adjusted')
plt.plot(range(len(groups)), tetra_percnt, 'o--', label='tetramer')
plt.plot(range(len(groups)), adj_percnt, 'o-', label='monomer-adjusted')
plt.plot(range(len(groups)), percnt, 'o--', label='monomer')
plt.plot(range(len(groups)), di_adj_percnt, 'o-', label='dimer-adjusted')
plt.plot(range(len(groups)), di_percnt, 'o--', label='dimer')
plt.xticks(range(len(groups)), map(string.capitalize, groups), rotation=45, size=16)
plt.title('Amino Acids Composition', size=16)
plt.ylabel('Percentage', size=16)
plt.xlabel('Amino Acids Groups', size=16)
plt.legend()
Out[215]:
In [ ]:
from propy import PyPro
In [ ]:
monomers = defaultdict(list)
#desc_sets = ['GetAAComp', 'GetDPComp'] # descriptors set
desc_sets = ['GetAAComp'] # descriptors set
for seq in SeqIO.parse('gfp-Monomer.faa', 'fasta'):
des = PyPro.GetProDes(str(seq))
for s in desc_sets:
monomers[seq.id].append(getattr(des, s)())
In [ ]:
for item in monomers.keys():
print item
break
In [218]:
for k,v in monomers['216'][0].iteritems():
aa.append(k)
comp.append(v)
In [245]:
aa_groups = defaultdict(int)
total_aa = 0
for a, c in monomers['216'][0].iteritems():
if a in aa_dict:
aa_groups[aa_dict[a.upper()]] += c
_percnt = [aa_groups[g]/100.0 for g in groups]
_adj_percnt = [aa_groups[g]*weights[g]/100.0 for g in groups]
In [246]:
fig = plt.figure(figsize=(10,8))
plt.plot(range(len(groups)), _adj_percnt, 'o-', label='adjusted')
plt.plot(range(len(groups)), _percnt, 'o--', label='raw')
plt.xticks(range(len(groups)), groups, size=16, rotation=45)
plt.title('ProtPy Amino Acid Composition', size=16)
plt.ylabel('Percentage', size=16)
plt.xlabel('Groups', size=16)
plt.legend()
plt.savefig('protpy.png')
In [240]:
monomers['216']
Out[240]:
In [239]:
len(set(aa_dict))
Out[239]:
In [241]:
des = PyPro.GetProDes('AAAAAAAGG')
In [242]:
des.GetAAComp()
Out[242]:
In [247]:
lengths = []
for seq in SeqIO.parse('gfp-Monomer.faa', 'fasta'):
lengths.append(len(seq.seq))
In [250]:
plt.hist(lengths, bins=30)
Out[250]:
In [ ]: