Amino Acids Composition


In [201]:
%matplotlib inline

In [202]:
from Bio import SeqIO
from collections import defaultdict
import matplotlib.pyplot as plt

In [203]:
aa_groups = defaultdict(int)
aa_dict = {
        'A': 'hydrophobic',
        'R': 'positively charged',
        'N': 'uncharged polar',
        'D': 'negatively charged',
        'C': 'special',
        'Q': 'uncharged polar',
        'E': 'negatively charged',
        'G': 'special',
        'H': 'positively charged',
        'I': 'hydrophobic',
        'L': 'hydrophobic',
        'K': 'positively charged',
        'M': 'hydrophobic',
        'F': 'hydrophobic',
        'P': 'special',
        'S': 'uncharged polar',
        'T': 'uncharged polar',
        'W': 'hydrophobic',
        'Y': 'hydrophobic',
        'V': 'hydrophobic',
}

In [204]:
total_aa = 0
for rec in SeqIO.parse('gfp-Monomer.faa', 'fasta'):
    for a in str(rec.seq):
        if a in aa_dict:
            total_aa += 1
            aa_groups[aa_dict[a.upper()]] += 1

In [205]:
for a, c in aa_groups.iteritems():
    print a, '%.2f' % (float(c)/total_aa*100.0)


uncharged polar 17.90
positively charged 16.06
negatively charged 13.96
hydrophobic 36.43
special 15.65

In [206]:
groups = sorted(aa_groups.keys())
percnt = [aa_groups[g]/float(total_aa) for g in groups]

In [209]:
weights = {'hydrophobic': 1.0/8.0,
           'special': 1.0/4.0,
           'uncharged polar': 1.0/4.0,
           'positively charged': 1.0/3.0,
           'negatively charged': 1.0/2.0,
           }

In [210]:
adj_percnt = [aa_groups[g]*weights[g]/float(total_aa*0.2) for g in groups]

In [211]:
plt.plot(range(len(groups)), adj_percnt, 'o--', label='adjusted')
plt.plot(range(len(groups)), percnt, 'o-', label='not adjusted')
plt.xticks(range(len(groups)), groups, rotation=45)
plt.title('Amino Acids Composition: Monomer')
plt.ylabel('Percentage')
plt.xlabel('Amino Acids Groups')
plt.legend()


Out[211]:
<matplotlib.legend.Legend at 0x10b673fd0>

In [212]:
aa_groups = defaultdict(int)
total_aa = 0
for rec in SeqIO.parse('gfp-Dimer.faa', 'fasta'):
    for a in str(rec.seq):
        if a in aa_dict:
            total_aa += 1
            aa_groups[aa_dict[a.upper()]] += 1
            
di_percnt = [aa_groups[g]/float(total_aa) for g in groups]
di_adj_percnt = [aa_groups[g]*weights[g]/float(total_aa*0.2) for g in groups]

In [213]:
di_percnt = [aa_groups[g]/float(total_aa) for g in groups]
di_adj_percnt = [aa_groups[g]*weights[g]/float(total_aa*0.2) for g in groups]
plt.plot(range(len(groups)), adj_percnt, 'o-', label='monomer-adjusted')
plt.plot(range(len(groups)), percnt, 'o--', label='monomer')
plt.plot(range(len(groups)), di_adj_percnt, 'o-', label='dimer-adjusted')
plt.plot(range(len(groups)), di_percnt, 'o--', label='dimer')
plt.xticks(range(len(groups)), groups, rotation=45)
plt.title('Amino Acids Composition: Dimer')
plt.ylabel('Percentage')
plt.xlabel('Amino Acids Groups')
plt.legend()


Out[213]:
<matplotlib.legend.Legend at 0x10d168e50>

In [214]:
aa_groups = defaultdict(int)
total_aa = 0
for rec in SeqIO.parse('gfp-tetramer.faa', 'fasta'):
    for a in str(rec.seq):
        if a in aa_dict:
            total_aa += 1
            aa_groups[aa_dict[a.upper()]] += 1
tetra_percnt = [aa_groups[g]/float(total_aa) for g in groups]
tetra_adj_percnt = [aa_groups[g]*weights[g]/float(total_aa*0.2) for g in groups]

In [215]:
import string
fig = plt.figure(figsize=(10,8))
plt.plot(range(len(groups)), tetra_adj_percnt, 'o-', label='tetramer-adjusted')
plt.plot(range(len(groups)), tetra_percnt, 'o--', label='tetramer')

plt.plot(range(len(groups)), adj_percnt, 'o-', label='monomer-adjusted')
plt.plot(range(len(groups)), percnt, 'o--', label='monomer')
plt.plot(range(len(groups)), di_adj_percnt, 'o-', label='dimer-adjusted')
plt.plot(range(len(groups)), di_percnt, 'o--', label='dimer')
plt.xticks(range(len(groups)), map(string.capitalize, groups), rotation=45, size=16)
plt.title('Amino Acids Composition', size=16)
plt.ylabel('Percentage', size=16)
plt.xlabel('Amino Acids Groups', size=16)
plt.legend()


Out[215]:
<matplotlib.legend.Legend at 0x10d751e50>

In [ ]:
from propy import PyPro

In [ ]:
monomers = defaultdict(list)
#desc_sets = ['GetAAComp', 'GetDPComp']  # descriptors set
desc_sets = ['GetAAComp']  # descriptors set

for seq in SeqIO.parse('gfp-Monomer.faa', 'fasta'):
    des = PyPro.GetProDes(str(seq))
    for s in desc_sets:
        monomers[seq.id].append(getattr(des, s)())

In [ ]:
for item in monomers.keys():
    print item
    break

In [218]:
for k,v in monomers['216'][0].iteritems():
    aa.append(k)
    comp.append(v)

In [245]:
aa_groups = defaultdict(int)
total_aa = 0
for a, c in monomers['216'][0].iteritems():
    if a in aa_dict:
        aa_groups[aa_dict[a.upper()]] += c
_percnt = [aa_groups[g]/100.0 for g in groups]
_adj_percnt = [aa_groups[g]*weights[g]/100.0 for g in groups]

In [246]:
fig = plt.figure(figsize=(10,8))
plt.plot(range(len(groups)), _adj_percnt, 'o-', label='adjusted')
plt.plot(range(len(groups)), _percnt, 'o--', label='raw')
plt.xticks(range(len(groups)), groups, size=16, rotation=45)
plt.title('ProtPy Amino Acid Composition', size=16)
plt.ylabel('Percentage', size=16)
plt.xlabel('Groups', size=16)
plt.legend()
plt.savefig('protpy.png')



In [240]:
monomers['216']


Out[240]:
[{'A': 0.575,
  'C': 0.575,
  'D': 1.149,
  'E': 3.448,
  'F': 0.575,
  'G': 3.448,
  'H': 5.172,
  'I': 1.149,
  'K': 2.874,
  'L': 2.874,
  'M': 4.598,
  'N': 3.448,
  'P': 0.575,
  'Q': 0.575,
  'R': 1.149,
  'S': 5.172,
  'T': 2.874,
  'V': 0.575,
  'W': 0.0,
  'Y': 1.149}]

In [239]:
len(set(aa_dict))


Out[239]:
20

In [241]:
des = PyPro.GetProDes('AAAAAAAGG')

In [242]:
des.GetAAComp()


Out[242]:
{'A': 77.778,
 'C': 0.0,
 'D': 0.0,
 'E': 0.0,
 'F': 0.0,
 'G': 22.222,
 'H': 0.0,
 'I': 0.0,
 'K': 0.0,
 'L': 0.0,
 'M': 0.0,
 'N': 0.0,
 'P': 0.0,
 'Q': 0.0,
 'R': 0.0,
 'S': 0.0,
 'T': 0.0,
 'V': 0.0,
 'W': 0.0,
 'Y': 0.0}

In [247]:
lengths = []
for seq in SeqIO.parse('gfp-Monomer.faa', 'fasta'):
    lengths.append(len(seq.seq))

In [250]:
plt.hist(lengths, bins=30)


Out[250]:
(array([   1.,    0.,    0.,    0.,    0.,    0.,    0.,   18.,   54.,
         135.,   22.,    4.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    3.,    1.]),
 array([ 134.        ,  145.06666667,  156.13333333,  167.2       ,
         178.26666667,  189.33333333,  200.4       ,  211.46666667,
         222.53333333,  233.6       ,  244.66666667,  255.73333333,
         266.8       ,  277.86666667,  288.93333333,  300.        ,
         311.06666667,  322.13333333,  333.2       ,  344.26666667,
         355.33333333,  366.4       ,  377.46666667,  388.53333333,
         399.6       ,  410.66666667,  421.73333333,  432.8       ,
         443.86666667,  454.93333333,  466.        ]),
 <a list of 30 Patch objects>)

In [ ]: