In [1]:
%pylab inline
import pylab as plt


Populating the interactive namespace from numpy and matplotlib

In [2]:
#pretty printing of tables
from ipy_table import * 
def prep_table(header, K): 
    mat = [header]
    for row in K:
        mat.append(list(row))
    return mat

In [3]:
#parse BLOSUM62 from NCBI
def MakeProteinSimilarityMatrix():
    url = "http://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt"
    import requests
    f = requests.get(url).text.split('\n')
    m=list()
    for line in f:
        l = line.strip()
        if l and l[0] != '#':
            m.append(l)
    aa = {i:str(c) for i,c in enumerate(m[0].split()[:-1])}
    dim = len(aa)
    import numpy as np
    B = np.zeros((dim, dim))
    for i,row in enumerate(m):
        if i > 0 and i < 24:
            vec = row.split()
            nvec = [int(x) for x in vec[1:-1]]
            for j,x in enumerate(nvec):
                B[i-1,j]=x
    return aa, B

In [4]:
#print BLOSUM62
print 'Blosum matrix'

aa_dict, B = MakeProteinSimilarityMatrix()
aa_list = [aa_dict[key] for key in aa_dict]
print 'Num aminoacids: ', len(aa_dict)
print aa_list

header = range(len(aa_dict))
header = aa_list
mat=prep_table(header, B)
make_table(mat)
apply_theme('basic')
set_global_style(float_format = '%d')


Blosum matrix
Num aminoacids:  23
['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X']
Out[4]:
ARNDCQEGHILKMFPSTWYVBZX
4-1-2-20-1-10-2-1-1-1-1-2-110-3-20-2-10
-150-2-310-20-3-22-1-3-2-1-1-3-2-3-10-1
-2061-30001-3-30-2-3-210-4-2-330-1
-2-216-302-1-1-3-4-1-3-3-10-1-4-3-341-1
0-3-3-39-3-4-3-3-1-1-3-1-2-3-1-1-2-2-1-3-3-2
-1100-352-20-3-210-3-10-1-2-1-203-1
-1002-425-20-3-31-2-3-10-1-3-2-214-1
0-20-1-3-2-26-2-4-4-2-3-3-20-2-2-3-3-1-2-1
-201-1-300-28-3-3-1-2-1-2-1-2-22-300-1
-1-3-3-3-1-3-3-4-342-310-3-2-1-3-13-3-3-1
-1-2-3-4-1-2-3-4-324-220-3-2-1-2-11-4-3-1
-120-1-311-2-1-3-25-1-3-10-1-3-2-201-1
-1-1-2-3-10-2-3-212-150-2-1-1-1-11-3-1-1
-2-3-3-3-2-3-3-3-100-306-4-2-213-1-3-3-1
-1-2-2-1-3-1-1-2-2-3-3-1-2-47-1-1-4-3-2-2-1-2
1-110-1000-1-2-20-1-2-141-3-2-2000
0-10-1-1-1-1-2-2-1-1-1-1-2-115-2-20-1-10
-3-3-4-4-2-2-3-2-2-3-2-3-11-4-3-2112-3-4-3-2
-2-2-2-3-2-1-2-32-1-1-2-13-3-2-227-1-3-2-1
0-3-3-3-1-2-2-3-331-21-1-2-20-3-14-3-2-1
-2-134-301-10-3-40-3-3-20-1-4-3-341-1
-1001-334-20-3-31-1-3-10-1-3-2-214-1
0-1-1-1-2-1-1-1-1-1-1-1-1-1-200-2-1-1-1-1-1

In [5]:
#extract rank matrix 
import numpy as np

def rank_matrix(B):
    idxs= np.argsort(-B, axis=1)
    for j, row in enumerate(idxs):
        ranks = sorted([(x,i) for i,x in enumerate(row)])
        pos = [i for x,i in ranks]
        if j == 0:
            D= np.array(pos)
        else:
            D = np.vstack([D,pos])
    E = D.astype( float )
    #make symmetric
    D = (E + E.T)/2
    return D

In [6]:
D = rank_matrix(B)

In [7]:
#pretty print rank matrix
print 'Interpretation: element in column j is at distance A_ij from element in row i'
#header = range(D.shape[0])
mat=prep_table(header, D)
make_table(mat)
apply_theme('basic')
set_global_style(float_format = '%0.1f')


Interpretation: element in column j is at distance A_ij from element in row i
Out[7]:
ARNDCQEGHILKMFPSTWYVBZX
0.09.517.015.53.511.59.52.519.09.08.513.08.013.56.01.02.517.515.53.017.513.02.5
9.50.04.514.519.03.05.011.04.018.015.01.011.518.513.012.58.018.515.521.09.56.014.5
17.04.50.02.519.06.07.55.52.518.018.05.017.017.012.53.07.521.514.519.51.59.014.0
15.514.52.50.017.05.52.09.011.515.021.510.018.516.58.07.58.520.520.018.51.04.012.5
3.519.019.017.00.018.022.016.016.07.58.021.08.511.016.59.57.07.011.05.518.521.016.0
11.53.06.05.518.00.02.514.56.516.513.02.57.018.510.58.59.011.58.515.56.51.514.5
9.55.07.52.022.02.50.010.57.015.517.03.517.516.08.09.513.018.014.013.04.00.513.0
2.511.05.59.016.014.510.50.014.522.020.515.020.014.510.54.515.58.518.017.07.514.08.5
19.04.02.511.516.06.57.014.50.019.520.510.014.59.012.010.517.013.02.021.56.07.511.0
9.018.018.015.07.516.515.522.019.50.02.020.03.04.015.513.511.015.06.51.017.016.013.0
8.515.018.021.58.013.017.020.520.52.00.016.01.04.517.514.510.511.07.03.022.017.58.5
13.01.05.010.021.02.53.515.010.020.016.00.010.520.57.57.010.020.017.016.05.53.512.0
8.011.517.018.58.57.017.520.014.53.01.010.50.04.014.011.513.05.58.02.019.013.511.0
13.518.517.016.511.018.516.014.59.04.04.520.54.00.021.514.515.52.01.07.018.019.06.0
6.013.012.58.016.510.58.010.512.015.517.57.514.021.50.08.56.020.520.013.514.07.518.0
1.012.53.07.59.58.59.54.510.513.514.57.011.514.58.50.02.017.015.517.55.57.06.5
2.58.07.58.57.09.013.015.517.011.010.510.013.015.56.02.00.013.015.04.08.011.52.5
17.518.521.520.57.011.518.08.513.015.011.020.05.52.020.517.013.00.01.514.521.519.013.0
15.515.514.520.011.08.514.018.02.06.57.017.08.01.020.015.515.01.50.05.520.516.54.5
3.021.019.518.55.515.513.017.021.51.03.016.02.07.013.517.54.014.55.50.020.514.06.0
17.59.51.51.018.56.54.07.56.017.022.05.519.018.014.05.58.021.520.520.50.04.56.5
13.06.09.04.021.01.50.514.07.516.017.53.513.519.07.57.011.519.016.514.04.51.011.0
2.514.514.012.516.014.513.08.511.013.08.512.011.06.018.06.52.513.04.56.06.511.019.0

In [8]:
#compute vector representation of aa such that their rank distance is respected
n_components = 5
from sklearn.manifold import MDS
feature_map = MDS(n_components=n_components, n_init=1000, max_iter=1000, 
                  n_jobs = -1, dissimilarity='precomputed', random_state = 2)
X_explicit=feature_map.fit_transform(D)

In [9]:
#plot 2D embedding of aa encodings

from sklearn.decomposition import TruncatedSVD
pca = TruncatedSVD(n_components=2)
X_reduced = pca.fit_transform(X_explicit)

size=11
plt.figure(figsize=(size,size))

#make mesh
h = .1  # step size in the mesh
b = 1 # border size
x_min, x_max = X_reduced[:, 0].min() - b, X_reduced[:, 0].max() + b
y_min, y_max = X_reduced[:, 1].min() - b, X_reduced[:, 1].max() + b
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

from sklearn.svm import OneClassSVM
#induce a predictive model
clf = OneClassSVM(gamma = 0.2, nu = 0.01)
clf.fit(X_reduced)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
levels = np.linspace(min(Z), max(Z), 40)
Z = Z.reshape(xx.shape)
    
plt.contourf(xx, yy, Z, cmap=plt.get_cmap('YlOrRd'), alpha=.9,levels=levels)
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], 
            alpha=.5, 
            s=70, 
            edgecolors='none', 
            c = 'white',
            cmap = plt.get_cmap('YlOrRd'))
for id in range(X_reduced.shape[0]):
    label = aa_list[id] 
    x = X_reduced[id, 0]
    y = X_reduced[id, 1]
    plt.annotate(label,xy = (x,y), xytext = (-12, -3), textcoords = 'offset points')
plt.show()



In [10]:
#save encoding as a numpy matrix
fname = 'BLOSUM62_5D_encoding'
np.save(fname,X_explicit)

In [11]:
#build a dictionary aa->vector encoding
aa_encoding_dict = {}
for aa, row in zip (aa_list, X_explicit):
    aa_encoding_dict[aa] = list(row)

In [12]:
#save
import pickle
pickle.dump( aa_encoding_dict, open( "BLOSUM62_5D_encoding_dict.p", "wb" ) )
#copy it to my home directory
#...

In [13]:
from urllib import urlretrieve
import pickle
urlretrieve ("http://www.bioinf.uni-freiburg.de/~costa/BLOSUM62_5D_encoding_dict.p", "BLOSUM62_5D_encoding_dict.p")
aa_encoding_dict5D = pickle.load( open( "BLOSUM62_5D_encoding_dict.p", "rb" ) )

In [14]:
prot_id = 'http://www.uniprot.org/uniprot/P12345.fasta'
prot_id = 'http://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:9606&random=yes&format=fasta&limit=3'
print prot_id


http://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:9606&random=yes&format=fasta&limit=3

In [15]:
def compose(url_string):
    from eden.modifier.fasta import fasta_to_fasta
    seqs = fasta_to_fasta(url_string)
    
    from eden.converter.fasta import fasta_to_eden
    graphs = fasta_to_eden(seqs)

    from eden.modifier.graph.vertex_attributes import colorize
    graphs = colorize(graphs, output_attribute = 'level', labels = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X'])
    
    from eden.modifier.graph.vertex_attributes import translate 
    graphs = translate(graphs, label_map = aa_encoding_dict5D, default = [0,0,0,0,0])
    
    return graphs