In [1]:
%pylab inline
import pylab as plt
In [2]:
#pretty printing of tables
from ipy_table import *
def prep_table(header, K):
mat = [header]
for row in K:
mat.append(list(row))
return mat
In [3]:
#parse BLOSUM62 from NCBI
def MakeProteinSimilarityMatrix():
url = "http://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt"
import requests
f = requests.get(url).text.split('\n')
m=list()
for line in f:
l = line.strip()
if l and l[0] != '#':
m.append(l)
aa = {i:str(c) for i,c in enumerate(m[0].split()[:-1])}
dim = len(aa)
import numpy as np
B = np.zeros((dim, dim))
for i,row in enumerate(m):
if i > 0 and i < 24:
vec = row.split()
nvec = [int(x) for x in vec[1:-1]]
for j,x in enumerate(nvec):
B[i-1,j]=x
return aa, B
In [4]:
#print BLOSUM62
print 'Blosum matrix'
aa_dict, B = MakeProteinSimilarityMatrix()
aa_list = [aa_dict[key] for key in aa_dict]
print 'Num aminoacids: ', len(aa_dict)
print aa_list
header = range(len(aa_dict))
header = aa_list
mat=prep_table(header, B)
make_table(mat)
apply_theme('basic')
set_global_style(float_format = '%d')
Out[4]:
In [5]:
#extract rank matrix
import numpy as np
def rank_matrix(B):
idxs= np.argsort(-B, axis=1)
for j, row in enumerate(idxs):
ranks = sorted([(x,i) for i,x in enumerate(row)])
pos = [i for x,i in ranks]
if j == 0:
D= np.array(pos)
else:
D = np.vstack([D,pos])
E = D.astype( float )
#make symmetric
D = (E + E.T)/2
return D
In [6]:
D = rank_matrix(B)
In [7]:
#pretty print rank matrix
print 'Interpretation: element in column j is at distance A_ij from element in row i'
#header = range(D.shape[0])
mat=prep_table(header, D)
make_table(mat)
apply_theme('basic')
set_global_style(float_format = '%0.1f')
Out[7]:
In [8]:
#compute vector representation of aa such that their rank distance is respected
n_components = 5
from sklearn.manifold import MDS
feature_map = MDS(n_components=n_components, n_init=1000, max_iter=1000,
n_jobs = -1, dissimilarity='precomputed', random_state = 2)
X_explicit=feature_map.fit_transform(D)
In [9]:
#plot 2D embedding of aa encodings
from sklearn.decomposition import TruncatedSVD
pca = TruncatedSVD(n_components=2)
X_reduced = pca.fit_transform(X_explicit)
size=11
plt.figure(figsize=(size,size))
#make mesh
h = .1 # step size in the mesh
b = 1 # border size
x_min, x_max = X_reduced[:, 0].min() - b, X_reduced[:, 0].max() + b
y_min, y_max = X_reduced[:, 1].min() - b, X_reduced[:, 1].max() + b
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
from sklearn.svm import OneClassSVM
#induce a predictive model
clf = OneClassSVM(gamma = 0.2, nu = 0.01)
clf.fit(X_reduced)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
levels = np.linspace(min(Z), max(Z), 40)
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.get_cmap('YlOrRd'), alpha=.9,levels=levels)
plt.scatter(X_reduced[:, 0], X_reduced[:, 1],
alpha=.5,
s=70,
edgecolors='none',
c = 'white',
cmap = plt.get_cmap('YlOrRd'))
for id in range(X_reduced.shape[0]):
label = aa_list[id]
x = X_reduced[id, 0]
y = X_reduced[id, 1]
plt.annotate(label,xy = (x,y), xytext = (-12, -3), textcoords = 'offset points')
plt.show()
In [10]:
#save encoding as a numpy matrix
fname = 'BLOSUM62_5D_encoding'
np.save(fname,X_explicit)
In [11]:
#build a dictionary aa->vector encoding
aa_encoding_dict = {}
for aa, row in zip (aa_list, X_explicit):
aa_encoding_dict[aa] = list(row)
In [12]:
#save
import pickle
pickle.dump( aa_encoding_dict, open( "BLOSUM62_5D_encoding_dict.p", "wb" ) )
#copy it to my home directory
#...
In [13]:
from urllib import urlretrieve
import pickle
urlretrieve ("http://www.bioinf.uni-freiburg.de/~costa/BLOSUM62_5D_encoding_dict.p", "BLOSUM62_5D_encoding_dict.p")
aa_encoding_dict5D = pickle.load( open( "BLOSUM62_5D_encoding_dict.p", "rb" ) )
In [14]:
prot_id = 'http://www.uniprot.org/uniprot/P12345.fasta'
prot_id = 'http://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:9606&random=yes&format=fasta&limit=3'
print prot_id
In [15]:
def compose(url_string):
from eden.modifier.fasta import fasta_to_fasta
seqs = fasta_to_fasta(url_string)
from eden.converter.fasta import fasta_to_eden
graphs = fasta_to_eden(seqs)
from eden.modifier.graph.vertex_attributes import colorize
graphs = colorize(graphs, output_attribute = 'level', labels = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X'])
from eden.modifier.graph.vertex_attributes import translate
graphs = translate(graphs, label_map = aa_encoding_dict5D, default = [0,0,0,0,0])
return graphs