In [3]:
import os
import sys
import Bio
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import ExPASy
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Graphics import GenomeDiagram
import prody
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator
import pylab as plt
import cPickle as pkl
In [12]:
MSA_init = AlignIO.read("PF00014_500.clustalw", "clustal")
In [13]:
print MSA_init
In [15]:
MSA = AlignIO.MultipleSeqAlignment([])
In [23]:
for record in MSA_init:
if record.seq.find('X')==-1:
MSA.add_sequence(record.id,str(record.seq))
In [26]:
print MSA
We have distances between each two sequences. For each sequence we compute number of other sequences that have identity score > x
In [27]:
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(MSA)
The identity measure is the following. x -- the fraction of identical positions. Therefore, $v=\sqrt{1-x}$ is the matrix cell value. Or we can rewrite: $x = 1 - v^2$ and therefore $1 - v^2 > x_{thr}$ is the criteria if two sequences are similar
In [6]:
def getNumSimilar(distMatrix, idx, x):
scores=distMatrix.matrix[idx]
addScores=[]
for i in range(idx+1,len(dm.matrix)):
addScores.append(distMatrix.matrix[i][idx])
scores = scores+addScores
return np.sum(np.array(scores)<(1.0-x))
In [7]:
def getSimilarityWeights(distMatrix, x):
wb=[]
for idx,name in enumerate(distMatrix.names):
w = 1.0/getNumSimilar(distMatrix, idx, x)
wb.append(w)
return wb
In [28]:
simW = getSimilarityWeights(dm, 0.9)
In [9]:
def renameResidues(strColumn):
resDict={
'-':0,
'A':1,
'C':2,
'D':3,
'E':4,
'F':5,
'G':6,
'H':7,
'I':8,
'K':9,
'L':10,
'M':11,
'N':12,
'P':13,
'Q':14,
'R':15,
'S':16,
'T':17,
'V':18,
'W':19,
'Y':20
}
numColumn=[]
for symb in strColumn:
numColumn.append(resDict[symb])
return numColumn
In [10]:
def removeColumnWithGapsAndRename(alignment, gapThr):
new_alignment=[]
B = len(alignment)
for idx in range(0,alignment.get_alignment_length()):
col = alignment.get_column(idx)
numGaps = col.count('-')
if float(numGaps)/float(B) < gapThr:
new_alignment.append(renameResidues(col))
return np.transpose(np.array(new_alignment))
The number of a sequence in the alignment is therefore the first index and the number of position in a sequence is the second
In [29]:
newMSA = removeColumnWithGapsAndRename(MSA,0.7)
In [32]:
print len(newMSA[:,0]), len(newMSA[0,:])
In [33]:
print newMSA
In [31]:
fileOut = open("MSA_weignted.pkl","w")
pkl.dump((newMSA, simW),fileOut)
fileOut.close()
In [34]:
a = np.array([[1,2],[3,4]])
b= np.array([[5,6],[7,8]])
In [36]:
ap = a.reshape((4,))
bp = b.reshape((4,))
In [37]:
ap, bp
Out[37]:
In [39]:
np.concatenate((ap,bp))
Out[39]:
In [4]:
fileIn = open("CFN.pkl","r")
CFN = pkl.load(fileIn)
fileIn.close()
In [5]:
for i in range(0,CFN.shape[0]): CFN[i,i] = 0.0
CFN
Out[5]:
In [7]:
plt.imshow(np.abs(CFN)>0.3)
plt.colorbar()
Out[7]:
In [ ]: