In [3]:

    
import os
import sys
import Bio 
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import ExPASy
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Graphics import GenomeDiagram
import prody
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator
import pylab as plt
import cPickle as pkl

Superfamily alignment

Got superfamily PF00014 (KU superfamily).
Initial sequence is TD ICKLPKDEGT CRDFILKWYY DPNTKSCARF WYGGCGGNEN KFGSQKECEK VCAPV. (3108 - 3165 from uniprot/P12111 corresponding to the pdb 1KTH)
Ran PBLAST on this sequence (PF00014.fas -- aligned sequences). PF00014_500.fas -- first 500 sequences from the list
Ran ClustalW on PF00014_500.fas (PF00014_500.clustalw)
Downloaded distance matrix for the alignment (PF00014.pim)



In [12]:

    
MSA_init = AlignIO.read("PF00014_500.clustalw", "clustal")



In [13]:

    
print MSA_init









    



SingleLetterAlphabet() alignment with 845 rows and 96 columns
TDRCQLQ-KE-EGLC-----RNF-------IVKWYYDAE---KN...--- gi|556982293|ref|XP_005997469.1|:3558-3614
-DICQLP-KQ-EGSC-----AEF-------ALKWYYDTT---SK...--- gi|432930338|ref|XP_004081431.1|:4241-4293
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|554817341|ref|XP_005919446.1|:2068-2121
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|548433333|ref|XP_005744276.1|:2098-2151
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|584018206|ref|XP_006803826.1|:2113-2166
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|499024864|ref|XP_004563187.1|:2123-2176
-DICKLP-KE-EGTC-----IKF-------VLKWFYDSV---SN...--- gi|597753114|ref|XP_007238398.1|:2293-2346
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571121|ref|XP_009302967.1|:12559-12617
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571119|ref|XP_009302966.1|:12668-12726
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571116|ref|XP_009302965.1|:12695-12753
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571111|ref|XP_009302964.1|:12730-12788
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571109|ref|XP_009302963.1|:12757-12815
-EICQLP-KE-EGAC-----AKF-------VLKWHYDAL---GK...APV gi|642110588|emb|CDQ71231.1|:3265-3323
TDACTLP-KD-EGDC-----RNF-------TLKWFFDGQ---QN...--- gi|573895845|ref|XP_006635665.1|:2650-2703
-DICWLQ-KD-VGTC-----RDF-------VLMWYHDPN---TR...--- gi|675421266|ref|XP_008924155.1|:3030-3082
-DICNLS-KE-AGPC-----RDF-------ILKWFFDTT---TK...--- gi|637250555|ref|XP_008112888.1|:2483-2535
-DICNLS-KE-AGPC-----RDF-------ILKWFFDTT---TK...--- gi|637250549|ref|XP_008112878.1|:2683-2735
-DICNLS-KE-AGPC-----RDF-------ILKWFFDTT---TK...--- gi|637250545|ref|XP_008112875.1|:2884-2936
...
-DMCNLA-VD-SGPC-----RGD-------FRKYYYEPG---LR...--- gi|332020126|gb|EGI60570.1|:2055-2107

Clean alignment



In [15]:

    
MSA = AlignIO.MultipleSeqAlignment([])



In [23]:

    
for record in MSA_init:
    if record.seq.find('X')==-1:
        MSA.add_sequence(record.id,str(record.seq))



In [26]:

    
print MSA









    



SingleLetterAlphabet() alignment with 843 rows and 96 columns
TDRCQLQ-KE-EGLC-----RNF-------IVKWYYDAE---KN...--- gi|556982293|ref|XP_005997469.1|:3558-3614
-DICQLP-KQ-EGSC-----AEF-------ALKWYYDTT---SK...--- gi|432930338|ref|XP_004081431.1|:4241-4293
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|554817341|ref|XP_005919446.1|:2068-2121
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|548433333|ref|XP_005744276.1|:2098-2151
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|584018206|ref|XP_006803826.1|:2113-2166
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|499024864|ref|XP_004563187.1|:2123-2176
-DICKLP-KE-EGTC-----IKF-------VLKWFYDSV---SN...--- gi|597753114|ref|XP_007238398.1|:2293-2346
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571121|ref|XP_009302967.1|:12559-12617
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571119|ref|XP_009302966.1|:12668-12726
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571116|ref|XP_009302965.1|:12695-12753
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571111|ref|XP_009302964.1|:12730-12788
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571109|ref|XP_009302963.1|:12757-12815
-EICQLP-KE-EGAC-----AKF-------VLKWHYDAL---GK...APV gi|642110588|emb|CDQ71231.1|:3265-3323
TDACTLP-KD-EGDC-----RNF-------TLKWFFDGQ---QN...--- gi|573895845|ref|XP_006635665.1|:2650-2703
-DICWLQ-KD-VGTC-----RDF-------VLMWYHDPN---TR...--- gi|675421266|ref|XP_008924155.1|:3030-3082
-DICNLS-KE-AGPC-----RDF-------ILKWFFDTT---TK...--- gi|637250555|ref|XP_008112888.1|:2483-2535
-DICNLS-KE-AGPC-----RDF-------ILKWFFDTT---TK...--- gi|637250549|ref|XP_008112878.1|:2683-2735
-DICNLS-KE-AGPC-----RDF-------ILKWFFDTT---TK...--- gi|637250545|ref|XP_008112875.1|:2884-2936
...
-DMCNLA-VD-SGPC-----RGD-------FRKYYYEPG---LR...--- gi|332020126|gb|EGI60570.1|:2055-2107

Computing weights of each sequence

We have distances between each two sequences. For each sequence we compute number of other sequences that have identity score > x



In [27]:

    
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(MSA)

The identity measure is the following. x -- the fraction of identical positions. Therefore, $v=\sqrt{1-x}$ is the matrix cell value. Or we can rewrite: $x = 1 - v^2$ and therefore $1 - v^2 > x_{thr}$ is the criteria if two sequences are similar



In [6]:

    
def getNumSimilar(distMatrix, idx, x):
    scores=distMatrix.matrix[idx]
    addScores=[]
    for i in range(idx+1,len(dm.matrix)):
        addScores.append(distMatrix.matrix[i][idx])
    scores = scores+addScores
    return np.sum(np.array(scores)<(1.0-x))



In [7]:

    
def getSimilarityWeights(distMatrix, x):
    wb=[]
    for idx,name in enumerate(distMatrix.names):
        w = 1.0/getNumSimilar(distMatrix, idx, x)
        wb.append(w)
    return wb



In [28]:

    
simW = getSimilarityWeights(dm, 0.9)

Removing columns with gaps and translating residues to integers



In [9]:

    
def renameResidues(strColumn):
    resDict={
             '-':0,
             'A':1,
             'C':2,
             'D':3,
             'E':4,
             'F':5,
             'G':6,
             'H':7,
             'I':8,
             'K':9,
             'L':10,
             'M':11,
             'N':12,
             'P':13,
             'Q':14,
             'R':15,
             'S':16,
             'T':17,
             'V':18,
             'W':19,
             'Y':20
             }
    numColumn=[]
    for symb in strColumn:
        numColumn.append(resDict[symb])
    return numColumn



In [10]:

    
def removeColumnWithGapsAndRename(alignment, gapThr):
    new_alignment=[]
    B = len(alignment)
    for idx in range(0,alignment.get_alignment_length()):
        col = alignment.get_column(idx)
        numGaps = col.count('-')
        if float(numGaps)/float(B) < gapThr:
            new_alignment.append(renameResidues(col))
    return np.transpose(np.array(new_alignment))

The number of a sequence in the alignment is therefore the first index and the number of position in a sequence is the second



In [29]:

    
newMSA = removeColumnWithGapsAndRename(MSA,0.7)



In [32]:

    
print len(newMSA[:,0]), len(newMSA[0,:])



In [33]:

    
print newMSA









    



[[ 3 15  2 ...,  2 18 16]
 [ 3  8  2 ...,  2  0  0]
 [ 3  8  2 ...,  2  6  0]
 ..., 
 [ 3  8  2 ...,  2  6  0]
 [ 3 11  2 ...,  2  0  0]
 [ 3 11  2 ...,  2  0  0]]

Saving results to file



In [31]:

    
fileOut = open("MSA_weignted.pkl","w")
pkl.dump((newMSA, simW),fileOut)
fileOut.close()



In [34]:

    
a = np.array([[1,2],[3,4]])
b= np.array([[5,6],[7,8]])



In [36]:

    
ap = a.reshape((4,))
bp = b.reshape((4,))



In [37]:

    
ap, bp









    Out[37]:





(array([1, 2, 3, 4]), array([5, 6, 7, 8]))



In [39]:

    
np.concatenate((ap,bp))









    Out[39]:





array([1, 2, 3, 4, 5, 6, 7, 8])

Result of the algorithm



In [4]:

    
fileIn = open("CFN.pkl","r")
CFN = pkl.load(fileIn)
fileIn.close()



In [5]:

    
for i in range(0,CFN.shape[0]): CFN[i,i] = 0.0 
CFN









    Out[5]:





array([[ 0.        ,  0.25737225,  0.10589921, ...,  0.01642261,
         0.03264588, -0.02584258],
       [ 0.25320939,  0.        , -0.06845689, ..., -0.14908647,
         0.22281266,  0.01676454],
       [ 0.09049025, -0.07483189,  0.        , ...,  0.21146779,
        -0.15201165, -0.00706682],
       ..., 
       [-0.00128436, -0.14158841,  0.21210117, ...,  0.        ,
        -0.14729039, -0.11153589],
       [ 0.02589362,  0.22604362, -0.12928917, ..., -0.13043965,
         0.        ,  0.31250605],
       [-0.03592782,  0.02096606,  0.01075284, ..., -0.10062782,
         0.31577373,  0.        ]])



In [7]:

    
plt.imshow(np.abs(CFN)>0.3)
plt.colorbar()









    Out[7]:





<matplotlib.colorbar.Colorbar instance at 0x30e5290>



In [ ]: