In [3]:
import os
import sys
import Bio 
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import ExPASy
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Graphics import GenomeDiagram
import prody
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator
import pylab as plt
import cPickle as pkl

Superfamily alignment

  1. Got superfamily PF00014 (KU superfamily).
  2. Initial sequence is TD ICKLPKDEGT CRDFILKWYY DPNTKSCARF WYGGCGGNEN KFGSQKECEK VCAPV. (3108 - 3165 from uniprot/P12111 corresponding to the pdb 1KTH)
  3. Ran PBLAST on this sequence (PF00014.fas -- aligned sequences). PF00014_500.fas -- first 500 sequences from the list
  4. Ran ClustalW on PF00014_500.fas (PF00014_500.clustalw)
  5. Downloaded distance matrix for the alignment (PF00014.pim)

In [12]:
MSA_init = AlignIO.read("PF00014_500.clustalw", "clustal")

In [13]:
print MSA_init


SingleLetterAlphabet() alignment with 845 rows and 96 columns
TDRCQLQ-KE-EGLC-----RNF-------IVKWYYDAE---KN...--- gi|556982293|ref|XP_005997469.1|:3558-3614
-DICQLP-KQ-EGSC-----AEF-------ALKWYYDTT---SK...--- gi|432930338|ref|XP_004081431.1|:4241-4293
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|554817341|ref|XP_005919446.1|:2068-2121
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|548433333|ref|XP_005744276.1|:2098-2151
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|584018206|ref|XP_006803826.1|:2113-2166
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|499024864|ref|XP_004563187.1|:2123-2176
-DICKLP-KE-EGTC-----IKF-------VLKWFYDSV---SN...--- gi|597753114|ref|XP_007238398.1|:2293-2346
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571121|ref|XP_009302967.1|:12559-12617
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571119|ref|XP_009302966.1|:12668-12726
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571116|ref|XP_009302965.1|:12695-12753
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571111|ref|XP_009302964.1|:12730-12788
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571109|ref|XP_009302963.1|:12757-12815
-EICQLP-KE-EGAC-----AKF-------VLKWHYDAL---GK...APV gi|642110588|emb|CDQ71231.1|:3265-3323
TDACTLP-KD-EGDC-----RNF-------TLKWFFDGQ---QN...--- gi|573895845|ref|XP_006635665.1|:2650-2703
-DICWLQ-KD-VGTC-----RDF-------VLMWYHDPN---TR...--- gi|675421266|ref|XP_008924155.1|:3030-3082
-DICNLS-KE-AGPC-----RDF-------ILKWFFDTT---TK...--- gi|637250555|ref|XP_008112888.1|:2483-2535
-DICNLS-KE-AGPC-----RDF-------ILKWFFDTT---TK...--- gi|637250549|ref|XP_008112878.1|:2683-2735
-DICNLS-KE-AGPC-----RDF-------ILKWFFDTT---TK...--- gi|637250545|ref|XP_008112875.1|:2884-2936
...
-DMCNLA-VD-SGPC-----RGD-------FRKYYYEPG---LR...--- gi|332020126|gb|EGI60570.1|:2055-2107

Clean alignment


In [15]:
MSA = AlignIO.MultipleSeqAlignment([])

In [23]:
for record in MSA_init:
    if record.seq.find('X')==-1:
        MSA.add_sequence(record.id,str(record.seq))

In [26]:
print MSA


SingleLetterAlphabet() alignment with 843 rows and 96 columns
TDRCQLQ-KE-EGLC-----RNF-------IVKWYYDAE---KN...--- gi|556982293|ref|XP_005997469.1|:3558-3614
-DICQLP-KQ-EGSC-----AEF-------ALKWYYDTT---SK...--- gi|432930338|ref|XP_004081431.1|:4241-4293
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|554817341|ref|XP_005919446.1|:2068-2121
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|548433333|ref|XP_005744276.1|:2098-2151
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|584018206|ref|XP_006803826.1|:2113-2166
-DICQLP-QE-EGTC-----AKF-------VLKWHYDAA---NK...--- gi|499024864|ref|XP_004563187.1|:2123-2176
-DICKLP-KE-EGTC-----IKF-------VLKWFYDSV---SN...--- gi|597753114|ref|XP_007238398.1|:2293-2346
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571121|ref|XP_009302967.1|:12559-12617
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571119|ref|XP_009302966.1|:12668-12726
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571116|ref|XP_009302965.1|:12695-12753
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571111|ref|XP_009302964.1|:12730-12788
-DICKLP-KE-EGSC-----AKF-------VLKWHYDPL---SG...APV gi|688571109|ref|XP_009302963.1|:12757-12815
-EICQLP-KE-EGAC-----AKF-------VLKWHYDAL---GK...APV gi|642110588|emb|CDQ71231.1|:3265-3323
TDACTLP-KD-EGDC-----RNF-------TLKWFFDGQ---QN...--- gi|573895845|ref|XP_006635665.1|:2650-2703
-DICWLQ-KD-VGTC-----RDF-------VLMWYHDPN---TR...--- gi|675421266|ref|XP_008924155.1|:3030-3082
-DICNLS-KE-AGPC-----RDF-------ILKWFFDTT---TK...--- gi|637250555|ref|XP_008112888.1|:2483-2535
-DICNLS-KE-AGPC-----RDF-------ILKWFFDTT---TK...--- gi|637250549|ref|XP_008112878.1|:2683-2735
-DICNLS-KE-AGPC-----RDF-------ILKWFFDTT---TK...--- gi|637250545|ref|XP_008112875.1|:2884-2936
...
-DMCNLA-VD-SGPC-----RGD-------FRKYYYEPG---LR...--- gi|332020126|gb|EGI60570.1|:2055-2107

Computing weights of each sequence

We have distances between each two sequences. For each sequence we compute number of other sequences that have identity score > x


In [27]:
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(MSA)

The identity measure is the following. x -- the fraction of identical positions. Therefore, $v=\sqrt{1-x}$ is the matrix cell value. Or we can rewrite: $x = 1 - v^2$ and therefore $1 - v^2 > x_{thr}$ is the criteria if two sequences are similar


In [6]:
def getNumSimilar(distMatrix, idx, x):
    scores=distMatrix.matrix[idx]
    addScores=[]
    for i in range(idx+1,len(dm.matrix)):
        addScores.append(distMatrix.matrix[i][idx])
    scores = scores+addScores
    return np.sum(np.array(scores)<(1.0-x))

In [7]:
def getSimilarityWeights(distMatrix, x):
    wb=[]
    for idx,name in enumerate(distMatrix.names):
        w = 1.0/getNumSimilar(distMatrix, idx, x)
        wb.append(w)
    return wb

In [28]:
simW = getSimilarityWeights(dm, 0.9)

Removing columns with gaps and translating residues to integers


In [9]:
def renameResidues(strColumn):
    resDict={
             '-':0,
             'A':1,
             'C':2,
             'D':3,
             'E':4,
             'F':5,
             'G':6,
             'H':7,
             'I':8,
             'K':9,
             'L':10,
             'M':11,
             'N':12,
             'P':13,
             'Q':14,
             'R':15,
             'S':16,
             'T':17,
             'V':18,
             'W':19,
             'Y':20
             }
    numColumn=[]
    for symb in strColumn:
        numColumn.append(resDict[symb])
    return numColumn

In [10]:
def removeColumnWithGapsAndRename(alignment, gapThr):
    new_alignment=[]
    B = len(alignment)
    for idx in range(0,alignment.get_alignment_length()):
        col = alignment.get_column(idx)
        numGaps = col.count('-')
        if float(numGaps)/float(B) < gapThr:
            new_alignment.append(renameResidues(col))
    return np.transpose(np.array(new_alignment))

The number of a sequence in the alignment is therefore the first index and the number of position in a sequence is the second


In [29]:
newMSA = removeColumnWithGapsAndRename(MSA,0.7)

In [32]:
print len(newMSA[:,0]), len(newMSA[0,:])


843 55

In [33]:
print newMSA


[[ 3 15  2 ...,  2 18 16]
 [ 3  8  2 ...,  2  0  0]
 [ 3  8  2 ...,  2  6  0]
 ..., 
 [ 3  8  2 ...,  2  6  0]
 [ 3 11  2 ...,  2  0  0]
 [ 3 11  2 ...,  2  0  0]]

Saving results to file


In [31]:
fileOut = open("MSA_weignted.pkl","w")
pkl.dump((newMSA, simW),fileOut)
fileOut.close()

In [34]:
a = np.array([[1,2],[3,4]])
b= np.array([[5,6],[7,8]])

In [36]:
ap = a.reshape((4,))
bp = b.reshape((4,))

In [37]:
ap, bp


Out[37]:
(array([1, 2, 3, 4]), array([5, 6, 7, 8]))

In [39]:
np.concatenate((ap,bp))


Out[39]:
array([1, 2, 3, 4, 5, 6, 7, 8])

Result of the algorithm


In [4]:
fileIn = open("CFN.pkl","r")
CFN = pkl.load(fileIn)
fileIn.close()

In [5]:
for i in range(0,CFN.shape[0]): CFN[i,i] = 0.0 
CFN


Out[5]:
array([[ 0.        ,  0.25737225,  0.10589921, ...,  0.01642261,
         0.03264588, -0.02584258],
       [ 0.25320939,  0.        , -0.06845689, ..., -0.14908647,
         0.22281266,  0.01676454],
       [ 0.09049025, -0.07483189,  0.        , ...,  0.21146779,
        -0.15201165, -0.00706682],
       ..., 
       [-0.00128436, -0.14158841,  0.21210117, ...,  0.        ,
        -0.14729039, -0.11153589],
       [ 0.02589362,  0.22604362, -0.12928917, ..., -0.13043965,
         0.        ,  0.31250605],
       [-0.03592782,  0.02096606,  0.01075284, ..., -0.10062782,
         0.31577373,  0.        ]])

In [7]:
plt.imshow(np.abs(CFN)>0.3)
plt.colorbar()


Out[7]:
<matplotlib.colorbar.Colorbar instance at 0x30e5290>

In [ ]: