notebook.community

Edit and run



In [1]:

    
from sklearn.cluster import KMeans
import numpy as np
import re
import pandas as pd
import seaborn as sns
import sklearn
%pylab inline
%matplotlib inline









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
import networkx as nx



In [3]:

    
path = "/media/vince/Postdoc/PixelDB"



In [4]:

    
f = open(path+"/other_files/all_pairwise_TM.dat")
content = f.readlines()
f.close()



In [5]:

    
#Load maximal pdb length
#3DCOMB was making some weird results
#It is being replace by deepaling, that apparentrly don't do that

PDBLen = dict()
for l in content:
    sp = l.split(" ")
    #print(sp)
    if (len(sp) != 6):
        print(sp)
        continue
    #if sp[0] != sp[1]:
    #    continue
    
    m = re.search("ALI:(\d+)",sp[3])
    match = int(m.group(1))
    alllen = [0,0]
    m = re.search("TOT1:(\d+)",sp[4])
    alllen[0] = int(m.group(1))
    
    m = re.search("TOT2:(\d+)",sp[5])
    alllen[1] = int(m.group(1))
    
    for i in range(0,2):
        if sp[i] not in PDBLen:
            PDBLen[sp[i]] = 0
        if PDBLen[sp[i]] < alllen[i]:
            PDBLen[sp[i]] = alllen[i]



In [6]:

    
AllTM = dict() #All TM
G=nx.Graph()
for l in content:
    #if not re.search("_.. ",l):
    #    continue
    #print(l)
    sp = l.split(" ")
    if (len(sp) != 6):
        print(sp)
        continue
    m = re.search("ALI:(\d+)",sp[3])
    match = int(m.group(1))
    
    m = re.search("TOT1:(\d+)",sp[4])
    len1 = int(m.group(1))
    
    m = re.search("TOT2:(\d+)",sp[5])
    len2 = int(m.group(1))
    
    #Check len, some old 3DCOMB artefact, leave it for the show
    if (len1 != PDBLen[sp[0]]):
        print(len1,PDBLen[sp[0]],sp)
        len1 = PDBLen[sp[0]]
    if (len2 != PDBLen[sp[1]]):
        print(len2 ,PDBLen[sp[1]],sp)
        len2 = PDBLen[sp[1]]
        
    #We want the worst TM 
    maxl = np.max([len1,len2])
    if maxl == 0:
        maxl = 10
        match = 0
        iden = 0
    TM = float(match) / maxl

    #Set the matrix
    for i in range(0,2):
        if sp[i] not in AllTM:
            G.add_node(sp[i])
            AllTM[sp[i]] = dict()
            AllTM[sp[i]][sp[i]] = 1.0
    if sp[1] == sp[0]:
        AllTM[sp[0]][sp[1]] = 1.0
        continue
    #Check some stuff
    if sp[1] in AllTM[sp[0]]:
        print(TM,AllTM[sp[0]][sp[1]])
    if sp[0] in AllTM[sp[1]]:
        print(TM,AllTM[sp[1]][sp[0]])        
    #Set TM
    if TM > 0.8:
        G.add_edge(sp[0],sp[1])
        G.add_edge(sp[1],sp[0])
        #break
    AllTM[sp[0]][sp[1]] = TM
    AllTM[sp[1]][sp[0]] = TM
    
    #break



In [7]:

    
Gi = G.copy()



In [8]:

    
#Build Datafram
DistDF = 1-pd.DataFrame(AllTM).fillna(0)
print(len(DistDF))



In [ ]:

    
G = Gi.copy()
AllCluster = []
AllName = []
ToPrint = []
for i in range(0,len(G.nodes())):
    MaxClique = []
    MaxSize = 0
    for cl in list(nx.find_cliques(G)):
        if len(cl) > MaxSize:
            MaxSize = len(cl)
            MaxClique = cl
    if MaxSize > 10:
        print(len(MaxClique))
    ToPrint += MaxClique
    #if (len(MaxClique)) == 1:
    #    break
    #sns.clustermap(DistDF[MaxClique].transpose()[MaxClique],vmax=1.0)
    plt.show()
    AllCluster.append(MaxClique)
    AllName += MaxClique
    for no in MaxClique:
        G.remove_node(no)
    if (len(G.nodes()) == 0):
        break



In [ ]:

    
figsize(10,10)
sns.heatmap(DistDF[ToPrint].transpose()[ToPrint],vmax=1.0,vmin=0,)
plt.show()



In [ ]:

    
for clu1 in AllCluster:
    if (len(clu1) < 20):
        continue
    for clu2 in AllCluster:
        if clu1 == clu2:
            continue
        if (len(clu2) < 20):
            continue
        if (len(clu2) > len(clu1)):
            continue
        sns.heatmap(DistDF[clu1+clu2].transpose()[clu1+clu2],vmax=1.0,vmin=0)
        plt.show()
        break
    break



In [ ]:

    
print(len(AllCluster))
print(AllCluster[-1])
print(AllCluster[0])



In [12]:

    
path
f = open(path+"other_files/cluster.dat","w")
CluNum = 1
for clu in AllCluster:
    f.write(str(CluNum))
    for c in clu:
        f.write(" "+str(c))
    f.write("\n")
    CluNum += 1
f.close()



In [ ]: