In [1]:
from sklearn.cluster import KMeans
import numpy as np
import re
import pandas as pd
import seaborn as sns
import sklearn
%pylab inline
%matplotlib inline
In [2]:
import networkx as nx
In [3]:
path = "/media/vince/Postdoc/PixelDB"
In [4]:
f = open(path+"/other_files/all_pairwise_TM.dat")
content = f.readlines()
f.close()
In [5]:
#Load maximal pdb length
#3DCOMB was making some weird results
#It is being replace by deepaling, that apparentrly don't do that
PDBLen = dict()
for l in content:
sp = l.split(" ")
#print(sp)
if (len(sp) != 6):
print(sp)
continue
#if sp[0] != sp[1]:
# continue
m = re.search("ALI:(\d+)",sp[3])
match = int(m.group(1))
alllen = [0,0]
m = re.search("TOT1:(\d+)",sp[4])
alllen[0] = int(m.group(1))
m = re.search("TOT2:(\d+)",sp[5])
alllen[1] = int(m.group(1))
for i in range(0,2):
if sp[i] not in PDBLen:
PDBLen[sp[i]] = 0
if PDBLen[sp[i]] < alllen[i]:
PDBLen[sp[i]] = alllen[i]
In [6]:
AllTM = dict() #All TM
G=nx.Graph()
for l in content:
#if not re.search("_.. ",l):
# continue
#print(l)
sp = l.split(" ")
if (len(sp) != 6):
print(sp)
continue
m = re.search("ALI:(\d+)",sp[3])
match = int(m.group(1))
m = re.search("TOT1:(\d+)",sp[4])
len1 = int(m.group(1))
m = re.search("TOT2:(\d+)",sp[5])
len2 = int(m.group(1))
#Check len, some old 3DCOMB artefact, leave it for the show
if (len1 != PDBLen[sp[0]]):
print(len1,PDBLen[sp[0]],sp)
len1 = PDBLen[sp[0]]
if (len2 != PDBLen[sp[1]]):
print(len2 ,PDBLen[sp[1]],sp)
len2 = PDBLen[sp[1]]
#We want the worst TM
maxl = np.max([len1,len2])
if maxl == 0:
maxl = 10
match = 0
iden = 0
TM = float(match) / maxl
#Set the matrix
for i in range(0,2):
if sp[i] not in AllTM:
G.add_node(sp[i])
AllTM[sp[i]] = dict()
AllTM[sp[i]][sp[i]] = 1.0
if sp[1] == sp[0]:
AllTM[sp[0]][sp[1]] = 1.0
continue
#Check some stuff
if sp[1] in AllTM[sp[0]]:
print(TM,AllTM[sp[0]][sp[1]])
if sp[0] in AllTM[sp[1]]:
print(TM,AllTM[sp[1]][sp[0]])
#Set TM
if TM > 0.8:
G.add_edge(sp[0],sp[1])
G.add_edge(sp[1],sp[0])
#break
AllTM[sp[0]][sp[1]] = TM
AllTM[sp[1]][sp[0]] = TM
#break
In [7]:
Gi = G.copy()
In [8]:
#Build Datafram
DistDF = 1-pd.DataFrame(AllTM).fillna(0)
print(len(DistDF))
In [ ]:
G = Gi.copy()
AllCluster = []
AllName = []
ToPrint = []
for i in range(0,len(G.nodes())):
MaxClique = []
MaxSize = 0
for cl in list(nx.find_cliques(G)):
if len(cl) > MaxSize:
MaxSize = len(cl)
MaxClique = cl
if MaxSize > 10:
print(len(MaxClique))
ToPrint += MaxClique
#if (len(MaxClique)) == 1:
# break
#sns.clustermap(DistDF[MaxClique].transpose()[MaxClique],vmax=1.0)
plt.show()
AllCluster.append(MaxClique)
AllName += MaxClique
for no in MaxClique:
G.remove_node(no)
if (len(G.nodes()) == 0):
break
In [ ]:
figsize(10,10)
sns.heatmap(DistDF[ToPrint].transpose()[ToPrint],vmax=1.0,vmin=0,)
plt.show()
In [ ]:
for clu1 in AllCluster:
if (len(clu1) < 20):
continue
for clu2 in AllCluster:
if clu1 == clu2:
continue
if (len(clu2) < 20):
continue
if (len(clu2) > len(clu1)):
continue
sns.heatmap(DistDF[clu1+clu2].transpose()[clu1+clu2],vmax=1.0,vmin=0)
plt.show()
break
break
In [ ]:
print(len(AllCluster))
print(AllCluster[-1])
print(AllCluster[0])
In [12]:
path
f = open(path+"other_files/cluster.dat","w")
CluNum = 1
for clu in AllCluster:
f.write(str(CluNum))
for c in clu:
f.write(" "+str(c))
f.write("\n")
CluNum += 1
f.close()
In [ ]: