In [1]:
from sklearn.cluster import KMeans
import numpy as np
import re
import pandas as pd
import seaborn as sns
import sklearn
%pylab inline
%matplotlib inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import networkx as nx

In [3]:
path = "/media/vince/Postdoc/PixelDB"

In [4]:
f = open(path+"/other_files/all_pairwise_TM.dat")
content = f.readlines()
f.close()

In [5]:
#Load maximal pdb length
#3DCOMB was making some weird results
#It is being replace by deepaling, that apparentrly don't do that

PDBLen = dict()
for l in content:
    sp = l.split(" ")
    #print(sp)
    if (len(sp) != 6):
        print(sp)
        continue
    #if sp[0] != sp[1]:
    #    continue
    
    m = re.search("ALI:(\d+)",sp[3])
    match = int(m.group(1))
    alllen = [0,0]
    m = re.search("TOT1:(\d+)",sp[4])
    alllen[0] = int(m.group(1))
    
    m = re.search("TOT2:(\d+)",sp[5])
    alllen[1] = int(m.group(1))
    
    for i in range(0,2):
        if sp[i] not in PDBLen:
            PDBLen[sp[i]] = 0
        if PDBLen[sp[i]] < alllen[i]:
            PDBLen[sp[i]] = alllen[i]

In [6]:
AllTM = dict() #All TM
G=nx.Graph()
for l in content:
    #if not re.search("_.. ",l):
    #    continue
    #print(l)
    sp = l.split(" ")
    if (len(sp) != 6):
        print(sp)
        continue
    m = re.search("ALI:(\d+)",sp[3])
    match = int(m.group(1))
    
    m = re.search("TOT1:(\d+)",sp[4])
    len1 = int(m.group(1))
    
    m = re.search("TOT2:(\d+)",sp[5])
    len2 = int(m.group(1))
    
    #Check len, some old 3DCOMB artefact, leave it for the show
    if (len1 != PDBLen[sp[0]]):
        print(len1,PDBLen[sp[0]],sp)
        len1 = PDBLen[sp[0]]
    if (len2 != PDBLen[sp[1]]):
        print(len2 ,PDBLen[sp[1]],sp)
        len2 = PDBLen[sp[1]]
        
    #We want the worst TM 
    maxl = np.max([len1,len2])
    if maxl == 0:
        maxl = 10
        match = 0
        iden = 0
    TM = float(match) / maxl

    #Set the matrix
    for i in range(0,2):
        if sp[i] not in AllTM:
            G.add_node(sp[i])
            AllTM[sp[i]] = dict()
            AllTM[sp[i]][sp[i]] = 1.0
    if sp[1] == sp[0]:
        AllTM[sp[0]][sp[1]] = 1.0
        continue
    #Check some stuff
    if sp[1] in AllTM[sp[0]]:
        print(TM,AllTM[sp[0]][sp[1]])
    if sp[0] in AllTM[sp[1]]:
        print(TM,AllTM[sp[1]][sp[0]])        
    #Set TM
    if TM > 0.8:
        G.add_edge(sp[0],sp[1])
        G.add_edge(sp[1],sp[0])
        #break
    AllTM[sp[0]][sp[1]] = TM
    AllTM[sp[1]][sp[0]] = TM
    
    #break

In [7]:
Gi = G.copy()

In [8]:
#Build Datafram
DistDF = 1-pd.DataFrame(AllTM).fillna(0)
print(len(DistDF))


2536

In [9]:
G = Gi.copy()
AllCluster = []
AllName = []
ToPrint = []
for i in range(0,len(G.nodes())):
    MaxClique = []
    MaxSize = 0
    for cl in list(nx.find_cliques(G)):
        if len(cl) > MaxSize:
            MaxSize = len(cl)
            MaxClique = cl
    if MaxSize > 10:
        print(len(MaxClique))
    ToPrint += MaxClique
    #if (len(MaxClique)) == 1:
    #    break
    #sns.clustermap(DistDF[MaxClique].transpose()[MaxClique],vmax=1.0)
    plt.show()
    AllCluster.append(MaxClique)
    AllName += MaxClique
    for no in MaxClique:
        G.remove_node(no)
    if (len(G.nodes()) == 0):
        break


380
197
138
101
62
41
38
35
32
32
31
31
31
29
27
26
26
25
22
22
19
19
17
16
15
15
14
12
12
12
11
11
11
11

In [10]:
figsize(10,10)
sns.heatmap(DistDF[ToPrint].transpose()[ToPrint],vmax=1.0,vmin=0,)
plt.show()



In [11]:
for clu1 in AllCluster:
    if (len(clu1) < 20):
        continue
    for clu2 in AllCluster:
        if clu1 == clu2:
            continue
        if (len(clu2) < 20):
            continue
        if (len(clu2) > len(clu1)):
            continue
        sns.heatmap(DistDF[clu1+clu2].transpose()[clu1+clu2],vmax=1.0,vmin=0)
        plt.show()
        break
    break



In [12]:
print(len(AllCluster))
print(AllCluster[-1])
print(AllCluster[0])


486
['2P6B_AC']
['1HHJ_D', '3TBS_D', '3GSX_A', '4G9D_A', '1MHC_A', '2VAA_A', '1T1X_A', '3LKS_A', '4U1L_D', '1B0G_D', '3VCL_A', '3TBW_G', '3LKQ_A', '1VAD_A', '4PRE_A', '1BII_A', '3RWE_A', '3TBY_J', '3PWJ_D', '2ZOK_E', '4E0R_D', '2ZOK_G', '1S9W_A', '3VH8_A', '3GSR_A', '1JPG_A', '3VFW_A', '1FZJ_A', '4G42_D', '3BW9_A', '3KPQ_A', '4U1I_A', '2CIK_A', '1X7Q_A', '3LKO_A', '4F7M_D', '1KPV_A', '2AXG_A', '3TIE_D', '4ZUV_D', '2AXF_A', '4WDI_D', '3QUL_J', '3VJ6_A', '2MHA_C', '2BSS_A', '1AGD_A', '1S7W_J', '1T1W_A', '3GSU_A', '3BZE_G', '3MRD_A', '3RWG_A', '1T0M_D', '2V2W_A', '1IM3_A', '3CZF_A', '1K8D_A', '4U1H_A', '3MR9_A', '3GSO_A', '3ECB_A', '1FZO_A', '3P9M_D', '2CLZ_A', '1K5N_A', '3IXA_D', '1KPU_A', '2DYP_A', '3MRI_A', '1YN6_A', '2RFX_A', '3RL1_A', '4JFP_D', '3MGT_J', '1HSA_D', '3WLB_A', '3BO8_A', '4HWZ_A', '4WU5_D', '2VLL_A', '4ZUU_A', '3V5D_D', '2GTZ_D', '3D25_A', '1JGE_A', '1A1O_A', '2NW3_A', '3X11_A', '1S7X_J', '3VFV_A', '1JGD_A', '4HX1_A', '2HN7_A', '3OX8_D', '3ROO_C', '4PRD_A', '3LN5_A', '3FON_C', '3MRR_A', '1SYS_A', '1A1M_A', '3VXN_A', '3KYN_A', '4QRS_A', '1ZHL_A', '3KPM_A', '4PG9_A', '4WJ5_D', '4JFQ_D', '1KJ3_I', '3VXP_D', '3FOL_A', '1YDP_A', '1S9X_A', '4PGE_A', '2V2X_A', '1WBX_A', '3X12_A', '1JHT_A', '3HPJ_D', '2BST_A', '1NAN_L', '2GIT_D', '1W0W_A', '3MRQ_A', '1ZT1_A', '1I4F_A', '2CLV_A', '3VRJ_A', '1WBZ_A', '3FQT_A', '3X14_A', '1UXS_A', '3FT4_A', '3QQ4_A', '1A9E_A', '3PAB_D', '1XR8_A', '1EEY_D', '3KLA_D', '3GIV_D', '3GSW_A', '2GUO_D', '3SPV_A', '4PRB_A', '4O2F_D', '4ZFZ_A', '3WS3_C', '3VRI_A', '4NT6_A', '3MRH_A', '2VAB_A', '4PR5_A', '3LKR_A', '3NFN_A', '1T22_A', '3MRJ_A', '4O2E_D', '3SKM_A', '4LCY_F', '1I7R_D', '3OXS_A', '3L3G_A', '3DX8_A', '1S7Q_A', '3SKO_A', '4O2C_A', '3DX6_A', '3VFU_A', '1T1Z_A', '4JQV_A', '4LNR_A', '3BWA_A', '3LKP_A', '1T20_A', '1E27_A', '3VFS_A', '4CVZ_A', '1KJM_A', '3P4N_D', '3MRF_A', '2HJL_A', '3WL9_A', '4NO5_A', '3FOM_A', '3QFD_D', '3KYO_C', '3BP7_A', '1JF1_A', '2BVP_A', '3MRB_A', '3LV3_A', '1EEZ_D', '3FQW_A', '1WBY_A', '1S8D_A', '3X13_A', '4WU7_D', '4L8D_C', '3UPR_C', '3RWI_A', '3MRL_A', '3MGO_J', '4PRA_A', '1JUF_A', '1ZSD_A', '3VFN_A', '1G7Q_A', '1RK1_A', '3KPP_A', '1XH3_A', '2H6P_A', '3GSV_A', '4I4W_A', '3RWC_D', '3CC5_D', '1HOC_A', '3LKN_A', '3MRN_A', '1FZM_A', '2A83_A', '4U1M_A', '3H9H_D', '1M05_C', '1AGB_A', '4QRU_A', '2YPK_A', '4Z77_D', '1KJV_A', '1Q94_A', '1Q94_D', '1OGT_A', '1LEG_A', '3MYJ_D', '1S7U_J', '3MRK_A', '1TVH_D', '2AV1_D', '3TID_A', '2C7U_A', '1SYV_A', '3PWN_D', '4NNY_A', '3VFP_A', '1RJY_D', '4F7T_D', '3KPN_A', '3OXR_A', '3VFT_A', '1ZHK_A', '1QR1_D', '1T1Y_A', '4E5X_A', '1A1N_A', '3CH1_J', '2FZ3_A', '1LEK_A', '4U6Y_A', '2GT9_D', '4L8B_A', '3DX7_A', '3VFR_A', '1T0N_D', '3PWL_D', '2CLR_D', '3L3D_A', '3BZF_C', '4HUU_D', '4QRT_A', '1T21_A', '3V5K_D', '4NQX_K', '4N8V_A', '1DUZ_D', '3P4O_D', '3MRC_A', '2HJK_A', '3KWW_A', '4U1J_A', '4HUV_A', '1OSZ_A', '2BVO_A', '4PRN_A', '3VFO_A', '3MRM_A', '4U1S_A', '4G8I_A', '3H7B_D', '3P4M_D', '2JCC_A', '1AGE_A', '3I6G_D', '1G7P_A', '3BEV_A', '4JFO_D', '2BSR_A', '1XR9_A', '1OF2_A', '1INQ_A', '3D18_A', '1TVB_D', '3TBT_J', '3RL2_A', '3MRO_A', '4MJ5_A', '1HHI_D', '1AGC_A', '3KPL_A', '1RJY_A', '3L3I_A', '3D2U_E', '2X4U_A', '2X4R_A', '3UTQ_A', '4QRQ_A', '1YN7_A', '4GKS_D', '4NQV_K', '3WS6_B', '3TBV_G', '2AV7_D', '1HHK_D', '3P9L_D', '1QVO_A', '4HV8_C', '3PWU_A', '1QVO_D', '3WUW_A', '2GTW_D', '1VAC_A', '3FT3_A', '3MRP_A', '1MI5_A', '2X4O_A', '3E6H_A', '1NAN_H', '4U1N_A', '4U1K_D', '3GSQ_A', '1QEW_A', '1JPF_A', '1UXW_A', '3I6L_D', '4K7F_D', '1S7V_D', '3MRE_A', '3CPL_C', '3QUK_D', '1I1Y_D', '1S9Y_A', '3C9N_A', '1FZK_A', '3V5H_D', '4XXC_A', '2XFX_A', '4G43_D', '4HUX_A', '1S7T_D', '3BXN_A', '1AGF_A', '4HS3_A', '3BP4_A', '1LK2_A', '1W0V_A', '1I7U_D', '1DUY_D', '3MRG_A', '1S7S_A', '2BVQ_A', '1QO3_A', '4JQX_A', '1VGK_A', '3VFM_A', '3L3J_A', '3E6F_A', '3KPO_A', '1M6O_A', '4Z76_D', '1N2R_A']

In [12]:
path
f = open(path+"other_files/cluster.dat","w")
CluNum = 1
for clu in AllCluster:
    f.write(str(CluNum))
    for c in clu:
        f.write(" "+str(c))
    f.write("\n")
    CluNum += 1
f.close()

In [ ]: