In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from small_script.myFunctions import *
import feather
import Bio.PDB as bio
import subprocess
from sklearn.cluster import MiniBatchKMeans
d3_to_index = bio.Polypeptide.d3_to_index  # we may want to adjust this in the future.
three_to_one = bio.Polypeptide.three_to_one
one_to_index = bio.Polypeptide.one_to_index
plt.rcParams['figure.figsize'] = [16.18033, 10]

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [10]:
def getFragPdb(pdbId, i, outFile=None):
    pdb = pdbId + ".pdb"
    if outFile is None:
        outFile = f"{i}_{pdb}"
#     pdb = "1igqB00.pdb"
#     pdbId = pdb.split('.')[0]
    pre = "/Users/weilu/Research/optimization/fragment/"
    database = "/Users/weilu/Research/optimization/fragment/database/dompdb/"
    parser = bio.PDBParser(QUIET=True)
    structure = parser.get_structure("x", os.path.join(database, pdb))
    for model in structure:
        for chain in model:
            all_residues = list(chain)
            io = bio.PDBIO()
            c = bio.Chain.Chain("A")
            c.child_list = all_residues[i:i+9]
#             for ii, res in enumerate(c):
#                 res.id = (' ', ii+1, ' ')
            io.set_structure(c)
            io.save(f'{pre}{outFile}')
def getScore(data, km):
    #     return km.score(data.iloc[:, 3:87].values)
    #     return data.values[3:4])
    #     return km.score(np.array([1]*84).reshape(1,-1))
    #     return np.sqrt(-km.score(data.values[3:87].reshape(1,-1)))
    #     return np.sqrt(-km.score(data.values.reshape(1,-1)))
    return np.sqrt(((km.cluster_centers_[int(data.values[-1])] - data.values[:-1])**2).sum())
def getFromTerminal(CMD):
    return subprocess.Popen(CMD,stdout=subprocess.PIPE,shell=True).communicate()[0].decode()

In [3]:
data_original = pd.read_csv("/Users/weilu/Research/optimization/fragment/data_jan20.csv")


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-3-84fc16d4cc19> in <module>
----> 1 data_original = pd.read_csv("/Users/weilu/Research/optimization/fragment/data_jan20.csv")

~/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    700                     skip_blank_lines=skip_blank_lines)
    701 
--> 702         return _read(filepath_or_buffer, kwds)
    703 
    704     parser_f.__name__ = name

~/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    427 
    428     # Create the parser.
--> 429     parser = TextFileReader(filepath_or_buffer, **kwds)
    430 
    431     if chunksize or iterator:

~/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    893             self.options['has_index_names'] = kwds['has_index_names']
    894 
--> 895         self._make_engine(self.engine)
    896 
    897     def close(self):

~/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1120     def _make_engine(self, engine='c'):
   1121         if engine == 'c':
-> 1122             self._engine = CParserWrapper(self.f, **self.options)
   1123         else:
   1124             if engine == 'python':

~/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1851         kwds['usecols'] = self.usecols
   1852 
-> 1853         self._reader = parsers.TextReader(src, **kwds)
   1854         self.unnamed_cols = self._reader.unnamed_cols
   1855 

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

FileNotFoundError: [Errno 2] File b'/Users/weilu/Research/optimization/fragment/data_jan20.csv' does not exist: b'/Users/weilu/Research/optimization/fragment/data_jan20.csv'

In [205]:
data_original.head()


Out[205]:
pdb i seq caca_1 caca_2 caca_3 caca_4 caca_5 caca_6 caca_7 ... cbcb_12 cbcb_13 cbcb_14 cbcb_15 cbcb_16 cbcb_17 cbcb_18 cbcb_19 cbcb_20 cbcb_21
0 1igqB00 0 DKLKKAIVQ 9.545797 12.242738 11.389445 14.435853 15.702080 18.185148 9.427593 ... 4.605271 8.984904 10.732664 13.098623 10.637720 14.192881 16.225273 11.296279 13.163032 10.095731
1 1igqB00 1 KLKKAIVQV 9.427593 9.508487 13.046163 14.956468 17.811722 21.096529 5.881323 ... 10.637720 14.192881 16.225273 19.915300 11.296279 13.163032 16.472520 10.095731 12.801142 9.801298
2 1igqB00 2 LKKAIVQVE 5.881323 9.385722 11.658462 14.435720 17.857430 21.547594 9.566237 ... 11.296279 13.163032 16.472520 20.721910 10.095731 12.801142 17.423760 9.801298 13.690151 11.383380
3 1igqB00 3 KKAIVQVEH 9.566237 12.596333 15.433746 18.900240 22.492160 25.003511 9.934756 ... 10.095731 12.801142 17.423760 18.518616 9.801298 13.690151 14.967668 11.383380 12.181765 11.096270
4 1igqB00 4 KAIVQVEHD 9.934756 12.622395 16.016058 19.465660 21.763054 25.281502 9.527389 ... 9.801298 13.690151 14.967668 19.082531 11.383380 12.181765 16.957390 11.096270 14.904981 10.865062

5 rows × 87 columns

get center pdb


In [619]:
pre = "/Users/weilu/Research/optimization/fragment/"
data_original = feather.read_dataframe(f"{pre}cluster100_v2.feather")
os.system(f"mkdir -p {pre}center_cluster100_v2/origin/")
os.system(f"mkdir -p {pre}center_cluster100_v2/pdbs/")
os.system(f"mkdir -p {pre}center_cluster100_v2/gros/")
center = data_original.groupby("cluster").head(1)
for i, row in center.reset_index(drop=True).iterrows():
    print(i, row["pdb"], row["i"], row["cluster"])
    getFragPdb(row["pdb"], int(row["i"]), f"center_cluster100_v2/origin/{row['cluster']}.pdb")
pre = "/Users/weilu/Research/optimization/fragment/center_cluster100_v2//"
for i in range(100):
    os.system(f"python ~/opt/small_script/pdb_reres.py {pre}origin/{i}.pdb > {pre}pdbs/{i}.pdb")
for i in range(100):
    os.system(f"python2 ~/opt/script/Pdb2Gro.py {pre}pdbs/{i}.pdb {pre}gros/{i}.gro")


0 1bg6A02 107 0
1 1c17M00 9 1
2 3p6dA00 53 2
3 3ubrA01 238 3
4 3q3eA03 3 4
5 2yj6A02 47 5
6 2fokB01 110 6
7 4qamB00 141 7
8 2ftxA00 23 8
9 4kksA03 29 9
10 2h5nC00 54 10
11 2y3cA00 135 11
12 3tp9A02 12 12
13 4o7oA02 15 13
14 1tz9A00 65 14
15 2autA00 65 15
16 1dk8A02 58 16
17 2vf7B01 225 17
18 5ja1B00 0 18
19 1e8cA02 213 19
20 4khbC00 59 20
21 1ogpA02 110 21
22 2r0qC01 6 22
23 2ri9A00 67 23
24 2duyA00 33 24
25 4lviA01 108 25
26 3la4A03 40 26
27 2icgA00 73 27
28 3abzA03 22 28
29 2wsaA00 318 29
30 3sigA00 42 30
31 4ag4A02 28 31
32 2p17A00 109 32
33 3k1dA01 28 33
34 4k17A03 55 34
35 1qwoA01 358 35
36 1hp1A01 17 36
37 4xjxA03 163 37
38 4q9dB01 152 38
39 1vw4L02 56 39
40 4dohB02 19 40
41 1uf3A00 25 41
42 3fmgA01 41 42
43 3gkuA02 65 43
44 2y2zA02 1 44
45 4ry8A01 1 45
46 3gw6F01 12 46
47 1y80A00 61 47
48 1d4aA00 241 48
49 1m2vB04 78 49
50 3c24A01 112 50
51 4p2cA01 102 51
52 1jfaB00 163 52
53 2lvvA00 175 53
54 3h75A01 77 54
55 2c0uA03 55 55
56 3l2pA01 91 56
57 3r5eA00 164 57
58 3zwcA03 98 58
59 1imjA00 67 59
60 2ivfB00 273 60
61 3b85A00 23 61
62 3qvsA01 82 62
63 2x4mB00 44 63
64 4tm5A02 124 64
65 2fb2B00 61 65
66 1mukA02 406 66
67 3gqnA03 38 67
68 3icrA02 41 68
69 2hy5B00 96 69
70 3zkvA00 456 70
71 2y8yA01 2 71
72 2xl4A00 98 72
73 4nleA03 67 73
74 4rhiA00 126 74
75 5h83A01 3 75
76 1wywA00 104 76
77 3it4D02 90 77
78 3av0A01 89 78
79 3p02A02 98 79
80 3ic8A01 26 80
81 3kb1A01 59 81
82 1nf8A00 164 82
83 3gzdA01 48 83
84 3brwC02 68 84
85 3ephA01 40 85
86 3hn7A03 83 86
87 4v19X00 104 87
88 2cfuA01 37 88
89 2iqgA02 129 89
90 4otpA02 27 90
91 3dupA01 102 91
92 4ie5A02 21 92
93 1u83A00 81 93
94 1w2yA00 52 94
95 2o1mA01 18 95
96 4oc8A01 193 96
97 1fo1B00 125 97
98 2nylB00 148 98
99 2yg8A02 38 99

In [620]:


In [621]:


In [529]:
data_original = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100.feather")

In [545]:
center = data_original.groupby("cluster").head(1)

In [548]:
for i, row in center.reset_index(drop=True).iterrows():
    print(i, row["pdb"], row["i"], row["cluster"])
    getFragPdb(row["pdb"], int(row["i"]), f"center_cluster100/{row['cluster']}.pdb")


0 1g9mG00 206 0
1 5cr9A02 26 1
2 2h7fX02 93 2
3 2q9oA03 200 3
4 4yokA01 40 4
5 1b3tA00 103 5
6 4gxtA01 160 6
7 2phpA00 111 7
8 4gb7A00 199 8
9 1nr0A01 123 9
10 3ahcA01 301 10
11 1q5vB01 6 11
12 5ipyA02 54 12
13 1rj1A00 60 13
14 5d01A01 155 14
15 5eufA02 10 15
16 4ua3A00 119 16
17 3lsoA01 97 17
18 2e63A00 86 18
19 2a90A01 70 19
20 1pujA02 13 20
21 1knyA02 109 21
22 3glvA00 105 22
23 4aweA00 263 23
24 3gg4A01 217 24
25 5e7qA01 317 25
26 1eakA01 31 26
27 1sj1A00 49 27
28 1khiA01 5 28
29 4ps6A00 117 29
30 2lpuA00 108 30
31 2rhkC00 17 31
32 1ogyA03 130 32
33 1tr2A02 35 33
34 2wvsA01 100 34
35 4jb1A02 54 35
36 3c8zA01 24 36
37 2xvyA01 10 37
38 3pohA01 10 38
39 1qknA00 20 39
40 1y8qC00 47 40
41 2inyA01 562 41
42 2rgqA00 104 42
43 3sd2A01 58 43
44 3d8kA00 48 44
45 1rmgA00 92 45
46 4hrwB01 127 46
47 1krlA00 32 47
48 4bmjA00 26 48
49 1uwyA01 235 49
50 1biqA00 294 50
51 1pujA02 45 51
52 1fc4A01 58 52
53 1m4zA01 59 53
54 1iooA00 135 54
55 3atsA02 173 55
56 3h09B02 216 56
57 1nr0A01 125 57
58 5fccA01 1 58
59 4qbuA03 28 59
60 2cvbA00 140 60
61 2aw6A02 54 61
62 3cf4A02 127 62
63 3cfuA00 115 63
64 2lgvA00 11 64
65 3lf7A01 330 65
66 2opwA00 124 66
67 2gw1A02 284 67
68 3q5wA00 2 68
69 2x6rB01 185 69
70 1m2oA05 52 70
71 3m0fB02 93 71
72 4i9cA02 153 72
73 2gjcB02 202 73
74 4nafA00 57 74
75 1ipkB02 141 75
76 4r3aA01 40 76
77 1fgsA01 146 77
78 3w5mA05 48 78
79 1kbpA02 251 79
80 4dndA00 0 80
81 1vddA03 18 81
82 2uvfB02 338 82
83 1xjvA01 31 83
84 1k4nA00 105 84
85 1a0rP01 87 85
86 5dllA03 30 86
87 1wzzA00 160 87
88 1kjnA00 118 88
89 1eyqA02 116 89
90 4h18A00 186 90
91 3w0lB02 77 91
92 4abnA02 123 92
93 4at7B01 83 93
94 1bt9A00 124 94
95 4pxvC00 24 95
96 2c0hA00 73 96
97 3fk5A02 92 97
98 1myrA00 296 98
99 4igbB02 26 99

In [550]:
pre = "/Users/weilu/Research/optimization/fragment/center_cluster100/"
for i in range(100):
    os.system(f"python ~/opt/small_script/pdb_reres.py {pre}origin/{i}.pdb > {pre}pdbs/{i}.pdb")

In [554]:
for i in range(100):
    os.system(f"python2 ~/opt/script/Pdb2Gro.py {pre}pdbs/{i}.pdb {pre}gros/{i}.gro")

In [238]:
from sklearn.cluster import KMeans
# x = data_original.head(100).iloc[:, 3:24].values

chosen = data_original.head(1000).reset_index(drop=True)
x = chosen.iloc[:, 3:87].values
kmeans = KMeans(n_clusters=20, random_state=0).fit(x)

In [239]:
plt.scatter(x[:, 0], x[:, 1], c=kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], c="red")


Out[239]:
<matplotlib.collections.PathCollection at 0x12dc3e400>

In [247]:
from sklearn.cluster import KMeans
# x = data_original.head(100).iloc[:, 3:24].values
# ((kmeans.labels_ == kmeans.predict(x))-1).sum()
chosen = data_original.head(10000).reset_index(drop=True)
x = chosen.iloc[:, 3:87].values
kmeans = KMeans(n_clusters=100, random_state=0).fit(x)

In [245]:
chosen.head()


Out[245]:
pdb i seq caca_1 caca_2 caca_3 caca_4 caca_5 caca_6 caca_7 ... cbcb_13 cbcb_14 cbcb_15 cbcb_16 cbcb_17 cbcb_18 cbcb_19 cbcb_20 cbcb_21 cluster
0 1igqB00 0 DKLKKAIVQ 9.545797 12.242738 11.389445 14.435853 15.702080 18.185148 9.427593 ... 8.984904 10.732664 13.098623 10.637720 14.192881 16.225273 11.296279 13.163032 10.095731 9
1 1igqB00 1 KLKKAIVQV 9.427593 9.508487 13.046163 14.956468 17.811722 21.096529 5.881323 ... 14.192881 16.225273 19.915300 11.296279 13.163032 16.472520 10.095731 12.801142 9.801298 12
2 1igqB00 2 LKKAIVQVE 5.881323 9.385722 11.658462 14.435720 17.857430 21.547594 9.566237 ... 13.163032 16.472520 20.721910 10.095731 12.801142 17.423760 9.801298 13.690151 11.383380 12
3 1igqB00 3 KKAIVQVEH 9.566237 12.596333 15.433746 18.900240 22.492160 25.003511 9.934756 ... 12.801142 17.423760 18.518616 9.801298 13.690151 14.967668 11.383380 12.181765 11.096270 19
4 1igqB00 4 KAIVQVEHD 9.934756 12.622395 16.016058 19.465660 21.763054 25.281502 9.527389 ... 13.690151 14.967668 19.082531 11.383380 12.181765 16.957390 11.096270 14.904981 10.865062 19

5 rows × 88 columns


In [307]:
chosen["rmsd"] = chosen.iloc[:,3:87].apply(lambda x: getScore(x, kmeans), axis=1)
chosen["cluster"] = kmeans.labels_

complete cluster


In [3]:
data_original = pd.read_csv("/Users/weilu/Research/optimization/fragment/data_jan31.csv")

In [4]:
chosen = data_original.reset_index(drop=True)
x = chosen.iloc[:, 3:87].values

In [5]:
kmeans = MiniBatchKMeans(n_clusters=100,
        random_state=0,
        batch_size=200,
        max_iter=300,
        tol=1e4).fit(x)

In [8]:
import pickle
pickle.dump(kmeans, open("/Users/weilu/Research/optimization/fragment/kmeans_cluster100_v2_2", "wb"))

In [11]:
chosen["cluster"] = kmeans.labels_
chosen["rmsd"] = chosen.iloc[:,3:88].apply(lambda x: getScore(x, kmeans), axis=1)

In [12]:
reodered_chosen = chosen.sort_values(["cluster", "rmsd"])
# reodered_chosen.reset_index(drop=True).to_feather("/Users/weilu/Research/optimization/fragment/cluster100_v2.feather")
reodered_chosen.reset_index(drop=True).to_feather("/Users/weilu/Research/optimization/fragment/cluster100_v2_2.feather")

In [603]:
import pickle
# pickle.dump(kmeans, open("/Users/weilu/Research/optimization/fragment/kmeans_cluster100_v2_2", "wb"))
kmeans = pickle.load(open("/Users/weilu/Research/optimization/fragment/kmeans_cluster100_v2", "rb"))

In [608]:


In [610]:


In [491]:
%%time
# %%time
# from sklearn.cluster import KMeans
# # x = data_original.head(100).iloc[:, 3:24].values
# # ((kmeans.labels_ == kmeans.predict(x))-1).sum()
# kmeans = KMeans(n_clusters=100, random_state=0).fit(x)
# # chosen["rmsd"] = chosen.iloc[:,3:87].apply(lambda x: getScore(x, kmeans), axis=1)
# chosen["cluster"] = kmeans.labels_
# fit on the whole data
kmeans = MiniBatchKMeans(n_clusters=500,
        random_state=0,
        batch_size=200,
        max_iter=300,
        tol=1e4).fit(x)


CPU times: user 30.3 s, sys: 5.92 s, total: 36.2 s
Wall time: 14.8 s

In [522]:
chosen["cluster"] = kmeans.labels_
chosen["rmsd"] = chosen.iloc[:,3:88].apply(lambda x: getScore(x, kmeans), axis=1)

In [523]:
reodered_chosen = chosen.sort_values(["cluster", "rmsd"])

In [528]:
reodered_chosen.reset_index(drop=True).to_feather("/Users/weilu/Research/optimization/fragment/cluster500.feather")

In [525]:
t = reodered_chosen.groupby("cluster")["rmsd"].describe().sort_values("count")

In [527]:
t.query("mean < 10")


Out[527]:
count mean std min 25% 50% 75% max
cluster
488 211.0 6.405952 2.184369 0.000000 4.851784 6.009832 7.477315 16.365348
419 461.0 8.400869 3.992281 0.000000 5.183176 7.018433 11.271126 21.236263
439 573.0 8.307781 2.359199 0.000000 6.527053 7.899386 9.752821 18.502313
463 631.0 5.769330 1.757134 0.000000 4.605430 5.413044 6.678401 16.163776
394 643.0 7.252995 2.016599 0.000000 5.875750 7.254019 8.251920 16.145780
412 644.0 9.648656 3.423034 0.000000 6.912548 8.817875 12.297921 19.966513
110 752.0 9.885749 2.144300 0.000000 8.255909 9.581002 11.430300 17.489185
324 766.0 9.085841 1.503798 0.000000 8.217952 9.110182 9.995570 13.652618
487 767.0 5.140011 2.070114 0.000000 3.807998 4.542719 5.826916 15.917704
494 778.0 5.416354 1.927569 0.000000 4.031041 5.029221 6.390334 15.819659
174 790.0 8.487826 2.225173 0.000000 6.985582 8.428768 9.779559 16.427066
313 908.0 7.433903 1.261792 0.000000 6.482404 7.349275 8.346862 12.202795
421 923.0 9.091107 3.942319 0.000000 6.315109 7.729651 11.712235 22.461120
484 935.0 5.711914 1.975222 0.000000 4.499907 5.436268 6.514064 18.286431
173 1008.0 9.257256 1.722207 0.000000 8.047716 9.178171 10.330828 14.469612
476 1077.0 5.261427 2.088553 0.000000 3.789850 4.629205 6.109691 13.768129
389 1087.0 6.832852 3.350067 0.000000 4.403291 5.580939 8.678317 17.565477
337 1088.0 8.889376 3.234105 0.000000 6.270527 8.602627 11.020706 18.671643
387 1152.0 8.740118 2.836843 0.000000 6.622372 7.967326 10.555101 19.106496
264 1260.0 9.503432 3.935349 0.000000 6.279775 8.264432 12.399816 22.850360
89 1270.0 8.160869 1.466499 4.405894 7.084065 8.165975 9.131448 14.216164
181 1304.0 9.190776 2.940143 0.000000 7.040864 8.414425 10.739330 19.252077
331 1332.0 9.636055 2.923684 0.000000 7.287349 9.580069 11.760191 18.181023
333 1366.0 8.161093 1.242464 0.000000 7.373075 8.153737 8.916212 15.594951
296 1403.0 7.277364 3.788108 0.000000 4.564008 5.821959 9.728810 19.912605
390 1413.0 9.844129 2.929726 0.000000 7.405211 9.580462 11.988679 18.566266
372 1483.0 9.501800 1.648011 0.000000 8.336452 9.339765 10.457104 15.878713
423 1509.0 7.229744 1.490450 0.000000 6.226085 7.219368 8.186919 13.217369
405 1509.0 7.489281 1.242903 0.000000 6.678965 7.451765 8.267258 12.784382
18 1526.0 6.682512 2.955888 0.000000 4.695716 5.872720 7.737941 20.127146
... ... ... ... ... ... ... ... ...
446 8325.0 5.612743 2.246595 1.686931 4.252918 4.982112 6.037459 17.209269
381 8404.0 9.779719 2.526253 2.738193 7.931354 9.708642 11.634050 18.009054
226 8489.0 3.303255 1.307035 0.804719 2.376314 2.995983 3.900423 13.161018
373 8567.0 8.750373 1.841973 3.518725 7.438240 8.680673 9.985167 15.714770
443 8613.0 5.553135 2.505225 1.731831 3.796030 4.823022 6.640948 20.369276
11 8616.0 5.498414 1.041507 2.041919 4.772000 5.450848 6.162283 10.567001
28 8704.0 4.132531 2.623001 0.923846 2.552216 3.257191 4.620020 21.465496
424 8829.0 8.855451 1.578204 4.113152 7.800379 8.773570 9.812179 16.224047
35 9076.0 6.293140 2.913352 0.925444 4.236622 5.702312 7.629875 20.049576
481 9115.0 4.478269 2.723088 1.071464 2.777123 3.566477 5.053013 21.257613
279 9162.0 9.188380 2.146380 4.574546 7.505149 9.036916 10.681678 18.002513
347 9302.0 9.921108 2.950067 0.000000 7.655886 9.841897 12.274723 18.210627
159 9568.0 8.230792 2.708362 1.947264 6.299143 7.723490 9.853484 19.574172
408 9667.0 6.381713 1.288461 2.215306 5.484409 6.311356 7.215179 12.464428
416 9970.0 7.429243 1.710242 0.000000 6.222167 7.479457 8.668579 15.107802
497 10085.0 2.993684 1.245996 0.941742 2.166304 2.652712 3.467202 12.653784
258 10548.0 9.320128 2.149168 3.320423 7.811364 9.296833 10.755182 16.861263
222 11020.0 7.638585 1.587936 3.538688 6.547909 7.501866 8.567174 15.734313
398 11847.0 7.016219 1.334663 3.169083 6.071185 6.937974 7.909028 12.899463
485 15435.0 2.500596 0.747492 0.750748 1.991833 2.403763 2.891510 8.282556
327 15627.0 2.919554 1.085106 0.851447 2.133238 2.666950 3.455255 10.615247
425 16487.0 2.850683 1.006460 0.792125 2.132705 2.648014 3.375965 14.221093
67 16580.0 5.888814 2.170344 2.562263 4.473827 5.177396 6.515498 18.038824
363 17419.0 4.754982 2.050652 1.052461 3.298524 4.384968 5.774809 20.050853
499 26735.0 2.336397 0.895919 0.603243 1.723335 2.134374 2.719237 8.906935
56 32435.0 2.032252 0.693076 0.589714 1.562771 1.887694 2.339050 11.419610
453 37130.0 2.195190 0.848428 0.518378 1.617063 2.004453 2.558342 12.418647
491 44173.0 1.873969 0.649835 0.552015 1.421173 1.744503 2.181334 9.924016
451 46148.0 2.151608 0.822239 0.591659 1.600590 1.981437 2.495240 11.074528
239 67870.0 1.977638 0.748888 0.503575 1.466526 1.814698 2.310752 9.656071

231 rows × 8 columns


In [524]:
reodered_chosen.groupby("cluster")["rmsd"].describe().sort_values("count")


Out[524]:
count mean std min 25% 50% 75% max
cluster
376 59.0 10.838282 3.232352 0.000000 9.192692 10.401419 13.294589 17.287852
211 194.0 15.185838 2.187446 0.000000 13.847370 15.128813 16.679899 20.607763
488 211.0 6.405952 2.184369 0.000000 4.851784 6.009832 7.477315 16.365348
103 284.0 14.563844 1.964411 0.000000 13.410099 14.638768 15.926320 18.627723
125 293.0 15.258882 2.016858 0.000000 14.238791 15.240803 16.369402 21.291743
213 339.0 14.261146 2.222284 0.000000 12.873759 14.402281 15.714741 19.123733
53 387.0 12.113211 3.963168 0.000000 8.359540 13.118751 15.284199 19.085406
145 394.0 13.946271 3.729265 0.000000 12.265290 14.876402 16.538905 22.741212
150 399.0 16.777677 2.235406 0.000000 15.719497 17.039087 18.193809 21.514749
325 412.0 11.905261 4.003662 0.000000 9.107426 11.397496 14.825498 20.814679
426 433.0 10.630110 4.275595 0.000000 7.803763 10.078080 13.454795 21.240280
202 460.0 14.595013 2.425844 0.000000 13.192075 14.587900 16.168760 21.602464
419 461.0 8.400869 3.992281 0.000000 5.183176 7.018433 11.271126 21.236263
118 480.0 13.069946 2.390771 0.000000 11.554023 12.876632 14.549696 20.326118
216 485.0 13.904434 1.905825 0.000000 12.829957 14.063004 15.172137 19.765212
60 551.0 14.802485 3.749959 0.000000 12.790487 15.384159 17.432484 24.452040
439 573.0 8.307781 2.359199 0.000000 6.527053 7.899386 9.752821 18.502313
46 600.0 12.309305 2.773076 0.000000 10.562480 12.333308 14.110623 19.258243
463 631.0 5.769330 1.757134 0.000000 4.605430 5.413044 6.678401 16.163776
394 643.0 7.252995 2.016599 0.000000 5.875750 7.254019 8.251920 16.145780
412 644.0 9.648656 3.423034 0.000000 6.912548 8.817875 12.297921 19.966513
205 649.0 13.248842 2.127469 0.000000 12.245327 13.596784 14.654608 18.177675
244 657.0 12.915141 2.032875 0.000000 11.708049 13.082302 14.343676 17.828327
236 661.0 12.611278 2.706920 0.000000 10.685368 12.348166 14.627941 19.588881
70 700.0 13.799584 2.500682 0.000000 12.564819 14.219139 15.589637 19.439629
137 723.0 13.213361 2.910252 0.000000 11.479426 13.578968 15.101994 24.337267
24 738.0 12.307941 2.568989 0.000000 10.458948 12.615052 14.156888 18.982266
110 752.0 9.885749 2.144300 0.000000 8.255909 9.581002 11.430300 17.489185
291 765.0 12.495069 2.368849 0.000000 10.992476 12.812920 14.291917 17.854387
324 766.0 9.085841 1.503798 0.000000 8.217952 9.110182 9.995570 13.652618
... ... ... ... ... ... ... ... ...
210 8407.0 10.401558 2.378907 3.987519 8.670919 10.390931 12.088837 20.102646
226 8489.0 3.303255 1.307035 0.804719 2.376314 2.995983 3.900423 13.161018
373 8567.0 8.750373 1.841973 3.518725 7.438240 8.680673 9.985167 15.714770
443 8613.0 5.553135 2.505225 1.731831 3.796030 4.823022 6.640948 20.369276
11 8616.0 5.498414 1.041507 2.041919 4.772000 5.450848 6.162283 10.567001
307 8690.0 11.673967 1.972951 6.125048 10.239719 11.473958 13.033092 18.718273
28 8704.0 4.132531 2.623001 0.923846 2.552216 3.257191 4.620020 21.465496
424 8829.0 8.855451 1.578204 4.113152 7.800379 8.773570 9.812179 16.224047
35 9076.0 6.293140 2.913352 0.925444 4.236622 5.702312 7.629875 20.049576
481 9115.0 4.478269 2.723088 1.071464 2.777123 3.566477 5.053013 21.257613
279 9162.0 9.188380 2.146380 4.574546 7.505149 9.036916 10.681678 18.002513
347 9302.0 9.921108 2.950067 0.000000 7.655886 9.841897 12.274723 18.210627
159 9568.0 8.230792 2.708362 1.947264 6.299143 7.723490 9.853484 19.574172
408 9667.0 6.381713 1.288461 2.215306 5.484409 6.311356 7.215179 12.464428
416 9970.0 7.429243 1.710242 0.000000 6.222167 7.479457 8.668579 15.107802
497 10085.0 2.993684 1.245996 0.941742 2.166304 2.652712 3.467202 12.653784
258 10548.0 9.320128 2.149168 3.320423 7.811364 9.296833 10.755182 16.861263
222 11020.0 7.638585 1.587936 3.538688 6.547909 7.501866 8.567174 15.734313
398 11847.0 7.016219 1.334663 3.169083 6.071185 6.937974 7.909028 12.899463
485 15435.0 2.500596 0.747492 0.750748 1.991833 2.403763 2.891510 8.282556
327 15627.0 2.919554 1.085106 0.851447 2.133238 2.666950 3.455255 10.615247
425 16487.0 2.850683 1.006460 0.792125 2.132705 2.648014 3.375965 14.221093
67 16580.0 5.888814 2.170344 2.562263 4.473827 5.177396 6.515498 18.038824
363 17419.0 4.754982 2.050652 1.052461 3.298524 4.384968 5.774809 20.050853
499 26735.0 2.336397 0.895919 0.603243 1.723335 2.134374 2.719237 8.906935
56 32435.0 2.032252 0.693076 0.589714 1.562771 1.887694 2.339050 11.419610
453 37130.0 2.195190 0.848428 0.518378 1.617063 2.004453 2.558342 12.418647
491 44173.0 1.873969 0.649835 0.552015 1.421173 1.744503 2.181334 9.924016
451 46148.0 2.151608 0.822239 0.591659 1.600590 1.981437 2.495240 11.074528
239 67870.0 1.977638 0.748888 0.503575 1.466526 1.814698 2.310752 9.656071

500 rows × 8 columns


In [488]:
reodered_chosen.reset_index(drop=True).to_feather("/Users/weilu/Research/optimization/fragment/cluster100.feather")

In [486]:
reodered_chosen.groupby("cluster")["rmsd"].describe().sort_values("count")


Out[486]:
count mean std min 25% 50% 75% max
cluster
99 952.0 14.943018 3.768664 0.000000 12.702428 15.557707 17.871381 22.480900
91 961.0 12.874403 4.222096 0.000001 10.245549 12.638462 16.007383 23.408196
49 1106.0 14.358321 2.872157 0.000002 12.317531 13.987767 16.424921 22.320873
82 2443.0 14.934389 2.465681 0.000003 13.663287 15.177049 16.573827 22.858531
45 2802.0 14.401188 2.831316 0.000000 12.572518 14.603214 16.425568 22.323190
83 2964.0 14.287125 3.500858 0.000002 11.287813 14.561032 16.984719 22.267516
33 3270.0 14.618352 3.334886 3.575489 12.367087 14.941829 16.960734 24.235116
41 3413.0 15.624262 2.506492 0.000002 14.150406 15.694428 17.373564 22.672772
40 3867.0 13.870901 3.225424 0.000000 11.917269 14.192434 16.083955 23.190758
44 3919.0 16.986246 2.257992 0.000000 15.712881 17.144057 18.472072 23.571758
20 4022.0 14.509955 2.770687 0.000002 12.613540 14.684247 16.682598 22.153949
90 4023.0 14.779448 2.714949 0.000000 13.121565 15.047151 16.644111 22.614183
38 4142.0 15.446491 3.136561 0.000001 13.517689 15.827723 17.772601 23.471810
86 4468.0 16.535079 2.966621 0.000000 15.131907 17.064306 18.580954 23.009877
73 4548.0 14.112077 3.449817 0.000000 12.011958 14.362980 16.464117 23.796580
60 4976.0 14.640957 2.778833 4.378706 12.765303 14.762178 16.605331 22.131411
12 5001.0 16.494051 2.401524 0.000000 14.971834 16.729541 18.186580 23.612149
97 5056.0 14.520031 2.333622 0.000003 12.974537 14.563677 16.119334 21.787286
77 5778.0 14.701592 2.687401 0.000000 13.044247 14.991552 16.577533 22.030660
8 5968.0 16.722013 2.405117 6.942118 15.064290 16.874535 18.439966 31.806063
94 6086.0 16.541551 2.447883 8.785505 14.527768 16.212836 18.490690 25.617378
34 6140.0 16.226214 2.547526 0.000000 14.658252 16.397174 17.967023 23.642450
88 6438.0 13.586398 2.311506 0.000002 12.048226 13.689158 15.160343 33.482797
71 6662.0 13.987300 2.772758 0.000002 12.016270 14.073191 15.992690 23.090788
29 6677.0 12.693257 3.656613 3.346590 9.872446 12.675122 15.518500 22.254154
39 6699.0 15.155062 2.187593 0.000000 13.779472 15.362334 16.658834 29.450959
18 6800.0 12.975794 3.038169 0.000000 10.799474 13.162264 15.302423 21.366734
72 6904.0 10.840528 2.849117 6.285384 8.895940 9.890762 11.983059 23.176161
47 6953.0 11.944390 2.294724 4.492983 10.510392 11.980350 13.437744 19.261959
93 7222.0 15.447458 2.775308 0.000002 13.663342 15.632052 17.422167 23.169541
... ... ... ... ... ... ... ... ...
9 19256.0 11.868036 2.760591 4.369682 9.957512 11.909907 13.603477 22.852608
67 19326.0 9.332737 3.310210 2.753179 6.957699 8.515299 11.291503 22.736066
26 19636.0 9.953674 3.098423 3.404686 7.663375 9.477456 11.535308 22.963759
79 19661.0 13.677402 3.149661 5.096248 11.249396 13.916112 16.120006 24.043936
15 20771.0 12.404434 2.488259 5.106200 10.644426 12.371523 14.098956 22.221392
3 21066.0 10.534024 3.727477 2.474252 7.684041 9.366867 13.376781 22.292233
76 21488.0 13.812775 2.363840 6.228645 12.129934 13.771876 15.521418 22.282234
43 22120.0 12.195323 3.673941 4.505613 9.019266 12.269840 15.083801 22.675310
4 22208.0 8.115052 1.782288 3.306458 6.912866 7.940351 9.086553 19.644252
64 22446.0 11.983956 2.377519 5.272052 10.237089 11.629880 13.524192 21.306675
85 23779.0 11.547363 2.432277 5.064709 9.766418 11.350750 13.125555 21.785420
25 24037.0 11.576444 2.297706 4.446398 10.009812 11.391328 12.960079 21.304210
21 25110.0 7.009054 3.672890 1.644417 4.224781 5.991007 9.062640 23.073114
87 26733.0 14.219340 2.324962 6.682571 12.533556 14.018004 15.787119 22.495349
2 28523.0 10.366287 2.318918 4.092718 8.711052 10.278778 11.885385 20.743262
53 29672.0 12.696080 2.295429 5.989480 11.078456 12.568173 14.183804 22.524250
61 29771.0 13.372956 2.239237 6.422474 11.811759 13.236231 14.794222 21.550275
17 29886.0 8.316054 2.063265 3.155554 6.847377 8.007853 9.455852 18.007877
13 30638.0 11.947376 2.701377 5.214697 9.894431 11.466771 13.540095 23.155044
27 31914.0 10.824237 3.878395 2.469289 7.678417 10.729263 13.788632 22.449788
30 33191.0 10.773912 2.349838 4.338749 9.092387 10.551146 12.290083 20.743047
28 34735.0 12.222311 2.835495 3.951257 10.317364 12.233157 14.085700 21.912548
89 35291.0 9.288116 1.982222 3.835690 7.849875 9.060492 10.546546 17.505264
11 37373.0 8.989179 4.073917 1.596455 5.373913 8.477139 12.497340 22.413459
57 38507.0 9.876553 2.291544 3.518744 8.152652 9.705877 11.423994 18.962729
58 43745.0 10.582617 2.227352 4.103513 8.940012 10.506420 12.097370 21.442793
80 45591.0 8.442550 3.123314 2.453882 6.005965 7.805853 10.514389 22.554986
54 59153.0 7.811586 3.759268 1.720934 4.700202 6.692015 10.633417 21.810600
56 98910.0 7.592623 1.993617 2.231323 6.215800 7.363404 8.681848 19.396090
1 382971.0 3.568442 2.394548 0.599623 2.092547 2.786991 4.038103 20.580043

100 rows × 8 columns


In [475]:
%%time
from sklearn.cluster import KMeans
# x = data_original.head(100).iloc[:, 3:24].values
# ((kmeans.labels_ == kmeans.predict(x))-1).sum()
chosen = data_original.head(100000).reset_index(drop=True)
x = chosen.iloc[:, 3:5].values
kmeans = KMeans(n_clusters=10, random_state=0).fit(x)
# chosen["rmsd"] = chosen.iloc[:,3:87].apply(lambda x: getScore(x, kmeans), axis=1)
chosen["cluster"] = kmeans.labels_


CPU times: user 6.66 s, sys: 1.21 s, total: 7.87 s
Wall time: 4.84 s

In [467]:
plt.scatter(x[:, 0], x[:, 1], c=kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], c="red")


Out[467]:
<matplotlib.collections.PathCollection at 0x18aa53e10>

In [476]:
%%time
# fit on the whole data
kmeans = MiniBatchKMeans(n_clusters=10,
        random_state=0,
        batch_size=100,
        max_iter=300,
        tol=1e4).fit(x)


CPU times: user 208 ms, sys: 1.98 ms, total: 210 ms
Wall time: 210 ms

In [477]:
plt.scatter(x[:, 0], x[:, 1], c=kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], c="red")


Out[477]:
<matplotlib.collections.PathCollection at 0x18c72acf8>

rmsd between cluster centers


In [398]:
from sklearn.cluster import KMeans
# x = data_original.head(100).iloc[:, 3:24].values
# ((kmeans.labels_ == kmeans.predict(x))-1).sum()
chosen = data_original.head(10000).reset_index(drop=True)
x = chosen.iloc[:, 3:87].values
kmeans = KMeans(n_clusters=10, random_state=0).fit(x)
# chosen["rmsd"] = chosen.iloc[:,3:87].apply(lambda x: getScore(x, kmeans), axis=1)
chosen["cluster"] = kmeans.labels_

In [376]:



Out[376]:
(100, 84)

In [399]:
# ((t1.reshape(n,1,k) - t1.reshape(1,n,k))**2).sum(axis=2)**0.5
# kmeans.cluster_centers_.shape
n = kmeans.cluster_centers_.shape[0]
k = kmeans.cluster_centers_.shape[1]
t1 = kmeans.cluster_centers_[:n, :k]
cluster_rmsd =((t1.reshape(n,1,k) - t1.reshape(1,n,k))**2).sum(axis=2)**0.5
cluster_rmsd[cluster_rmsd==0.0] = 100
cluster_rmsd.min()


Out[399]:
18.62429297622066

In [400]:
plt.rcParams['figure.figsize'] = [16.18033, 10]
plt.imshow(cluster_rmsd, cmap="seismic")
plt.colorbar()


Out[400]:
<matplotlib.colorbar.Colorbar at 0x193d84908>

In [391]:
n = kmeans.cluster_centers_.shape[0]
k = kmeans.cluster_centers_.shape[1]
t1 = kmeans.cluster_centers_[:n, :k]
cluster_rmsd =((t1.reshape(n,1,k) - t1.reshape(1,n,k))**2).sum(axis=2)**0.5

In [392]:
cluster_rmsd[cluster_rmsd==0.0] = 100

In [393]:
cluster_rmsd.min()


Out[393]:
4.854396829416842

In [394]:
plt.rcParams['figure.figsize'] = [16.18033, 10]
plt.imshow(cluster_rmsd, cmap="seismic")
plt.colorbar()


Out[394]:
<matplotlib.colorbar.Colorbar at 0x18e05e7f0>

In [271]:
kmeans.score(x[0].reshape(1,-1))


Out[271]:
-89.79255359750277

In [314]:
d = chosen.query("cluster == 88").sort_values("rmsd").reset_index(drop=True)

In [ ]:
# compute the rmsd with respect to the pdb that closest to the cluster center
pdbList = glob.glob(f"{pre}cluster88/[0-9]*.pdb")
with open(pre+"cluster88_rmsd.csv", "w") as out:
    out.write("i,j,rmsd\n")
    for p1 in pdbList:
        i1 = int(p1.split("/")[-1].split(".")[0])
        if i1 != 0:
            continue
        print(i1)
        for p2 in pdbList:
            i2 = p2.split("/")[-1].split(".")[0]
            rmsd = float(getFromTerminal(f"calculate_rmsd.py {p1} {p2}"))
            out.write(f"{i1},{i2},{rmsd}\n")

In [349]:
cluster88 = pd.read_csv(pre+"cluster88_rmsd.csv")

average CA rmsd is about 1.3(max 2.7) within the cluster.


In [354]:
cluster88.query("i!=j").groupby("i")["rmsd"].describe().sort_values("mean").head()


Out[354]:
count mean std min 25% 50% 75% max
i
0 55.0 1.297952 0.621858 0.247209 0.744088 1.234168 1.814403 2.669655
3 55.0 1.302234 0.606989 0.390278 0.766102 1.263094 1.783251 2.655407
2 55.0 1.303887 0.638794 0.174071 0.717086 1.283669 1.838137 2.670576
4 55.0 1.308397 0.648857 0.174071 0.727001 1.307860 1.865656 2.601067
9 55.0 1.331420 0.539353 0.387006 0.917789 1.346706 1.728705 2.387373

In [355]:
d


Out[355]:
pdb i seq caca_1 caca_2 caca_3 caca_4 caca_5 caca_6 caca_7 ... cbcb_14 cbcb_15 cbcb_16 cbcb_17 cbcb_18 cbcb_19 cbcb_20 cbcb_21 cluster rmsd
0 3bwsA02 124 SVIDRKTKL 9.949520 11.881444 15.007518 14.224649 11.941430 10.004884 9.158696 ... 7.121051 6.843129 4.431560 7.335259 5.484321 7.748463 9.831528 9.043426 88 3.819305
1 2gy5A03 19 GVCHEDTGE 9.729459 12.950154 15.225656 13.891017 11.779313 9.805697 9.412597 ... 6.277780 7.273304 4.563250 6.884191 5.750976 8.210796 10.516659 9.288854 88 4.452417
2 5teeA01 333 KCWDIATLE 10.085794 12.141468 15.270156 14.333816 11.746857 10.142472 9.340342 ... 6.638502 6.956893 4.632072 7.229734 5.380895 6.813114 9.548963 8.939441 88 4.612135
3 5a2fA02 56 KEMDPVTQL 10.218586 12.734818 15.575337 14.460573 11.705797 10.030134 9.314142 ... 6.102289 7.088407 4.625541 6.740666 5.120184 7.463485 9.995037 9.003468 88 4.821177
4 3bwsA02 269 YVIDTTTDT 9.946895 12.214399 15.314721 14.489808 11.962596 10.234323 9.240640 ... 6.559402 6.920260 5.023807 7.303305 5.770678 6.323815 9.489404 8.827125 88 5.162033
5 5teeA01 89 KIWDVETKT 10.036064 12.049080 15.123745 14.087447 11.657670 9.739409 9.672215 ... 7.149391 6.913264 4.877741 7.522132 5.712117 6.512667 9.248618 8.779138 88 5.462373
6 2xt6B03 81 VIVDRKTGE 9.954157 12.611201 15.333824 14.229970 11.283616 9.897378 9.132003 ... 6.168306 7.465668 4.440479 6.005554 4.201615 8.649049 9.925421 8.656796 88 5.566763
7 3bwsA02 35 DVLDINSGQ 9.834118 11.908249 14.979401 14.166902 11.109526 10.022712 9.741758 ... 7.071804 7.131282 4.950830 6.842302 4.385797 7.382644 9.384715 7.943592 88 5.761471
8 3bwsA02 213 EVYDLKEKK 9.741776 12.013299 15.171566 13.940570 12.418761 9.452124 9.560477 ... 8.233245 6.993384 3.955309 7.474078 6.513636 6.924696 9.438110 9.940254 88 5.987422
9 4x36A02 83 YYLDAKEGA 9.940290 12.375504 15.257021 14.755202 11.646676 10.272871 8.773566 ... 5.879449 6.757819 5.124569 5.930865 5.363800 8.044666 10.621360 9.346555 88 6.043243
10 3bwsA02 81 HVFDLKTLA 10.294765 12.111046 15.390107 14.873136 11.918390 10.923274 8.894362 ... 5.934416 6.495215 4.319704 6.379131 4.612946 7.610839 9.810240 8.927789 88 6.138788
11 2czrA02 47 FYKCEECGK 9.168917 12.426137 14.358852 13.570449 11.310402 9.443875 8.913856 ... 6.706592 8.060266 5.050359 6.188032 5.399819 7.802509 10.548103 8.616280 88 6.157605
12 4lpqA02 9 VEIDLDKQI 9.906225 12.531385 15.429649 14.528891 13.127890 10.403035 9.426353 ... 8.736698 7.622255 4.310046 7.259790 5.234055 6.911831 9.290947 9.327504 88 6.414481
13 3tunA02 34 CKVSLESGH 10.379654 12.639275 15.348255 14.291680 10.995448 10.162334 9.374565 ... 5.464734 6.281708 4.522317 6.022815 5.260367 7.944193 10.505252 8.564303 88 6.419547
14 3bwsA02 104 LLYDPIRDL 10.193686 13.003706 15.691595 14.823738 13.057449 10.318852 9.808470 ... 8.385339 7.620235 4.600103 7.006957 4.847549 6.773152 9.199594 8.855232 88 6.504967
15 3bwsA02 301 VISDFLDHQ 9.847713 12.340275 15.451954 14.255663 13.476367 10.383883 8.911294 ... 8.614441 7.075551 3.749933 7.625578 5.056649 8.167385 9.202552 9.558326 88 6.681753
16 1igqB00 31 WLKYEDDGQ 10.160343 13.270386 15.768248 15.105667 12.140276 10.367481 9.581594 ... 6.238425 7.055059 6.029074 6.631764 4.811035 7.643550 9.732194 8.268499 88 6.873870
17 3tunA02 8 GWYSISDES 10.406827 12.695497 15.692054 14.874257 13.157329 10.956638 9.741825 ... 8.121049 7.145075 4.038841 7.135518 4.777005 6.716304 8.934176 8.554501 88 6.953285
18 3bwsA02 114 YCSNWISED 9.621660 12.917183 15.446292 14.177167 13.226645 10.154354 9.548340 ... 9.551937 8.516931 4.503995 7.054341 5.320324 7.854528 9.869858 9.593534 88 7.150779
19 2czrA02 84 FELNFREGK 9.401689 12.725125 14.779818 13.575309 12.958589 10.353744 9.336522 ... 9.104170 8.467405 4.055002 7.074190 5.586054 7.375543 9.794145 8.695016 88 7.353073
20 4x36A02 96 FIQSADGTG 9.316404 12.502723 14.474878 12.100182 12.781625 9.473136 9.100404 ... 8.679495 7.596229 6.392345 5.241836 4.742088 8.686616 9.994502 8.133007 88 8.111603
21 5teeA01 132 FCYWFNRND 9.734386 11.471707 14.944398 14.817686 13.018109 10.472942 9.250990 ... 9.811750 7.862967 5.062631 7.820497 4.361548 7.979190 8.761995 9.243713 88 8.168455
22 3bwsA02 235 IALSPDGKY 9.942943 12.838919 14.970727 12.748605 13.571465 10.319620 9.058799 ... 8.890912 7.295113 6.352217 5.512440 4.280098 9.089517 9.640741 8.166730 88 8.599598
23 3bwsA02 291 LDVSPDNRY 10.274695 13.081531 15.338303 13.379724 13.781996 10.656155 9.302710 ... 8.529108 6.914666 7.092677 5.174410 4.133323 8.836203 9.589263 8.149068 88 8.699957
24 1ka1A02 87 IHTDAMEDV 10.407919 12.130158 14.331053 13.842548 10.225945 10.270689 9.547078 ... 5.289476 5.900444 5.524788 7.152904 5.819712 8.841456 10.445993 8.840138 88 8.776713
25 3bwsA02 146 LLLSKDGKE 9.950219 13.005416 14.986661 12.596710 13.485259 10.565754 9.318662 ... 9.009582 7.589873 6.367220 5.838040 4.157111 9.012595 9.362629 7.554568 88 8.911911
26 3a79B00 508 SAKCSGSGK 10.110187 13.011072 14.885407 13.630521 10.028573 9.704154 10.287563 ... 5.247304 6.889391 5.188907 6.328406 4.973253 8.910455 10.206703 8.177255 88 8.960456
27 3bwsA02 62 TISIPEHNE 9.817282 12.642413 15.391432 14.783654 13.754408 10.523006 9.632155 ... 9.948241 7.800527 4.945644 7.654897 4.541373 6.721983 8.166798 8.948376 88 9.328044
28 3hviA00 192 SSYLEYMKV 9.915005 12.536254 14.963794 13.874313 12.605189 10.819551 9.230969 ... 6.508354 6.251395 4.209615 8.067944 4.002141 11.135752 9.179267 9.378696 88 9.406241
29 1kvkA01 49 SLNLPNVGI 9.872140 12.464895 15.427619 14.898810 13.771154 10.793503 9.835277 ... 10.082244 8.119540 4.712597 7.447677 5.173263 6.850187 8.813840 8.941354 88 9.425864
30 3bwsA02 250 GPNHPTEGY 8.995919 11.200593 14.301685 13.813991 11.312732 9.266978 7.749330 ... 5.335237 4.460575 4.294027 5.952809 7.126341 8.807027 11.041970 11.815134 88 9.625760
31 1p9hA00 9 PNADPALGL 8.204474 10.150581 13.143430 13.322371 12.160578 11.349983 9.665771 ... 7.127809 6.223761 4.310967 7.569828 6.613921 7.029165 9.850493 9.631247 88 10.114729
32 5b3kA00 89 GLGDSSYGD 9.597320 12.154374 15.100858 14.412055 13.749628 10.152723 10.020901 ... 9.491571 4.860025 4.142701 7.559115 4.433785 7.575753 7.348795 8.629077 88 10.596721
33 2yeqA02 154 HFTYGNLAS 10.306349 12.778808 13.628778 13.996952 10.818155 7.468215 9.210052 ... 6.887122 4.880846 5.155641 4.599007 7.822213 7.813149 9.890082 8.855243 88 11.339090
34 1vx7E02 94 MGGFPHYGV 7.975001 9.675704 13.042289 12.716304 10.627055 9.650623 7.693866 ... 5.901504 5.050352 3.800005 6.113456 7.217942 9.328736 11.408817 12.313504 88 11.388345
35 2mj6A00 74 RATGEEGET 10.114860 13.810698 15.537375 13.465596 11.132085 7.529359 10.105418 ... 7.311515 5.777658 7.177136 4.094617 6.927640 8.186397 11.716082 10.635818 88 11.626912
36 4b9gA00 53 KLVNVNNPD 9.657044 11.811341 14.628034 13.417040 11.054585 11.446303 8.837651 ... 6.292343 9.008652 4.260343 7.011172 7.830382 8.814804 12.381727 10.399013 88 12.114169
37 4gyiA03 44 EEKDAEDPS 9.612258 12.716767 15.137066 13.919269 11.202133 11.844838 8.997447 ... 4.381712 8.290860 6.033016 6.653250 8.025368 8.679353 12.191490 9.909112 88 12.458303
38 3tunA02 107 AIPVPGNPD 9.183981 12.203222 14.755483 13.067821 10.905220 11.343682 8.843083 ... 5.455028 8.854027 4.492827 7.458682 7.959353 9.402717 12.204529 10.528821 88 12.475578
39 3majA02 49 LERHGGSLV 9.786978 13.407288 14.910089 12.403946 10.591506 7.053391 10.257120 ... 7.917249 5.814570 8.424906 4.247381 6.721586 6.580546 10.721272 10.583070 88 12.761652
40 2bteA05 49 EIYVPGKIL 9.999669 12.358915 14.762330 13.789237 10.346306 7.235074 9.718364 ... 6.982888 5.248495 6.646159 3.817559 7.485408 8.574403 9.458507 10.122655 88 12.829105
41 2yeqA02 235 PIYSMDSWD 10.442057 12.177690 15.400909 14.613280 14.722425 10.947508 9.156952 ... 9.444396 3.871136 3.824424 8.584323 5.558829 10.608860 7.775544 9.088020 88 13.032639
42 5a2fA02 10 LFLETEQLK 10.247132 13.817410 16.596710 14.396990 13.947150 10.481287 10.136517 ... 9.878394 5.473297 3.925470 8.597685 5.523424 10.738580 9.831265 10.684470 88 13.363569
43 5b3kA00 5 LSGSVYGTA 10.143659 13.758297 15.273129 12.781238 13.092915 9.680444 10.111543 ... 10.398278 5.785300 5.400983 6.153954 3.945883 8.266972 8.379736 9.034787 88 13.395649
44 4kqcA02 19 AAFTSDNKA 7.635839 10.297567 12.887216 11.377438 9.537807 8.587503 10.329976 ... 5.632930 4.144410 5.104934 7.282839 9.187781 6.860199 10.786813 12.001636 88 13.559004
45 1ac5A00 282 RESSQKGTA 9.565839 13.072076 13.702000 10.173500 11.112715 8.762067 10.317171 ... 11.583023 8.858782 5.732589 6.250565 3.782586 8.150827 8.276429 7.952695 88 13.570053
46 1iv8A04 55 INTSWRNQN 7.467202 11.081289 12.271182 11.130205 10.820847 10.029446 10.850627 ... 5.654761 3.746603 5.228187 6.865900 7.643214 7.479899 11.152542 11.235831 88 13.687695
47 5tjjA02 29 VKRLRDDRV 9.804014 13.411267 15.186080 13.200135 10.432072 6.660206 10.220562 ... 7.879859 5.688957 8.363690 4.255258 6.413924 7.371846 10.703006 10.747286 88 13.718079
48 1a48A01 21 IYEVDAGTL 9.616343 12.438126 13.847081 13.413589 10.542364 6.839457 9.359861 ... 7.455886 6.129662 8.608805 3.978680 6.352230 4.970822 9.293366 10.553911 88 13.853912
49 1vx7E02 79 SASTDADIT 8.199695 11.514258 13.286428 11.109321 10.686544 9.954393 10.556925 ... 6.671429 6.060616 4.341811 7.643427 9.053087 7.334598 11.572095 10.136599 88 13.957391
50 1vliA01 270 KTTTAIEGE 9.331764 12.773533 14.553049 12.981730 13.157260 12.276527 9.811162 ... 6.338029 5.398420 4.697480 7.438582 10.186050 6.663074 10.660284 11.929588 88 14.021673
51 1vx7E02 146 DTSSKIGHG 8.886813 9.614176 13.160775 13.560004 12.032557 9.949920 8.166377 ... 8.340906 3.299832 5.298683 8.124020 3.338068 9.770238 6.973861 7.576541 88 14.387859
52 2xt6B03 97 LATNPDGTP 6.727586 8.690116 11.635242 10.416782 10.696363 8.880394 9.847705 ... 7.165048 5.930047 6.727807 4.764903 7.259028 8.913892 12.010137 10.369381 88 14.400125
53 2ebeA00 24 LPGEVAGAR 8.584558 9.919054 13.655498 14.275121 13.300017 11.250684 7.274832 ... 10.058243 6.896407 6.365407 5.926227 5.536994 5.997392 8.515861 10.323625 88 14.456325
54 3bwsA02 194 IVSGNTENK 9.503940 13.078058 14.365262 14.337009 13.430169 10.672255 9.702360 ... 9.891918 7.112747 8.391471 8.114089 3.026186 11.094549 7.104325 4.670869 88 15.448550
55 2gy5A03 12 CTACMNNGV 8.239299 11.777732 14.927156 13.773328 11.719008 9.464096 9.486419 ... 9.481142 11.445315 7.693930 3.574055 6.200655 7.022531 11.150053 10.721931 88 15.909167

56 rows × 89 columns


In [315]:
for i, row in d.iterrows():
    print(i, row["pdb"], row["i"])
    getFragPdb(row["pdb"], int(row["i"]), f"cluster88/{i}.pdb")


0 3bwsA02 124
1 2gy5A03 19
2 5teeA01 333
3 5a2fA02 56
4 3bwsA02 269
5 5teeA01 89
6 2xt6B03 81
7 3bwsA02 35
8 3bwsA02 213
9 4x36A02 83
10 3bwsA02 81
11 2czrA02 47
12 4lpqA02 9
13 3tunA02 34
14 3bwsA02 104
15 3bwsA02 301
16 1igqB00 31
17 3tunA02 8
18 3bwsA02 114
19 2czrA02 84
20 4x36A02 96
21 5teeA01 132
22 3bwsA02 235
23 3bwsA02 291
24 1ka1A02 87
25 3bwsA02 146
26 3a79B00 508
27 3bwsA02 62
28 3hviA00 192
29 1kvkA01 49
30 3bwsA02 250
31 1p9hA00 9
32 5b3kA00 89
33 2yeqA02 154
34 1vx7E02 94
35 2mj6A00 74
36 4b9gA00 53
37 4gyiA03 44
38 3tunA02 107
39 3majA02 49
40 2bteA05 49
41 2yeqA02 235
42 5a2fA02 10
43 5b3kA00 5
44 4kqcA02 19
45 1ac5A00 282
46 1iv8A04 55
47 5tjjA02 29
48 1a48A01 21
49 1vx7E02 79
50 1vliA01 270
51 1vx7E02 146
52 2xt6B03 97
53 2ebeA00 24
54 3bwsA02 194
55 2gy5A03 12

In [309]:
chosen.groupby("cluster")["rmsd"].describe().sort_values("count")


Out[309]:
count mean std min 25% 50% 75% max
cluster
86 25.0 12.868003 2.429841 8.180901 11.585724 13.128961 15.137856 16.086021
31 27.0 10.473506 2.571394 6.830730 8.555585 10.168139 12.625339 15.197835
20 37.0 11.948910 3.092640 7.513775 9.795009 11.379272 13.952240 18.524402
92 38.0 13.966047 2.389854 8.431391 12.411322 14.141286 16.221890 17.391868
57 41.0 13.543257 2.579857 9.174207 11.875370 13.287241 14.786760 19.987259
52 41.0 12.396760 2.120092 7.802385 10.917835 11.902867 13.991850 17.486862
45 42.0 12.519146 2.424179 6.651546 11.085333 12.652927 13.997425 17.496040
90 42.0 10.328494 3.056165 5.334071 7.579417 9.787187 12.954448 16.926965
69 44.0 11.745726 4.110222 5.725782 7.896013 10.928373 14.615448 19.751685
77 45.0 11.411638 3.054516 6.919495 8.976652 10.449578 14.333481 17.071899
6 46.0 10.668448 2.545219 6.227311 8.140037 11.016674 12.629587 15.423659
0 47.0 14.509444 2.470353 9.226504 13.101469 14.556386 16.572913 19.358230
47 48.0 12.729488 3.242662 6.088605 10.585771 12.452035 14.989300 20.836815
94 49.0 11.592993 3.241832 4.906130 8.873921 12.090305 14.158854 17.384469
50 50.0 12.996113 2.060327 7.903283 11.649837 13.214512 14.436430 16.519939
98 50.0 10.288891 2.125497 5.840846 8.969731 10.230508 11.445786 15.367995
37 52.0 10.737135 3.622544 5.375806 8.121953 10.133434 12.036602 19.093855
12 53.0 11.658254 2.009164 7.229504 10.414187 11.635477 12.870523 16.490421
75 53.0 12.303176 2.765158 7.490998 9.846713 12.810812 14.106641 18.506562
91 53.0 12.253184 3.124894 7.356313 9.992794 12.165720 14.560320 19.125920
33 54.0 15.254683 2.772595 9.718127 13.463367 15.203922 17.276330 20.230821
93 54.0 12.731544 2.447766 8.084662 10.884686 12.367426 14.375135 19.421543
61 55.0 12.135543 2.968750 6.652033 9.700207 11.986171 13.914067 18.366642
4 55.0 12.095456 3.200645 5.986447 9.502334 11.845416 13.925985 18.225360
40 56.0 13.132927 2.459430 8.461646 11.342993 13.111697 14.719820 20.304801
18 56.0 11.472333 2.322243 6.660154 10.031681 11.102281 12.897364 16.418819
88 56.0 9.728882 3.453831 3.819305 6.483612 9.367143 13.115371 15.909167
84 57.0 12.270203 1.977655 8.086094 10.720760 12.208336 13.513403 16.932392
36 58.0 12.126113 2.392913 6.474681 10.996501 12.152336 13.598899 17.566650
11 58.0 11.802073 3.030148 5.937072 9.867863 11.690367 14.526130 18.369063
... ... ... ... ... ... ... ... ...
97 96.0 10.397544 2.156213 4.843555 8.971957 10.500444 12.034870 15.887437
81 96.0 10.893850 1.806490 7.209888 9.664418 10.642898 12.175620 17.066911
29 99.0 11.169434 2.484482 6.119285 9.157915 11.336999 13.020159 17.398069
24 100.0 10.958577 3.195694 6.285791 8.380647 10.144780 12.700818 20.347792
21 102.0 9.274561 3.901041 3.612956 6.361961 8.411638 11.606667 19.017857
42 102.0 8.683971 1.646199 5.457985 7.718123 8.563766 9.620044 13.560157
32 103.0 11.377467 2.587875 7.017511 9.356829 11.070049 13.138082 18.061126
26 103.0 11.673854 2.888604 5.961631 9.681419 11.186433 13.421784 17.845587
72 104.0 10.797946 2.223295 5.842216 9.489124 10.696482 12.322521 16.318550
82 106.0 8.864908 2.846169 4.126482 6.461541 8.885306 10.930845 14.974236
68 110.0 8.798542 3.038090 4.574051 6.556161 8.270263 10.427786 17.689796
99 114.0 9.261581 2.506325 4.685189 7.627691 8.848057 11.181854 16.532881
51 114.0 7.279830 3.422616 3.365311 4.596202 6.227794 9.064487 18.266724
95 115.0 8.240733 1.837165 5.047425 6.738861 8.152049 9.716614 12.962903
63 117.0 8.748993 1.660973 5.493222 7.657481 8.697411 9.703374 14.036003
53 125.0 7.374729 4.107384 2.404638 3.764119 6.233578 9.923549 18.865109
65 126.0 9.250297 1.981388 5.138529 7.795777 9.112192 10.608026 15.085967
1 134.0 8.061704 2.264072 3.513803 6.336191 7.963095 9.393186 16.710189
89 135.0 8.777316 1.859779 4.499449 7.717106 8.683468 9.719918 15.497464
19 135.0 7.835607 1.913975 4.302796 6.350714 7.800876 9.185014 12.179388
25 135.0 6.388283 3.302691 1.961434 3.600076 5.666581 8.153018 18.217225
54 150.0 7.302164 2.970296 2.804589 5.026684 6.687114 8.919654 16.160247
79 159.0 8.747054 1.775704 4.651922 7.499323 8.755172 9.964741 13.639758
16 160.0 6.539418 2.908991 3.385512 4.389139 5.591175 7.529391 17.318794
28 160.0 7.127153 1.459804 3.737256 6.113912 7.129481 8.012175 11.654530
7 208.0 6.800668 1.333470 3.752799 5.853304 6.655734 7.592728 10.969135
14 223.0 7.032772 1.580068 3.240167 5.842751 6.918776 8.082615 11.896784
64 267.0 6.079601 1.436474 2.667226 5.173178 6.006692 6.962764 10.991078
2 285.0 4.884285 2.033753 1.825475 3.610672 4.196315 5.714881 13.594217
73 1699.0 2.801301 1.493118 0.604966 1.870317 2.457331 3.339556 14.398997

100 rows × 8 columns


In [296]:
chosen.groupby("cluster")["rmsd"].describe()


Out[296]:
count mean std min 25% 50% 75% max
cluster
0 47.0 14.509444 2.470353 9.226504 13.101469 14.556386 16.572913 19.358230
1 134.0 8.061704 2.264072 3.513803 6.336191 7.963095 9.393186 16.710189
2 285.0 4.884285 2.033753 1.825475 3.610672 4.196315 5.714881 13.594217
3 63.0 11.792323 2.030125 7.925980 10.115502 11.795392 13.097566 15.275605
4 55.0 12.095456 3.200645 5.986447 9.502334 11.845416 13.925985 18.225360
5 88.0 10.875914 3.103294 5.921701 8.141490 10.544263 13.040039 18.224837
6 46.0 10.668448 2.545219 6.227311 8.140037 11.016674 12.629587 15.423659
7 208.0 6.800668 1.333470 3.752799 5.853304 6.655734 7.592728 10.969135
8 64.0 11.639289 2.423901 6.294332 9.685099 11.532024 13.191878 16.898084
9 68.0 13.283440 2.730605 8.283355 11.036373 13.383355 15.224419 20.960029
10 63.0 9.088767 3.612960 3.987563 6.032655 7.856961 12.208410 17.095299
11 58.0 11.802073 3.030148 5.937072 9.867863 11.690367 14.526130 18.369063
12 53.0 11.658254 2.009164 7.229504 10.414187 11.635477 12.870523 16.490421
13 59.0 11.127064 2.586649 6.015839 9.420929 11.241936 12.618502 18.425722
14 223.0 7.032772 1.580068 3.240167 5.842751 6.918776 8.082615 11.896784
15 65.0 12.337664 2.833936 7.081583 9.988564 11.966763 14.439806 19.244873
16 160.0 6.539418 2.908991 3.385512 4.389139 5.591175 7.529391 17.318794
17 70.0 10.064104 3.865495 4.297259 7.014241 8.819931 13.695621 19.649993
18 56.0 11.472333 2.322243 6.660154 10.031681 11.102281 12.897364 16.418819
19 135.0 7.835607 1.913975 4.302796 6.350714 7.800876 9.185014 12.179388
20 37.0 11.948910 3.092640 7.513775 9.795009 11.379272 13.952240 18.524402
21 102.0 9.274561 3.901041 3.612956 6.361961 8.411638 11.606667 19.017857
22 66.0 10.666855 3.175975 4.637165 8.003551 10.195682 13.005901 18.776521
23 63.0 9.545131 2.336334 4.861592 7.906835 9.587818 11.061898 14.832847
24 100.0 10.958577 3.195694 6.285791 8.380647 10.144780 12.700818 20.347792
25 135.0 6.388283 3.302691 1.961434 3.600076 5.666581 8.153018 18.217225
26 103.0 11.673854 2.888604 5.961631 9.681419 11.186433 13.421784 17.845587
27 85.0 9.117133 2.413714 5.050972 7.208351 8.711424 10.913915 14.786774
28 160.0 7.127153 1.459804 3.737256 6.113912 7.129481 8.012175 11.654530
29 99.0 11.169434 2.484482 6.119285 9.157915 11.336999 13.020159 17.398069
... ... ... ... ... ... ... ... ...
70 64.0 13.578291 2.536454 8.479009 11.552321 13.365783 15.193793 20.172172
71 82.0 8.563055 2.027075 3.655412 7.122049 8.289281 10.049077 12.663579
72 104.0 10.797946 2.223295 5.842216 9.489124 10.696482 12.322521 16.318550
73 1699.0 2.801301 1.493118 0.604966 1.870317 2.457331 3.339556 14.398997
74 74.0 10.274946 3.124773 6.902806 8.267877 9.224823 10.704157 20.079225
75 53.0 12.303176 2.765158 7.490998 9.846713 12.810812 14.106641 18.506562
76 71.0 11.688041 2.918578 6.455069 9.388986 11.885100 13.805711 17.627366
77 45.0 11.411638 3.054516 6.919495 8.976652 10.449578 14.333481 17.071899
78 72.0 9.603870 2.805974 5.038361 7.447394 9.040345 11.661425 16.632578
79 159.0 8.747054 1.775704 4.651922 7.499323 8.755172 9.964741 13.639758
80 73.0 14.303270 2.217348 8.772048 12.789239 14.371254 15.990052 19.490860
81 96.0 10.893850 1.806490 7.209888 9.664418 10.642898 12.175620 17.066911
82 106.0 8.864908 2.846169 4.126482 6.461541 8.885306 10.930845 14.974236
83 82.0 10.152878 1.844130 5.890020 8.937818 10.050156 11.240107 15.926636
84 57.0 12.270203 1.977655 8.086094 10.720760 12.208336 13.513403 16.932392
85 68.0 10.445316 2.388606 6.026959 9.001074 10.661674 12.279133 15.423282
86 25.0 12.868003 2.429841 8.180901 11.585724 13.128961 15.137856 16.086021
87 63.0 11.927190 2.376512 8.272821 9.721701 11.669178 13.658085 17.103763
88 56.0 9.728882 3.453831 3.819305 6.483612 9.367143 13.115371 15.909167
89 135.0 8.777316 1.859779 4.499449 7.717106 8.683468 9.719918 15.497464
90 42.0 10.328494 3.056165 5.334071 7.579417 9.787187 12.954448 16.926965
91 53.0 12.253184 3.124894 7.356313 9.992794 12.165720 14.560320 19.125920
92 38.0 13.966047 2.389854 8.431391 12.411322 14.141286 16.221890 17.391868
93 54.0 12.731544 2.447766 8.084662 10.884686 12.367426 14.375135 19.421543
94 49.0 11.592993 3.241832 4.906130 8.873921 12.090305 14.158854 17.384469
95 115.0 8.240733 1.837165 5.047425 6.738861 8.152049 9.716614 12.962903
96 60.0 10.816445 2.397899 6.838141 8.975154 10.862857 12.261697 17.314459
97 96.0 10.397544 2.156213 4.843555 8.971957 10.500444 12.034870 15.887437
98 50.0 10.288891 2.125497 5.840846 8.969731 10.230508 11.445786 15.367995
99 114.0 9.261581 2.506325 4.685189 7.627691 8.848057 11.181854 16.532881

100 rows × 8 columns


In [249]:
chosen.groupby("cluster")["caca_1"].describe()


Out[249]:
count mean std min 25% 50% 75% max
cluster
0 47.0 9.107972 0.967933 6.033747 8.792595 9.222053 9.653984 10.783340
1 134.0 9.283012 0.434942 8.082854 9.065539 9.288622 9.464692 10.421309
2 285.0 5.231241 0.302073 3.988152 5.036580 5.179334 5.347704 6.532340
3 63.0 6.605184 1.571319 4.544618 5.137373 6.012451 7.725599 10.031975
4 55.0 6.139642 1.315773 4.880736 5.251148 5.602896 6.290223 9.585359
5 88.0 6.455053 1.470074 4.464227 5.159675 5.598491 7.858600 9.366185
6 46.0 9.717776 0.675211 7.649774 9.463802 9.911917 10.162426 10.543301
7 208.0 9.950973 0.487548 8.361642 9.739443 10.024712 10.305412 10.865170
8 64.0 9.245912 0.768627 7.141240 8.706300 9.304117 9.840726 10.642015
9 68.0 7.343715 1.713498 4.871723 5.481516 7.899383 8.786096 9.809787
10 63.0 9.759267 0.756167 7.502295 9.520896 10.034769 10.182660 10.910431
11 58.0 6.014184 1.061447 4.466815 5.352783 5.676593 6.344498 8.828549
12 53.0 8.833394 0.895318 6.091265 8.552012 8.962975 9.497625 9.933512
13 59.0 7.978848 1.301351 4.934107 7.336499 8.219158 8.912570 10.045939
14 223.0 9.786439 0.521609 8.044416 9.435349 9.828861 10.157747 10.836526
15 65.0 7.312742 1.547518 5.184872 5.753088 7.770713 8.729336 10.152164
16 160.0 8.295052 0.813046 6.261518 7.662178 8.574114 8.984371 9.775524
17 70.0 9.867539 0.650233 7.321350 9.652145 9.966244 10.336061 10.850627
18 56.0 9.083503 0.923239 6.788041 8.580646 9.239259 9.767775 10.545494
19 135.0 9.920066 0.485947 8.154524 9.655799 9.991779 10.253253 10.822775
20 37.0 5.890449 1.040159 4.924730 5.201715 5.456261 6.167805 8.527210
21 102.0 5.598708 1.064236 4.619137 5.054111 5.172285 5.451120 9.318322
22 66.0 8.514215 1.092339 5.422628 7.782361 8.667383 9.250473 10.368338
23 63.0 5.684063 0.705324 4.586082 5.271411 5.431941 5.954486 7.771918
24 100.0 9.637346 1.029738 4.625182 9.482487 9.835333 10.228119 10.868514
25 135.0 8.122323 0.681756 5.145186 8.004938 8.219689 8.500360 9.150583
26 103.0 5.346222 0.576351 4.687043 5.025092 5.189865 5.400290 7.749330
27 85.0 9.795203 0.592879 7.693119 9.523393 9.778906 10.241662 10.992158
28 160.0 8.461235 0.798432 6.702386 7.885981 8.386912 9.000854 10.526746
29 99.0 5.629837 0.868820 4.409707 5.112083 5.384417 5.759179 8.437553
... ... ... ... ... ... ... ... ...
70 64.0 5.803136 0.855240 4.681069 5.293294 5.483550 5.966646 8.468333
71 82.0 7.780235 0.832253 5.466374 7.430647 7.794484 8.251898 9.736248
72 104.0 7.186783 1.567058 4.318045 5.531958 7.865799 8.520729 9.806312
73 1699.0 5.160235 0.235914 4.404136 5.008525 5.136191 5.286947 6.802791
74 74.0 9.634687 0.482808 8.166377 9.323127 9.662062 9.970132 10.393231
75 53.0 6.135036 1.518998 4.822321 5.164798 5.428969 6.046421 10.150519
76 71.0 8.603556 1.076276 5.301748 8.301310 8.953986 9.205549 10.191883
77 45.0 8.173880 0.899719 5.766696 7.818158 8.201186 8.780516 9.705441
78 72.0 9.098500 0.859052 5.198376 8.703559 9.130106 9.696855 10.657252
79 159.0 8.015971 0.867010 5.424811 7.424664 8.109244 8.637223 10.200883
80 73.0 6.431632 1.370623 4.440581 5.251721 5.576448 7.868195 8.885790
81 96.0 6.121469 1.159343 4.399655 5.320935 5.659830 7.216746 9.233649
82 106.0 5.724130 1.108622 4.320683 5.074471 5.247718 5.689854 9.179356
83 82.0 9.798888 0.654166 7.072979 9.596563 9.926461 10.280733 10.722654
84 57.0 6.501368 1.259104 4.295369 5.409472 6.043009 7.475144 9.103723
85 68.0 7.415206 0.968708 5.273489 6.928287 7.352596 8.102690 9.408419
86 25.0 6.279767 1.147799 5.008773 5.382519 6.031190 6.806560 9.179280
87 63.0 9.139912 0.899868 6.668500 8.736752 9.286281 9.837852 10.625488
88 56.0 9.571231 0.805657 6.727586 9.384208 9.825700 10.091892 10.442057
89 135.0 9.717952 0.596464 7.845283 9.374381 9.823113 10.126503 10.808686
90 42.0 6.136051 0.920375 4.891723 5.416706 5.840446 7.064580 8.199224
91 53.0 5.561117 0.862180 4.560911 5.051378 5.437956 5.666302 8.549741
92 38.0 8.371339 1.328756 5.091163 7.665534 8.825273 9.291897 10.406777
93 54.0 6.379349 1.508717 4.740458 5.330225 5.584230 8.049022 9.562082
94 49.0 8.810355 0.762296 7.308287 8.286874 9.070411 9.305305 10.228033
95 115.0 5.762286 0.670392 4.908475 5.319549 5.569041 5.974905 8.491797
96 60.0 5.602232 1.119957 4.692906 4.998555 5.202498 5.513247 8.917312
97 96.0 9.373655 0.736226 7.810011 8.825228 9.392087 10.019339 10.682501
98 50.0 9.584710 0.577891 8.025982 9.198726 9.616575 10.020066 10.673256
99 114.0 9.474343 0.854113 6.201775 9.040631 9.639253 10.104994 10.998346

100 rows × 8 columns


In [233]:
d = data_original.head(1000)[kmeans.labels_ == 0]
for i, row in d.iterrows():
    print(i, row["pdb"], row["i"])


6 1igqB00 6
14 1igqB00 14
20 1igqB00 20
29 1igqB00 29
39 1igqB00 39
65 2r7rA08 16
220 3fcnA00 118
225 3fcnA00 123
230 3fcnA00 128
267 5tjjA02 27
290 5tjjA02 50
296 5tjjA02 56
326 5tjjA02 86
352 5tjjA02 112
369 5tjjA02 129
419 2gy5A03 17
428 2gy5A03 26
441 2gy5A03 39
454 2gy5A03 52
467 2gy5A03 65
475 2gy5A03 73
488 2gy5A03 86
502 2gy5A03 100
509 2gy5A03 107
517 2gy5A03 115
528 2gy5A03 126
566 1ka1A02 37
594 1ka1A02 65
614 1ka1A02 85
622 1ka1A02 93
631 1ka1A02 102
638 1ka1A02 109
662 2czrA02 6
675 2czrA02 19
701 2czrA02 45
709 2czrA02 53
738 2czrA02 82
746 2czrA02 90
753 2czrA02 97
769 1a48A01 1
787 1a48A01 19
802 1a48A01 34
808 1a48A01 40
836 1a48A01 68
841 1a48A01 73
872 1n08A00 1
891 1n08A00 20
906 1n08A00 35
928 1n08A00 57
956 1n08A00 85
964 1n08A00 93

In [232]:
i = 0
((x[i] - kmeans.cluster_centers_[kmeans.labels_[i]])**2)


Out[232]:
array([1.99762260e+00, 6.08547783e+00, 1.60923936e-01, 2.37948752e+00,
       7.32518437e-01, 1.82500920e+00, 4.06818698e+00, 1.24109133e+00,
       7.15746328e+00, 6.30259290e+00, 1.00427376e+01, 4.99073131e-01,
       4.70566311e-01, 4.60160346e-01, 1.20339491e+00, 1.86631283e+00,
       3.52674964e+00, 4.53173738e+00, 4.58810165e-01, 5.22721704e-01,
       4.65781076e-02, 3.17588895e+00, 1.27787050e+01, 1.26398678e-02,
       3.30453780e+00, 6.45726474e-01, 5.38713829e-01, 8.91350571e+00,
       5.56125657e-01, 8.53646906e+00, 5.90221803e+00, 6.77068565e+00,
       7.44166894e-01, 7.17918538e-01, 4.57289185e-01, 2.46043792e-01,
       1.80884812e+00, 3.64631819e+00, 2.66007485e+00, 4.56652806e-01,
       1.68522764e-01, 6.33084480e-03, 1.48740274e+00, 4.22502739e+00,
       1.01220387e-01, 2.55282600e-01, 1.24617964e-01, 3.00895134e-03,
       6.86502116e+00, 5.05230963e+00, 1.58429942e+01, 1.61148917e+01,
       2.15972173e+01, 4.29636052e+00, 4.42807599e-01, 5.83744226e-01,
       6.36432948e-02, 1.04878810e+00, 3.25555131e+00, 3.49581231e+00,
       2.80374021e-01, 9.41031106e-02, 1.32645659e-02, 2.58960079e+00,
       1.02770963e+01, 6.00112750e-01, 5.17046586e-01, 1.15277658e-01,
       5.47032365e-01, 1.20201860e+01, 3.92740337e+00, 1.72796302e+01,
       1.58457873e+01, 1.60462948e+01, 5.56059148e+00, 1.59844043e-01,
       6.15317327e-01, 6.74883261e-01, 7.08147322e-01, 3.90341007e+00,
       1.64559057e+00, 4.22006932e-01, 2.41808675e-03, 1.29815240e-02])

In [ ]:


In [ ]:

with only ca ca distance


In [3]:
data_original = pd.read_csv("/Users/weilu/Research/optimization/fragment/clustered_bin2_jan18.csv", index_col=0)


/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/numpy/lib/arraysetops.py:571: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)

In [4]:
data_original.head()


Out[4]:
pdb i seq dis1 dis2 dis3 dis4 dis5 dis6 dis7 ... d15 d16 d17 d18 d19 d20 d21 dd category count
0 1igqB00 0 DKLKKAIVQ 9.545797 12.242739 11.389445 14.435853 15.702080 18.185148 9.427593 ... 15 9 13 15 9 13 9 9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1... NaN NaN
1 1igqB00 1 KLKKAIVQV 9.427593 9.508488 13.046163 14.956468 17.811722 21.096529 5.881323 ... 19 9 13 17 9 13 9 9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1... NaN NaN
2 1igqB00 2 LKKAIVQVE 5.881323 9.385722 11.658462 14.435720 17.857430 21.547594 9.566237 ... 19 9 13 17 9 13 11 5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,... NaN NaN
3 1igqB00 3 KKAIVQVEH 9.566237 12.596333 15.433746 18.900240 22.492163 25.003511 9.934756 ... 19 9 13 15 11 13 11 9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13... 9987.0 13.0
4 1igqB00 4 KAIVQVEHD 9.934756 12.622395 16.016058 19.465660 21.763054 25.281502 9.527388 ... 19 11 13 17 11 13 11 9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1... 6835.0 18.0

5 rows × 49 columns


In [ ]:


In [70]:
data_original.shape


Out[70]:
(1901430, 49)

In [87]:
data_original.head()


Out[87]:
pdb i seq dis1 dis2 dis3 dis4 dis5 dis6 dis7 ... d15 d16 d17 d18 d19 d20 d21 dd category count
0 1igqB00 0 DKLKKAIVQ 9.545797 12.242739 11.389445 14.435853 15.702080 18.185148 9.427593 ... 15 9 13 15 9 13 9 9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1... NaN NaN
1 1igqB00 1 KLKKAIVQV 9.427593 9.508488 13.046163 14.956468 17.811722 21.096529 5.881323 ... 19 9 13 17 9 13 9 9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1... NaN NaN
2 1igqB00 2 LKKAIVQVE 5.881323 9.385722 11.658462 14.435720 17.857430 21.547594 9.566237 ... 19 9 13 17 9 13 11 5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,... NaN NaN
3 1igqB00 3 KKAIVQVEH 9.566237 12.596333 15.433746 18.900240 22.492163 25.003511 9.934756 ... 19 9 13 15 11 13 11 9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13... 9987.0 13.0
4 1igqB00 4 KAIVQVEHD 9.934756 12.622395 16.016058 19.465660 21.763054 25.281502 9.527388 ... 19 11 13 17 11 13 11 9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1... 6835.0 18.0

5 rows × 49 columns


In [71]:
from sklearn.cluster import KMeans
x = data_original.head(10000).iloc[:, 3:24].values
kmeans = KMeans(n_clusters=1000, random_state=0).fit(x)

In [75]:
plt.scatter(x[:, 0], x[:, 1], c=kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], c="red")


Out[75]:
<matplotlib.collections.PathCollection at 0x12a4ec940>

In [90]:
kmeans.score(x[3:4])


Out[90]:
-3.2598634605765255

In [130]:
data_original.head(10000)[kmeans.labels_ == 941]


Out[130]:
pdb i seq dis1 dis2 dis3 dis4 dis5 dis6 dis7 ... d15 d16 d17 d18 d19 d20 d21 dd category count
0 1igqB00 0 DKLKKAIVQ 9.545797 12.242739 11.389445 14.435853 15.702080 18.185148 9.427593 ... 15 9 13 15 9 13 9 9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1... NaN NaN
2734 3bwsA02 260 KKGLVLGKV 9.273508 11.413203 12.253963 13.338609 17.082760 19.947418 8.448602 ... 15 9 13 17 9 13 11 9,11,13,13,17,19,9,9,11,15,17,7,9,13,15,9,13,1... NaN NaN
3623 2kwbA00 48 PSAEEGAED 10.160047 12.109205 12.734339 13.432731 16.219532 18.111626 9.109225 ... 15 9 13 17 9 13 9 11,13,13,13,17,19,9,11,13,15,17,7,9,13,15,9,13... NaN NaN
4542 4x36A02 113 TLADKPEFT 8.823990 11.467757 11.260575 14.548013 16.793629 19.895035 8.978478 ... 15 9 13 17 9 13 11 9,11,11,15,17,19,9,9,13,15,19,5,9,11,15,9,13,1... NaN NaN
5234 2xt6B03 77 QRHAVIVDR 8.991055 11.222426 11.667209 14.419406 17.422922 17.947512 8.275139 ... 13 11 13 15 9 13 9 9,11,11,15,17,17,9,9,13,17,17,5,9,13,13,11,13,... NaN NaN

5 rows × 49 columns


In [169]:
data_original.head(10000)[kmeans.labels_ == 55]


Out[169]:
pdb i seq dis1 dis2 dis3 dis4 dis5 dis6 dis7 ... d15 d16 d17 d18 d19 d20 d21 dd category count
81 2r7rA08 32 HENEIQLYL 5.333650 6.582048 8.963180 10.288549 10.966698 12.824388 5.418285 ... 11 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5 7.0 12033.0
82 2r7rA08 33 ENEIQLYLI 5.418285 6.527355 9.036262 10.305749 11.110831 12.772940 5.267358 ... 11 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5 7.0 12033.0
109 2r7rA08 60 SRDKYRILE 5.273392 7.189122 9.218030 10.316360 11.565206 13.020775 5.393554 ... 9 5 7 9 5 5 5 5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,5,5 119.0 500.0
136 3fcnA00 9 FVWCQQQAD 5.353385 6.903554 8.958042 10.115607 11.136167 12.917888 5.357417 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 26.0 2898.0
203 3fcnA00 101 GVIWRRAVS 5.706202 6.473025 8.968908 10.203383 11.063753 12.901576 5.376695 ... 11 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5 7.0 12033.0
211 3fcnA00 109 SEAKAALIE 5.344415 6.596049 8.857686 10.147244 11.163829 13.002431 5.334358 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 26.0 2898.0
262 5tjjA02 14 AELHQVAAH 5.302562 6.808673 9.016759 10.052061 10.938404 12.879579 5.065632 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5 1.0 20575.0
316 5tjjA02 76 DESTWRRVL 5.653988 7.055184 9.186121 10.291211 11.365362 13.000630 4.902782 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5 1.0 20575.0
397 5tjjA02 165 ESAAKVSAW 5.407758 6.636605 9.023434 10.096171 11.093265 12.860629 5.241992 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,9,11,5,5,9,9,5,7,9,5,7,5 32.0 2048.0
573 1ka1A02 44 QAKYCLLAL 5.226872 6.776495 8.958233 10.000200 10.943960 12.758240 4.969972 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5 1.0 20575.0
604 1ka1A02 75 AAGNVIVHE 5.136070 7.062123 8.969414 10.173305 11.103614 12.970013 5.939757 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 26.0 2898.0
764 2czrA02 108 SEAREFIKE 5.000071 6.672308 8.776916 9.992788 10.722353 12.505007 5.711265 ... 9 5 7 9 5 5 5 5,7,9,9,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,5,5 72.0 857.0
1242 3c2gA02 10 KPTFIHNVL 5.510642 6.534197 8.919077 10.171537 10.914976 12.664318 5.196806 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5 1.0 20575.0
1349 4e8uA00 66 DWNGLHNGL 5.240033 6.915307 8.927713 10.154488 11.080714 12.896862 5.195405 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5 1.0 20575.0
1411 4e8uA00 128 TIAELTEEE 5.482339 6.552261 8.640179 10.185078 11.387709 12.941839 5.131206 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,5,9,11,11,5,5,9,9,5,7,9,5,7,5 123.0 488.0
1416 4e8uA00 133 TEEEARKQE 5.517636 6.781759 8.824654 10.205109 11.145656 12.672886 5.547846 ... 9 5 7 9 5 5 5 5,7,9,11,11,13,5,7,9,9,11,5,5,9,9,5,7,9,5,5,5 242.0 250.0
1422 4e8uA00 139 KQELLVQNL 5.110320 6.776401 9.302724 10.483055 11.061368 12.841620 5.393684 ... 11 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5 7.0 12033.0
1424 4e8uA00 141 ELLVQNLRQ 5.569650 6.717126 9.148919 10.256499 10.802386 12.373766 5.528145 ... 9 5 5 9 5 5 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,5,9,5,5,5 134.0 442.0
1440 4e8uA00 157 DMKEIEELC 5.243148 6.879291 9.108886 10.120252 11.231353 12.822567 5.409897 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 26.0 2898.0
1536 2pp4A00 0 GARQLSKLK 6.207573 7.202536 9.400275 10.749393 11.598902 13.362099 5.100773 ... 11 5 5 9 5 7 5 7,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,5,9,5,7,5 728.0 104.0
1541 2pp4A00 5 SKLKRFLTT 5.183907 6.706759 8.854277 10.298901 11.097798 12.910788 5.072634 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 6.0 12641.0
1544 2pp4A00 8 KRFLTTLQQ 5.264502 6.452593 8.704339 10.355394 11.098370 12.683537 5.190423 ... 11 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5 7.0 12033.0
1612 2pp4A00 76 ELLHCARLA 5.472988 6.820663 8.981866 10.199092 11.791062 13.364829 5.001121 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 26.0 2898.0
1948 4kqcA02 43 RAEGFVKRF 5.550560 6.733570 8.984053 10.487504 11.358106 12.615863 5.444900 ... 9 5 7 9 5 5 5 5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,5,5 119.0 500.0
1981 4kqcA02 76 NEADIANEV 5.605928 6.408804 8.675251 10.291935 11.278878 12.624000 5.419741 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,5,9,11,11,5,5,9,9,5,7,9,5,7,5 123.0 488.0
2180 1vx7E02 158 TSEEKVKYY 5.358503 6.696349 8.846507 10.074055 11.011369 12.806045 4.978814 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5 1.0 20575.0
2854 3wx4A00 69 NIDDVLKTI 5.374019 6.652499 9.082006 10.314707 11.130764 12.851135 5.035265 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 6.0 12641.0
3252 1ac5A00 152 DFLENYFKI 5.352051 6.727336 8.592132 10.314495 11.319632 13.000024 5.516701 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,5,9,11,11,5,5,9,9,5,7,9,5,7,5 123.0 488.0
3278 1ac5A00 178 GQYIPFFAN 5.843970 6.960258 9.138881 10.514634 11.141170 13.012049 6.073991 ... 11 5 5 9 5 7 5 5,7,9,11,11,13,7,7,9,11,11,5,7,9,11,5,5,9,5,7,5 7934.0 16.0
3279 1ac5A00 179 QYIPFFANA 6.073991 6.677140 8.863530 10.379977 11.407236 12.863068 5.448027 ... 9 5 7 9 5 7 5 7,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 852.0 92.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7234 3c1yA02 54 LLRIVEEIR 5.496828 6.834570 9.111869 10.276987 11.216656 12.825080 5.083735 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 6.0 12641.0
7381 16vpA00 61 LPSDVVEWG 5.291775 6.773263 9.009974 10.250907 10.991113 12.455873 5.153724 ... 9 5 7 9 5 5 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,5,5 23.0 3170.0
7436 16vpA00 116 AREESYRTV 5.769338 7.464414 9.355312 10.482346 11.950436 13.531206 5.497259 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 6.0 12641.0
7494 16vpA00 174 RLARVLFLH 5.416078 6.982404 8.758867 10.387445 11.068685 12.976485 5.758302 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 26.0 2898.0
7510 16vpA00 190 EILWAAYAE 5.806808 6.595221 8.930958 10.298653 11.531592 13.227515 5.468965 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 26.0 2898.0
7514 16vpA00 194 AAYAEQMMR 5.534247 6.771947 9.057928 10.301928 11.553566 13.386687 5.153292 ... 11 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5 7.0 12033.0
7561 16vpA00 241 EARRLRELN 5.337707 6.590122 8.774970 10.208087 10.912583 12.535189 5.406728 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 6.0 12641.0
7604 16vpA00 284 RASGYFMVL 5.073411 6.951392 8.907423 10.098037 11.398313 13.217564 5.346855 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 6.0 12641.0
7633 3cjeA00 41 PPLALFIAG 5.453949 6.889291 9.076591 10.533273 11.137803 12.805101 5.271422 ... 9 5 7 9 5 5 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,5,5 23.0 3170.0
7690 3cjeA00 106 PIEAQQALI 5.299537 6.546436 8.796499 10.062986 11.049634 12.792604 5.024445 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5 1.0 20575.0
7735 2munA00 12 IACGQCRDK 5.168211 6.778906 9.208090 9.972991 11.608543 13.115890 5.561805 ... 9 5 7 9 5 7 5 5,7,9,9,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 9.0 6656.0
7759 2munA00 36 TFKKCQDLL 5.815854 6.762688 8.830809 10.575121 11.575522 13.029126 5.006928 ... 11 5 7 9 5 7 5 5,7,9,11,11,13,5,5,9,9,11,5,7,9,11,5,7,9,5,7,5 16.0 4654.0
7779 3hviA00 18 DPQSVLEAI 5.454301 6.552622 9.117189 10.232352 10.933669 12.845427 4.972138 ... 11 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5 4.0 14304.0
7803 3hviA00 42 AKGQIMDAV 5.596153 7.134886 9.099961 10.281355 11.522535 13.294353 5.162168 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 6.0 12641.0
7983 1iv8A04 24 YRYYQVLVG 5.510259 6.778948 9.046924 10.256198 11.195829 12.715963 5.247990 ... 9 5 7 9 5 5 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,5,5 23.0 3170.0
8004 1iv8A04 65 EYENRVMEL 5.419779 6.677068 9.125540 10.355475 10.915956 12.973279 5.006636 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5 1.0 20575.0
8009 1iv8A04 70 VMELVEETF 5.211782 6.753980 9.108153 10.221592 11.196578 12.860190 5.191833 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 6.0 12641.0
8033 4gyiA03 10 DPASLYADL 5.917629 7.046831 8.958712 10.471177 11.489807 13.168632 5.319816 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 10.0 5374.0
8092 4gyiA03 77 FDRDVQCIK 5.399261 6.542648 8.751293 10.142467 11.301354 12.744295 5.224391 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 6.0 12641.0
8095 4gyiA03 80 DVQCIKRFF 5.184418 6.547073 8.878405 10.018908 10.934995 12.655374 5.282895 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 26.0 2898.0
8167 1y66A00 13 VRRHQEITQ 5.087811 6.584020 8.800280 10.099912 10.866683 12.673161 5.373821 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 26.0 2898.0
8302 1vliA01 141 EISDVHEAW 5.594763 6.623869 8.847273 10.318558 11.200769 12.649756 5.119834 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 6.0 12641.0
8458 5b3kA00 18 RHAQKLLSA 5.135142 6.670463 8.966060 10.143664 10.806757 12.612224 5.248866 ... 9 5 5 9 5 7 5 5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,5,9,5,7,5 25.0 2981.0
8581 5b3kA00 141 AEFAAALKG 5.698404 6.876418 8.993896 10.675476 11.688439 12.668344 5.148262 ... 9 5 7 9 5 5 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,5,5 23.0 3170.0
8762 1d4aA00 82 DIVAEQKKL 5.284836 6.739615 8.933946 10.209151 11.292310 12.962308 5.385168 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 6.0 12641.0
8765 1d4aA00 85 AEQKKLEAA 5.233301 6.579210 8.914511 10.140086 11.058459 12.903560 5.160893 ... 11 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5 7.0 12033.0
9064 2yeqA02 119 EAFVLRRAA 5.471800 6.937684 8.908560 10.209603 11.383506 13.161331 5.411641 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 6.0 12641.0
9193 2yeqA02 248 QRERVINFI 5.267016 6.608962 8.887669 10.006727 11.037183 12.801084 5.218074 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 26.0 2898.0
9338 2yeqA02 393 EEDRFFSHN 6.074079 7.749986 9.214346 10.553204 12.107005 13.593403 5.386916 ... 9 5 7 9 5 7 5 7,7,9,11,13,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5 550.0 134.0
9551 1ydxA03 101 YVPFFYCAL 5.764995 7.315386 9.241748 10.730502 11.593540 13.349720 5.340494 ... 9 5 7 9 5 7 5 5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5 1.0 20575.0

124 rows × 49 columns


In [185]:


In [194]:
d = data_original.head(10000)[kmeans.labels_ == 450]
for i, row in d.iterrows():
    print(i, row["pdb"], row["i"])
    getFragPdb(row["pdb"], int(row["i"]), f"compare/{i}.pdb")


331 5tjjA02 91
880 1n08A00 9
1484 1opoA02 43
1797 5j47A03 18
8991 2yeqA02 46
9529 1ydxA03 71

In [175]:
getFragPdb("3bwsA02.pdb", 260, "test2.pdb")

In [196]:
getFragPdb("2r7rA08", 32)

In [197]:
getFragPdb("2r7rA08", 33)

In [97]:
i = 0
((x[i] - kmeans.cluster_centers_[kmeans.labels_[i]])**2).sum()


Out[97]:
4.322128020315504

In [95]:
((x[3] - kmeans.cluster_centers_[kmeans.labels_[3]])**2).sum()


Out[95]:
3.2598634605762107

In [183]:
pd.Series(kmeans.labels_).value_counts()[500:]


Out[183]:
450    6
497    6
64     6
688    6
494    6
363    6
395    6
123    6
486    6
101    6
919    6
378    6
475    6
483    6
457    6
835    6
539    6
271    6
861    6
869    6
560    6
238    6
626    6
637    6
509    6
415    6
719    6
261    6
665    6
84     6
      ..
917    2
624    2
14     1
681    1
868    1
753    1
15     1
959    1
788    1
531    1
601    1
952    1
18     1
213    1
874    1
406    1
522    1
518    1
308    1
330    1
904    1
872    1
184    1
840    1
832    1
548    1
186    1
954    1
19     1
797    1
Length: 500, dtype: int64

In [178]:
pd.Series(kmeans.labels_).value_counts().hist(bins=50, log=True)


Out[178]:
<matplotlib.axes._subplots.AxesSubplot at 0x12cb08f60>

In [ ]:
import os
# get the fragment pdb
pdb = "1igqB00.pdb"
pdbId = pdb.split('.')[0]
i = 0
pre = "/Users/weilu/Research/optimization/fragment/"
database = "/Users/weilu/Research/optimization/fragment/database/dompdb/"
parser = bio.PDBParser(QUIET=True)
structure = parser.get_structure("x", os.path.join(database, pdb))
for model in structure:
    for chain in model:
        all_residues = list(chain)
        io = bio.PDBIO()
        c = bio.Chain.Chain("A")
        c.child_list = cc.child_list[i:i+9]
        for ii, res in enumerate(c):
            res.id = (' ', ii+1, ' ')
        io.set_structure(c)
        io.save(f'{pre}test.pdb')

In [ ]:
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0],
              [4, 2], [4, 4], [4, 0]])
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
kmeans.labels_

kmeans.predict([[0, 0], [4, 4]])

kmeans.cluster_centers_

In [20]:
plt.scatter(X[:, 0], X[:, 1])


Out[20]:
<matplotlib.collections.PathCollection at 0x128479208>

In [7]:
kmeans.labels_


Out[7]:
array([0, 0, 0, 1, 1, 1], dtype=int32)

In [6]:
X


Out[6]:
array([[1, 2],
       [1, 4],
       [1, 0],
       [4, 2],
       [4, 4],
       [4, 0]])

In [3]:
data = data_original.fillna(-1)[["pdb", "i", "seq","dd", "category", "count"]]
data["category"] = data["category"].astype(int)
data["count"] = data["count"].astype(int)
for i in range(1,10):
    data[f"s{i}"] = data["seq"].apply(lambda x: one_to_index(x[i-1]))

In [4]:


In [15]:
data.to_feather("/Users/weilu/Research/optimization/fragment/feather_cluster_data.feather")

In [13]:
data.head()


Out[13]:
pdb i seq dd category count s1 s2 s3 s4 s5 s6 s7 s8 s9
0 1igqB00 0 DKLKKAIVQ 9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1... -1 -1 2 8 9 8 8 0 7 17 13
1 1igqB00 1 KLKKAIVQV 9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1... -1 -1 8 9 8 8 0 7 17 13 17
2 1igqB00 2 LKKAIVQVE 5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,... -1 -1 9 8 8 0 7 17 13 17 3
3 1igqB00 3 KKAIVQVEH 9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13... 9987 13 8 8 0 7 17 13 17 3 6
4 1igqB00 4 KAIVQVEHD 9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1... 6835 18 8 0 7 17 13 17 3 6 2

In [14]:
data.tail()


Out[14]:
pdb i seq dd category count s1 s2 s3 s4 s5 s6 s7 s8 s9
1901425 1xjhA00 49 NAMDIAEIR 5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5 1 20575 11 0 10 2 7 0 3 7 14
1901426 1xjhA00 50 AMDIAEIRN 5,7,9,9,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5 2 18996 0 10 2 7 0 3 7 14 11
1901427 1xjhA00 51 MDIAEIRNN 5,7,9,9,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 3 18382 10 2 7 0 3 7 14 11 11
1901428 1xjhA00 52 DIAEIRNNA 5,7,9,11,11,11,5,7,9,9,11,5,7,9,11,5,7,9,5,5,5 81 745 2 7 0 3 7 14 11 11 0
1901429 1xjhA00 53 IAEIRNNAS 5,7,9,9,11,11,5,7,9,11,11,5,7,9,11,5,5,7,5,5,5 2071 46 7 0 3 7 14 11 11 0 15

In [6]:
data.shape


Out[6]:
(1901430, 15)

In [7]:
data.query("count != -1").shape


Out[7]:
(753824, 15)

In [8]:
data.query("count > 500").shape


Out[8]:
(325598, 15)

In [12]:
data.query("count != -1").sample(10)


Out[12]:
pdb i seq dd category count s1 s2 s3 s4 s5 s6 s7 s8 s9
83349 4pz0A01 131 GDSFEVKGI 11,13,17,21,21,19,11,13,17,19,17,11,13,15,13,1... 6062 20 5 2 15 4 3 17 8 5 7
376107 3ot5A02 26 MQGMFEAVR 5,7,9,9,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 3 18382 10 13 5 10 4 3 0 17 14
77904 1fmzA00 147 QYIGIHRDR 9,13,15,19,21,25,11,13,17,19,23,9,13,15,19,11,... 927 87 13 19 7 5 7 6 14 2 14
1335697 2je2A00 3 AEFNDKGEL 11,13,15,11,11,7,11,11,9,9,7,7,5,5,5,5,5,7,7,9,9 1664 55 0 3 4 11 2 8 5 3 9
531509 3i5xA02 26 HIKKQIKER 5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,5,5 34 1811 6 7 8 8 13 7 8 3 14
912144 4qtuB00 151 DQVDDILQS 5,7,9,9,11,13,5,7,9,9,11,5,7,9,11,5,5,9,5,7,5 14 4800 2 13 17 2 2 7 9 13 15
1182231 1sbzA00 155 ARVLDQFGL 5,7,9,11,11,9,5,7,9,9,9,5,7,9,9,5,7,5,5,5,7 60 1011 0 14 17 9 2 13 4 5 9
1438574 2cfuA01 327 GNAEIVEVL 5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5 4 14304 5 11 0 3 7 17 3 17 9
1346608 3p32A01 12 DRAALPRAI 5,7,9,11,11,13,5,7,9,11,11,7,7,9,11,5,7,9,5,7,5 328 207 2 14 0 0 9 12 14 0 7
1468369 4ds7E00 11 LVLACVRMK 5,7,9,9,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5 0 39130 9 17 9 0 1 17 14 10 8

In [11]:
data.query("category == 0").shape


Out[11]:
(39130, 15)

In [17]:
data.head()


Out[17]:
pdb i seq dd category count s1 s2 s3 s4 s5 s6 s7 s8 s9
0 1igqB00 0 DKLKKAIVQ 9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1... -1 -1 2 8 9 8 8 0 7 17 13
1 1igqB00 1 KLKKAIVQV 9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1... -1 -1 8 9 8 8 0 7 17 13 17
2 1igqB00 2 LKKAIVQVE 5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,... -1 -1 9 8 8 0 7 17 13 17 3
3 1igqB00 3 KKAIVQVEH 9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13... 9987 13 8 8 0 7 17 13 17 3 6
4 1igqB00 4 KAIVQVEHD 9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1... 6835 18 8 0 7 17 13 17 3 6 2

In [108]:
data.head().iloc[:,6:]


Out[108]:
s1 s2 s3 s4 s5 s6 s7 s8 s9
0 2 8 9 8 8 0 7 17 13
1 8 9 8 8 0 7 17 13 17
2 9 8 8 0 7 17 13 17 3
3 8 8 0 7 17 13 17 3 6
4 8 0 7 17 13 17 3 6 2

In [18]:
test = data.query("count > 500")

In [19]:
train_x = test.iloc[:,6:].values
train_y = test["category"].values

In [ ]:
train_x.shape

In [ ]:
from sklearn.svm import SVC 
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(train_x, train_y) 
svm_predictions = svm_model_linear.predict(train_x)

In [ ]:
from sklearn.metrics import confusion_matrix 
# model accuracy for X_test   
accuracy = svm_model_linear.score(train_x, train_y) 
  
# creating a confusion matrix 
cm = confusion_matrix(train_y, svm_predictions)

In [111]:
train_x.shape


Out[111]:
(1901430, 9)

In [115]:
train_y


Out[115]:
array([  -1,   -1,   -1, ...,    3,   81, 2071])

In [114]:
train_y.shape


Out[114]:
(1901430,)

In [ ]:


In [68]:
seq_count = data["seq"].value_counts()
filtered_seq_count = seq_count[seq_count>1].reset_index().rename(columns={"seq":"index", "index":"seq"})
data_filtered = data_original.merge(filtered_seq_count, on="seq")

In [95]:
def get_total_std(a):
    return a.iloc[:,3:6].std().sum()

In [99]:
data_filtered_total_std = data_filtered.groupby("seq").apply(get_total_std)

small total std dominates.


In [104]:
data_filtered_total_std.hist(bins=50, log=True)


Out[104]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a7ad255c0>

In [46]:
data_original.head()


Out[46]:
pdb i seq dis1 dis2 dis3 dis4 dis5 dis6 dis7 ... d15 d16 d17 d18 d19 d20 d21 dd category count
0 1igqB00 0 DKLKKAIVQ 9.545797 12.242739 11.389445 14.435853 15.702080 18.185148 9.427593 ... 15 9 13 15 9 13 9 9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1... NaN NaN
1 1igqB00 1 KLKKAIVQV 9.427593 9.508488 13.046163 14.956468 17.811722 21.096529 5.881323 ... 19 9 13 17 9 13 9 9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1... NaN NaN
2 1igqB00 2 LKKAIVQVE 5.881323 9.385722 11.658462 14.435720 17.857430 21.547594 9.566237 ... 19 9 13 17 9 13 11 5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,... NaN NaN
3 1igqB00 3 KKAIVQVEH 9.566237 12.596333 15.433746 18.900240 22.492163 25.003511 9.934756 ... 19 9 13 15 11 13 11 9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13... 9987.0 13.0
4 1igqB00 4 KAIVQVEHD 9.934756 12.622395 16.016058 19.465660 21.763054 25.281502 9.527388 ... 19 11 13 17 11 13 11 9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1... 6835.0 18.0

5 rows × 49 columns


In [44]:
data.head()


Out[44]:
pdb i seq dd category count s1 s2 s3 s4 s5 s6 s7 s8 s9
0 1igqB00 0 DKLKKAIVQ 9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1... -1 -1 2 8 9 8 8 0 7 17 13
1 1igqB00 1 KLKKAIVQV 9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1... -1 -1 8 9 8 8 0 7 17 13 17
2 1igqB00 2 LKKAIVQVE 5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,... -1 -1 9 8 8 0 7 17 13 17 3
3 1igqB00 3 KKAIVQVEH 9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13... 9987 13 8 8 0 7 17 13 17 3 6
4 1igqB00 4 KAIVQVEHD 9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1... 6835 18 8 0 7 17 13 17 3 6 2

In [6]:
data.dtypes


Out[6]:
pdb          object
i             int64
seq          object
dis1        float64
dis2        float64
dis3        float64
dis4        float64
dis5        float64
dis6        float64
dis7        float64
dis8        float64
dis9        float64
dis10       float64
dis11       float64
dis12       float64
dis13       float64
dis14       float64
dis15       float64
dis16       float64
dis17       float64
dis18       float64
dis19       float64
dis20       float64
dis21       float64
DisType      object
d1            int64
d2            int64
d3            int64
d4            int64
d5            int64
d6            int64
d7            int64
d8            int64
d9            int64
d10           int64
d11           int64
d12           int64
d13           int64
d14           int64
d15           int64
d16           int64
d17           int64
d18           int64
d19           int64
d20           int64
d21           int64
dd           object
category    float64
count       float64
dtype: object

In [ ]: