In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from small_script.myFunctions import *
import feather
import Bio.PDB as bio
import subprocess
from sklearn.cluster import MiniBatchKMeans
d3_to_index = bio.Polypeptide.d3_to_index # we may want to adjust this in the future.
three_to_one = bio.Polypeptide.three_to_one
one_to_index = bio.Polypeptide.one_to_index
plt.rcParams['figure.figsize'] = [16.18033, 10]
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [10]:
def getFragPdb(pdbId, i, outFile=None):
pdb = pdbId + ".pdb"
if outFile is None:
outFile = f"{i}_{pdb}"
# pdb = "1igqB00.pdb"
# pdbId = pdb.split('.')[0]
pre = "/Users/weilu/Research/optimization/fragment/"
database = "/Users/weilu/Research/optimization/fragment/database/dompdb/"
parser = bio.PDBParser(QUIET=True)
structure = parser.get_structure("x", os.path.join(database, pdb))
for model in structure:
for chain in model:
all_residues = list(chain)
io = bio.PDBIO()
c = bio.Chain.Chain("A")
c.child_list = all_residues[i:i+9]
# for ii, res in enumerate(c):
# res.id = (' ', ii+1, ' ')
io.set_structure(c)
io.save(f'{pre}{outFile}')
def getScore(data, km):
# return km.score(data.iloc[:, 3:87].values)
# return data.values[3:4])
# return km.score(np.array([1]*84).reshape(1,-1))
# return np.sqrt(-km.score(data.values[3:87].reshape(1,-1)))
# return np.sqrt(-km.score(data.values.reshape(1,-1)))
return np.sqrt(((km.cluster_centers_[int(data.values[-1])] - data.values[:-1])**2).sum())
def getFromTerminal(CMD):
return subprocess.Popen(CMD,stdout=subprocess.PIPE,shell=True).communicate()[0].decode()
In [3]:
data_original = pd.read_csv("/Users/weilu/Research/optimization/fragment/data_jan20.csv")
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-3-84fc16d4cc19> in <module>
----> 1 data_original = pd.read_csv("/Users/weilu/Research/optimization/fragment/data_jan20.csv")
~/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
700 skip_blank_lines=skip_blank_lines)
701
--> 702 return _read(filepath_or_buffer, kwds)
703
704 parser_f.__name__ = name
~/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
427
428 # Create the parser.
--> 429 parser = TextFileReader(filepath_or_buffer, **kwds)
430
431 if chunksize or iterator:
~/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
893 self.options['has_index_names'] = kwds['has_index_names']
894
--> 895 self._make_engine(self.engine)
896
897 def close(self):
~/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
1120 def _make_engine(self, engine='c'):
1121 if engine == 'c':
-> 1122 self._engine = CParserWrapper(self.f, **self.options)
1123 else:
1124 if engine == 'python':
~/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
1851 kwds['usecols'] = self.usecols
1852
-> 1853 self._reader = parsers.TextReader(src, **kwds)
1854 self.unnamed_cols = self._reader.unnamed_cols
1855
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()
FileNotFoundError: [Errno 2] File b'/Users/weilu/Research/optimization/fragment/data_jan20.csv' does not exist: b'/Users/weilu/Research/optimization/fragment/data_jan20.csv'
In [205]:
data_original.head()
Out[205]:
pdb
i
seq
caca_1
caca_2
caca_3
caca_4
caca_5
caca_6
caca_7
...
cbcb_12
cbcb_13
cbcb_14
cbcb_15
cbcb_16
cbcb_17
cbcb_18
cbcb_19
cbcb_20
cbcb_21
0
1igqB00
0
DKLKKAIVQ
9.545797
12.242738
11.389445
14.435853
15.702080
18.185148
9.427593
...
4.605271
8.984904
10.732664
13.098623
10.637720
14.192881
16.225273
11.296279
13.163032
10.095731
1
1igqB00
1
KLKKAIVQV
9.427593
9.508487
13.046163
14.956468
17.811722
21.096529
5.881323
...
10.637720
14.192881
16.225273
19.915300
11.296279
13.163032
16.472520
10.095731
12.801142
9.801298
2
1igqB00
2
LKKAIVQVE
5.881323
9.385722
11.658462
14.435720
17.857430
21.547594
9.566237
...
11.296279
13.163032
16.472520
20.721910
10.095731
12.801142
17.423760
9.801298
13.690151
11.383380
3
1igqB00
3
KKAIVQVEH
9.566237
12.596333
15.433746
18.900240
22.492160
25.003511
9.934756
...
10.095731
12.801142
17.423760
18.518616
9.801298
13.690151
14.967668
11.383380
12.181765
11.096270
4
1igqB00
4
KAIVQVEHD
9.934756
12.622395
16.016058
19.465660
21.763054
25.281502
9.527389
...
9.801298
13.690151
14.967668
19.082531
11.383380
12.181765
16.957390
11.096270
14.904981
10.865062
5 rows × 87 columns
In [619]:
pre = "/Users/weilu/Research/optimization/fragment/"
data_original = feather.read_dataframe(f"{pre}cluster100_v2.feather")
os.system(f"mkdir -p {pre}center_cluster100_v2/origin/")
os.system(f"mkdir -p {pre}center_cluster100_v2/pdbs/")
os.system(f"mkdir -p {pre}center_cluster100_v2/gros/")
center = data_original.groupby("cluster").head(1)
for i, row in center.reset_index(drop=True).iterrows():
print(i, row["pdb"], row["i"], row["cluster"])
getFragPdb(row["pdb"], int(row["i"]), f"center_cluster100_v2/origin/{row['cluster']}.pdb")
pre = "/Users/weilu/Research/optimization/fragment/center_cluster100_v2//"
for i in range(100):
os.system(f"python ~/opt/small_script/pdb_reres.py {pre}origin/{i}.pdb > {pre}pdbs/{i}.pdb")
for i in range(100):
os.system(f"python2 ~/opt/script/Pdb2Gro.py {pre}pdbs/{i}.pdb {pre}gros/{i}.gro")
0 1bg6A02 107 0
1 1c17M00 9 1
2 3p6dA00 53 2
3 3ubrA01 238 3
4 3q3eA03 3 4
5 2yj6A02 47 5
6 2fokB01 110 6
7 4qamB00 141 7
8 2ftxA00 23 8
9 4kksA03 29 9
10 2h5nC00 54 10
11 2y3cA00 135 11
12 3tp9A02 12 12
13 4o7oA02 15 13
14 1tz9A00 65 14
15 2autA00 65 15
16 1dk8A02 58 16
17 2vf7B01 225 17
18 5ja1B00 0 18
19 1e8cA02 213 19
20 4khbC00 59 20
21 1ogpA02 110 21
22 2r0qC01 6 22
23 2ri9A00 67 23
24 2duyA00 33 24
25 4lviA01 108 25
26 3la4A03 40 26
27 2icgA00 73 27
28 3abzA03 22 28
29 2wsaA00 318 29
30 3sigA00 42 30
31 4ag4A02 28 31
32 2p17A00 109 32
33 3k1dA01 28 33
34 4k17A03 55 34
35 1qwoA01 358 35
36 1hp1A01 17 36
37 4xjxA03 163 37
38 4q9dB01 152 38
39 1vw4L02 56 39
40 4dohB02 19 40
41 1uf3A00 25 41
42 3fmgA01 41 42
43 3gkuA02 65 43
44 2y2zA02 1 44
45 4ry8A01 1 45
46 3gw6F01 12 46
47 1y80A00 61 47
48 1d4aA00 241 48
49 1m2vB04 78 49
50 3c24A01 112 50
51 4p2cA01 102 51
52 1jfaB00 163 52
53 2lvvA00 175 53
54 3h75A01 77 54
55 2c0uA03 55 55
56 3l2pA01 91 56
57 3r5eA00 164 57
58 3zwcA03 98 58
59 1imjA00 67 59
60 2ivfB00 273 60
61 3b85A00 23 61
62 3qvsA01 82 62
63 2x4mB00 44 63
64 4tm5A02 124 64
65 2fb2B00 61 65
66 1mukA02 406 66
67 3gqnA03 38 67
68 3icrA02 41 68
69 2hy5B00 96 69
70 3zkvA00 456 70
71 2y8yA01 2 71
72 2xl4A00 98 72
73 4nleA03 67 73
74 4rhiA00 126 74
75 5h83A01 3 75
76 1wywA00 104 76
77 3it4D02 90 77
78 3av0A01 89 78
79 3p02A02 98 79
80 3ic8A01 26 80
81 3kb1A01 59 81
82 1nf8A00 164 82
83 3gzdA01 48 83
84 3brwC02 68 84
85 3ephA01 40 85
86 3hn7A03 83 86
87 4v19X00 104 87
88 2cfuA01 37 88
89 2iqgA02 129 89
90 4otpA02 27 90
91 3dupA01 102 91
92 4ie5A02 21 92
93 1u83A00 81 93
94 1w2yA00 52 94
95 2o1mA01 18 95
96 4oc8A01 193 96
97 1fo1B00 125 97
98 2nylB00 148 98
99 2yg8A02 38 99
In [620]:
In [621]:
In [529]:
data_original = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100.feather")
In [545]:
center = data_original.groupby("cluster").head(1)
In [548]:
for i, row in center.reset_index(drop=True).iterrows():
print(i, row["pdb"], row["i"], row["cluster"])
getFragPdb(row["pdb"], int(row["i"]), f"center_cluster100/{row['cluster']}.pdb")
0 1g9mG00 206 0
1 5cr9A02 26 1
2 2h7fX02 93 2
3 2q9oA03 200 3
4 4yokA01 40 4
5 1b3tA00 103 5
6 4gxtA01 160 6
7 2phpA00 111 7
8 4gb7A00 199 8
9 1nr0A01 123 9
10 3ahcA01 301 10
11 1q5vB01 6 11
12 5ipyA02 54 12
13 1rj1A00 60 13
14 5d01A01 155 14
15 5eufA02 10 15
16 4ua3A00 119 16
17 3lsoA01 97 17
18 2e63A00 86 18
19 2a90A01 70 19
20 1pujA02 13 20
21 1knyA02 109 21
22 3glvA00 105 22
23 4aweA00 263 23
24 3gg4A01 217 24
25 5e7qA01 317 25
26 1eakA01 31 26
27 1sj1A00 49 27
28 1khiA01 5 28
29 4ps6A00 117 29
30 2lpuA00 108 30
31 2rhkC00 17 31
32 1ogyA03 130 32
33 1tr2A02 35 33
34 2wvsA01 100 34
35 4jb1A02 54 35
36 3c8zA01 24 36
37 2xvyA01 10 37
38 3pohA01 10 38
39 1qknA00 20 39
40 1y8qC00 47 40
41 2inyA01 562 41
42 2rgqA00 104 42
43 3sd2A01 58 43
44 3d8kA00 48 44
45 1rmgA00 92 45
46 4hrwB01 127 46
47 1krlA00 32 47
48 4bmjA00 26 48
49 1uwyA01 235 49
50 1biqA00 294 50
51 1pujA02 45 51
52 1fc4A01 58 52
53 1m4zA01 59 53
54 1iooA00 135 54
55 3atsA02 173 55
56 3h09B02 216 56
57 1nr0A01 125 57
58 5fccA01 1 58
59 4qbuA03 28 59
60 2cvbA00 140 60
61 2aw6A02 54 61
62 3cf4A02 127 62
63 3cfuA00 115 63
64 2lgvA00 11 64
65 3lf7A01 330 65
66 2opwA00 124 66
67 2gw1A02 284 67
68 3q5wA00 2 68
69 2x6rB01 185 69
70 1m2oA05 52 70
71 3m0fB02 93 71
72 4i9cA02 153 72
73 2gjcB02 202 73
74 4nafA00 57 74
75 1ipkB02 141 75
76 4r3aA01 40 76
77 1fgsA01 146 77
78 3w5mA05 48 78
79 1kbpA02 251 79
80 4dndA00 0 80
81 1vddA03 18 81
82 2uvfB02 338 82
83 1xjvA01 31 83
84 1k4nA00 105 84
85 1a0rP01 87 85
86 5dllA03 30 86
87 1wzzA00 160 87
88 1kjnA00 118 88
89 1eyqA02 116 89
90 4h18A00 186 90
91 3w0lB02 77 91
92 4abnA02 123 92
93 4at7B01 83 93
94 1bt9A00 124 94
95 4pxvC00 24 95
96 2c0hA00 73 96
97 3fk5A02 92 97
98 1myrA00 296 98
99 4igbB02 26 99
In [550]:
pre = "/Users/weilu/Research/optimization/fragment/center_cluster100/"
for i in range(100):
os.system(f"python ~/opt/small_script/pdb_reres.py {pre}origin/{i}.pdb > {pre}pdbs/{i}.pdb")
In [554]:
for i in range(100):
os.system(f"python2 ~/opt/script/Pdb2Gro.py {pre}pdbs/{i}.pdb {pre}gros/{i}.gro")
In [238]:
from sklearn.cluster import KMeans
# x = data_original.head(100).iloc[:, 3:24].values
chosen = data_original.head(1000).reset_index(drop=True)
x = chosen.iloc[:, 3:87].values
kmeans = KMeans(n_clusters=20, random_state=0).fit(x)
In [239]:
plt.scatter(x[:, 0], x[:, 1], c=kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], c="red")
Out[239]:
<matplotlib.collections.PathCollection at 0x12dc3e400>
In [247]:
from sklearn.cluster import KMeans
# x = data_original.head(100).iloc[:, 3:24].values
# ((kmeans.labels_ == kmeans.predict(x))-1).sum()
chosen = data_original.head(10000).reset_index(drop=True)
x = chosen.iloc[:, 3:87].values
kmeans = KMeans(n_clusters=100, random_state=0).fit(x)
In [245]:
chosen.head()
Out[245]:
pdb
i
seq
caca_1
caca_2
caca_3
caca_4
caca_5
caca_6
caca_7
...
cbcb_13
cbcb_14
cbcb_15
cbcb_16
cbcb_17
cbcb_18
cbcb_19
cbcb_20
cbcb_21
cluster
0
1igqB00
0
DKLKKAIVQ
9.545797
12.242738
11.389445
14.435853
15.702080
18.185148
9.427593
...
8.984904
10.732664
13.098623
10.637720
14.192881
16.225273
11.296279
13.163032
10.095731
9
1
1igqB00
1
KLKKAIVQV
9.427593
9.508487
13.046163
14.956468
17.811722
21.096529
5.881323
...
14.192881
16.225273
19.915300
11.296279
13.163032
16.472520
10.095731
12.801142
9.801298
12
2
1igqB00
2
LKKAIVQVE
5.881323
9.385722
11.658462
14.435720
17.857430
21.547594
9.566237
...
13.163032
16.472520
20.721910
10.095731
12.801142
17.423760
9.801298
13.690151
11.383380
12
3
1igqB00
3
KKAIVQVEH
9.566237
12.596333
15.433746
18.900240
22.492160
25.003511
9.934756
...
12.801142
17.423760
18.518616
9.801298
13.690151
14.967668
11.383380
12.181765
11.096270
19
4
1igqB00
4
KAIVQVEHD
9.934756
12.622395
16.016058
19.465660
21.763054
25.281502
9.527389
...
13.690151
14.967668
19.082531
11.383380
12.181765
16.957390
11.096270
14.904981
10.865062
19
5 rows × 88 columns
In [307]:
chosen["rmsd"] = chosen.iloc[:,3:87].apply(lambda x: getScore(x, kmeans), axis=1)
chosen["cluster"] = kmeans.labels_
In [3]:
data_original = pd.read_csv("/Users/weilu/Research/optimization/fragment/data_jan31.csv")
In [4]:
chosen = data_original.reset_index(drop=True)
x = chosen.iloc[:, 3:87].values
In [5]:
kmeans = MiniBatchKMeans(n_clusters=100,
random_state=0,
batch_size=200,
max_iter=300,
tol=1e4).fit(x)
In [8]:
import pickle
pickle.dump(kmeans, open("/Users/weilu/Research/optimization/fragment/kmeans_cluster100_v2_2", "wb"))
In [11]:
chosen["cluster"] = kmeans.labels_
chosen["rmsd"] = chosen.iloc[:,3:88].apply(lambda x: getScore(x, kmeans), axis=1)
In [12]:
reodered_chosen = chosen.sort_values(["cluster", "rmsd"])
# reodered_chosen.reset_index(drop=True).to_feather("/Users/weilu/Research/optimization/fragment/cluster100_v2.feather")
reodered_chosen.reset_index(drop=True).to_feather("/Users/weilu/Research/optimization/fragment/cluster100_v2_2.feather")
In [603]:
import pickle
# pickle.dump(kmeans, open("/Users/weilu/Research/optimization/fragment/kmeans_cluster100_v2_2", "wb"))
kmeans = pickle.load(open("/Users/weilu/Research/optimization/fragment/kmeans_cluster100_v2", "rb"))
In [608]:
In [610]:
In [491]:
%%time
# %%time
# from sklearn.cluster import KMeans
# # x = data_original.head(100).iloc[:, 3:24].values
# # ((kmeans.labels_ == kmeans.predict(x))-1).sum()
# kmeans = KMeans(n_clusters=100, random_state=0).fit(x)
# # chosen["rmsd"] = chosen.iloc[:,3:87].apply(lambda x: getScore(x, kmeans), axis=1)
# chosen["cluster"] = kmeans.labels_
# fit on the whole data
kmeans = MiniBatchKMeans(n_clusters=500,
random_state=0,
batch_size=200,
max_iter=300,
tol=1e4).fit(x)
CPU times: user 30.3 s, sys: 5.92 s, total: 36.2 s
Wall time: 14.8 s
In [522]:
chosen["cluster"] = kmeans.labels_
chosen["rmsd"] = chosen.iloc[:,3:88].apply(lambda x: getScore(x, kmeans), axis=1)
In [523]:
reodered_chosen = chosen.sort_values(["cluster", "rmsd"])
In [528]:
reodered_chosen.reset_index(drop=True).to_feather("/Users/weilu/Research/optimization/fragment/cluster500.feather")
In [525]:
t = reodered_chosen.groupby("cluster")["rmsd"].describe().sort_values("count")
In [527]:
t.query("mean < 10")
Out[527]:
count
mean
std
min
25%
50%
75%
max
cluster
488
211.0
6.405952
2.184369
0.000000
4.851784
6.009832
7.477315
16.365348
419
461.0
8.400869
3.992281
0.000000
5.183176
7.018433
11.271126
21.236263
439
573.0
8.307781
2.359199
0.000000
6.527053
7.899386
9.752821
18.502313
463
631.0
5.769330
1.757134
0.000000
4.605430
5.413044
6.678401
16.163776
394
643.0
7.252995
2.016599
0.000000
5.875750
7.254019
8.251920
16.145780
412
644.0
9.648656
3.423034
0.000000
6.912548
8.817875
12.297921
19.966513
110
752.0
9.885749
2.144300
0.000000
8.255909
9.581002
11.430300
17.489185
324
766.0
9.085841
1.503798
0.000000
8.217952
9.110182
9.995570
13.652618
487
767.0
5.140011
2.070114
0.000000
3.807998
4.542719
5.826916
15.917704
494
778.0
5.416354
1.927569
0.000000
4.031041
5.029221
6.390334
15.819659
174
790.0
8.487826
2.225173
0.000000
6.985582
8.428768
9.779559
16.427066
313
908.0
7.433903
1.261792
0.000000
6.482404
7.349275
8.346862
12.202795
421
923.0
9.091107
3.942319
0.000000
6.315109
7.729651
11.712235
22.461120
484
935.0
5.711914
1.975222
0.000000
4.499907
5.436268
6.514064
18.286431
173
1008.0
9.257256
1.722207
0.000000
8.047716
9.178171
10.330828
14.469612
476
1077.0
5.261427
2.088553
0.000000
3.789850
4.629205
6.109691
13.768129
389
1087.0
6.832852
3.350067
0.000000
4.403291
5.580939
8.678317
17.565477
337
1088.0
8.889376
3.234105
0.000000
6.270527
8.602627
11.020706
18.671643
387
1152.0
8.740118
2.836843
0.000000
6.622372
7.967326
10.555101
19.106496
264
1260.0
9.503432
3.935349
0.000000
6.279775
8.264432
12.399816
22.850360
89
1270.0
8.160869
1.466499
4.405894
7.084065
8.165975
9.131448
14.216164
181
1304.0
9.190776
2.940143
0.000000
7.040864
8.414425
10.739330
19.252077
331
1332.0
9.636055
2.923684
0.000000
7.287349
9.580069
11.760191
18.181023
333
1366.0
8.161093
1.242464
0.000000
7.373075
8.153737
8.916212
15.594951
296
1403.0
7.277364
3.788108
0.000000
4.564008
5.821959
9.728810
19.912605
390
1413.0
9.844129
2.929726
0.000000
7.405211
9.580462
11.988679
18.566266
372
1483.0
9.501800
1.648011
0.000000
8.336452
9.339765
10.457104
15.878713
423
1509.0
7.229744
1.490450
0.000000
6.226085
7.219368
8.186919
13.217369
405
1509.0
7.489281
1.242903
0.000000
6.678965
7.451765
8.267258
12.784382
18
1526.0
6.682512
2.955888
0.000000
4.695716
5.872720
7.737941
20.127146
...
...
...
...
...
...
...
...
...
446
8325.0
5.612743
2.246595
1.686931
4.252918
4.982112
6.037459
17.209269
381
8404.0
9.779719
2.526253
2.738193
7.931354
9.708642
11.634050
18.009054
226
8489.0
3.303255
1.307035
0.804719
2.376314
2.995983
3.900423
13.161018
373
8567.0
8.750373
1.841973
3.518725
7.438240
8.680673
9.985167
15.714770
443
8613.0
5.553135
2.505225
1.731831
3.796030
4.823022
6.640948
20.369276
11
8616.0
5.498414
1.041507
2.041919
4.772000
5.450848
6.162283
10.567001
28
8704.0
4.132531
2.623001
0.923846
2.552216
3.257191
4.620020
21.465496
424
8829.0
8.855451
1.578204
4.113152
7.800379
8.773570
9.812179
16.224047
35
9076.0
6.293140
2.913352
0.925444
4.236622
5.702312
7.629875
20.049576
481
9115.0
4.478269
2.723088
1.071464
2.777123
3.566477
5.053013
21.257613
279
9162.0
9.188380
2.146380
4.574546
7.505149
9.036916
10.681678
18.002513
347
9302.0
9.921108
2.950067
0.000000
7.655886
9.841897
12.274723
18.210627
159
9568.0
8.230792
2.708362
1.947264
6.299143
7.723490
9.853484
19.574172
408
9667.0
6.381713
1.288461
2.215306
5.484409
6.311356
7.215179
12.464428
416
9970.0
7.429243
1.710242
0.000000
6.222167
7.479457
8.668579
15.107802
497
10085.0
2.993684
1.245996
0.941742
2.166304
2.652712
3.467202
12.653784
258
10548.0
9.320128
2.149168
3.320423
7.811364
9.296833
10.755182
16.861263
222
11020.0
7.638585
1.587936
3.538688
6.547909
7.501866
8.567174
15.734313
398
11847.0
7.016219
1.334663
3.169083
6.071185
6.937974
7.909028
12.899463
485
15435.0
2.500596
0.747492
0.750748
1.991833
2.403763
2.891510
8.282556
327
15627.0
2.919554
1.085106
0.851447
2.133238
2.666950
3.455255
10.615247
425
16487.0
2.850683
1.006460
0.792125
2.132705
2.648014
3.375965
14.221093
67
16580.0
5.888814
2.170344
2.562263
4.473827
5.177396
6.515498
18.038824
363
17419.0
4.754982
2.050652
1.052461
3.298524
4.384968
5.774809
20.050853
499
26735.0
2.336397
0.895919
0.603243
1.723335
2.134374
2.719237
8.906935
56
32435.0
2.032252
0.693076
0.589714
1.562771
1.887694
2.339050
11.419610
453
37130.0
2.195190
0.848428
0.518378
1.617063
2.004453
2.558342
12.418647
491
44173.0
1.873969
0.649835
0.552015
1.421173
1.744503
2.181334
9.924016
451
46148.0
2.151608
0.822239
0.591659
1.600590
1.981437
2.495240
11.074528
239
67870.0
1.977638
0.748888
0.503575
1.466526
1.814698
2.310752
9.656071
231 rows × 8 columns
In [524]:
reodered_chosen.groupby("cluster")["rmsd"].describe().sort_values("count")
Out[524]:
count
mean
std
min
25%
50%
75%
max
cluster
376
59.0
10.838282
3.232352
0.000000
9.192692
10.401419
13.294589
17.287852
211
194.0
15.185838
2.187446
0.000000
13.847370
15.128813
16.679899
20.607763
488
211.0
6.405952
2.184369
0.000000
4.851784
6.009832
7.477315
16.365348
103
284.0
14.563844
1.964411
0.000000
13.410099
14.638768
15.926320
18.627723
125
293.0
15.258882
2.016858
0.000000
14.238791
15.240803
16.369402
21.291743
213
339.0
14.261146
2.222284
0.000000
12.873759
14.402281
15.714741
19.123733
53
387.0
12.113211
3.963168
0.000000
8.359540
13.118751
15.284199
19.085406
145
394.0
13.946271
3.729265
0.000000
12.265290
14.876402
16.538905
22.741212
150
399.0
16.777677
2.235406
0.000000
15.719497
17.039087
18.193809
21.514749
325
412.0
11.905261
4.003662
0.000000
9.107426
11.397496
14.825498
20.814679
426
433.0
10.630110
4.275595
0.000000
7.803763
10.078080
13.454795
21.240280
202
460.0
14.595013
2.425844
0.000000
13.192075
14.587900
16.168760
21.602464
419
461.0
8.400869
3.992281
0.000000
5.183176
7.018433
11.271126
21.236263
118
480.0
13.069946
2.390771
0.000000
11.554023
12.876632
14.549696
20.326118
216
485.0
13.904434
1.905825
0.000000
12.829957
14.063004
15.172137
19.765212
60
551.0
14.802485
3.749959
0.000000
12.790487
15.384159
17.432484
24.452040
439
573.0
8.307781
2.359199
0.000000
6.527053
7.899386
9.752821
18.502313
46
600.0
12.309305
2.773076
0.000000
10.562480
12.333308
14.110623
19.258243
463
631.0
5.769330
1.757134
0.000000
4.605430
5.413044
6.678401
16.163776
394
643.0
7.252995
2.016599
0.000000
5.875750
7.254019
8.251920
16.145780
412
644.0
9.648656
3.423034
0.000000
6.912548
8.817875
12.297921
19.966513
205
649.0
13.248842
2.127469
0.000000
12.245327
13.596784
14.654608
18.177675
244
657.0
12.915141
2.032875
0.000000
11.708049
13.082302
14.343676
17.828327
236
661.0
12.611278
2.706920
0.000000
10.685368
12.348166
14.627941
19.588881
70
700.0
13.799584
2.500682
0.000000
12.564819
14.219139
15.589637
19.439629
137
723.0
13.213361
2.910252
0.000000
11.479426
13.578968
15.101994
24.337267
24
738.0
12.307941
2.568989
0.000000
10.458948
12.615052
14.156888
18.982266
110
752.0
9.885749
2.144300
0.000000
8.255909
9.581002
11.430300
17.489185
291
765.0
12.495069
2.368849
0.000000
10.992476
12.812920
14.291917
17.854387
324
766.0
9.085841
1.503798
0.000000
8.217952
9.110182
9.995570
13.652618
...
...
...
...
...
...
...
...
...
210
8407.0
10.401558
2.378907
3.987519
8.670919
10.390931
12.088837
20.102646
226
8489.0
3.303255
1.307035
0.804719
2.376314
2.995983
3.900423
13.161018
373
8567.0
8.750373
1.841973
3.518725
7.438240
8.680673
9.985167
15.714770
443
8613.0
5.553135
2.505225
1.731831
3.796030
4.823022
6.640948
20.369276
11
8616.0
5.498414
1.041507
2.041919
4.772000
5.450848
6.162283
10.567001
307
8690.0
11.673967
1.972951
6.125048
10.239719
11.473958
13.033092
18.718273
28
8704.0
4.132531
2.623001
0.923846
2.552216
3.257191
4.620020
21.465496
424
8829.0
8.855451
1.578204
4.113152
7.800379
8.773570
9.812179
16.224047
35
9076.0
6.293140
2.913352
0.925444
4.236622
5.702312
7.629875
20.049576
481
9115.0
4.478269
2.723088
1.071464
2.777123
3.566477
5.053013
21.257613
279
9162.0
9.188380
2.146380
4.574546
7.505149
9.036916
10.681678
18.002513
347
9302.0
9.921108
2.950067
0.000000
7.655886
9.841897
12.274723
18.210627
159
9568.0
8.230792
2.708362
1.947264
6.299143
7.723490
9.853484
19.574172
408
9667.0
6.381713
1.288461
2.215306
5.484409
6.311356
7.215179
12.464428
416
9970.0
7.429243
1.710242
0.000000
6.222167
7.479457
8.668579
15.107802
497
10085.0
2.993684
1.245996
0.941742
2.166304
2.652712
3.467202
12.653784
258
10548.0
9.320128
2.149168
3.320423
7.811364
9.296833
10.755182
16.861263
222
11020.0
7.638585
1.587936
3.538688
6.547909
7.501866
8.567174
15.734313
398
11847.0
7.016219
1.334663
3.169083
6.071185
6.937974
7.909028
12.899463
485
15435.0
2.500596
0.747492
0.750748
1.991833
2.403763
2.891510
8.282556
327
15627.0
2.919554
1.085106
0.851447
2.133238
2.666950
3.455255
10.615247
425
16487.0
2.850683
1.006460
0.792125
2.132705
2.648014
3.375965
14.221093
67
16580.0
5.888814
2.170344
2.562263
4.473827
5.177396
6.515498
18.038824
363
17419.0
4.754982
2.050652
1.052461
3.298524
4.384968
5.774809
20.050853
499
26735.0
2.336397
0.895919
0.603243
1.723335
2.134374
2.719237
8.906935
56
32435.0
2.032252
0.693076
0.589714
1.562771
1.887694
2.339050
11.419610
453
37130.0
2.195190
0.848428
0.518378
1.617063
2.004453
2.558342
12.418647
491
44173.0
1.873969
0.649835
0.552015
1.421173
1.744503
2.181334
9.924016
451
46148.0
2.151608
0.822239
0.591659
1.600590
1.981437
2.495240
11.074528
239
67870.0
1.977638
0.748888
0.503575
1.466526
1.814698
2.310752
9.656071
500 rows × 8 columns
In [488]:
reodered_chosen.reset_index(drop=True).to_feather("/Users/weilu/Research/optimization/fragment/cluster100.feather")
In [486]:
reodered_chosen.groupby("cluster")["rmsd"].describe().sort_values("count")
Out[486]:
count
mean
std
min
25%
50%
75%
max
cluster
99
952.0
14.943018
3.768664
0.000000
12.702428
15.557707
17.871381
22.480900
91
961.0
12.874403
4.222096
0.000001
10.245549
12.638462
16.007383
23.408196
49
1106.0
14.358321
2.872157
0.000002
12.317531
13.987767
16.424921
22.320873
82
2443.0
14.934389
2.465681
0.000003
13.663287
15.177049
16.573827
22.858531
45
2802.0
14.401188
2.831316
0.000000
12.572518
14.603214
16.425568
22.323190
83
2964.0
14.287125
3.500858
0.000002
11.287813
14.561032
16.984719
22.267516
33
3270.0
14.618352
3.334886
3.575489
12.367087
14.941829
16.960734
24.235116
41
3413.0
15.624262
2.506492
0.000002
14.150406
15.694428
17.373564
22.672772
40
3867.0
13.870901
3.225424
0.000000
11.917269
14.192434
16.083955
23.190758
44
3919.0
16.986246
2.257992
0.000000
15.712881
17.144057
18.472072
23.571758
20
4022.0
14.509955
2.770687
0.000002
12.613540
14.684247
16.682598
22.153949
90
4023.0
14.779448
2.714949
0.000000
13.121565
15.047151
16.644111
22.614183
38
4142.0
15.446491
3.136561
0.000001
13.517689
15.827723
17.772601
23.471810
86
4468.0
16.535079
2.966621
0.000000
15.131907
17.064306
18.580954
23.009877
73
4548.0
14.112077
3.449817
0.000000
12.011958
14.362980
16.464117
23.796580
60
4976.0
14.640957
2.778833
4.378706
12.765303
14.762178
16.605331
22.131411
12
5001.0
16.494051
2.401524
0.000000
14.971834
16.729541
18.186580
23.612149
97
5056.0
14.520031
2.333622
0.000003
12.974537
14.563677
16.119334
21.787286
77
5778.0
14.701592
2.687401
0.000000
13.044247
14.991552
16.577533
22.030660
8
5968.0
16.722013
2.405117
6.942118
15.064290
16.874535
18.439966
31.806063
94
6086.0
16.541551
2.447883
8.785505
14.527768
16.212836
18.490690
25.617378
34
6140.0
16.226214
2.547526
0.000000
14.658252
16.397174
17.967023
23.642450
88
6438.0
13.586398
2.311506
0.000002
12.048226
13.689158
15.160343
33.482797
71
6662.0
13.987300
2.772758
0.000002
12.016270
14.073191
15.992690
23.090788
29
6677.0
12.693257
3.656613
3.346590
9.872446
12.675122
15.518500
22.254154
39
6699.0
15.155062
2.187593
0.000000
13.779472
15.362334
16.658834
29.450959
18
6800.0
12.975794
3.038169
0.000000
10.799474
13.162264
15.302423
21.366734
72
6904.0
10.840528
2.849117
6.285384
8.895940
9.890762
11.983059
23.176161
47
6953.0
11.944390
2.294724
4.492983
10.510392
11.980350
13.437744
19.261959
93
7222.0
15.447458
2.775308
0.000002
13.663342
15.632052
17.422167
23.169541
...
...
...
...
...
...
...
...
...
9
19256.0
11.868036
2.760591
4.369682
9.957512
11.909907
13.603477
22.852608
67
19326.0
9.332737
3.310210
2.753179
6.957699
8.515299
11.291503
22.736066
26
19636.0
9.953674
3.098423
3.404686
7.663375
9.477456
11.535308
22.963759
79
19661.0
13.677402
3.149661
5.096248
11.249396
13.916112
16.120006
24.043936
15
20771.0
12.404434
2.488259
5.106200
10.644426
12.371523
14.098956
22.221392
3
21066.0
10.534024
3.727477
2.474252
7.684041
9.366867
13.376781
22.292233
76
21488.0
13.812775
2.363840
6.228645
12.129934
13.771876
15.521418
22.282234
43
22120.0
12.195323
3.673941
4.505613
9.019266
12.269840
15.083801
22.675310
4
22208.0
8.115052
1.782288
3.306458
6.912866
7.940351
9.086553
19.644252
64
22446.0
11.983956
2.377519
5.272052
10.237089
11.629880
13.524192
21.306675
85
23779.0
11.547363
2.432277
5.064709
9.766418
11.350750
13.125555
21.785420
25
24037.0
11.576444
2.297706
4.446398
10.009812
11.391328
12.960079
21.304210
21
25110.0
7.009054
3.672890
1.644417
4.224781
5.991007
9.062640
23.073114
87
26733.0
14.219340
2.324962
6.682571
12.533556
14.018004
15.787119
22.495349
2
28523.0
10.366287
2.318918
4.092718
8.711052
10.278778
11.885385
20.743262
53
29672.0
12.696080
2.295429
5.989480
11.078456
12.568173
14.183804
22.524250
61
29771.0
13.372956
2.239237
6.422474
11.811759
13.236231
14.794222
21.550275
17
29886.0
8.316054
2.063265
3.155554
6.847377
8.007853
9.455852
18.007877
13
30638.0
11.947376
2.701377
5.214697
9.894431
11.466771
13.540095
23.155044
27
31914.0
10.824237
3.878395
2.469289
7.678417
10.729263
13.788632
22.449788
30
33191.0
10.773912
2.349838
4.338749
9.092387
10.551146
12.290083
20.743047
28
34735.0
12.222311
2.835495
3.951257
10.317364
12.233157
14.085700
21.912548
89
35291.0
9.288116
1.982222
3.835690
7.849875
9.060492
10.546546
17.505264
11
37373.0
8.989179
4.073917
1.596455
5.373913
8.477139
12.497340
22.413459
57
38507.0
9.876553
2.291544
3.518744
8.152652
9.705877
11.423994
18.962729
58
43745.0
10.582617
2.227352
4.103513
8.940012
10.506420
12.097370
21.442793
80
45591.0
8.442550
3.123314
2.453882
6.005965
7.805853
10.514389
22.554986
54
59153.0
7.811586
3.759268
1.720934
4.700202
6.692015
10.633417
21.810600
56
98910.0
7.592623
1.993617
2.231323
6.215800
7.363404
8.681848
19.396090
1
382971.0
3.568442
2.394548
0.599623
2.092547
2.786991
4.038103
20.580043
100 rows × 8 columns
In [475]:
%%time
from sklearn.cluster import KMeans
# x = data_original.head(100).iloc[:, 3:24].values
# ((kmeans.labels_ == kmeans.predict(x))-1).sum()
chosen = data_original.head(100000).reset_index(drop=True)
x = chosen.iloc[:, 3:5].values
kmeans = KMeans(n_clusters=10, random_state=0).fit(x)
# chosen["rmsd"] = chosen.iloc[:,3:87].apply(lambda x: getScore(x, kmeans), axis=1)
chosen["cluster"] = kmeans.labels_
CPU times: user 6.66 s, sys: 1.21 s, total: 7.87 s
Wall time: 4.84 s
In [467]:
plt.scatter(x[:, 0], x[:, 1], c=kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], c="red")
Out[467]:
<matplotlib.collections.PathCollection at 0x18aa53e10>
In [476]:
%%time
# fit on the whole data
kmeans = MiniBatchKMeans(n_clusters=10,
random_state=0,
batch_size=100,
max_iter=300,
tol=1e4).fit(x)
CPU times: user 208 ms, sys: 1.98 ms, total: 210 ms
Wall time: 210 ms
In [477]:
plt.scatter(x[:, 0], x[:, 1], c=kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], c="red")
Out[477]:
<matplotlib.collections.PathCollection at 0x18c72acf8>
In [398]:
from sklearn.cluster import KMeans
# x = data_original.head(100).iloc[:, 3:24].values
# ((kmeans.labels_ == kmeans.predict(x))-1).sum()
chosen = data_original.head(10000).reset_index(drop=True)
x = chosen.iloc[:, 3:87].values
kmeans = KMeans(n_clusters=10, random_state=0).fit(x)
# chosen["rmsd"] = chosen.iloc[:,3:87].apply(lambda x: getScore(x, kmeans), axis=1)
chosen["cluster"] = kmeans.labels_
In [376]:
Out[376]:
(100, 84)
In [399]:
# ((t1.reshape(n,1,k) - t1.reshape(1,n,k))**2).sum(axis=2)**0.5
# kmeans.cluster_centers_.shape
n = kmeans.cluster_centers_.shape[0]
k = kmeans.cluster_centers_.shape[1]
t1 = kmeans.cluster_centers_[:n, :k]
cluster_rmsd =((t1.reshape(n,1,k) - t1.reshape(1,n,k))**2).sum(axis=2)**0.5
cluster_rmsd[cluster_rmsd==0.0] = 100
cluster_rmsd.min()
Out[399]:
18.62429297622066
In [400]:
plt.rcParams['figure.figsize'] = [16.18033, 10]
plt.imshow(cluster_rmsd, cmap="seismic")
plt.colorbar()
Out[400]:
<matplotlib.colorbar.Colorbar at 0x193d84908>
In [391]:
n = kmeans.cluster_centers_.shape[0]
k = kmeans.cluster_centers_.shape[1]
t1 = kmeans.cluster_centers_[:n, :k]
cluster_rmsd =((t1.reshape(n,1,k) - t1.reshape(1,n,k))**2).sum(axis=2)**0.5
In [392]:
cluster_rmsd[cluster_rmsd==0.0] = 100
In [393]:
cluster_rmsd.min()
Out[393]:
4.854396829416842
In [394]:
plt.rcParams['figure.figsize'] = [16.18033, 10]
plt.imshow(cluster_rmsd, cmap="seismic")
plt.colorbar()
Out[394]:
<matplotlib.colorbar.Colorbar at 0x18e05e7f0>
In [271]:
kmeans.score(x[0].reshape(1,-1))
Out[271]:
-89.79255359750277
In [314]:
d = chosen.query("cluster == 88").sort_values("rmsd").reset_index(drop=True)
In [ ]:
# compute the rmsd with respect to the pdb that closest to the cluster center
pdbList = glob.glob(f"{pre}cluster88/[0-9]*.pdb")
with open(pre+"cluster88_rmsd.csv", "w") as out:
out.write("i,j,rmsd\n")
for p1 in pdbList:
i1 = int(p1.split("/")[-1].split(".")[0])
if i1 != 0:
continue
print(i1)
for p2 in pdbList:
i2 = p2.split("/")[-1].split(".")[0]
rmsd = float(getFromTerminal(f"calculate_rmsd.py {p1} {p2}"))
out.write(f"{i1},{i2},{rmsd}\n")
In [349]:
cluster88 = pd.read_csv(pre+"cluster88_rmsd.csv")
In [354]:
cluster88.query("i!=j").groupby("i")["rmsd"].describe().sort_values("mean").head()
Out[354]:
count
mean
std
min
25%
50%
75%
max
i
0
55.0
1.297952
0.621858
0.247209
0.744088
1.234168
1.814403
2.669655
3
55.0
1.302234
0.606989
0.390278
0.766102
1.263094
1.783251
2.655407
2
55.0
1.303887
0.638794
0.174071
0.717086
1.283669
1.838137
2.670576
4
55.0
1.308397
0.648857
0.174071
0.727001
1.307860
1.865656
2.601067
9
55.0
1.331420
0.539353
0.387006
0.917789
1.346706
1.728705
2.387373
In [355]:
d
Out[355]:
pdb
i
seq
caca_1
caca_2
caca_3
caca_4
caca_5
caca_6
caca_7
...
cbcb_14
cbcb_15
cbcb_16
cbcb_17
cbcb_18
cbcb_19
cbcb_20
cbcb_21
cluster
rmsd
0
3bwsA02
124
SVIDRKTKL
9.949520
11.881444
15.007518
14.224649
11.941430
10.004884
9.158696
...
7.121051
6.843129
4.431560
7.335259
5.484321
7.748463
9.831528
9.043426
88
3.819305
1
2gy5A03
19
GVCHEDTGE
9.729459
12.950154
15.225656
13.891017
11.779313
9.805697
9.412597
...
6.277780
7.273304
4.563250
6.884191
5.750976
8.210796
10.516659
9.288854
88
4.452417
2
5teeA01
333
KCWDIATLE
10.085794
12.141468
15.270156
14.333816
11.746857
10.142472
9.340342
...
6.638502
6.956893
4.632072
7.229734
5.380895
6.813114
9.548963
8.939441
88
4.612135
3
5a2fA02
56
KEMDPVTQL
10.218586
12.734818
15.575337
14.460573
11.705797
10.030134
9.314142
...
6.102289
7.088407
4.625541
6.740666
5.120184
7.463485
9.995037
9.003468
88
4.821177
4
3bwsA02
269
YVIDTTTDT
9.946895
12.214399
15.314721
14.489808
11.962596
10.234323
9.240640
...
6.559402
6.920260
5.023807
7.303305
5.770678
6.323815
9.489404
8.827125
88
5.162033
5
5teeA01
89
KIWDVETKT
10.036064
12.049080
15.123745
14.087447
11.657670
9.739409
9.672215
...
7.149391
6.913264
4.877741
7.522132
5.712117
6.512667
9.248618
8.779138
88
5.462373
6
2xt6B03
81
VIVDRKTGE
9.954157
12.611201
15.333824
14.229970
11.283616
9.897378
9.132003
...
6.168306
7.465668
4.440479
6.005554
4.201615
8.649049
9.925421
8.656796
88
5.566763
7
3bwsA02
35
DVLDINSGQ
9.834118
11.908249
14.979401
14.166902
11.109526
10.022712
9.741758
...
7.071804
7.131282
4.950830
6.842302
4.385797
7.382644
9.384715
7.943592
88
5.761471
8
3bwsA02
213
EVYDLKEKK
9.741776
12.013299
15.171566
13.940570
12.418761
9.452124
9.560477
...
8.233245
6.993384
3.955309
7.474078
6.513636
6.924696
9.438110
9.940254
88
5.987422
9
4x36A02
83
YYLDAKEGA
9.940290
12.375504
15.257021
14.755202
11.646676
10.272871
8.773566
...
5.879449
6.757819
5.124569
5.930865
5.363800
8.044666
10.621360
9.346555
88
6.043243
10
3bwsA02
81
HVFDLKTLA
10.294765
12.111046
15.390107
14.873136
11.918390
10.923274
8.894362
...
5.934416
6.495215
4.319704
6.379131
4.612946
7.610839
9.810240
8.927789
88
6.138788
11
2czrA02
47
FYKCEECGK
9.168917
12.426137
14.358852
13.570449
11.310402
9.443875
8.913856
...
6.706592
8.060266
5.050359
6.188032
5.399819
7.802509
10.548103
8.616280
88
6.157605
12
4lpqA02
9
VEIDLDKQI
9.906225
12.531385
15.429649
14.528891
13.127890
10.403035
9.426353
...
8.736698
7.622255
4.310046
7.259790
5.234055
6.911831
9.290947
9.327504
88
6.414481
13
3tunA02
34
CKVSLESGH
10.379654
12.639275
15.348255
14.291680
10.995448
10.162334
9.374565
...
5.464734
6.281708
4.522317
6.022815
5.260367
7.944193
10.505252
8.564303
88
6.419547
14
3bwsA02
104
LLYDPIRDL
10.193686
13.003706
15.691595
14.823738
13.057449
10.318852
9.808470
...
8.385339
7.620235
4.600103
7.006957
4.847549
6.773152
9.199594
8.855232
88
6.504967
15
3bwsA02
301
VISDFLDHQ
9.847713
12.340275
15.451954
14.255663
13.476367
10.383883
8.911294
...
8.614441
7.075551
3.749933
7.625578
5.056649
8.167385
9.202552
9.558326
88
6.681753
16
1igqB00
31
WLKYEDDGQ
10.160343
13.270386
15.768248
15.105667
12.140276
10.367481
9.581594
...
6.238425
7.055059
6.029074
6.631764
4.811035
7.643550
9.732194
8.268499
88
6.873870
17
3tunA02
8
GWYSISDES
10.406827
12.695497
15.692054
14.874257
13.157329
10.956638
9.741825
...
8.121049
7.145075
4.038841
7.135518
4.777005
6.716304
8.934176
8.554501
88
6.953285
18
3bwsA02
114
YCSNWISED
9.621660
12.917183
15.446292
14.177167
13.226645
10.154354
9.548340
...
9.551937
8.516931
4.503995
7.054341
5.320324
7.854528
9.869858
9.593534
88
7.150779
19
2czrA02
84
FELNFREGK
9.401689
12.725125
14.779818
13.575309
12.958589
10.353744
9.336522
...
9.104170
8.467405
4.055002
7.074190
5.586054
7.375543
9.794145
8.695016
88
7.353073
20
4x36A02
96
FIQSADGTG
9.316404
12.502723
14.474878
12.100182
12.781625
9.473136
9.100404
...
8.679495
7.596229
6.392345
5.241836
4.742088
8.686616
9.994502
8.133007
88
8.111603
21
5teeA01
132
FCYWFNRND
9.734386
11.471707
14.944398
14.817686
13.018109
10.472942
9.250990
...
9.811750
7.862967
5.062631
7.820497
4.361548
7.979190
8.761995
9.243713
88
8.168455
22
3bwsA02
235
IALSPDGKY
9.942943
12.838919
14.970727
12.748605
13.571465
10.319620
9.058799
...
8.890912
7.295113
6.352217
5.512440
4.280098
9.089517
9.640741
8.166730
88
8.599598
23
3bwsA02
291
LDVSPDNRY
10.274695
13.081531
15.338303
13.379724
13.781996
10.656155
9.302710
...
8.529108
6.914666
7.092677
5.174410
4.133323
8.836203
9.589263
8.149068
88
8.699957
24
1ka1A02
87
IHTDAMEDV
10.407919
12.130158
14.331053
13.842548
10.225945
10.270689
9.547078
...
5.289476
5.900444
5.524788
7.152904
5.819712
8.841456
10.445993
8.840138
88
8.776713
25
3bwsA02
146
LLLSKDGKE
9.950219
13.005416
14.986661
12.596710
13.485259
10.565754
9.318662
...
9.009582
7.589873
6.367220
5.838040
4.157111
9.012595
9.362629
7.554568
88
8.911911
26
3a79B00
508
SAKCSGSGK
10.110187
13.011072
14.885407
13.630521
10.028573
9.704154
10.287563
...
5.247304
6.889391
5.188907
6.328406
4.973253
8.910455
10.206703
8.177255
88
8.960456
27
3bwsA02
62
TISIPEHNE
9.817282
12.642413
15.391432
14.783654
13.754408
10.523006
9.632155
...
9.948241
7.800527
4.945644
7.654897
4.541373
6.721983
8.166798
8.948376
88
9.328044
28
3hviA00
192
SSYLEYMKV
9.915005
12.536254
14.963794
13.874313
12.605189
10.819551
9.230969
...
6.508354
6.251395
4.209615
8.067944
4.002141
11.135752
9.179267
9.378696
88
9.406241
29
1kvkA01
49
SLNLPNVGI
9.872140
12.464895
15.427619
14.898810
13.771154
10.793503
9.835277
...
10.082244
8.119540
4.712597
7.447677
5.173263
6.850187
8.813840
8.941354
88
9.425864
30
3bwsA02
250
GPNHPTEGY
8.995919
11.200593
14.301685
13.813991
11.312732
9.266978
7.749330
...
5.335237
4.460575
4.294027
5.952809
7.126341
8.807027
11.041970
11.815134
88
9.625760
31
1p9hA00
9
PNADPALGL
8.204474
10.150581
13.143430
13.322371
12.160578
11.349983
9.665771
...
7.127809
6.223761
4.310967
7.569828
6.613921
7.029165
9.850493
9.631247
88
10.114729
32
5b3kA00
89
GLGDSSYGD
9.597320
12.154374
15.100858
14.412055
13.749628
10.152723
10.020901
...
9.491571
4.860025
4.142701
7.559115
4.433785
7.575753
7.348795
8.629077
88
10.596721
33
2yeqA02
154
HFTYGNLAS
10.306349
12.778808
13.628778
13.996952
10.818155
7.468215
9.210052
...
6.887122
4.880846
5.155641
4.599007
7.822213
7.813149
9.890082
8.855243
88
11.339090
34
1vx7E02
94
MGGFPHYGV
7.975001
9.675704
13.042289
12.716304
10.627055
9.650623
7.693866
...
5.901504
5.050352
3.800005
6.113456
7.217942
9.328736
11.408817
12.313504
88
11.388345
35
2mj6A00
74
RATGEEGET
10.114860
13.810698
15.537375
13.465596
11.132085
7.529359
10.105418
...
7.311515
5.777658
7.177136
4.094617
6.927640
8.186397
11.716082
10.635818
88
11.626912
36
4b9gA00
53
KLVNVNNPD
9.657044
11.811341
14.628034
13.417040
11.054585
11.446303
8.837651
...
6.292343
9.008652
4.260343
7.011172
7.830382
8.814804
12.381727
10.399013
88
12.114169
37
4gyiA03
44
EEKDAEDPS
9.612258
12.716767
15.137066
13.919269
11.202133
11.844838
8.997447
...
4.381712
8.290860
6.033016
6.653250
8.025368
8.679353
12.191490
9.909112
88
12.458303
38
3tunA02
107
AIPVPGNPD
9.183981
12.203222
14.755483
13.067821
10.905220
11.343682
8.843083
...
5.455028
8.854027
4.492827
7.458682
7.959353
9.402717
12.204529
10.528821
88
12.475578
39
3majA02
49
LERHGGSLV
9.786978
13.407288
14.910089
12.403946
10.591506
7.053391
10.257120
...
7.917249
5.814570
8.424906
4.247381
6.721586
6.580546
10.721272
10.583070
88
12.761652
40
2bteA05
49
EIYVPGKIL
9.999669
12.358915
14.762330
13.789237
10.346306
7.235074
9.718364
...
6.982888
5.248495
6.646159
3.817559
7.485408
8.574403
9.458507
10.122655
88
12.829105
41
2yeqA02
235
PIYSMDSWD
10.442057
12.177690
15.400909
14.613280
14.722425
10.947508
9.156952
...
9.444396
3.871136
3.824424
8.584323
5.558829
10.608860
7.775544
9.088020
88
13.032639
42
5a2fA02
10
LFLETEQLK
10.247132
13.817410
16.596710
14.396990
13.947150
10.481287
10.136517
...
9.878394
5.473297
3.925470
8.597685
5.523424
10.738580
9.831265
10.684470
88
13.363569
43
5b3kA00
5
LSGSVYGTA
10.143659
13.758297
15.273129
12.781238
13.092915
9.680444
10.111543
...
10.398278
5.785300
5.400983
6.153954
3.945883
8.266972
8.379736
9.034787
88
13.395649
44
4kqcA02
19
AAFTSDNKA
7.635839
10.297567
12.887216
11.377438
9.537807
8.587503
10.329976
...
5.632930
4.144410
5.104934
7.282839
9.187781
6.860199
10.786813
12.001636
88
13.559004
45
1ac5A00
282
RESSQKGTA
9.565839
13.072076
13.702000
10.173500
11.112715
8.762067
10.317171
...
11.583023
8.858782
5.732589
6.250565
3.782586
8.150827
8.276429
7.952695
88
13.570053
46
1iv8A04
55
INTSWRNQN
7.467202
11.081289
12.271182
11.130205
10.820847
10.029446
10.850627
...
5.654761
3.746603
5.228187
6.865900
7.643214
7.479899
11.152542
11.235831
88
13.687695
47
5tjjA02
29
VKRLRDDRV
9.804014
13.411267
15.186080
13.200135
10.432072
6.660206
10.220562
...
7.879859
5.688957
8.363690
4.255258
6.413924
7.371846
10.703006
10.747286
88
13.718079
48
1a48A01
21
IYEVDAGTL
9.616343
12.438126
13.847081
13.413589
10.542364
6.839457
9.359861
...
7.455886
6.129662
8.608805
3.978680
6.352230
4.970822
9.293366
10.553911
88
13.853912
49
1vx7E02
79
SASTDADIT
8.199695
11.514258
13.286428
11.109321
10.686544
9.954393
10.556925
...
6.671429
6.060616
4.341811
7.643427
9.053087
7.334598
11.572095
10.136599
88
13.957391
50
1vliA01
270
KTTTAIEGE
9.331764
12.773533
14.553049
12.981730
13.157260
12.276527
9.811162
...
6.338029
5.398420
4.697480
7.438582
10.186050
6.663074
10.660284
11.929588
88
14.021673
51
1vx7E02
146
DTSSKIGHG
8.886813
9.614176
13.160775
13.560004
12.032557
9.949920
8.166377
...
8.340906
3.299832
5.298683
8.124020
3.338068
9.770238
6.973861
7.576541
88
14.387859
52
2xt6B03
97
LATNPDGTP
6.727586
8.690116
11.635242
10.416782
10.696363
8.880394
9.847705
...
7.165048
5.930047
6.727807
4.764903
7.259028
8.913892
12.010137
10.369381
88
14.400125
53
2ebeA00
24
LPGEVAGAR
8.584558
9.919054
13.655498
14.275121
13.300017
11.250684
7.274832
...
10.058243
6.896407
6.365407
5.926227
5.536994
5.997392
8.515861
10.323625
88
14.456325
54
3bwsA02
194
IVSGNTENK
9.503940
13.078058
14.365262
14.337009
13.430169
10.672255
9.702360
...
9.891918
7.112747
8.391471
8.114089
3.026186
11.094549
7.104325
4.670869
88
15.448550
55
2gy5A03
12
CTACMNNGV
8.239299
11.777732
14.927156
13.773328
11.719008
9.464096
9.486419
...
9.481142
11.445315
7.693930
3.574055
6.200655
7.022531
11.150053
10.721931
88
15.909167
56 rows × 89 columns
In [315]:
for i, row in d.iterrows():
print(i, row["pdb"], row["i"])
getFragPdb(row["pdb"], int(row["i"]), f"cluster88/{i}.pdb")
0 3bwsA02 124
1 2gy5A03 19
2 5teeA01 333
3 5a2fA02 56
4 3bwsA02 269
5 5teeA01 89
6 2xt6B03 81
7 3bwsA02 35
8 3bwsA02 213
9 4x36A02 83
10 3bwsA02 81
11 2czrA02 47
12 4lpqA02 9
13 3tunA02 34
14 3bwsA02 104
15 3bwsA02 301
16 1igqB00 31
17 3tunA02 8
18 3bwsA02 114
19 2czrA02 84
20 4x36A02 96
21 5teeA01 132
22 3bwsA02 235
23 3bwsA02 291
24 1ka1A02 87
25 3bwsA02 146
26 3a79B00 508
27 3bwsA02 62
28 3hviA00 192
29 1kvkA01 49
30 3bwsA02 250
31 1p9hA00 9
32 5b3kA00 89
33 2yeqA02 154
34 1vx7E02 94
35 2mj6A00 74
36 4b9gA00 53
37 4gyiA03 44
38 3tunA02 107
39 3majA02 49
40 2bteA05 49
41 2yeqA02 235
42 5a2fA02 10
43 5b3kA00 5
44 4kqcA02 19
45 1ac5A00 282
46 1iv8A04 55
47 5tjjA02 29
48 1a48A01 21
49 1vx7E02 79
50 1vliA01 270
51 1vx7E02 146
52 2xt6B03 97
53 2ebeA00 24
54 3bwsA02 194
55 2gy5A03 12
In [309]:
chosen.groupby("cluster")["rmsd"].describe().sort_values("count")
Out[309]:
count
mean
std
min
25%
50%
75%
max
cluster
86
25.0
12.868003
2.429841
8.180901
11.585724
13.128961
15.137856
16.086021
31
27.0
10.473506
2.571394
6.830730
8.555585
10.168139
12.625339
15.197835
20
37.0
11.948910
3.092640
7.513775
9.795009
11.379272
13.952240
18.524402
92
38.0
13.966047
2.389854
8.431391
12.411322
14.141286
16.221890
17.391868
57
41.0
13.543257
2.579857
9.174207
11.875370
13.287241
14.786760
19.987259
52
41.0
12.396760
2.120092
7.802385
10.917835
11.902867
13.991850
17.486862
45
42.0
12.519146
2.424179
6.651546
11.085333
12.652927
13.997425
17.496040
90
42.0
10.328494
3.056165
5.334071
7.579417
9.787187
12.954448
16.926965
69
44.0
11.745726
4.110222
5.725782
7.896013
10.928373
14.615448
19.751685
77
45.0
11.411638
3.054516
6.919495
8.976652
10.449578
14.333481
17.071899
6
46.0
10.668448
2.545219
6.227311
8.140037
11.016674
12.629587
15.423659
0
47.0
14.509444
2.470353
9.226504
13.101469
14.556386
16.572913
19.358230
47
48.0
12.729488
3.242662
6.088605
10.585771
12.452035
14.989300
20.836815
94
49.0
11.592993
3.241832
4.906130
8.873921
12.090305
14.158854
17.384469
50
50.0
12.996113
2.060327
7.903283
11.649837
13.214512
14.436430
16.519939
98
50.0
10.288891
2.125497
5.840846
8.969731
10.230508
11.445786
15.367995
37
52.0
10.737135
3.622544
5.375806
8.121953
10.133434
12.036602
19.093855
12
53.0
11.658254
2.009164
7.229504
10.414187
11.635477
12.870523
16.490421
75
53.0
12.303176
2.765158
7.490998
9.846713
12.810812
14.106641
18.506562
91
53.0
12.253184
3.124894
7.356313
9.992794
12.165720
14.560320
19.125920
33
54.0
15.254683
2.772595
9.718127
13.463367
15.203922
17.276330
20.230821
93
54.0
12.731544
2.447766
8.084662
10.884686
12.367426
14.375135
19.421543
61
55.0
12.135543
2.968750
6.652033
9.700207
11.986171
13.914067
18.366642
4
55.0
12.095456
3.200645
5.986447
9.502334
11.845416
13.925985
18.225360
40
56.0
13.132927
2.459430
8.461646
11.342993
13.111697
14.719820
20.304801
18
56.0
11.472333
2.322243
6.660154
10.031681
11.102281
12.897364
16.418819
88
56.0
9.728882
3.453831
3.819305
6.483612
9.367143
13.115371
15.909167
84
57.0
12.270203
1.977655
8.086094
10.720760
12.208336
13.513403
16.932392
36
58.0
12.126113
2.392913
6.474681
10.996501
12.152336
13.598899
17.566650
11
58.0
11.802073
3.030148
5.937072
9.867863
11.690367
14.526130
18.369063
...
...
...
...
...
...
...
...
...
97
96.0
10.397544
2.156213
4.843555
8.971957
10.500444
12.034870
15.887437
81
96.0
10.893850
1.806490
7.209888
9.664418
10.642898
12.175620
17.066911
29
99.0
11.169434
2.484482
6.119285
9.157915
11.336999
13.020159
17.398069
24
100.0
10.958577
3.195694
6.285791
8.380647
10.144780
12.700818
20.347792
21
102.0
9.274561
3.901041
3.612956
6.361961
8.411638
11.606667
19.017857
42
102.0
8.683971
1.646199
5.457985
7.718123
8.563766
9.620044
13.560157
32
103.0
11.377467
2.587875
7.017511
9.356829
11.070049
13.138082
18.061126
26
103.0
11.673854
2.888604
5.961631
9.681419
11.186433
13.421784
17.845587
72
104.0
10.797946
2.223295
5.842216
9.489124
10.696482
12.322521
16.318550
82
106.0
8.864908
2.846169
4.126482
6.461541
8.885306
10.930845
14.974236
68
110.0
8.798542
3.038090
4.574051
6.556161
8.270263
10.427786
17.689796
99
114.0
9.261581
2.506325
4.685189
7.627691
8.848057
11.181854
16.532881
51
114.0
7.279830
3.422616
3.365311
4.596202
6.227794
9.064487
18.266724
95
115.0
8.240733
1.837165
5.047425
6.738861
8.152049
9.716614
12.962903
63
117.0
8.748993
1.660973
5.493222
7.657481
8.697411
9.703374
14.036003
53
125.0
7.374729
4.107384
2.404638
3.764119
6.233578
9.923549
18.865109
65
126.0
9.250297
1.981388
5.138529
7.795777
9.112192
10.608026
15.085967
1
134.0
8.061704
2.264072
3.513803
6.336191
7.963095
9.393186
16.710189
89
135.0
8.777316
1.859779
4.499449
7.717106
8.683468
9.719918
15.497464
19
135.0
7.835607
1.913975
4.302796
6.350714
7.800876
9.185014
12.179388
25
135.0
6.388283
3.302691
1.961434
3.600076
5.666581
8.153018
18.217225
54
150.0
7.302164
2.970296
2.804589
5.026684
6.687114
8.919654
16.160247
79
159.0
8.747054
1.775704
4.651922
7.499323
8.755172
9.964741
13.639758
16
160.0
6.539418
2.908991
3.385512
4.389139
5.591175
7.529391
17.318794
28
160.0
7.127153
1.459804
3.737256
6.113912
7.129481
8.012175
11.654530
7
208.0
6.800668
1.333470
3.752799
5.853304
6.655734
7.592728
10.969135
14
223.0
7.032772
1.580068
3.240167
5.842751
6.918776
8.082615
11.896784
64
267.0
6.079601
1.436474
2.667226
5.173178
6.006692
6.962764
10.991078
2
285.0
4.884285
2.033753
1.825475
3.610672
4.196315
5.714881
13.594217
73
1699.0
2.801301
1.493118
0.604966
1.870317
2.457331
3.339556
14.398997
100 rows × 8 columns
In [296]:
chosen.groupby("cluster")["rmsd"].describe()
Out[296]:
count
mean
std
min
25%
50%
75%
max
cluster
0
47.0
14.509444
2.470353
9.226504
13.101469
14.556386
16.572913
19.358230
1
134.0
8.061704
2.264072
3.513803
6.336191
7.963095
9.393186
16.710189
2
285.0
4.884285
2.033753
1.825475
3.610672
4.196315
5.714881
13.594217
3
63.0
11.792323
2.030125
7.925980
10.115502
11.795392
13.097566
15.275605
4
55.0
12.095456
3.200645
5.986447
9.502334
11.845416
13.925985
18.225360
5
88.0
10.875914
3.103294
5.921701
8.141490
10.544263
13.040039
18.224837
6
46.0
10.668448
2.545219
6.227311
8.140037
11.016674
12.629587
15.423659
7
208.0
6.800668
1.333470
3.752799
5.853304
6.655734
7.592728
10.969135
8
64.0
11.639289
2.423901
6.294332
9.685099
11.532024
13.191878
16.898084
9
68.0
13.283440
2.730605
8.283355
11.036373
13.383355
15.224419
20.960029
10
63.0
9.088767
3.612960
3.987563
6.032655
7.856961
12.208410
17.095299
11
58.0
11.802073
3.030148
5.937072
9.867863
11.690367
14.526130
18.369063
12
53.0
11.658254
2.009164
7.229504
10.414187
11.635477
12.870523
16.490421
13
59.0
11.127064
2.586649
6.015839
9.420929
11.241936
12.618502
18.425722
14
223.0
7.032772
1.580068
3.240167
5.842751
6.918776
8.082615
11.896784
15
65.0
12.337664
2.833936
7.081583
9.988564
11.966763
14.439806
19.244873
16
160.0
6.539418
2.908991
3.385512
4.389139
5.591175
7.529391
17.318794
17
70.0
10.064104
3.865495
4.297259
7.014241
8.819931
13.695621
19.649993
18
56.0
11.472333
2.322243
6.660154
10.031681
11.102281
12.897364
16.418819
19
135.0
7.835607
1.913975
4.302796
6.350714
7.800876
9.185014
12.179388
20
37.0
11.948910
3.092640
7.513775
9.795009
11.379272
13.952240
18.524402
21
102.0
9.274561
3.901041
3.612956
6.361961
8.411638
11.606667
19.017857
22
66.0
10.666855
3.175975
4.637165
8.003551
10.195682
13.005901
18.776521
23
63.0
9.545131
2.336334
4.861592
7.906835
9.587818
11.061898
14.832847
24
100.0
10.958577
3.195694
6.285791
8.380647
10.144780
12.700818
20.347792
25
135.0
6.388283
3.302691
1.961434
3.600076
5.666581
8.153018
18.217225
26
103.0
11.673854
2.888604
5.961631
9.681419
11.186433
13.421784
17.845587
27
85.0
9.117133
2.413714
5.050972
7.208351
8.711424
10.913915
14.786774
28
160.0
7.127153
1.459804
3.737256
6.113912
7.129481
8.012175
11.654530
29
99.0
11.169434
2.484482
6.119285
9.157915
11.336999
13.020159
17.398069
...
...
...
...
...
...
...
...
...
70
64.0
13.578291
2.536454
8.479009
11.552321
13.365783
15.193793
20.172172
71
82.0
8.563055
2.027075
3.655412
7.122049
8.289281
10.049077
12.663579
72
104.0
10.797946
2.223295
5.842216
9.489124
10.696482
12.322521
16.318550
73
1699.0
2.801301
1.493118
0.604966
1.870317
2.457331
3.339556
14.398997
74
74.0
10.274946
3.124773
6.902806
8.267877
9.224823
10.704157
20.079225
75
53.0
12.303176
2.765158
7.490998
9.846713
12.810812
14.106641
18.506562
76
71.0
11.688041
2.918578
6.455069
9.388986
11.885100
13.805711
17.627366
77
45.0
11.411638
3.054516
6.919495
8.976652
10.449578
14.333481
17.071899
78
72.0
9.603870
2.805974
5.038361
7.447394
9.040345
11.661425
16.632578
79
159.0
8.747054
1.775704
4.651922
7.499323
8.755172
9.964741
13.639758
80
73.0
14.303270
2.217348
8.772048
12.789239
14.371254
15.990052
19.490860
81
96.0
10.893850
1.806490
7.209888
9.664418
10.642898
12.175620
17.066911
82
106.0
8.864908
2.846169
4.126482
6.461541
8.885306
10.930845
14.974236
83
82.0
10.152878
1.844130
5.890020
8.937818
10.050156
11.240107
15.926636
84
57.0
12.270203
1.977655
8.086094
10.720760
12.208336
13.513403
16.932392
85
68.0
10.445316
2.388606
6.026959
9.001074
10.661674
12.279133
15.423282
86
25.0
12.868003
2.429841
8.180901
11.585724
13.128961
15.137856
16.086021
87
63.0
11.927190
2.376512
8.272821
9.721701
11.669178
13.658085
17.103763
88
56.0
9.728882
3.453831
3.819305
6.483612
9.367143
13.115371
15.909167
89
135.0
8.777316
1.859779
4.499449
7.717106
8.683468
9.719918
15.497464
90
42.0
10.328494
3.056165
5.334071
7.579417
9.787187
12.954448
16.926965
91
53.0
12.253184
3.124894
7.356313
9.992794
12.165720
14.560320
19.125920
92
38.0
13.966047
2.389854
8.431391
12.411322
14.141286
16.221890
17.391868
93
54.0
12.731544
2.447766
8.084662
10.884686
12.367426
14.375135
19.421543
94
49.0
11.592993
3.241832
4.906130
8.873921
12.090305
14.158854
17.384469
95
115.0
8.240733
1.837165
5.047425
6.738861
8.152049
9.716614
12.962903
96
60.0
10.816445
2.397899
6.838141
8.975154
10.862857
12.261697
17.314459
97
96.0
10.397544
2.156213
4.843555
8.971957
10.500444
12.034870
15.887437
98
50.0
10.288891
2.125497
5.840846
8.969731
10.230508
11.445786
15.367995
99
114.0
9.261581
2.506325
4.685189
7.627691
8.848057
11.181854
16.532881
100 rows × 8 columns
In [249]:
chosen.groupby("cluster")["caca_1"].describe()
Out[249]:
count
mean
std
min
25%
50%
75%
max
cluster
0
47.0
9.107972
0.967933
6.033747
8.792595
9.222053
9.653984
10.783340
1
134.0
9.283012
0.434942
8.082854
9.065539
9.288622
9.464692
10.421309
2
285.0
5.231241
0.302073
3.988152
5.036580
5.179334
5.347704
6.532340
3
63.0
6.605184
1.571319
4.544618
5.137373
6.012451
7.725599
10.031975
4
55.0
6.139642
1.315773
4.880736
5.251148
5.602896
6.290223
9.585359
5
88.0
6.455053
1.470074
4.464227
5.159675
5.598491
7.858600
9.366185
6
46.0
9.717776
0.675211
7.649774
9.463802
9.911917
10.162426
10.543301
7
208.0
9.950973
0.487548
8.361642
9.739443
10.024712
10.305412
10.865170
8
64.0
9.245912
0.768627
7.141240
8.706300
9.304117
9.840726
10.642015
9
68.0
7.343715
1.713498
4.871723
5.481516
7.899383
8.786096
9.809787
10
63.0
9.759267
0.756167
7.502295
9.520896
10.034769
10.182660
10.910431
11
58.0
6.014184
1.061447
4.466815
5.352783
5.676593
6.344498
8.828549
12
53.0
8.833394
0.895318
6.091265
8.552012
8.962975
9.497625
9.933512
13
59.0
7.978848
1.301351
4.934107
7.336499
8.219158
8.912570
10.045939
14
223.0
9.786439
0.521609
8.044416
9.435349
9.828861
10.157747
10.836526
15
65.0
7.312742
1.547518
5.184872
5.753088
7.770713
8.729336
10.152164
16
160.0
8.295052
0.813046
6.261518
7.662178
8.574114
8.984371
9.775524
17
70.0
9.867539
0.650233
7.321350
9.652145
9.966244
10.336061
10.850627
18
56.0
9.083503
0.923239
6.788041
8.580646
9.239259
9.767775
10.545494
19
135.0
9.920066
0.485947
8.154524
9.655799
9.991779
10.253253
10.822775
20
37.0
5.890449
1.040159
4.924730
5.201715
5.456261
6.167805
8.527210
21
102.0
5.598708
1.064236
4.619137
5.054111
5.172285
5.451120
9.318322
22
66.0
8.514215
1.092339
5.422628
7.782361
8.667383
9.250473
10.368338
23
63.0
5.684063
0.705324
4.586082
5.271411
5.431941
5.954486
7.771918
24
100.0
9.637346
1.029738
4.625182
9.482487
9.835333
10.228119
10.868514
25
135.0
8.122323
0.681756
5.145186
8.004938
8.219689
8.500360
9.150583
26
103.0
5.346222
0.576351
4.687043
5.025092
5.189865
5.400290
7.749330
27
85.0
9.795203
0.592879
7.693119
9.523393
9.778906
10.241662
10.992158
28
160.0
8.461235
0.798432
6.702386
7.885981
8.386912
9.000854
10.526746
29
99.0
5.629837
0.868820
4.409707
5.112083
5.384417
5.759179
8.437553
...
...
...
...
...
...
...
...
...
70
64.0
5.803136
0.855240
4.681069
5.293294
5.483550
5.966646
8.468333
71
82.0
7.780235
0.832253
5.466374
7.430647
7.794484
8.251898
9.736248
72
104.0
7.186783
1.567058
4.318045
5.531958
7.865799
8.520729
9.806312
73
1699.0
5.160235
0.235914
4.404136
5.008525
5.136191
5.286947
6.802791
74
74.0
9.634687
0.482808
8.166377
9.323127
9.662062
9.970132
10.393231
75
53.0
6.135036
1.518998
4.822321
5.164798
5.428969
6.046421
10.150519
76
71.0
8.603556
1.076276
5.301748
8.301310
8.953986
9.205549
10.191883
77
45.0
8.173880
0.899719
5.766696
7.818158
8.201186
8.780516
9.705441
78
72.0
9.098500
0.859052
5.198376
8.703559
9.130106
9.696855
10.657252
79
159.0
8.015971
0.867010
5.424811
7.424664
8.109244
8.637223
10.200883
80
73.0
6.431632
1.370623
4.440581
5.251721
5.576448
7.868195
8.885790
81
96.0
6.121469
1.159343
4.399655
5.320935
5.659830
7.216746
9.233649
82
106.0
5.724130
1.108622
4.320683
5.074471
5.247718
5.689854
9.179356
83
82.0
9.798888
0.654166
7.072979
9.596563
9.926461
10.280733
10.722654
84
57.0
6.501368
1.259104
4.295369
5.409472
6.043009
7.475144
9.103723
85
68.0
7.415206
0.968708
5.273489
6.928287
7.352596
8.102690
9.408419
86
25.0
6.279767
1.147799
5.008773
5.382519
6.031190
6.806560
9.179280
87
63.0
9.139912
0.899868
6.668500
8.736752
9.286281
9.837852
10.625488
88
56.0
9.571231
0.805657
6.727586
9.384208
9.825700
10.091892
10.442057
89
135.0
9.717952
0.596464
7.845283
9.374381
9.823113
10.126503
10.808686
90
42.0
6.136051
0.920375
4.891723
5.416706
5.840446
7.064580
8.199224
91
53.0
5.561117
0.862180
4.560911
5.051378
5.437956
5.666302
8.549741
92
38.0
8.371339
1.328756
5.091163
7.665534
8.825273
9.291897
10.406777
93
54.0
6.379349
1.508717
4.740458
5.330225
5.584230
8.049022
9.562082
94
49.0
8.810355
0.762296
7.308287
8.286874
9.070411
9.305305
10.228033
95
115.0
5.762286
0.670392
4.908475
5.319549
5.569041
5.974905
8.491797
96
60.0
5.602232
1.119957
4.692906
4.998555
5.202498
5.513247
8.917312
97
96.0
9.373655
0.736226
7.810011
8.825228
9.392087
10.019339
10.682501
98
50.0
9.584710
0.577891
8.025982
9.198726
9.616575
10.020066
10.673256
99
114.0
9.474343
0.854113
6.201775
9.040631
9.639253
10.104994
10.998346
100 rows × 8 columns
In [233]:
d = data_original.head(1000)[kmeans.labels_ == 0]
for i, row in d.iterrows():
print(i, row["pdb"], row["i"])
6 1igqB00 6
14 1igqB00 14
20 1igqB00 20
29 1igqB00 29
39 1igqB00 39
65 2r7rA08 16
220 3fcnA00 118
225 3fcnA00 123
230 3fcnA00 128
267 5tjjA02 27
290 5tjjA02 50
296 5tjjA02 56
326 5tjjA02 86
352 5tjjA02 112
369 5tjjA02 129
419 2gy5A03 17
428 2gy5A03 26
441 2gy5A03 39
454 2gy5A03 52
467 2gy5A03 65
475 2gy5A03 73
488 2gy5A03 86
502 2gy5A03 100
509 2gy5A03 107
517 2gy5A03 115
528 2gy5A03 126
566 1ka1A02 37
594 1ka1A02 65
614 1ka1A02 85
622 1ka1A02 93
631 1ka1A02 102
638 1ka1A02 109
662 2czrA02 6
675 2czrA02 19
701 2czrA02 45
709 2czrA02 53
738 2czrA02 82
746 2czrA02 90
753 2czrA02 97
769 1a48A01 1
787 1a48A01 19
802 1a48A01 34
808 1a48A01 40
836 1a48A01 68
841 1a48A01 73
872 1n08A00 1
891 1n08A00 20
906 1n08A00 35
928 1n08A00 57
956 1n08A00 85
964 1n08A00 93
In [232]:
i = 0
((x[i] - kmeans.cluster_centers_[kmeans.labels_[i]])**2)
Out[232]:
array([1.99762260e+00, 6.08547783e+00, 1.60923936e-01, 2.37948752e+00,
7.32518437e-01, 1.82500920e+00, 4.06818698e+00, 1.24109133e+00,
7.15746328e+00, 6.30259290e+00, 1.00427376e+01, 4.99073131e-01,
4.70566311e-01, 4.60160346e-01, 1.20339491e+00, 1.86631283e+00,
3.52674964e+00, 4.53173738e+00, 4.58810165e-01, 5.22721704e-01,
4.65781076e-02, 3.17588895e+00, 1.27787050e+01, 1.26398678e-02,
3.30453780e+00, 6.45726474e-01, 5.38713829e-01, 8.91350571e+00,
5.56125657e-01, 8.53646906e+00, 5.90221803e+00, 6.77068565e+00,
7.44166894e-01, 7.17918538e-01, 4.57289185e-01, 2.46043792e-01,
1.80884812e+00, 3.64631819e+00, 2.66007485e+00, 4.56652806e-01,
1.68522764e-01, 6.33084480e-03, 1.48740274e+00, 4.22502739e+00,
1.01220387e-01, 2.55282600e-01, 1.24617964e-01, 3.00895134e-03,
6.86502116e+00, 5.05230963e+00, 1.58429942e+01, 1.61148917e+01,
2.15972173e+01, 4.29636052e+00, 4.42807599e-01, 5.83744226e-01,
6.36432948e-02, 1.04878810e+00, 3.25555131e+00, 3.49581231e+00,
2.80374021e-01, 9.41031106e-02, 1.32645659e-02, 2.58960079e+00,
1.02770963e+01, 6.00112750e-01, 5.17046586e-01, 1.15277658e-01,
5.47032365e-01, 1.20201860e+01, 3.92740337e+00, 1.72796302e+01,
1.58457873e+01, 1.60462948e+01, 5.56059148e+00, 1.59844043e-01,
6.15317327e-01, 6.74883261e-01, 7.08147322e-01, 3.90341007e+00,
1.64559057e+00, 4.22006932e-01, 2.41808675e-03, 1.29815240e-02])
In [ ]:
In [ ]:
In [3]:
data_original = pd.read_csv("/Users/weilu/Research/optimization/fragment/clustered_bin2_jan18.csv", index_col=0)
/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/numpy/lib/arraysetops.py:571: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
mask |= (ar1 == a)
In [4]:
data_original.head()
Out[4]:
pdb
i
seq
dis1
dis2
dis3
dis4
dis5
dis6
dis7
...
d15
d16
d17
d18
d19
d20
d21
dd
category
count
0
1igqB00
0
DKLKKAIVQ
9.545797
12.242739
11.389445
14.435853
15.702080
18.185148
9.427593
...
15
9
13
15
9
13
9
9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1...
NaN
NaN
1
1igqB00
1
KLKKAIVQV
9.427593
9.508488
13.046163
14.956468
17.811722
21.096529
5.881323
...
19
9
13
17
9
13
9
9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1...
NaN
NaN
2
1igqB00
2
LKKAIVQVE
5.881323
9.385722
11.658462
14.435720
17.857430
21.547594
9.566237
...
19
9
13
17
9
13
11
5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,...
NaN
NaN
3
1igqB00
3
KKAIVQVEH
9.566237
12.596333
15.433746
18.900240
22.492163
25.003511
9.934756
...
19
9
13
15
11
13
11
9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13...
9987.0
13.0
4
1igqB00
4
KAIVQVEHD
9.934756
12.622395
16.016058
19.465660
21.763054
25.281502
9.527388
...
19
11
13
17
11
13
11
9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1...
6835.0
18.0
5 rows × 49 columns
In [ ]:
In [70]:
data_original.shape
Out[70]:
(1901430, 49)
In [87]:
data_original.head()
Out[87]:
pdb
i
seq
dis1
dis2
dis3
dis4
dis5
dis6
dis7
...
d15
d16
d17
d18
d19
d20
d21
dd
category
count
0
1igqB00
0
DKLKKAIVQ
9.545797
12.242739
11.389445
14.435853
15.702080
18.185148
9.427593
...
15
9
13
15
9
13
9
9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1...
NaN
NaN
1
1igqB00
1
KLKKAIVQV
9.427593
9.508488
13.046163
14.956468
17.811722
21.096529
5.881323
...
19
9
13
17
9
13
9
9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1...
NaN
NaN
2
1igqB00
2
LKKAIVQVE
5.881323
9.385722
11.658462
14.435720
17.857430
21.547594
9.566237
...
19
9
13
17
9
13
11
5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,...
NaN
NaN
3
1igqB00
3
KKAIVQVEH
9.566237
12.596333
15.433746
18.900240
22.492163
25.003511
9.934756
...
19
9
13
15
11
13
11
9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13...
9987.0
13.0
4
1igqB00
4
KAIVQVEHD
9.934756
12.622395
16.016058
19.465660
21.763054
25.281502
9.527388
...
19
11
13
17
11
13
11
9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1...
6835.0
18.0
5 rows × 49 columns
In [71]:
from sklearn.cluster import KMeans
x = data_original.head(10000).iloc[:, 3:24].values
kmeans = KMeans(n_clusters=1000, random_state=0).fit(x)
In [75]:
plt.scatter(x[:, 0], x[:, 1], c=kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], c="red")
Out[75]:
<matplotlib.collections.PathCollection at 0x12a4ec940>
In [90]:
kmeans.score(x[3:4])
Out[90]:
-3.2598634605765255
In [130]:
data_original.head(10000)[kmeans.labels_ == 941]
Out[130]:
pdb
i
seq
dis1
dis2
dis3
dis4
dis5
dis6
dis7
...
d15
d16
d17
d18
d19
d20
d21
dd
category
count
0
1igqB00
0
DKLKKAIVQ
9.545797
12.242739
11.389445
14.435853
15.702080
18.185148
9.427593
...
15
9
13
15
9
13
9
9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1...
NaN
NaN
2734
3bwsA02
260
KKGLVLGKV
9.273508
11.413203
12.253963
13.338609
17.082760
19.947418
8.448602
...
15
9
13
17
9
13
11
9,11,13,13,17,19,9,9,11,15,17,7,9,13,15,9,13,1...
NaN
NaN
3623
2kwbA00
48
PSAEEGAED
10.160047
12.109205
12.734339
13.432731
16.219532
18.111626
9.109225
...
15
9
13
17
9
13
9
11,13,13,13,17,19,9,11,13,15,17,7,9,13,15,9,13...
NaN
NaN
4542
4x36A02
113
TLADKPEFT
8.823990
11.467757
11.260575
14.548013
16.793629
19.895035
8.978478
...
15
9
13
17
9
13
11
9,11,11,15,17,19,9,9,13,15,19,5,9,11,15,9,13,1...
NaN
NaN
5234
2xt6B03
77
QRHAVIVDR
8.991055
11.222426
11.667209
14.419406
17.422922
17.947512
8.275139
...
13
11
13
15
9
13
9
9,11,11,15,17,17,9,9,13,17,17,5,9,13,13,11,13,...
NaN
NaN
5 rows × 49 columns
In [169]:
data_original.head(10000)[kmeans.labels_ == 55]
Out[169]:
pdb
i
seq
dis1
dis2
dis3
dis4
dis5
dis6
dis7
...
d15
d16
d17
d18
d19
d20
d21
dd
category
count
81
2r7rA08
32
HENEIQLYL
5.333650
6.582048
8.963180
10.288549
10.966698
12.824388
5.418285
...
11
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5
7.0
12033.0
82
2r7rA08
33
ENEIQLYLI
5.418285
6.527355
9.036262
10.305749
11.110831
12.772940
5.267358
...
11
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5
7.0
12033.0
109
2r7rA08
60
SRDKYRILE
5.273392
7.189122
9.218030
10.316360
11.565206
13.020775
5.393554
...
9
5
7
9
5
5
5
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,5,5
119.0
500.0
136
3fcnA00
9
FVWCQQQAD
5.353385
6.903554
8.958042
10.115607
11.136167
12.917888
5.357417
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
26.0
2898.0
203
3fcnA00
101
GVIWRRAVS
5.706202
6.473025
8.968908
10.203383
11.063753
12.901576
5.376695
...
11
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5
7.0
12033.0
211
3fcnA00
109
SEAKAALIE
5.344415
6.596049
8.857686
10.147244
11.163829
13.002431
5.334358
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
26.0
2898.0
262
5tjjA02
14
AELHQVAAH
5.302562
6.808673
9.016759
10.052061
10.938404
12.879579
5.065632
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5
1.0
20575.0
316
5tjjA02
76
DESTWRRVL
5.653988
7.055184
9.186121
10.291211
11.365362
13.000630
4.902782
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5
1.0
20575.0
397
5tjjA02
165
ESAAKVSAW
5.407758
6.636605
9.023434
10.096171
11.093265
12.860629
5.241992
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,9,11,5,5,9,9,5,7,9,5,7,5
32.0
2048.0
573
1ka1A02
44
QAKYCLLAL
5.226872
6.776495
8.958233
10.000200
10.943960
12.758240
4.969972
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5
1.0
20575.0
604
1ka1A02
75
AAGNVIVHE
5.136070
7.062123
8.969414
10.173305
11.103614
12.970013
5.939757
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
26.0
2898.0
764
2czrA02
108
SEAREFIKE
5.000071
6.672308
8.776916
9.992788
10.722353
12.505007
5.711265
...
9
5
7
9
5
5
5
5,7,9,9,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,5,5
72.0
857.0
1242
3c2gA02
10
KPTFIHNVL
5.510642
6.534197
8.919077
10.171537
10.914976
12.664318
5.196806
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5
1.0
20575.0
1349
4e8uA00
66
DWNGLHNGL
5.240033
6.915307
8.927713
10.154488
11.080714
12.896862
5.195405
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5
1.0
20575.0
1411
4e8uA00
128
TIAELTEEE
5.482339
6.552261
8.640179
10.185078
11.387709
12.941839
5.131206
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,5,9,11,11,5,5,9,9,5,7,9,5,7,5
123.0
488.0
1416
4e8uA00
133
TEEEARKQE
5.517636
6.781759
8.824654
10.205109
11.145656
12.672886
5.547846
...
9
5
7
9
5
5
5
5,7,9,11,11,13,5,7,9,9,11,5,5,9,9,5,7,9,5,5,5
242.0
250.0
1422
4e8uA00
139
KQELLVQNL
5.110320
6.776401
9.302724
10.483055
11.061368
12.841620
5.393684
...
11
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5
7.0
12033.0
1424
4e8uA00
141
ELLVQNLRQ
5.569650
6.717126
9.148919
10.256499
10.802386
12.373766
5.528145
...
9
5
5
9
5
5
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,5,9,5,5,5
134.0
442.0
1440
4e8uA00
157
DMKEIEELC
5.243148
6.879291
9.108886
10.120252
11.231353
12.822567
5.409897
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
26.0
2898.0
1536
2pp4A00
0
GARQLSKLK
6.207573
7.202536
9.400275
10.749393
11.598902
13.362099
5.100773
...
11
5
5
9
5
7
5
7,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,5,9,5,7,5
728.0
104.0
1541
2pp4A00
5
SKLKRFLTT
5.183907
6.706759
8.854277
10.298901
11.097798
12.910788
5.072634
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
6.0
12641.0
1544
2pp4A00
8
KRFLTTLQQ
5.264502
6.452593
8.704339
10.355394
11.098370
12.683537
5.190423
...
11
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5
7.0
12033.0
1612
2pp4A00
76
ELLHCARLA
5.472988
6.820663
8.981866
10.199092
11.791062
13.364829
5.001121
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
26.0
2898.0
1948
4kqcA02
43
RAEGFVKRF
5.550560
6.733570
8.984053
10.487504
11.358106
12.615863
5.444900
...
9
5
7
9
5
5
5
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,5,5
119.0
500.0
1981
4kqcA02
76
NEADIANEV
5.605928
6.408804
8.675251
10.291935
11.278878
12.624000
5.419741
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,5,9,11,11,5,5,9,9,5,7,9,5,7,5
123.0
488.0
2180
1vx7E02
158
TSEEKVKYY
5.358503
6.696349
8.846507
10.074055
11.011369
12.806045
4.978814
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5
1.0
20575.0
2854
3wx4A00
69
NIDDVLKTI
5.374019
6.652499
9.082006
10.314707
11.130764
12.851135
5.035265
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
6.0
12641.0
3252
1ac5A00
152
DFLENYFKI
5.352051
6.727336
8.592132
10.314495
11.319632
13.000024
5.516701
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,5,9,11,11,5,5,9,9,5,7,9,5,7,5
123.0
488.0
3278
1ac5A00
178
GQYIPFFAN
5.843970
6.960258
9.138881
10.514634
11.141170
13.012049
6.073991
...
11
5
5
9
5
7
5
5,7,9,11,11,13,7,7,9,11,11,5,7,9,11,5,5,9,5,7,5
7934.0
16.0
3279
1ac5A00
179
QYIPFFANA
6.073991
6.677140
8.863530
10.379977
11.407236
12.863068
5.448027
...
9
5
7
9
5
7
5
7,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
852.0
92.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
7234
3c1yA02
54
LLRIVEEIR
5.496828
6.834570
9.111869
10.276987
11.216656
12.825080
5.083735
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
6.0
12641.0
7381
16vpA00
61
LPSDVVEWG
5.291775
6.773263
9.009974
10.250907
10.991113
12.455873
5.153724
...
9
5
7
9
5
5
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,5,5
23.0
3170.0
7436
16vpA00
116
AREESYRTV
5.769338
7.464414
9.355312
10.482346
11.950436
13.531206
5.497259
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
6.0
12641.0
7494
16vpA00
174
RLARVLFLH
5.416078
6.982404
8.758867
10.387445
11.068685
12.976485
5.758302
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
26.0
2898.0
7510
16vpA00
190
EILWAAYAE
5.806808
6.595221
8.930958
10.298653
11.531592
13.227515
5.468965
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
26.0
2898.0
7514
16vpA00
194
AAYAEQMMR
5.534247
6.771947
9.057928
10.301928
11.553566
13.386687
5.153292
...
11
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5
7.0
12033.0
7561
16vpA00
241
EARRLRELN
5.337707
6.590122
8.774970
10.208087
10.912583
12.535189
5.406728
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
6.0
12641.0
7604
16vpA00
284
RASGYFMVL
5.073411
6.951392
8.907423
10.098037
11.398313
13.217564
5.346855
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
6.0
12641.0
7633
3cjeA00
41
PPLALFIAG
5.453949
6.889291
9.076591
10.533273
11.137803
12.805101
5.271422
...
9
5
7
9
5
5
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,5,5
23.0
3170.0
7690
3cjeA00
106
PIEAQQALI
5.299537
6.546436
8.796499
10.062986
11.049634
12.792604
5.024445
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5
1.0
20575.0
7735
2munA00
12
IACGQCRDK
5.168211
6.778906
9.208090
9.972991
11.608543
13.115890
5.561805
...
9
5
7
9
5
7
5
5,7,9,9,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
9.0
6656.0
7759
2munA00
36
TFKKCQDLL
5.815854
6.762688
8.830809
10.575121
11.575522
13.029126
5.006928
...
11
5
7
9
5
7
5
5,7,9,11,11,13,5,5,9,9,11,5,7,9,11,5,7,9,5,7,5
16.0
4654.0
7779
3hviA00
18
DPQSVLEAI
5.454301
6.552622
9.117189
10.232352
10.933669
12.845427
4.972138
...
11
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5
4.0
14304.0
7803
3hviA00
42
AKGQIMDAV
5.596153
7.134886
9.099961
10.281355
11.522535
13.294353
5.162168
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
6.0
12641.0
7983
1iv8A04
24
YRYYQVLVG
5.510259
6.778948
9.046924
10.256198
11.195829
12.715963
5.247990
...
9
5
7
9
5
5
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,5,5
23.0
3170.0
8004
1iv8A04
65
EYENRVMEL
5.419779
6.677068
9.125540
10.355475
10.915956
12.973279
5.006636
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5
1.0
20575.0
8009
1iv8A04
70
VMELVEETF
5.211782
6.753980
9.108153
10.221592
11.196578
12.860190
5.191833
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
6.0
12641.0
8033
4gyiA03
10
DPASLYADL
5.917629
7.046831
8.958712
10.471177
11.489807
13.168632
5.319816
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5
10.0
5374.0
8092
4gyiA03
77
FDRDVQCIK
5.399261
6.542648
8.751293
10.142467
11.301354
12.744295
5.224391
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
6.0
12641.0
8095
4gyiA03
80
DVQCIKRFF
5.184418
6.547073
8.878405
10.018908
10.934995
12.655374
5.282895
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
26.0
2898.0
8167
1y66A00
13
VRRHQEITQ
5.087811
6.584020
8.800280
10.099912
10.866683
12.673161
5.373821
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
26.0
2898.0
8302
1vliA01
141
EISDVHEAW
5.594763
6.623869
8.847273
10.318558
11.200769
12.649756
5.119834
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
6.0
12641.0
8458
5b3kA00
18
RHAQKLLSA
5.135142
6.670463
8.966060
10.143664
10.806757
12.612224
5.248866
...
9
5
5
9
5
7
5
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,5,9,5,7,5
25.0
2981.0
8581
5b3kA00
141
AEFAAALKG
5.698404
6.876418
8.993896
10.675476
11.688439
12.668344
5.148262
...
9
5
7
9
5
5
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,5,5
23.0
3170.0
8762
1d4aA00
82
DIVAEQKKL
5.284836
6.739615
8.933946
10.209151
11.292310
12.962308
5.385168
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
6.0
12641.0
8765
1d4aA00
85
AEQKKLEAA
5.233301
6.579210
8.914511
10.140086
11.058459
12.903560
5.160893
...
11
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5
7.0
12033.0
9064
2yeqA02
119
EAFVLRRAA
5.471800
6.937684
8.908560
10.209603
11.383506
13.161331
5.411641
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
6.0
12641.0
9193
2yeqA02
248
QRERVINFI
5.267016
6.608962
8.887669
10.006727
11.037183
12.801084
5.218074
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
26.0
2898.0
9338
2yeqA02
393
EEDRFFSHN
6.074079
7.749986
9.214346
10.553204
12.107005
13.593403
5.386916
...
9
5
7
9
5
7
5
7,7,9,11,13,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5
550.0
134.0
9551
1ydxA03
101
YVPFFYCAL
5.764995
7.315386
9.241748
10.730502
11.593540
13.349720
5.340494
...
9
5
7
9
5
7
5
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5
1.0
20575.0
124 rows × 49 columns
In [185]:
In [194]:
d = data_original.head(10000)[kmeans.labels_ == 450]
for i, row in d.iterrows():
print(i, row["pdb"], row["i"])
getFragPdb(row["pdb"], int(row["i"]), f"compare/{i}.pdb")
331 5tjjA02 91
880 1n08A00 9
1484 1opoA02 43
1797 5j47A03 18
8991 2yeqA02 46
9529 1ydxA03 71
In [175]:
getFragPdb("3bwsA02.pdb", 260, "test2.pdb")
In [196]:
getFragPdb("2r7rA08", 32)
In [197]:
getFragPdb("2r7rA08", 33)
In [97]:
i = 0
((x[i] - kmeans.cluster_centers_[kmeans.labels_[i]])**2).sum()
Out[97]:
4.322128020315504
In [95]:
((x[3] - kmeans.cluster_centers_[kmeans.labels_[3]])**2).sum()
Out[95]:
3.2598634605762107
In [183]:
pd.Series(kmeans.labels_).value_counts()[500:]
Out[183]:
450 6
497 6
64 6
688 6
494 6
363 6
395 6
123 6
486 6
101 6
919 6
378 6
475 6
483 6
457 6
835 6
539 6
271 6
861 6
869 6
560 6
238 6
626 6
637 6
509 6
415 6
719 6
261 6
665 6
84 6
..
917 2
624 2
14 1
681 1
868 1
753 1
15 1
959 1
788 1
531 1
601 1
952 1
18 1
213 1
874 1
406 1
522 1
518 1
308 1
330 1
904 1
872 1
184 1
840 1
832 1
548 1
186 1
954 1
19 1
797 1
Length: 500, dtype: int64
In [178]:
pd.Series(kmeans.labels_).value_counts().hist(bins=50, log=True)
Out[178]:
<matplotlib.axes._subplots.AxesSubplot at 0x12cb08f60>
In [ ]:
import os
# get the fragment pdb
pdb = "1igqB00.pdb"
pdbId = pdb.split('.')[0]
i = 0
pre = "/Users/weilu/Research/optimization/fragment/"
database = "/Users/weilu/Research/optimization/fragment/database/dompdb/"
parser = bio.PDBParser(QUIET=True)
structure = parser.get_structure("x", os.path.join(database, pdb))
for model in structure:
for chain in model:
all_residues = list(chain)
io = bio.PDBIO()
c = bio.Chain.Chain("A")
c.child_list = cc.child_list[i:i+9]
for ii, res in enumerate(c):
res.id = (' ', ii+1, ' ')
io.set_structure(c)
io.save(f'{pre}test.pdb')
In [ ]:
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0],
[4, 2], [4, 4], [4, 0]])
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
kmeans.labels_
kmeans.predict([[0, 0], [4, 4]])
kmeans.cluster_centers_
In [20]:
plt.scatter(X[:, 0], X[:, 1])
Out[20]:
<matplotlib.collections.PathCollection at 0x128479208>
In [7]:
kmeans.labels_
Out[7]:
array([0, 0, 0, 1, 1, 1], dtype=int32)
In [6]:
X
Out[6]:
array([[1, 2],
[1, 4],
[1, 0],
[4, 2],
[4, 4],
[4, 0]])
In [3]:
data = data_original.fillna(-1)[["pdb", "i", "seq","dd", "category", "count"]]
data["category"] = data["category"].astype(int)
data["count"] = data["count"].astype(int)
for i in range(1,10):
data[f"s{i}"] = data["seq"].apply(lambda x: one_to_index(x[i-1]))
In [4]:
In [15]:
data.to_feather("/Users/weilu/Research/optimization/fragment/feather_cluster_data.feather")
In [13]:
data.head()
Out[13]:
pdb
i
seq
dd
category
count
s1
s2
s3
s4
s5
s6
s7
s8
s9
0
1igqB00
0
DKLKKAIVQ
9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1...
-1
-1
2
8
9
8
8
0
7
17
13
1
1igqB00
1
KLKKAIVQV
9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1...
-1
-1
8
9
8
8
0
7
17
13
17
2
1igqB00
2
LKKAIVQVE
5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,...
-1
-1
9
8
8
0
7
17
13
17
3
3
1igqB00
3
KKAIVQVEH
9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13...
9987
13
8
8
0
7
17
13
17
3
6
4
1igqB00
4
KAIVQVEHD
9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1...
6835
18
8
0
7
17
13
17
3
6
2
In [14]:
data.tail()
Out[14]:
pdb
i
seq
dd
category
count
s1
s2
s3
s4
s5
s6
s7
s8
s9
1901425
1xjhA00
49
NAMDIAEIR
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5
1
20575
11
0
10
2
7
0
3
7
14
1901426
1xjhA00
50
AMDIAEIRN
5,7,9,9,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5
2
18996
0
10
2
7
0
3
7
14
11
1901427
1xjhA00
51
MDIAEIRNN
5,7,9,9,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
3
18382
10
2
7
0
3
7
14
11
11
1901428
1xjhA00
52
DIAEIRNNA
5,7,9,11,11,11,5,7,9,9,11,5,7,9,11,5,7,9,5,5,5
81
745
2
7
0
3
7
14
11
11
0
1901429
1xjhA00
53
IAEIRNNAS
5,7,9,9,11,11,5,7,9,11,11,5,7,9,11,5,5,7,5,5,5
2071
46
7
0
3
7
14
11
11
0
15
In [6]:
data.shape
Out[6]:
(1901430, 15)
In [7]:
data.query("count != -1").shape
Out[7]:
(753824, 15)
In [8]:
data.query("count > 500").shape
Out[8]:
(325598, 15)
In [12]:
data.query("count != -1").sample(10)
Out[12]:
pdb
i
seq
dd
category
count
s1
s2
s3
s4
s5
s6
s7
s8
s9
83349
4pz0A01
131
GDSFEVKGI
11,13,17,21,21,19,11,13,17,19,17,11,13,15,13,1...
6062
20
5
2
15
4
3
17
8
5
7
376107
3ot5A02
26
MQGMFEAVR
5,7,9,9,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5
3
18382
10
13
5
10
4
3
0
17
14
77904
1fmzA00
147
QYIGIHRDR
9,13,15,19,21,25,11,13,17,19,23,9,13,15,19,11,...
927
87
13
19
7
5
7
6
14
2
14
1335697
2je2A00
3
AEFNDKGEL
11,13,15,11,11,7,11,11,9,9,7,7,5,5,5,5,5,7,7,9,9
1664
55
0
3
4
11
2
8
5
3
9
531509
3i5xA02
26
HIKKQIKER
5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,5,5
34
1811
6
7
8
8
13
7
8
3
14
912144
4qtuB00
151
DQVDDILQS
5,7,9,9,11,13,5,7,9,9,11,5,7,9,11,5,5,9,5,7,5
14
4800
2
13
17
2
2
7
9
13
15
1182231
1sbzA00
155
ARVLDQFGL
5,7,9,11,11,9,5,7,9,9,9,5,7,9,9,5,7,5,5,5,7
60
1011
0
14
17
9
2
13
4
5
9
1438574
2cfuA01
327
GNAEIVEVL
5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5
4
14304
5
11
0
3
7
17
3
17
9
1346608
3p32A01
12
DRAALPRAI
5,7,9,11,11,13,5,7,9,11,11,7,7,9,11,5,7,9,5,7,5
328
207
2
14
0
0
9
12
14
0
7
1468369
4ds7E00
11
LVLACVRMK
5,7,9,9,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5
0
39130
9
17
9
0
1
17
14
10
8
In [11]:
data.query("category == 0").shape
Out[11]:
(39130, 15)
In [17]:
data.head()
Out[17]:
pdb
i
seq
dd
category
count
s1
s2
s3
s4
s5
s6
s7
s8
s9
0
1igqB00
0
DKLKKAIVQ
9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1...
-1
-1
2
8
9
8
8
0
7
17
13
1
1igqB00
1
KLKKAIVQV
9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1...
-1
-1
8
9
8
8
0
7
17
13
17
2
1igqB00
2
LKKAIVQVE
5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,...
-1
-1
9
8
8
0
7
17
13
17
3
3
1igqB00
3
KKAIVQVEH
9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13...
9987
13
8
8
0
7
17
13
17
3
6
4
1igqB00
4
KAIVQVEHD
9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1...
6835
18
8
0
7
17
13
17
3
6
2
In [108]:
data.head().iloc[:,6:]
Out[108]:
s1
s2
s3
s4
s5
s6
s7
s8
s9
0
2
8
9
8
8
0
7
17
13
1
8
9
8
8
0
7
17
13
17
2
9
8
8
0
7
17
13
17
3
3
8
8
0
7
17
13
17
3
6
4
8
0
7
17
13
17
3
6
2
In [18]:
test = data.query("count > 500")
In [19]:
train_x = test.iloc[:,6:].values
train_y = test["category"].values
In [ ]:
train_x.shape
In [ ]:
from sklearn.svm import SVC
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(train_x, train_y)
svm_predictions = svm_model_linear.predict(train_x)
In [ ]:
from sklearn.metrics import confusion_matrix
# model accuracy for X_test
accuracy = svm_model_linear.score(train_x, train_y)
# creating a confusion matrix
cm = confusion_matrix(train_y, svm_predictions)
In [111]:
train_x.shape
Out[111]:
(1901430, 9)
In [115]:
train_y
Out[115]:
array([ -1, -1, -1, ..., 3, 81, 2071])
In [114]:
train_y.shape
Out[114]:
(1901430,)
In [ ]:
In [68]:
seq_count = data["seq"].value_counts()
filtered_seq_count = seq_count[seq_count>1].reset_index().rename(columns={"seq":"index", "index":"seq"})
data_filtered = data_original.merge(filtered_seq_count, on="seq")
In [95]:
def get_total_std(a):
return a.iloc[:,3:6].std().sum()
In [99]:
data_filtered_total_std = data_filtered.groupby("seq").apply(get_total_std)
In [104]:
data_filtered_total_std.hist(bins=50, log=True)
Out[104]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a7ad255c0>
In [46]:
data_original.head()
Out[46]:
pdb
i
seq
dis1
dis2
dis3
dis4
dis5
dis6
dis7
...
d15
d16
d17
d18
d19
d20
d21
dd
category
count
0
1igqB00
0
DKLKKAIVQ
9.545797
12.242739
11.389445
14.435853
15.702080
18.185148
9.427593
...
15
9
13
15
9
13
9
9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1...
NaN
NaN
1
1igqB00
1
KLKKAIVQV
9.427593
9.508488
13.046163
14.956468
17.811722
21.096529
5.881323
...
19
9
13
17
9
13
9
9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1...
NaN
NaN
2
1igqB00
2
LKKAIVQVE
5.881323
9.385722
11.658462
14.435720
17.857430
21.547594
9.566237
...
19
9
13
17
9
13
11
5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,...
NaN
NaN
3
1igqB00
3
KKAIVQVEH
9.566237
12.596333
15.433746
18.900240
22.492163
25.003511
9.934756
...
19
9
13
15
11
13
11
9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13...
9987.0
13.0
4
1igqB00
4
KAIVQVEHD
9.934756
12.622395
16.016058
19.465660
21.763054
25.281502
9.527388
...
19
11
13
17
11
13
11
9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1...
6835.0
18.0
5 rows × 49 columns
In [44]:
data.head()
Out[44]:
pdb
i
seq
dd
category
count
s1
s2
s3
s4
s5
s6
s7
s8
s9
0
1igqB00
0
DKLKKAIVQ
9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1...
-1
-1
2
8
9
8
8
0
7
17
13
1
1igqB00
1
KLKKAIVQV
9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1...
-1
-1
8
9
8
8
0
7
17
13
17
2
1igqB00
2
LKKAIVQVE
5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,...
-1
-1
9
8
8
0
7
17
13
17
3
3
1igqB00
3
KKAIVQVEH
9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13...
9987
13
8
8
0
7
17
13
17
3
6
4
1igqB00
4
KAIVQVEHD
9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1...
6835
18
8
0
7
17
13
17
3
6
2
In [6]:
data.dtypes
Out[6]:
pdb object
i int64
seq object
dis1 float64
dis2 float64
dis3 float64
dis4 float64
dis5 float64
dis6 float64
dis7 float64
dis8 float64
dis9 float64
dis10 float64
dis11 float64
dis12 float64
dis13 float64
dis14 float64
dis15 float64
dis16 float64
dis17 float64
dis18 float64
dis19 float64
dis20 float64
dis21 float64
DisType object
d1 int64
d2 int64
d3 int64
d4 int64
d5 int64
d6 int64
d7 int64
d8 int64
d9 int64
d10 int64
d11 int64
d12 int64
d13 int64
d14 int64
d15 int64
d16 int64
d17 int64
d18 int64
d19 int64
d20 int64
d21 int64
dd object
category float64
count float64
dtype: object
In [ ]:
Content source: luwei0917/awsemmd_script
Similar notebooks: