In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from small_script.myFunctions import *
import feather
import Bio.PDB as bio
import subprocess
from sklearn.cluster import MiniBatchKMeans
d3_to_index = bio.Polypeptide.d3_to_index  # we may want to adjust this in the future.
three_to_one = bio.Polypeptide.three_to_one
one_to_index = bio.Polypeptide.one_to_index
plt.rcParams['figure.figsize'] = [16.18033, 10]

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
def getFragPdb(pdbId, i, outFile=None):
    pdb = pdbId + ".pdb"
    if outFile is None:
        outFile = f"{i}_{pdb}"
#     pdb = "1igqB00.pdb"
#     pdbId = pdb.split('.')[0]
    pre = "/Users/weilu/Research/optimization/fragment/"
    database = "/Users/weilu/Research/optimization/fragment/database/dompdb/"
    parser = bio.PDBParser(QUIET=True)
    structure = parser.get_structure("x", os.path.join(database, pdb))
    for model in structure:
        for chain in model:
            all_residues = list(chain)
            io = bio.PDBIO()
            c = bio.Chain.Chain("A")
            c.child_list = all_residues[i:i+9]
#             for ii, res in enumerate(c):
#                 res.id = (' ', ii+1, ' ')
            io.set_structure(c)
            io.save(f'{pre}{outFile}')
def getScore(data, km):
    #     return km.score(data.iloc[:, 3:87].values)
    #     return data.values[3:4])
    #     return km.score(np.array([1]*84).reshape(1,-1))
    #     return np.sqrt(-km.score(data.values[3:87].reshape(1,-1)))
    #     return np.sqrt(-km.score(data.values.reshape(1,-1)))
    return np.sqrt(((km.cluster_centers_[int(data.values[-1])] - data.values[:-1])**2).sum())
def getFromTerminal(CMD):
    return subprocess.Popen(CMD,stdout=subprocess.PIPE,shell=True).communicate()[0].decode()

In [ ]:
# pre = "/Users/weilu/Research/optimization/fragment/"
# data_original = feather.read_dataframe(f"{pre}cluster100_v2.feather")
# os.system(f"mkdir -p {pre}center_cluster100_v2/origin/")
# os.system(f"mkdir -p {pre}center_cluster100_v2/pdbs/")
# os.system(f"mkdir -p {pre}center_cluster100_v2/gros/")
# center = data_original.groupby("cluster").head(1)
# for i, row in center.reset_index(drop=True).iterrows():
#     print(i, row["pdb"], row["i"], row["cluster"])
#     getFragPdb(row["pdb"], int(row["i"]), f"center_cluster100_v2/origin/{row['cluster']}.pdb")
# pre = "/Users/weilu/Research/optimization/fragment/center_cluster100_v2//"
# for i in range(100):
#     os.system(f"python ~/opt/small_script/pdb_reres.py {pre}origin/{i}.pdb > {pre}pdbs/{i}.pdb")
# for i in range(100):
#     os.system(f"python2 ~/opt/script/Pdb2Gro.py {pre}pdbs/{i}.pdb {pre}gros/{i}.gro")

In [ ]:


In [ ]:
# data_original = pd.read_csv("/Users/weilu/Research/optimization/fragment/data_jan31.csv")
# chosen = data_original.reset_index(drop=True)
# x = chosen.iloc[:, 3:87].values
# kmeans = MiniBatchKMeans(n_clusters=100,
#         random_state=0,
#         batch_size=200,
#         max_iter=300,
#         tol=1e4).fit(x)
# import pickle
# # pickle.dump(kmeans, open("/Users/weilu/Research/optimization/fragment/kmeans_cluster100_v2_2", "wb"))
# chosen["cluster"] = kmeans.labels_
# chosen["rmsd"] = chosen.iloc[:,3:88].apply(lambda x: getScore(x, kmeans), axis=1)
# reodered_chosen = chosen.sort_values(["cluster", "rmsd"])
# # reodered_chosen.reset_index(drop=True).to_feather("/Users/weilu/Research/optimization/fragment/cluster100_v2.feather")
# reodered_chosen.reset_index(drop=True).to_feather("/Users/weilu/Research/optimization/fragment/cluster100_v2_2.feather")

In [ ]:


In [ ]:


In [ ]:


In [4]:
data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100_v2_2.feather")

In [14]:
data.shape


Out[14]:
(1912985, 89)

In [13]:
data["cluster"].value_counts()[71]


Out[13]:
55296

In [12]:
data["cluster"].value_counts()


Out[12]:
55    337166
1      98515
71     55296
38     40610
70     37302
19     36340
45     32528
2      32112
0      31681
32     30514
30     30463
89     29997
23     28138
8      27691
48     27179
33     26388
86     26028
14     25705
16     25236
82     23936
5      23714
80     22254
20     21146
99     20855
31     20690
65     20377
96     19858
79     19337
52     19092
63     18638
       ...  
17      9783
64      9571
4       9430
44      9252
39      9154
81      9027
6       8376
88      8029
58      7743
34      7312
42      7250
18      6982
87      6915
41      6746
94      6583
43      6485
9       5543
93      4862
24      4549
73      4544
26      4303
28      4301
27      3625
60      3568
78      2932
68      2873
61      2216
85      1752
59      1516
46      1342
Name: cluster, Length: 100, dtype: int64

In [8]:
data["cluster"].value_counts()["71"]


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-8-83abbed31a0b> in <module>
----> 1 data["cluster"].value_counts()["71"]

~/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
    866         key = com.apply_if_callable(key, self)
    867         try:
--> 868             result = self.index.get_value(self, key)
    869 
    870             if not is_scalar(result):

~/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
   4372         try:
   4373             return self._engine.get_value(s, k,
-> 4374                                           tz=getattr(series.dtype, 'tz', None))
   4375         except KeyError as e1:
   4376             if len(self) > 0 and (self.holds_integer() or self.is_boolean()):

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()

KeyError: '71'

In [ ]: