In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from small_script.myFunctions import *
import feather
import Bio.PDB as bio
import subprocess
from sklearn.cluster import MiniBatchKMeans
d3_to_index = bio.Polypeptide.d3_to_index # we may want to adjust this in the future.
three_to_one = bio.Polypeptide.three_to_one
one_to_index = bio.Polypeptide.one_to_index
plt.rcParams['figure.figsize'] = [16.18033, 10]
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
def getFragPdb(pdbId, i, outFile=None):
pdb = pdbId + ".pdb"
if outFile is None:
outFile = f"{i}_{pdb}"
# pdb = "1igqB00.pdb"
# pdbId = pdb.split('.')[0]
pre = "/Users/weilu/Research/optimization/fragment/"
database = "/Users/weilu/Research/optimization/fragment/database/dompdb/"
parser = bio.PDBParser(QUIET=True)
structure = parser.get_structure("x", os.path.join(database, pdb))
for model in structure:
for chain in model:
all_residues = list(chain)
io = bio.PDBIO()
c = bio.Chain.Chain("A")
c.child_list = all_residues[i:i+9]
# for ii, res in enumerate(c):
# res.id = (' ', ii+1, ' ')
io.set_structure(c)
io.save(f'{pre}{outFile}')
def getScore(data, km):
# return km.score(data.iloc[:, 3:87].values)
# return data.values[3:4])
# return km.score(np.array([1]*84).reshape(1,-1))
# return np.sqrt(-km.score(data.values[3:87].reshape(1,-1)))
# return np.sqrt(-km.score(data.values.reshape(1,-1)))
return np.sqrt(((km.cluster_centers_[int(data.values[-1])] - data.values[:-1])**2).sum())
def getFromTerminal(CMD):
return subprocess.Popen(CMD,stdout=subprocess.PIPE,shell=True).communicate()[0].decode()
In [ ]:
# pre = "/Users/weilu/Research/optimization/fragment/"
# data_original = feather.read_dataframe(f"{pre}cluster100_v2.feather")
# os.system(f"mkdir -p {pre}center_cluster100_v2/origin/")
# os.system(f"mkdir -p {pre}center_cluster100_v2/pdbs/")
# os.system(f"mkdir -p {pre}center_cluster100_v2/gros/")
# center = data_original.groupby("cluster").head(1)
# for i, row in center.reset_index(drop=True).iterrows():
# print(i, row["pdb"], row["i"], row["cluster"])
# getFragPdb(row["pdb"], int(row["i"]), f"center_cluster100_v2/origin/{row['cluster']}.pdb")
# pre = "/Users/weilu/Research/optimization/fragment/center_cluster100_v2//"
# for i in range(100):
# os.system(f"python ~/opt/small_script/pdb_reres.py {pre}origin/{i}.pdb > {pre}pdbs/{i}.pdb")
# for i in range(100):
# os.system(f"python2 ~/opt/script/Pdb2Gro.py {pre}pdbs/{i}.pdb {pre}gros/{i}.gro")
In [ ]:
In [ ]:
# data_original = pd.read_csv("/Users/weilu/Research/optimization/fragment/data_jan31.csv")
# chosen = data_original.reset_index(drop=True)
# x = chosen.iloc[:, 3:87].values
# kmeans = MiniBatchKMeans(n_clusters=100,
# random_state=0,
# batch_size=200,
# max_iter=300,
# tol=1e4).fit(x)
# import pickle
# # pickle.dump(kmeans, open("/Users/weilu/Research/optimization/fragment/kmeans_cluster100_v2_2", "wb"))
# chosen["cluster"] = kmeans.labels_
# chosen["rmsd"] = chosen.iloc[:,3:88].apply(lambda x: getScore(x, kmeans), axis=1)
# reodered_chosen = chosen.sort_values(["cluster", "rmsd"])
# # reodered_chosen.reset_index(drop=True).to_feather("/Users/weilu/Research/optimization/fragment/cluster100_v2.feather")
# reodered_chosen.reset_index(drop=True).to_feather("/Users/weilu/Research/optimization/fragment/cluster100_v2_2.feather")
In [ ]:
In [ ]:
In [ ]:
In [4]:
data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100_v2_2.feather")
In [14]:
data.shape
Out[14]:
In [13]:
data["cluster"].value_counts()[71]
Out[13]:
In [12]:
data["cluster"].value_counts()
Out[12]:
In [8]:
data["cluster"].value_counts()["71"]
In [ ]: