In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
# sys.path.insert(0,'..')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2
In [2]:
plt.rcParams['figure.figsize'] = [16.18033, 10] #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
In [3]:
data = pd.read_csv("/Users/weilu/Research/data/survey_represent_x_com_complete.csv", index_col=0)
In [4]:
data_res = data.query("ResName == 'TYR'").reset_index(drop=True)
sns.jointplot("r1", "r2", kind="kde", data=data_res)
# plt.show()
Out[4]:
In [5]:
import plotly.express as px
In [6]:
fig = px.scatter_3d(data_res, x='r1', y='r2', z='r3', color="r")
fig.show()
In [10]:
X = data_res[["r1", "r2", "r3"]].values
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)
In [15]:
data_res["predicted"] = y_kmeans
In [45]:
fig = px.scatter_3d(data_res, x='r1', y='r2', z='r3', opacity=0.02)
fig.add_scatter3d(x=kmeans.cluster_centers_[:,0], y=kmeans.cluster_centers_[:,1], z=kmeans.cluster_centers_[:,2])
fig.show()
In [5]:
X = data_res[["r1", "r2", "r3"]].values
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3).fit(X)
labels = gmm.predict(X)
In [6]:
gmm.covariances_
Out[6]:
In [9]:
pc = gmm.precisions_cholesky_
In [15]:
1/gmm.covariances_
Out[15]:
In [14]:
pc[0].dot(pc[0].T)
Out[14]:
In [18]:
pc[0].T.dot(pc[0])
Out[18]:
In [17]:
pc
Out[17]:
In [16]:
gmm.precisions_
Out[16]:
In [ ]:
gmm.pr
In [223]:
gmm.means_
Out[223]:
In [228]:
x = np.array([[1,2,3]]).T
m = gmm.means_[0]
sigma = gmm.covariances_[0]
In [232]:
gmm.precisions_cholesky_
Out[232]:
In [ ]:
def _estimate_log_prob(self, X):
return _estimate_log_gaussian_prob(
X, self.means_, self.precisions_cholesky_, self.covariance_type)
# Gaussian mixture probability estimators
def _compute_log_det_cholesky(matrix_chol, covariance_type, n_features):
"""Compute the log-det of the cholesky decomposition of matrices.
Parameters
----------
matrix_chol : array-like
Cholesky decompositions of the matrices.
'full' : shape of (n_components, n_features, n_features)
'tied' : shape of (n_features, n_features)
'diag' : shape of (n_components, n_features)
'spherical' : shape of (n_components,)
covariance_type : {'full', 'tied', 'diag', 'spherical'}
n_features : int
Number of features.
Returns
-------
log_det_precision_chol : array-like, shape (n_components,)
The determinant of the precision matrix for each component.
"""
if covariance_type == 'full':
n_components, _, _ = matrix_chol.shape
log_det_chol = (np.sum(np.log(
matrix_chol.reshape(
n_components, -1)[:, ::n_features + 1]), 1))
elif covariance_type == 'tied':
log_det_chol = (np.sum(np.log(np.diag(matrix_chol))))
elif covariance_type == 'diag':
log_det_chol = (np.sum(np.log(matrix_chol), axis=1))
else:
log_det_chol = n_features * (np.log(matrix_chol))
return log_det_chol
def _estimate_log_gaussian_prob(X, means, precisions_chol, covariance_type):
"""Estimate the log Gaussian probability.
Parameters
----------
X : array-like, shape (n_samples, n_features)
means : array-like, shape (n_components, n_features)
precisions_chol : array-like
Cholesky decompositions of the precision matrices.
'full' : shape of (n_components, n_features, n_features)
'tied' : shape of (n_features, n_features)
'diag' : shape of (n_components, n_features)
'spherical' : shape of (n_components,)
covariance_type : {'full', 'tied', 'diag', 'spherical'}
Returns
-------
log_prob : array, shape (n_samples, n_components)
"""
n_samples, n_features = X.shape
n_components, _ = means.shape
# det(precision_chol) is half of det(precision)
log_det = _compute_log_det_cholesky(
precisions_chol, covariance_type, n_features)
if covariance_type == 'full':
log_prob = np.empty((n_samples, n_components))
for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
log_prob[:, k] = np.sum(np.square(y), axis=1)
elif covariance_type == 'tied':
log_prob = np.empty((n_samples, n_components))
for k, mu in enumerate(means):
y = np.dot(X, precisions_chol) - np.dot(mu, precisions_chol)
log_prob[:, k] = np.sum(np.square(y), axis=1)
elif covariance_type == 'diag':
precisions = precisions_chol ** 2
log_prob = (np.sum((means ** 2 * precisions), 1) -
2. * np.dot(X, (means * precisions).T) +
np.dot(X ** 2, precisions.T))
elif covariance_type == 'spherical':
precisions = precisions_chol ** 2
log_prob = (np.sum(means ** 2, 1) * precisions -
2 * np.dot(X, means.T * precisions) +
np.outer(row_norms(X, squared=True), precisions))
return -.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det
def score_samples(self, X):
"""Compute the weighted log probabilities for each sample.
Parameters
----------
X : array-like, shape (n_samples, n_features)
List of n_features-dimensional data points. Each row
corresponds to a single data point.
Returns
-------
log_prob : array, shape (n_samples,)
Log probabilities of each data point in X.
"""
check_is_fitted(self)
X = _check_X(X, None, self.means_.shape[1])
return logsumexp(self._estimate_weighted_log_prob(X), axis=1)
def _estimate_weighted_log_prob(self, X):
"""Estimate the weighted log-probabilities, log P(X | Z) + log weights.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Returns
-------
weighted_log_prob : array, shape (n_samples, n_component)
"""
return self._estimate_log_prob(X) + self._estimate_log_weights()
def _estimate_log_prob(self, X):
"""Estimate the log-probabilities log P(X | Z).
Compute the log-probabilities per each component for each sample.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Returns
-------
log_prob : array, shape (n_samples, n_component)
"""
pass
def _estimate_log_weights(self):
"""Estimate log-weights in EM algorithm, E[ log pi ] in VB algorithm.
Returns
-------
log_weight : array, shape (n_components, )
"""
pass
def _estimate_log_weights(self):
return np.log(self.weights_)
In [ ]:
def _compute_log_det_cholesky(matrix_chol, covariance_type, n_features):
"""Compute the log-det of the cholesky decomposition of matrices.
Parameters
----------
matrix_chol : array-like
Cholesky decompositions of the matrices.
'full' : shape of (n_components, n_features, n_features)
'tied' : shape of (n_features, n_features)
'diag' : shape of (n_components, n_features)
'spherical' : shape of (n_components,)
covariance_type : {'full', 'tied', 'diag', 'spherical'}
n_features : int
Number of features.
Returns
-------
log_det_precision_chol : array-like, shape (n_components,)
The determinant of the precision matrix for each component.
"""
if covariance_type == 'full':
n_components, _, _ = matrix_chol.shape
log_det_chol = (np.sum(np.log(
matrix_chol.reshape(
n_components, -1)[:, ::n_features + 1]), 1))
return log_det_chol
In [253]:
gmm.means_.shape
Out[253]:
In [255]:
In [299]:
gmm.means_
Out[299]:
In [300]:
gmm.score_samples(gmm.means_)
Out[300]:
In [301]:
gmm.weights_
Out[301]:
In [283]:
precisions_chol = gmm.precisions_cholesky_
means = gmm.means_
log_prob = np.empty((n_samples, n_components))
for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
y = np.dot(x, prec_chol) - np.dot(mu, prec_chol)
log_prob[:, k] = np.sum(np.square(y), axis=1)
In [284]:
-.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det
Out[284]:
In [286]:
Out[286]:
In [269]:
log_prob[0]
Out[269]:
In [271]:
y = np.dot(x, precisions_chol[0]) - np.dot(gmm.means_[0], precisions_chol[0])
np.sum(np.square(y), axis=1)
Out[271]:
In [360]:
x = np.array([[5.22568112, 3.87951258, 4.01458734]])
x = np.array([[4.7, 3.9, 8]])
x = np.array([[4.7, 3.9, 7]])
In [349]:
X.max(axis=0)
Out[349]:
In [365]:
x = X.max(axis=0).reshape(1,3)
# x = np.array([[5.22568112, 3.87951258, 4.01458734]])
In [376]:
Out[376]:
In [377]:
means = gmm.means_
In [366]:
n_samples, n_features = x.shape
matrix_chol = gmm.precisions_cholesky_
n_components, _ = gmm.means_.shape
n_components, _, _ = matrix_chol.shape
log_det_chol = (np.sum(np.log(
matrix_chol.reshape(
n_components, -1)[:, ::n_features + 1]), 1))
log_det = log_det_chol
mean_dot_precisions_chol = np.zeros((3,3))
log_prob = np.zeros(3)
for i in range(n_components):
mean_dot_precisions_chol[i] = np.dot(gmm.means_[i], precisions_chol[i])
y = np.dot(x, precisions_chol[i]) - mean_dot_precisions_chol[i]
log_prob[i] = np.sum(np.square(y))
log_gaussian_prob = -.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det
score = np.log(np.sum(np.exp(log_gaussian_prob + np.log(gmm.weights_))))
kt = 1
E_side_chain = -score*kt
print(E_side_chain)
In [383]:
def compute_side_chain_energy_for_x(x, means, precisions_chol, log_det):
n_samples, n_features = x.shape
n_components, _ = means.shape
mean_dot_precisions_chol = np.zeros((3,3))
log_prob = np.zeros(3)
for i in range(n_components):
mean_dot_precisions_chol[i] = np.dot(means[i], precisions_chol[i])
y = np.dot(x, precisions_chol[i]) - mean_dot_precisions_chol[i]
log_prob[i] = np.sum(np.square(y))
log_gaussian_prob = -.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det
score = np.log(np.sum(np.exp(log_gaussian_prob + np.log(gmm.weights_))))
kt = 1
E_side_chain = -score*kt
# print(E_side_chain)
return E_side_chain
In [384]:
compute_side_chain_energy_for_x(x, means, precisions_chol, log_det)
Out[384]:
In [385]:
x
Out[385]:
In [367]:
log_gaussian_prob + np.log(gmm.weights_)
Out[367]:
In [373]:
np.log(np.sum(np.exp(np.array([0,-10, -10]))))
Out[373]:
In [374]:
np.log(np.sum(np.exp(np.array([0,0, -10]))))
Out[374]:
In [372]:
np.exp(np.array([1e-3,1e-10, 1e-10]))
Out[372]:
In [370]:
np.sum(np.exp(np.array([1e-3,1e-10, 1e-10])))
Out[370]:
In [344]:
log_gaussian_prob + np.log(gmm.weights_)
Out[344]:
In [323]:
log_gaussian_prob
Out[323]:
In [310]:
log_gaussian_prob + np.log(gmm.weights_)
Out[310]:
In [316]:
precisions_chol
Out[316]:
In [313]:
np.log(np.sum(np.exp(np.array([0,0,-40]))))
Out[313]:
In [298]:
log_gaussian_prob + np.log(gmm.weights_)
Out[298]:
In [293]:
np.sum(np.exp(log_gaussian_prob + np.log(gmm.weights_)))
Out[293]:
In [281]:
log_prob
Out[281]:
In [274]:
np.sum(np.square(y), axis=1)
Out[274]:
In [270]:
log_prob
Out[270]:
In [ ]:
n_components, _, _ = matrix_chol.shape
log_det_chol = (np.sum(np.log(
matrix_chol.reshape(
n_components, -1)[:, ::n_features + 1]), 1))
In [235]:
x = np.array([[1,2,3]])
In [240]:
from scipy.special import logsumexp
In [241]:
logsumexp(gmm._estimate_log_prob(x) + gmm._estimate_log_weights(), axis=1)
Out[241]:
In [248]:
np.log(np.sum(np.exp(gmm._estimate_log_prob(x) + np.log(gmm.weights_))))
Out[248]:
In [291]:
np.log(np.sum(np.exp(log_gaussian_prob + np.log(gmm.weights_))))
Out[291]:
In [290]:
np.log(gmm.weights_)
Out[290]:
In [249]:
gmm._estimate_log_prob(x)
Out[249]:
In [250]:
np.log(gmm.weights_)
Out[250]:
In [ ]:
In [245]:
np.exp(gmm._estimate_log_prob(x) + gmm._estimate_log_weights())
Out[245]:
In [242]:
gmm.score([[1,2,3]])
Out[242]:
In [ ]:
In [ ]:
log_prob = np.empty((n_samples, n_components))
for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
log_prob[:, k] = np.sum(np.square(y), axis=1)
In [231]:
(x-m).T.dot(np.linalg.inv(sigma)).dot(x-m)
Out[231]:
In [230]:
np.linalg.inv(sigma)
Out[230]:
In [229]:
sigma.shape
Out[229]:
In [227]:
x.shape
Out[227]:
In [219]:
sampled = gmm.sample(10000)[0]
sampled = pd.DataFrame(sampled, columns=["r1", "r2", "r3"])
In [220]:
fig = px.scatter_3d(sampled, x='r1', y='r2', z='r3', opacity=0.02)
fig.show()
In [209]:
sns.jointplot("r1", "r2", kind="kde", data=sampled)
Out[209]:
In [188]:
sns.jointplot("r1", "r2", kind="kde", data=data_res)
Out[188]:
In [48]:
data_lys = data.query("ResName == 'LYS'").reset_index(drop=True)
sns.jointplot("r1", "r2", kind="kde", data=data_lys)
# plt.show()
Out[48]:
In [50]:
data_lys = data.query("ResName == 'LYS'").reset_index(drop=True)
sns.jointplot("r1", "r3", kind="kde", data=data_lys)
# plt.show()
Out[50]:
In [58]:
X_lys = data_lys[["r1", "r2", "r3"]].values
from sklearn.cluster import KMeans
kmeans_lys = KMeans(n_clusters=3)
kmeans_lys.fit(X)
y_kmeans = kmeans_lys.predict(X_lys)
In [61]:
In [165]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3).fit(X_lys)
labels = gmm.predict(X_lys)
min_x = X_lys.min(axis=0)
max_x = X_lys.max(axis=0)
n = 50
r1 = np.linspace(min_x[0], max_x[0], n)
r2 = np.linspace(min_x[1], max_x[1], n)
r3 = np.linspace(min_x[2], max_x[2], n)
xx, yy, zz = np.meshgrid(r1, r2, r3)
d_ = np.zeros((n**3, 4))
d_[:,0] = xx.flatten()
d_[:,1] = yy.flatten()
d_[:,2] = zz.flatten()
d_[:,3] = gmm.score_samples(d_[:,:3])
d = pd.DataFrame(d_, columns=["r1", "r2", "r3", "score"])
d_filtered = d.query("score > 0")
fig = px.scatter_3d(d_filtered, x='r1', y='r2', z='r3', color="score")
fig.show()
In [172]:
In [173]:
fig = px.scatter_3d(d_filtered, x='r1', y='r2', z='r3', color="score")
fig.show()
In [136]:
plt.hist(gmm.score_samples(X_lys), bins=50)
Out[136]:
In [134]:
gmm.score_samples(X_lys)
Out[134]:
In [118]:
prob = np.array(score_list).reshape(n,n,n)
In [ ]:
In [112]:
plt.imshow(prob)
plt.clim([-5, 0])
plt.colorbar()
Out[112]:
In [ ]:
r1 = np.linspace(2, 5, 100)
r2 = np.linspace(2, 5, 100)
r3 = np.linspace(2, 5, 100)
score_list = []
for r1_ in r1:
for r2_ in r2:
score = gmm.score([[r1_, r2_, r3_]])
score_list.append(score)
In [88]:
gmm.score([[1,2,3]])
Out[88]:
In [89]:
gmm.predict_proba([[1,2,3]])
Out[89]:
In [63]:
data_lys["predicted"] = labels
In [71]:
kernel = stats.gaussian_kde(X_lys)
In [74]:
kernel.weights
Out[74]:
In [75]:
from sklearn.neighbors import KernelDensity
In [76]:
kde = KernelDensity()
In [77]:
kde.fit(X_lys)
Out[77]:
In [85]:
kde.tree_
Out[85]:
In [79]:
kde.score([[1,2,3]])
Out[79]:
In [ ]:
In [59]:
fig = px.scatter_3d(data_lys, x='r1', y='r2', z='r3', opacity=0.02)
fig.add_scatter3d(x=kmeans_lys.cluster_centers_[:,0], y=kmeans_lys.cluster_centers_[:,1], z=kmeans_lys.cluster_centers_[:,2])
fig.show()
In [53]:
fig = px.scatter_3d(data_lys, x='r1', y='r2', z='r3', opacity=0.1)
fig.show()
In [55]:
data_ala = data.query("ResName == 'ALA'").reset_index(drop=True)
sns.jointplot("r1", "r2", kind="kde", data=data_ala)
# plt.show()
Out[55]:
In [56]:
fig = px.scatter_3d(data_ala, x='r1', y='r2', z='r3', color="r")
fig.show()
In [57]:
fig = px.scatter_3d(data_ala, x='r1', y='r2', z='r3', opacity=0.1)
fig.show()
In [23]:
res_set = {}
res_set['GLY'] = {'CA', 'N', 'O', 'C'}
res_set['ALA'] = {'CA', 'CB', 'N', 'O', 'C'}
res_set['VAL'] = {'CA', 'CB', 'CG2', 'CG1', 'N', 'O', 'C'}
res_set['CYS'] = {'CA', 'CB', 'SG', 'N', 'O', 'C'}
res_set['PRO'] = {'CA', 'CB', 'CG', 'CD', 'N', 'O', 'C'}
res_set['LEU'] = {'CD1', 'CA', 'CD2', 'CB', 'CG', 'N', 'O', 'C'}
res_set['ILE'] = {'CD1', 'CA', 'CB', 'CG2', 'CG1', 'N', 'O', 'C'}
res_set['MET'] = {'CA', 'CB', 'SD', 'CG', 'CE', 'N', 'O', 'C'}
res_set['TRP'] = {'CD1', 'CA', 'CZ3', 'NE1', 'CD2', 'CB', 'CG', 'CZ2', 'CH2', 'CE3', 'N', 'O', 'CE2', 'C'}
res_set['PHE'] = {'CD1', 'CA', 'CD2', 'CB', 'CE1', 'CG', 'CZ', 'N', 'O', 'CE2', 'C'}
res_set['SER'] = {'CA', 'OG', 'CB', 'N', 'O', 'C'}
res_set['THR'] = {'CA', 'CB', 'CG2', 'N', 'OG1', 'O', 'C'}
res_set['TYR'] = {'CD1', 'CA', 'CD2', 'CB', 'CE1', 'OH', 'CG', 'CZ', 'N', 'O', 'CE2', 'C'}
res_set["GLN"] = {'CA', 'CB', 'CG', 'CD', 'OE1', 'NE2', 'N', 'O', 'C'}
res_set['ASN'] = {'CA', 'OD1', 'CB', 'CG', 'ND2', 'N', 'O', 'C'}
res_set['LYS'] = {'CA', 'CB', 'CE', 'CG', 'CD', 'NZ', 'N', 'O', 'C'}
res_set['ARG'] = {'CA', 'NE', 'CB', 'CG', 'CD', 'NH2', 'CZ', 'NH1', 'N', 'O', 'C'}
res_set['HIS'] = {'CA', 'ND1', 'CD2', 'CB', 'CE1', 'CG', 'NE2', 'N', 'O', 'C'}
res_set['ASP'] = {'OD2', 'CA', 'OD1', 'CB', 'CG', 'N', 'O', 'C'}
res_set['GLU'] = {'CA', 'OE2', 'CB', 'CG', 'CD', 'OE1', 'N', 'O', 'C'}
def dis(a, b):
return ((a[0]-b[0])**2 + (a[1]-b[1])**2 + (a[2]-b[2])**2)**0.5
def get_side_chain_center_of_mass(atoms):
# ensure complete first
total = np.array([0., 0., 0.])
total_mass = 0
for atom in atoms:
if atom.get_name() in ["N", "CA", "C", "O"]:
continue
if atom.element == "H":
continue
total += atom.mass * atom.get_coord()
total_mass += atom.mass
# print(atom.get_name(), atom.get_coord())
x_com = total / total_mass
return x_com
def get_all_non_H_atoms(res):
atoms = res.get_atoms()
name_list = []
for atom in atoms:
name = atom.get_name()
if atom.element == "H":
continue
name_list.append(name)
return set(name_list)
def get_x_more_check(res, res_set=res_set):
resName = res.get_resname()
if resName == "GLY":
x_com = res["CA"].get_coord()
else:
set_of_all_non_H_atoms = get_all_non_H_atoms(res)
if set_of_all_non_H_atoms != res_set[resName]:
print("atom not complete", set_of_all_non_H_atoms, "should be ", res_set[resName])
x_com = get_side_chain_center_of_mass(res.get_atoms())
n = res["N"].get_coord()
ca = res["CA"].get_coord()
c = res["C"].get_coord()
A = np.zeros((3,3))
A[:,0] = n
A[:,1] = ca
A[:,2] = c
A_inv = np.linalg.inv(A)
x = A_inv.dot(x_com)
return x
def get_x(res):
x_com = get_side_chain_center_of_mass(res.get_atoms())
n = res["N"].get_coord()
ca = res["CA"].get_coord()
c = res["C"].get_coord()
A = np.zeros((3,3))
A[:,0] = n
A[:,1] = ca
A[:,2] = c
A_inv = np.linalg.inv(A)
x = A_inv.dot(x_com)
return x
def get_x_with_regularization(res, alpha=0.001):
x_com = get_side_chain_center_of_mass(res.get_atoms())
n = res["N"].get_coord()
ca = res["CA"].get_coord()
c = res["C"].get_coord()
A = np.zeros((3,3))
A[:,0] = n
A[:,1] = ca
A[:,2] = c
# A_inv = np.linalg.inv(A)
# x = A_inv.dot(x_com)
# return x
A_inv_regularized = np.linalg.inv(A.T.dot(A) + alpha*np.eye(3)).dot(A.T)
x_regularized = A_inv_regularized.dot(x_com)
r = dis(A.dot(x_regularized), x_com)
return x_regularized, r
def get_x_vector_based(res, alpha=0.001):
n = res["N"].get_coord()
ca = res["CA"].get_coord()
c = res["C"].get_coord()
x_com = get_side_chain_center_of_mass(res.get_atoms())
ca_x_com = x_com - ca
n_ca = ca - n
ca_c = c - ca
x3 = np.cross(n_ca, ca_c)
n_ca_unit = n_ca / np.linalg.norm(n_ca)
ca_c_unit = ca_c / np.linalg.norm(ca_c)
x3_unit = x3 / np. linalg.norm(x3)
A = np.zeros((3,3))
A[:,0] = n_ca_unit
A[:,1] = ca_c_unit
A[:,2] = x3_unit
A_inv = np.linalg.inv(A)
x = A_inv.dot(ca_x_com)
return x, 0
def get_x_distance_based(res, alpha=0.001):
n = res["N"].get_coord()
ca = res["CA"].get_coord()
c = res["C"].get_coord()
x_com = get_side_chain_center_of_mass(res.get_atoms())
x = np.array([dis(x_com, n), dis(x_com, ca), dis(x_com, c)])
return x, dis(ca, x_com)
In [22]:
In [ ]:
import glob
a_list = glob.glob("/Users/weilu/Research/server/feb_2020/energy_evaluations_and_database_survey/my_CATH_database/*.pdb")
In [ ]:
%%time
info = []
skipped_residues_count = 0
# for i, a in enumerate(a_list[:500]):
for i, a in enumerate(a_list):
pdb = a.split("/")[-1].split(".")[0]
structure = parser.get_structure("x", a)
for res in structure.get_residues():
resName = res.get_resname()
set_of_all_non_H_atoms = get_all_non_H_atoms(res)
if resName == "GLY":
continue
if set_of_all_non_H_atoms != res_set[resName]:
skipped_residues_count += 1
# print("skip", resName, res.get_id(), "atom not complete", set_of_all_non_H_atoms, "should be ", res_set[resName])
continue
x, r = get_x_distance_based(res, alpha=0.001)
info.append([pdb, res.id[1], resName, x[0], x[1], x[2], r])
print("skipped_residues_count", skipped_residues_count)
In [ ]:
X = data_res[["r1", "r2", "r3"]].values
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3).fit(X)
labels = gmm.predict(X)
min_x = X.min(axis=0)
max_x = X.max(axis=0)
n = 50
r1 = np.linspace(min_x[0], max_x[0], n)
r2 = np.linspace(min_x[1], max_x[1], n)
r3 = np.linspace(min_x[2], max_x[2], n)
xx, yy, zz = np.meshgrid(r1, r2, r3)
d_ = np.zeros((n**3, 4))
d_[:,0] = xx.flatten()
d_[:,1] = yy.flatten()
d_[:,2] = zz.flatten()
d_[:,3] = gmm.score_samples(d_[:,:3])
d = pd.DataFrame(d_, columns=["r1", "r2", "r3", "score"])
d_filtered = d.query("score > 0")
fig = px.scatter_3d(d_filtered, x='r1', y='r2', z='r3', color="score")
fig.show()