In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
# sys.path.insert(0,'..')
# from notebookFunctions import *
# from .. import notebookFunctions

from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2

In [2]:
plt.rcParams['figure.figsize'] = [16.18033, 10]    #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100

In [3]:
data = pd.read_csv("/Users/weilu/Research/data/survey_represent_x_com_complete.csv", index_col=0)

In [4]:
data_res = data.query("ResName == 'TYR'").reset_index(drop=True)
sns.jointplot("r1", "r2", kind="kde", data=data_res)
# plt.show()


Out[4]:
<seaborn.axisgrid.JointGrid at 0x1a1f12df60>

In [5]:
import plotly.express as px

In [6]:
fig = px.scatter_3d(data_res, x='r1', y='r2', z='r3', color="r")
fig.show()



In [10]:
X = data_res[["r1", "r2", "r3"]].values

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)

In [15]:
data_res["predicted"] = y_kmeans

In [45]:
fig = px.scatter_3d(data_res, x='r1', y='r2', z='r3', opacity=0.02)
fig.add_scatter3d(x=kmeans.cluster_centers_[:,0], y=kmeans.cluster_centers_[:,1], z=kmeans.cluster_centers_[:,2])
fig.show()



In [5]:
X = data_res[["r1", "r2", "r3"]].values
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3).fit(X)
labels = gmm.predict(X)

In [6]:
gmm.covariances_


Out[6]:
array([[[ 0.00289093,  0.00220976,  0.00098427],
        [ 0.00220976,  0.0030833 ,  0.00477244],
        [ 0.00098427,  0.00477244,  0.03741773]],

       [[ 0.03945493,  0.00484687, -0.00370831],
        [ 0.00484687,  0.00319935,  0.00227712],
        [-0.00370831,  0.00227712,  0.00429153]],

       [[ 0.06486049,  0.00546383, -0.04913508],
        [ 0.00546383,  0.00289318,  0.00341033],
        [-0.04913508,  0.00341033,  0.07603875]]])

In [9]:
pc = gmm.precisions_cholesky_

In [15]:
1/gmm.covariances_


Out[15]:
array([[[ 345.90946024,  452.53879933, 1015.97861812],
        [ 452.53879933,  324.32787309,  209.53631185],
        [1015.97861812,  209.53631185,   26.72529802]],

       [[  25.34537362,  206.31882585, -269.66487862],
        [ 206.31882585,  312.56365561,  439.15160653],
        [-269.66487862,  439.15160653,  233.01693996]],

       [[  15.41770569,  183.02169154,  -20.35205635],
        [ 183.02169154,  345.63987873,  293.22691319],
        [ -20.35205635,  293.22691319,   13.15118987]]])

In [14]:
pc[0].dot(pc[0].T)


Out[14]:
array([[ 901.20971099, -759.03861429,   73.10523653],
       [-759.03861429, 1043.40177074, -113.11412853],
       [  73.10523653, -113.11412853,   39.22940183]])

In [18]:
pc[0].T.dot(pc[0])


Out[18]:
array([[ 345.90946024, -380.73482187,  217.08202464],
       [-380.73482187, 1136.31460705, -722.60357473],
       [ 217.08202464, -722.60357473,  501.61681627]])

In [17]:
pc


Out[17]:
array([[[ 18.59864135, -20.4711094 ,  11.67192918],
        [  0.        ,  26.7814915 , -18.05971993],
        [  0.        ,   0.        ,   6.26333791]],

       [[  5.0344189 ,  -2.40738223,   6.79789195],
        [  0.        ,  19.59680227, -32.00419269],
        [  0.        ,   0.        ,  30.49647854]],

       [[  3.92653864,  -1.70786655,   8.21358263],
        [  0.        ,  20.27387412, -25.01316343],
        [  0.        ,   0.        ,   8.06082409]]])

In [16]:
gmm.precisions_


Out[16]:
array([[[ 901.20971099, -759.03861429,   73.10523653],
        [-759.03861429, 1043.40177074, -113.11412853],
        [  73.10523653, -113.11412853,   39.22940183]],

       [[  77.35219785, -264.7380376 ,  207.31176607],
        [-264.7380376 , 1408.3030094 , -976.01517562],
        [ 207.31176607, -976.01517562,  930.0352032 ]],

       [[  85.79745344, -240.07275608,   66.2082447 ],
        [-240.07275608, 1036.68831688, -201.62671028],
        [  66.2082447 , -201.62671028,   64.97688496]]])

In [ ]:
gmm.pr

In [223]:
gmm.means_


Out[223]:
array([[5.22568112, 3.87951258, 4.01458734],
       [4.1477059 , 3.87660711, 5.26290152],
       [4.15129429, 3.9084444 , 4.11559124]])

In [228]:
x = np.array([[1,2,3]]).T
m = gmm.means_[0]
sigma = gmm.covariances_[0]

In [232]:
gmm.precisions_cholesky_


Out[232]:
array([[[ 18.59864135, -20.4711094 ,  11.67192918],
        [  0.        ,  26.7814915 , -18.05971993],
        [  0.        ,   0.        ,   6.26333791]],

       [[  5.0344189 ,  -2.40738223,   6.79789195],
        [  0.        ,  19.59680227, -32.00419269],
        [  0.        ,   0.        ,  30.49647854]],

       [[  3.92653864,  -1.70786655,   8.21358263],
        [  0.        ,  20.27387412, -25.01316343],
        [  0.        ,   0.        ,   8.06082409]]])

In [ ]:
def _estimate_log_prob(self, X):
    return _estimate_log_gaussian_prob(
        X, self.means_, self.precisions_cholesky_, self.covariance_type)

# Gaussian mixture probability estimators
def _compute_log_det_cholesky(matrix_chol, covariance_type, n_features):
    """Compute the log-det of the cholesky decomposition of matrices.
    Parameters
    ----------
    matrix_chol : array-like
        Cholesky decompositions of the matrices.
        'full' : shape of (n_components, n_features, n_features)
        'tied' : shape of (n_features, n_features)
        'diag' : shape of (n_components, n_features)
        'spherical' : shape of (n_components,)
    covariance_type : {'full', 'tied', 'diag', 'spherical'}
    n_features : int
        Number of features.
    Returns
    -------
    log_det_precision_chol : array-like, shape (n_components,)
        The determinant of the precision matrix for each component.
    """
    if covariance_type == 'full':
        n_components, _, _ = matrix_chol.shape
        log_det_chol = (np.sum(np.log(
            matrix_chol.reshape(
                n_components, -1)[:, ::n_features + 1]), 1))

    elif covariance_type == 'tied':
        log_det_chol = (np.sum(np.log(np.diag(matrix_chol))))

    elif covariance_type == 'diag':
        log_det_chol = (np.sum(np.log(matrix_chol), axis=1))

    else:
        log_det_chol = n_features * (np.log(matrix_chol))

    return log_det_chol

def _estimate_log_gaussian_prob(X, means, precisions_chol, covariance_type):
    """Estimate the log Gaussian probability.
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
    means : array-like, shape (n_components, n_features)
    precisions_chol : array-like
        Cholesky decompositions of the precision matrices.
        'full' : shape of (n_components, n_features, n_features)
        'tied' : shape of (n_features, n_features)
        'diag' : shape of (n_components, n_features)
        'spherical' : shape of (n_components,)
    covariance_type : {'full', 'tied', 'diag', 'spherical'}
    Returns
    -------
    log_prob : array, shape (n_samples, n_components)
    """
    n_samples, n_features = X.shape
    n_components, _ = means.shape
    # det(precision_chol) is half of det(precision)
    log_det = _compute_log_det_cholesky(
        precisions_chol, covariance_type, n_features)

    if covariance_type == 'full':
        log_prob = np.empty((n_samples, n_components))
        for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
            y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
            log_prob[:, k] = np.sum(np.square(y), axis=1)

    elif covariance_type == 'tied':
        log_prob = np.empty((n_samples, n_components))
        for k, mu in enumerate(means):
            y = np.dot(X, precisions_chol) - np.dot(mu, precisions_chol)
            log_prob[:, k] = np.sum(np.square(y), axis=1)

    elif covariance_type == 'diag':
        precisions = precisions_chol ** 2
        log_prob = (np.sum((means ** 2 * precisions), 1) -
                    2. * np.dot(X, (means * precisions).T) +
                    np.dot(X ** 2, precisions.T))

    elif covariance_type == 'spherical':
        precisions = precisions_chol ** 2
        log_prob = (np.sum(means ** 2, 1) * precisions -
                    2 * np.dot(X, means.T * precisions) +
                    np.outer(row_norms(X, squared=True), precisions))
    return -.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det

def score_samples(self, X):
    """Compute the weighted log probabilities for each sample.
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        List of n_features-dimensional data points. Each row
        corresponds to a single data point.
    Returns
    -------
    log_prob : array, shape (n_samples,)
        Log probabilities of each data point in X.
    """
    check_is_fitted(self)
    X = _check_X(X, None, self.means_.shape[1])

    return logsumexp(self._estimate_weighted_log_prob(X), axis=1)


    def _estimate_weighted_log_prob(self, X):
        """Estimate the weighted log-probabilities, log P(X | Z) + log weights.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
        Returns
        -------
        weighted_log_prob : array, shape (n_samples, n_component)
        """
        return self._estimate_log_prob(X) + self._estimate_log_weights()
    
    def _estimate_log_prob(self, X):
        """Estimate the log-probabilities log P(X | Z).
        Compute the log-probabilities per each component for each sample.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
        Returns
        -------
        log_prob : array, shape (n_samples, n_component)
        """
        pass
    
    def _estimate_log_weights(self):
        """Estimate log-weights in EM algorithm, E[ log pi ] in VB algorithm.
        Returns
        -------
        log_weight : array, shape (n_components, )
        """
        pass
    
    def _estimate_log_weights(self):
        return np.log(self.weights_)

In [ ]:
def _compute_log_det_cholesky(matrix_chol, covariance_type, n_features):
    """Compute the log-det of the cholesky decomposition of matrices.
    Parameters
    ----------
    matrix_chol : array-like
        Cholesky decompositions of the matrices.
        'full' : shape of (n_components, n_features, n_features)
        'tied' : shape of (n_features, n_features)
        'diag' : shape of (n_components, n_features)
        'spherical' : shape of (n_components,)
    covariance_type : {'full', 'tied', 'diag', 'spherical'}
    n_features : int
        Number of features.
    Returns
    -------
    log_det_precision_chol : array-like, shape (n_components,)
        The determinant of the precision matrix for each component.
    """
    if covariance_type == 'full':
        n_components, _, _ = matrix_chol.shape
        log_det_chol = (np.sum(np.log(
            matrix_chol.reshape(
                n_components, -1)[:, ::n_features + 1]), 1))

    return log_det_chol

In [253]:
gmm.means_.shape


Out[253]:
(3, 3)

In [255]:


In [299]:
gmm.means_


Out[299]:
array([[5.22568112, 3.87951258, 4.01458734],
       [4.1477059 , 3.87660711, 5.26290152],
       [4.15129429, 3.9084444 , 4.11559124]])

In [300]:
gmm.score_samples(gmm.means_)


Out[300]:
array([4.5064429 , 4.52522597, 0.88353589])

In [301]:
gmm.weights_


Out[301]:
array([0.45737378, 0.48324372, 0.0593825 ])

In [283]:
precisions_chol = gmm.precisions_cholesky_
means = gmm.means_
log_prob = np.empty((n_samples, n_components))
for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
    y = np.dot(x, prec_chol) - np.dot(mu, prec_chol)
    log_prob[:, k] = np.sum(np.square(y), axis=1)

In [284]:
-.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det


Out[284]:
array([[-3973.29074742, -1007.09479573,  -710.30321622]])

In [286]:



Out[286]:
array([-3973.29074742, -1007.09479573,  -710.30321622])

In [269]:
log_prob[0]


Out[269]:
(1, 3)

In [271]:
y = np.dot(x, precisions_chol[0]) - np.dot(gmm.means_[0], precisions_chol[0])
np.sum(np.square(y), axis=1)


Out[271]:
array([7957.15888929])

In [360]:
x = np.array([[5.22568112, 3.87951258, 4.01458734]])
x = np.array([[4.7, 3.9, 8]])
x = np.array([[4.7, 3.9, 7]])

In [349]:
X.max(axis=0)


Out[349]:
array([5.46133935, 4.14118887, 5.53369196])

In [365]:
x = X.max(axis=0).reshape(1,3)
# x = np.array([[5.22568112, 3.87951258, 4.01458734]])

In [376]:



Out[376]:
array([[5.22568112, 3.87951258, 4.01458734],
       [4.1477059 , 3.87660711, 5.26290152],
       [4.15129429, 3.9084444 , 4.11559124]])

In [377]:
means = gmm.means_

In [366]:
n_samples, n_features = x.shape
matrix_chol = gmm.precisions_cholesky_
n_components, _ = gmm.means_.shape
n_components, _, _ = matrix_chol.shape
log_det_chol = (np.sum(np.log(
    matrix_chol.reshape(
        n_components, -1)[:, ::n_features + 1]), 1))
log_det = log_det_chol

mean_dot_precisions_chol = np.zeros((3,3))
log_prob = np.zeros(3)
for i in range(n_components):
    mean_dot_precisions_chol[i] = np.dot(gmm.means_[i], precisions_chol[i])
    y = np.dot(x, precisions_chol[i]) - mean_dot_precisions_chol[i]
    log_prob[i] = np.sum(np.square(y))
    
log_gaussian_prob = -.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det

score = np.log(np.sum(np.exp(log_gaussian_prob + np.log(gmm.weights_))))
kt = 1
E_side_chain = -score*kt

print(E_side_chain)


35.90492633763506

In [383]:
def compute_side_chain_energy_for_x(x, means, precisions_chol, log_det):
    n_samples, n_features = x.shape
    n_components, _ = means.shape

    mean_dot_precisions_chol = np.zeros((3,3))
    log_prob = np.zeros(3)
    for i in range(n_components):
        mean_dot_precisions_chol[i] = np.dot(means[i], precisions_chol[i])
        y = np.dot(x, precisions_chol[i]) - mean_dot_precisions_chol[i]
        log_prob[i] = np.sum(np.square(y))

    log_gaussian_prob = -.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det

    score = np.log(np.sum(np.exp(log_gaussian_prob + np.log(gmm.weights_))))
    kt = 1
    E_side_chain = -score*kt
    # print(E_side_chain)
    return E_side_chain

In [384]:
compute_side_chain_energy_for_x(x, means, precisions_chol, log_det)


Out[384]:
35.90492633763506

In [385]:
x


Out[385]:
array([[5.46133935, 4.14118887, 5.53369196]])

In [367]:
log_gaussian_prob + np.log(gmm.weights_)


Out[367]:
array([ -35.90492634,  -57.41091197, -149.40589214])

In [373]:
np.log(np.sum(np.exp(np.array([0,-10, -10]))))


Out[373]:
9.079573746725622e-05

In [374]:
np.log(np.sum(np.exp(np.array([0,0, -10]))))


Out[374]:
0.6931698802671862

In [372]:
np.exp(np.array([1e-3,1e-10, 1e-10]))


Out[372]:
array([1.0010005, 1.       , 1.       ])

In [370]:
np.sum(np.exp(np.array([1e-3,1e-10, 1e-10])))


Out[370]:
3.0010005003667084

In [344]:
log_gaussian_prob + np.log(gmm.weights_)


Out[344]:
array([ -110283.22064057, -2604273.10594069,  -189982.03657247])

In [323]:
log_gaussian_prob


Out[323]:
array([ -7627311.16605359, -53149974.99266396,  -7925574.8941635 ])

In [310]:
log_gaussian_prob + np.log(gmm.weights_)


Out[310]:
array([   4.5064429 , -488.79635896,  -49.08856454])

In [316]:
precisions_chol


Out[316]:
array([[[ 18.59864135, -20.4711094 ,  11.67192918],
        [  0.        ,  26.7814915 , -18.05971993],
        [  0.        ,   0.        ,   6.26333791]],

       [[  5.0344189 ,  -2.40738223,   6.79789195],
        [  0.        ,  19.59680227, -32.00419269],
        [  0.        ,   0.        ,  30.49647854]],

       [[  3.92653864,  -1.70786655,   8.21358263],
        [  0.        ,  20.27387412, -25.01316343],
        [  0.        ,   0.        ,   8.06082409]]])

In [313]:
np.log(np.sum(np.exp(np.array([0,0,-40]))))


Out[313]:
0.6931471805599453

In [298]:
log_gaussian_prob + np.log(gmm.weights_)


Out[298]:
array([-3974.07300175, -1007.82202989,  -713.12697188])

In [293]:
np.sum(np.exp(log_gaussian_prob + np.log(gmm.weights_)))


Out[293]:
1.9628685536941e-310

In [281]:
log_prob


Out[281]:
array([7957.15888929, 2024.69451172, 1428.02101553])

In [274]:
np.sum(np.square(y), axis=1)


Out[274]:
array([7957.15888929])

In [270]:
log_prob


Out[270]:
array([[7957.15888929, 2024.69451172, 1428.02101553]])

In [ ]:
n_components, _, _ = matrix_chol.shape
log_det_chol = (np.sum(np.log(
    matrix_chol.reshape(
        n_components, -1)[:, ::n_features + 1]), 1))

In [235]:
x = np.array([[1,2,3]])

In [240]:
from scipy.special import logsumexp

In [241]:
logsumexp(gmm._estimate_log_prob(x) + gmm._estimate_log_weights(), axis=1)


Out[241]:
array([-713.12697188])

In [248]:
np.log(np.sum(np.exp(gmm._estimate_log_prob(x) + np.log(gmm.weights_))))


Out[248]:
-713.1269718770509

In [291]:
np.log(np.sum(np.exp(log_gaussian_prob + np.log(gmm.weights_))))


Out[291]:
-713.1269718770509

In [290]:
np.log(gmm.weights_)


Out[290]:
array([-0.78225432, -0.72723416, -2.82375566])

In [249]:
gmm._estimate_log_prob(x)


Out[249]:
array([[-3973.29074742, -1007.09479573,  -710.30321622]])

In [250]:
np.log(gmm.weights_)


Out[250]:
array([-0.78225432, -0.72723416, -2.82375566])

In [ ]:


In [245]:
np.exp(gmm._estimate_log_prob(x) + gmm._estimate_log_weights())


Out[245]:
array([[0.00000000e+000, 0.00000000e+000, 1.96286855e-310]])

In [242]:
gmm.score([[1,2,3]])


Out[242]:
-713.1269718770509

In [ ]:


In [ ]:
log_prob = np.empty((n_samples, n_components))
for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
    y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
    log_prob[:, k] = np.sum(np.square(y), axis=1)

In [231]:
(x-m).T.dot(np.linalg.inv(sigma)).dot(x-m)


Out[231]:
array([[6201.77602469, 4235.90014669, 4433.15643657],
       [4235.90014669, 2969.06126271, 3096.17607241],
       [4433.15643657, 3096.17607241, 3230.32888909]])

In [230]:
np.linalg.inv(sigma)


Out[230]:
array([[ 901.20971099, -759.03861429,   73.10523653],
       [-759.03861429, 1043.40177074, -113.11412853],
       [  73.10523653, -113.11412853,   39.22940183]])

In [229]:
sigma.shape


Out[229]:
(3, 3)

In [227]:
x.shape


Out[227]:
(3, 1)

In [219]:
sampled = gmm.sample(10000)[0]
sampled = pd.DataFrame(sampled, columns=["r1", "r2", "r3"])

In [220]:
fig = px.scatter_3d(sampled, x='r1', y='r2', z='r3', opacity=0.02)
fig.show()



In [209]:
sns.jointplot("r1", "r2", kind="kde", data=sampled)


Out[209]:
<seaborn.axisgrid.JointGrid at 0x1a1e6350f0>

In [188]:
sns.jointplot("r1", "r2", kind="kde", data=data_res)


Out[188]:
<seaborn.axisgrid.JointGrid at 0x1a2b1c89b0>

In [48]:
data_lys = data.query("ResName == 'LYS'").reset_index(drop=True)
sns.jointplot("r1", "r2", kind="kde", data=data_lys)
# plt.show()


Out[48]:
<seaborn.axisgrid.JointGrid at 0x1a2b402a20>

In [50]:
data_lys = data.query("ResName == 'LYS'").reset_index(drop=True)
sns.jointplot("r1", "r3", kind="kde", data=data_lys)
# plt.show()


Out[50]:
<seaborn.axisgrid.JointGrid at 0x1a1edf40b8>

In [58]:
X_lys = data_lys[["r1", "r2", "r3"]].values

from sklearn.cluster import KMeans
kmeans_lys = KMeans(n_clusters=3)
kmeans_lys.fit(X)
y_kmeans = kmeans_lys.predict(X_lys)

In [61]:


In [165]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3).fit(X_lys)
labels = gmm.predict(X_lys)


min_x = X_lys.min(axis=0)
max_x = X_lys.max(axis=0)
n = 50
r1 = np.linspace(min_x[0], max_x[0], n)
r2 = np.linspace(min_x[1], max_x[1], n)
r3 = np.linspace(min_x[2], max_x[2], n)

xx, yy, zz = np.meshgrid(r1, r2, r3)

d_ = np.zeros((n**3, 4))
d_[:,0] = xx.flatten()
d_[:,1] = yy.flatten()
d_[:,2] = zz.flatten()
d_[:,3] = gmm.score_samples(d_[:,:3])

d = pd.DataFrame(d_, columns=["r1", "r2", "r3", "score"])
d_filtered = d.query("score > 0")


fig = px.scatter_3d(d_filtered, x='r1', y='r2', z='r3', color="score")
fig.show()

In [172]:


In [173]:
fig = px.scatter_3d(d_filtered, x='r1', y='r2', z='r3', color="score")
fig.show()



In [136]:
plt.hist(gmm.score_samples(X_lys), bins=50)


Out[136]:
(array([  1.,   0.,   1.,   0.,   1.,   1.,   1.,   0.,   0.,   0.,   1.,
          2.,   2.,   2.,   1.,   3.,   0.,   2.,   3.,   3.,   2.,   5.,
          7.,   6.,   4.,  10.,   5.,  12.,  16.,  13.,  31.,  32.,  50.,
         85., 122., 163., 189., 297., 532., 749., 343., 133., 193., 205.,
        235., 368., 496., 332., 392., 179.]),
 array([-15.54486776, -15.16409846, -14.78332915, -14.40255984,
        -14.02179053, -13.64102122, -13.26025192, -12.87948261,
        -12.4987133 , -12.11794399, -11.73717469, -11.35640538,
        -10.97563607, -10.59486676, -10.21409745,  -9.83332815,
         -9.45255884,  -9.07178953,  -8.69102022,  -8.31025092,
         -7.92948161,  -7.5487123 ,  -7.16794299,  -6.78717368,
         -6.40640438,  -6.02563507,  -5.64486576,  -5.26409645,
         -4.88332715,  -4.50255784,  -4.12178853,  -3.74101922,
         -3.36024991,  -2.97948061,  -2.5987113 ,  -2.21794199,
         -1.83717268,  -1.45640338,  -1.07563407,  -0.69486476,
         -0.31409545,   0.06667386,   0.44744316,   0.82821247,
          1.20898178,   1.58975109,   1.97052039,   2.3512897 ,
          2.73205901,   3.11282832,   3.49359763]),
 <a list of 50 Patch objects>)

In [134]:
gmm.score_samples(X_lys)


Out[134]:
array([-0.74202584, -1.45329625,  2.36560947, ...,  1.2158689 ,
        1.65569531, -2.66455865])

In [118]:
prob = np.array(score_list).reshape(n,n,n)

In [ ]:


In [112]:
plt.imshow(prob)
plt.clim([-5, 0])
plt.colorbar()


Out[112]:
<matplotlib.colorbar.Colorbar at 0x1a268d1a58>

In [ ]:
r1 = np.linspace(2, 5, 100)
r2 = np.linspace(2, 5, 100)
r3 = np.linspace(2, 5, 100)

score_list = []
for r1_ in r1:
    for r2_ in r2:
            score = gmm.score([[r1_, r2_, r3_]])
            score_list.append(score)

In [88]:
gmm.score([[1,2,3]])


Out[88]:
-44.924811181615404

In [89]:
gmm.predict_proba([[1,2,3]])


Out[89]:
array([[0.00000000e+000, 1.00000000e+000, 8.78636648e-227]])

In [63]:
data_lys["predicted"] = labels

In [71]:
kernel = stats.gaussian_kde(X_lys)

In [74]:
kernel.weights


Out[74]:
array([0.33333333, 0.33333333, 0.33333333])

In [75]:
from sklearn.neighbors import KernelDensity

In [76]:
kde = KernelDensity()

In [77]:
kde.fit(X_lys)


Out[77]:
KernelDensity(algorithm='auto', atol=0, bandwidth=1.0, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0)

In [85]:
kde.tree_


Out[85]:
<sklearn.neighbors.kd_tree.KDTree at 0x7fb8f8725c18>

In [79]:
kde.score([[1,2,3]])


Out[79]:
-9.160519807383077

In [ ]:


In [59]:
fig = px.scatter_3d(data_lys, x='r1', y='r2', z='r3', opacity=0.02)
fig.add_scatter3d(x=kmeans_lys.cluster_centers_[:,0], y=kmeans_lys.cluster_centers_[:,1], z=kmeans_lys.cluster_centers_[:,2])
fig.show()



In [53]:
fig = px.scatter_3d(data_lys, x='r1', y='r2', z='r3', opacity=0.1)
fig.show()



In [55]:
data_ala = data.query("ResName == 'ALA'").reset_index(drop=True)
sns.jointplot("r1", "r2", kind="kde", data=data_ala)
# plt.show()


Out[55]:
<seaborn.axisgrid.JointGrid at 0x1a259f0898>

In [56]:
fig = px.scatter_3d(data_ala, x='r1', y='r2', z='r3', color="r")
fig.show()



In [57]:
fig = px.scatter_3d(data_ala, x='r1', y='r2', z='r3', opacity=0.1)
fig.show()



In [23]:
res_set = {}
res_set['GLY'] = {'CA', 'N', 'O', 'C'}
res_set['ALA'] = {'CA', 'CB', 'N', 'O', 'C'}
res_set['VAL'] = {'CA', 'CB', 'CG2', 'CG1', 'N', 'O', 'C'}
res_set['CYS'] = {'CA', 'CB', 'SG', 'N', 'O', 'C'}
res_set['PRO'] = {'CA', 'CB', 'CG', 'CD', 'N', 'O', 'C'}
res_set['LEU'] = {'CD1', 'CA', 'CD2', 'CB', 'CG', 'N', 'O', 'C'}
res_set['ILE'] = {'CD1', 'CA', 'CB', 'CG2', 'CG1', 'N', 'O', 'C'}
res_set['MET'] = {'CA', 'CB', 'SD', 'CG', 'CE', 'N', 'O', 'C'}
res_set['TRP'] = {'CD1', 'CA', 'CZ3', 'NE1', 'CD2', 'CB', 'CG', 'CZ2', 'CH2', 'CE3', 'N', 'O', 'CE2', 'C'}
res_set['PHE'] = {'CD1', 'CA', 'CD2', 'CB', 'CE1', 'CG', 'CZ', 'N', 'O', 'CE2', 'C'}


res_set['SER'] = {'CA', 'OG', 'CB', 'N', 'O', 'C'}
res_set['THR'] = {'CA', 'CB', 'CG2', 'N', 'OG1', 'O', 'C'}
res_set['TYR'] = {'CD1', 'CA', 'CD2', 'CB', 'CE1', 'OH', 'CG', 'CZ', 'N', 'O', 'CE2', 'C'}
res_set["GLN"] = {'CA', 'CB', 'CG', 'CD', 'OE1', 'NE2', 'N', 'O', 'C'}
res_set['ASN'] = {'CA', 'OD1', 'CB', 'CG', 'ND2', 'N', 'O', 'C'}


res_set['LYS'] = {'CA', 'CB', 'CE', 'CG', 'CD', 'NZ', 'N', 'O', 'C'}
res_set['ARG'] = {'CA', 'NE', 'CB', 'CG', 'CD', 'NH2', 'CZ', 'NH1', 'N', 'O', 'C'}
res_set['HIS'] = {'CA', 'ND1', 'CD2', 'CB', 'CE1', 'CG', 'NE2', 'N', 'O', 'C'}


res_set['ASP'] = {'OD2', 'CA', 'OD1', 'CB', 'CG', 'N', 'O', 'C'}
res_set['GLU'] = {'CA', 'OE2', 'CB', 'CG', 'CD', 'OE1', 'N', 'O', 'C'}

def dis(a, b):
    return ((a[0]-b[0])**2 + (a[1]-b[1])**2 + (a[2]-b[2])**2)**0.5

def get_side_chain_center_of_mass(atoms):
    # ensure complete first
    total = np.array([0., 0., 0.])
    total_mass = 0
    for atom in atoms:
        if atom.get_name() in ["N", "CA", "C", "O"]:
            continue
        if atom.element == "H":
            continue
        total += atom.mass * atom.get_coord()
        total_mass += atom.mass
        # print(atom.get_name(), atom.get_coord())
    x_com = total / total_mass
    return x_com

def get_all_non_H_atoms(res):
    atoms = res.get_atoms()
    name_list = []
    for atom in atoms:
        name = atom.get_name()
        if atom.element == "H":
            continue
        name_list.append(name)
    return set(name_list)


def get_x_more_check(res, res_set=res_set):
    resName = res.get_resname()
    if resName == "GLY":
        x_com = res["CA"].get_coord()
    else:
        set_of_all_non_H_atoms = get_all_non_H_atoms(res)
        if set_of_all_non_H_atoms != res_set[resName]:
            print("atom not complete", set_of_all_non_H_atoms, "should be ", res_set[resName])
        x_com = get_side_chain_center_of_mass(res.get_atoms())
    
    n = res["N"].get_coord()
    ca = res["CA"].get_coord()
    c = res["C"].get_coord()

    A = np.zeros((3,3))
    A[:,0] = n
    A[:,1] = ca
    A[:,2] = c

    A_inv = np.linalg.inv(A)
    x = A_inv.dot(x_com)
    return x


def get_x(res):
    x_com = get_side_chain_center_of_mass(res.get_atoms())
    
    n = res["N"].get_coord()
    ca = res["CA"].get_coord()
    c = res["C"].get_coord()

    A = np.zeros((3,3))
    A[:,0] = n
    A[:,1] = ca
    A[:,2] = c

    A_inv = np.linalg.inv(A)
    x = A_inv.dot(x_com)
    return x

def get_x_with_regularization(res, alpha=0.001):
    x_com = get_side_chain_center_of_mass(res.get_atoms())
    
    n = res["N"].get_coord()
    ca = res["CA"].get_coord()
    c = res["C"].get_coord()

    A = np.zeros((3,3))
    A[:,0] = n
    A[:,1] = ca
    A[:,2] = c

    # A_inv = np.linalg.inv(A)
    # x = A_inv.dot(x_com)
    # return x
    A_inv_regularized = np.linalg.inv(A.T.dot(A) + alpha*np.eye(3)).dot(A.T)
    x_regularized = A_inv_regularized.dot(x_com)
    r = dis(A.dot(x_regularized), x_com)
    return x_regularized, r


def get_x_vector_based(res, alpha=0.001):
    n = res["N"].get_coord()
    ca = res["CA"].get_coord()
    c = res["C"].get_coord()

    x_com = get_side_chain_center_of_mass(res.get_atoms())
    ca_x_com = x_com - ca

    n_ca = ca - n
    ca_c = c - ca
    x3 = np.cross(n_ca, ca_c)

    n_ca_unit = n_ca / np.linalg.norm(n_ca)
    ca_c_unit = ca_c / np.linalg.norm(ca_c)
    x3_unit = x3 / np. linalg.norm(x3)

    A = np.zeros((3,3))
    A[:,0] = n_ca_unit
    A[:,1] = ca_c_unit
    A[:,2] = x3_unit

    A_inv = np.linalg.inv(A)
    x = A_inv.dot(ca_x_com)
    return x, 0

def get_x_distance_based(res, alpha=0.001):
    n = res["N"].get_coord()
    ca = res["CA"].get_coord()
    c = res["C"].get_coord()

    x_com = get_side_chain_center_of_mass(res.get_atoms())

    x = np.array([dis(x_com, n), dis(x_com, ca), dis(x_com, c)])
    return x, dis(ca, x_com)

In [22]:


In [ ]:
import glob
a_list = glob.glob("/Users/weilu/Research/server/feb_2020/energy_evaluations_and_database_survey/my_CATH_database/*.pdb")

In [ ]:
%%time
info = []
skipped_residues_count = 0
# for i, a in enumerate(a_list[:500]):
for i, a in enumerate(a_list):
    pdb = a.split("/")[-1].split(".")[0]
    structure = parser.get_structure("x", a)
    for res in structure.get_residues():
        resName = res.get_resname()
        set_of_all_non_H_atoms = get_all_non_H_atoms(res)
        if resName == "GLY":
            continue
        if set_of_all_non_H_atoms != res_set[resName]:
            skipped_residues_count += 1
            # print("skip", resName, res.get_id(), "atom not complete", set_of_all_non_H_atoms, "should be ", res_set[resName])
            continue
        x, r = get_x_distance_based(res, alpha=0.001)
        info.append([pdb, res.id[1], resName, x[0], x[1], x[2], r])
print("skipped_residues_count", skipped_residues_count)

In [ ]:
X = data_res[["r1", "r2", "r3"]].values
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3).fit(X)
labels = gmm.predict(X)


min_x = X.min(axis=0)
max_x = X.max(axis=0)
n = 50
r1 = np.linspace(min_x[0], max_x[0], n)
r2 = np.linspace(min_x[1], max_x[1], n)
r3 = np.linspace(min_x[2], max_x[2], n)

xx, yy, zz = np.meshgrid(r1, r2, r3)

d_ = np.zeros((n**3, 4))
d_[:,0] = xx.flatten()
d_[:,1] = yy.flatten()
d_[:,2] = zz.flatten()
d_[:,3] = gmm.score_samples(d_[:,:3])

d = pd.DataFrame(d_, columns=["r1", "r2", "r3", "score"])
d_filtered = d.query("score > 0")


fig = px.scatter_3d(d_filtered, x='r1', y='r2', z='r3', color="score")
fig.show()