In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
# sys.path.insert(0,'..')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2
In [2]:
plt.rcParams['figure.figsize'] = [16.18033, 10] #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
In [4]:
data = pd.read_csv("/Users/weilu/Research/data/survey_center_of_mass_distance_complete.csv", index_col=0)
In [5]:
y = "r_com_com"
d = data
t = d.groupby(["ResName1", "ResName2"])[y].idxmin().reset_index()
selected = d.iloc[t[y].to_list()].reset_index(drop=True)
In [7]:
In [26]:
weight_list = ["ALA", "SER", "PRO", "VAL", "THR", "CYS", "ILE", "LEU", "ASN", "ASP", "GLN", "LYS", "GLU", "MET", "HIS", "PHE", "ARG", "TYR", "TRP"]
res_to_index = {}
for i, res in enumerate(weight_list):
res_to_index[res] = i
min_r_com_com_matrix = np.zeros((19, 19))
for i, line in selected.iterrows():
res1 = line["ResName1"]
res2 = line["ResName2"]
min_r_com_com_matrix[res_to_index[res1]][res_to_index[res2]] = line["r_com_com"] - 3.5 + 0.3
plt.imshow(min_r_com_com_matrix, origin=0, cmap="seismic", vmin=-2, vmax=2)
plt.colorbar()
plt.xticks(ticks=np.arange(19), labels=weight_list)
In [27]:
Out[27]:
In [6]:
selected
Out[6]:
In [19]:
res_list = ['ARG', 'ASP', 'PRO', 'TRP', 'THR', 'HIS', 'GLU', 'LEU', 'SER',
'LYS', 'VAL', 'PHE', 'ILE', 'TYR', 'GLN', 'ALA', 'ASN', 'MET',
'CYS']
In [ ]:
for res1 in res_list:
for res2 in res_list:
data.query(f"ResName1 == '{res1}' and ResName2 == '{res2}'").hist("r_com_com", bins=50)
plt.xlim([0,20])
plt.title(f"{res1}_{res2}")
plt.savefig(f"/Users/weilu/Research/database/survey_center_of_mass_distance/{res1}_{res2}.png", dpi=300)
plt.clf()
In [63]:
data["r_com_com"].min()
Out[63]:
In [248]:
Out[248]:
In [249]:
short_list = ['ARG', 'ASP', 'PRO', 'TRP', 'THR', 'HIS', 'GLU', 'LEU', 'SER',
'LYS', 'VAL', 'PHE', 'ILE', 'TYR', 'GLN', 'ALA', 'ASN', 'MET',
'CYS']
from sklearn.mixture import GaussianMixture
df_ = []
for res1 in short_list:
for res2 in short_list:
data_one = data.query(f"ResName1 == '{res1}' and ResName2 == '{res2}'").reset_index(drop=True)
lower_data = data_one[data_one.r_com_com <= 7.5].reset_index(drop=True)
data_res = lower_data
X = data_res[["r_com_com"]].values
gmm = GaussianMixture(n_components=3).fit(X)
plt.hist(X, bins=20)
# x_hat = np.array(bin_centers).reshape(-1, 1)
x_min = data_res["r_com_com"].min()
x_max = data_res["r_com_com"].max()
x_hat = np.linspace(x_min, x_max).reshape(-1, 1)
y_hat = len(X)/6*np.exp(gmm.score_samples(x_hat))
# e_hat = gmm.score_samples(x_hat)
plt.plot(x_hat, y_hat)
#short_list = ['ARG','ASP']
# df_temp = pd.DataFrame(np.array([bin_centers, y]).T, columns=["r_cbd_cbd","energy"])
# df_temp["ResName1"] = res1
# df_temp["ResName2"] = res2
# #print(df_temp)
# df_.append(df_temp)
# plt.plot(bin_centers, y)
plt.title(f"gmm_fit_{res1}_{res2}")
plt.savefig(f"/Users/weilu/Research/server/mar_2020/cmd_cmd_exclude_volume/figures/gmm_fit_{res1}_{res2}.png", dpi=300)
plt.clf()
# print(df)
# df = pd.concat(df_).reset_index(drop=True)
# df.to_csv('/Users/weilu/Research/server/mar_2020/cmd_cmd_exclude_volume/cbd_cbd_energy.csv', index=False)
In [157]:
short_list = ['ARG', 'ASP', 'PRO', 'TRP', 'THR', 'HIS', 'GLU', 'LEU', 'SER',
'LYS', 'VAL', 'PHE', 'ILE', 'TYR', 'GLN', 'ALA', 'ASN', 'MET',
'CYS']
#short_list = ['ARG','ASP']
df_ = []
for res1 in short_list:
for res2 in short_list:
data_one = data.query(f"ResName1 == '{res1}' and ResName2 == '{res2}'").reset_index(drop=True)
lower_data = data_one[data_one.r_com_com <= 7.5].reset_index(drop=True)
hist,bins = np.histogram(lower_data["r_com_com"], bins=10)
bin_centers = (bins[1:] + bins[:-1])/2
y = -np.log((hist+10)/np.sum(hist+10))
df_temp = pd.DataFrame(np.array([bin_centers, y]).T, columns=["r_cbd_cbd","energy"])
df_temp["ResName1"] = res1
df_temp["ResName2"] = res2
#print(df_temp)
df_.append(df_temp)
plt.plot(bin_centers, y)
plt.title(f"energy_{res1}_{res2}")
plt.savefig(f"/Users/weilu/Research/server/mar_2020/cmd_cmd_exclude_volume/figures/energy_{res1}_{res2}.png", dpi=300)
plt.clf()
# print(df)
df = pd.concat(df_).reset_index(drop=True)
df.to_csv('/Users/weilu/Research/server/mar_2020/cmd_cmd_exclude_volume/cbd_cbd_energy.csv', index=False)
In [147]:
df_temp = pd.DataFrame(np.array([bin_centers, y]).T, columns=["r_cbd_cbd","energy"])
df_temp["ResName1"] = res1
df_temp["ResName2"] = res2
In [194]:
[10, 20]*2
Out[194]:
In [201]:
lower_data.hist("r_com_com")
Out[201]:
In [ ]:
In [270]:
data.query("3c18A03")
Out[270]:
In [254]:
a = gmm.sample(1000)[0]
In [259]:
min(X)[0]
Out[259]:
In [268]:
fig, ax1 = plt.subplots()
from sklearn.mixture import GaussianMixture
data_res = lower_data
X = data_res[["r_com_com"]].values
gmm = GaussianMixture(n_components=3).fit(X)
x_hat = np.linspace(min(X)[0], max(X)[0], 200).reshape(-1, 1)
y_hat = len(X)/6*np.exp(gmm.score_samples(x_hat))
e_hat = -gmm.score_samples(x_hat)
color = 'tab:blue'
ax1.tick_params(axis='y', labelcolor=color)
ax1.hist(X, bins=20)
ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
color = 'tab:red'
ax2.set_ylabel('sin', color=color) # we already handled the x-label with ax1
ax2.plot(x_hat, e_hat, color=color)
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout() # otherwise the right y-label is slightly clipped
plt.show()
In [269]:
plt.plot(x_hat, y_hat)
Out[269]:
In [251]:
# Create some mock data
t = np.arange(0.01, 10.0, 0.01)
data1 = np.exp(t)
data2 = np.sin(2 * np.pi * t)
fig, ax1 = plt.subplots()
color = 'tab:red'
ax1.set_xlabel('time (s)')
ax1.set_ylabel('exp', color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax1.hist(X, bins=20)
ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
color = 'tab:blue'
ax2.set_ylabel('sin', color=color) # we already handled the x-label with ax1
ax2.plot(x_hat, -e_hat, color=color)
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout() # otherwise the right y-label is slightly clipped
plt.show()
In [243]:
plt.hist(X, bins=20)
# x_hat = np.array(bin_centers).reshape(-1, 1)
x_hat = np.linspace(bins[0], bins[-1]).reshape(-1, 1)
y_hat = len(X)/6*np.exp(gmm.score_samples(x_hat))
# e_hat = gmm.score_samples(x_hat)
plt.plot(x_hat, y_hat)
# plt.plot(x_hat, e_hat)
Out[243]:
In [224]:
plt.hist(list(gmm.sample(1000)[0].flatten()))
Out[224]:
In [226]:
gmm.score_samples(np.array(bin_centers).reshape(-1, 1))
Out[226]:
In [228]:
y_hat = -gmm.score_samples(np.array(bin_centers).reshape(-1, 1))
plt.plot(bin_centers, y_hat)
Out[228]:
In [200]:
from scipy.signal import savgol_filter
res1 = "CYS"
res2 = "MET"
data_one = data.query(f"ResName1 == '{res1}' and ResName2 == '{res2}'").reset_index(drop=True)
lower_data = data_one[data_one.r_com_com <= 7.5].reset_index(drop=True)
n_bins = 10 * 1
data_selected = lower_data["r_com_com"].to_list()
hist,bins = np.histogram(data_selected, bins=n_bins)
bin_centers = (bins[1:] + bins[:-1])/2
y = -np.log((hist+10)/np.sum(hist+10))
# yhat = savgol_filter(y, 21, 3) # window size 51, polynomial order 3
df_temp = pd.DataFrame(np.array([bin_centers, y]).T, columns=["r_cbd_cbd","energy"])
df_temp["ResName1"] = res1
df_temp["ResName2"] = res2
plt.plot(bin_centers, y)
# plt.plot(bin_centers, yhat)
plt.title(f"energy_{res1}_{res2}")
Out[200]:
In [ ]:
scatter()
In [174]:
from scipy.signal import savgol_filter
x = np.linspace(0,2*np.pi,100)
y = np.sin(x) + np.random.random(100) * 0.2
yhat = savgol_filter(y, 5, 3) # window size 51, polynomial order 3
plt.plot(x,y)
plt.plot(x,yhat, color='red')
plt.show()
In [106]:
df = pd.DataFrame(columns=["ResName1","ResName2","Max","Lowest"])
short_list = ['ARG', 'ASP', 'PRO', 'TRP', 'THR', 'HIS', 'GLU', 'LEU', 'SER',
'LYS', 'VAL', 'PHE', 'ILE', 'TYR', 'GLN', 'ALA', 'ASN', 'MET',
'CYS']
#short_list = ['ARG','ASP']
for res1 in short_list:
for res2 in short_list:
data_one = data.query(f"ResName1 == '{res1}' and ResName2 == '{res2}'").reset_index(drop=True)
lower_data = data_one[data_one.r_com_com <= 7.5].reset_index(drop=True)
hist,bins = np.histogram(lower_data["r_com_com"], bins=10)
max_index = list(hist).index(max(hist))
max_value = (bins[max_index + 1] + bins[max_index]) / 2
lowest_value = bins[0]
min_value = lower_data["r_com_com"].min()
max_hist = max(hist)
#print(lowest_value)
#print(max_value)
df_temp = pd.DataFrame([(res1,res2,max_value,lowest_value, min_value, max_hist)], columns=["ResName1","ResName2","Max","Lowest", "Min", "max_hist"])
#print(df_temp)
df = df.append(df_temp, ignore_index=True) # append return a new dataframe, so must use df = df.append()
#
# print(df)
df.to_csv('/Users/weilu/Research/server/mar_2020/cmd_cmd_exclude_volume/cbd_cbd_max_min_bins10.csv', index=False)
In [64]:
df = pd.DataFrame(columns=["ResName1","ResName2","Max","Lowest"])
short_list = ['ARG', 'ASP', 'PRO', 'TRP', 'THR', 'HIS', 'GLU', 'LEU', 'SER',
'LYS', 'VAL', 'PHE', 'ILE', 'TYR', 'GLN', 'ALA', 'ASN', 'MET',
'CYS']
#short_list = ['ARG','ASP']
for res1 in short_list:
for res2 in short_list:
data_one = data.query(f"ResName1 == '{res1}' and ResName2 == '{res2}'")
lower_data = data_one[data_one.r_com_com <= 7.5].reset_index(drop=True)
hist,bins = np.histogram(lower_data["r_com_com"], bins=30)
max_index = list(hist).index(max(hist))
max_value = (bins[max_index + 1] + bins[max_index]) / 2
lowest_value = bins[0]
min_value = lower_data["r_com_com"].min()
#print(lowest_value)
#print(max_value)
df_temp = pd.DataFrame([(res1,res2,max_value,lowest_value, min_value)], columns=["ResName1","ResName2","Max","Lowest", "Min"])
#print(df_temp)
df = df.append(df_temp, ignore_index=True) # append return a new dataframe, so must use df = df.append()
#
#print(bins[max_index])
plt.hist(lower_data["r_com_com"],bins=bins)
plt.axvline(max_value, color='black')
plt.axvline(lowest_value, color='red')
#sns.distplot(lower_data["r_com_com"],bins=30)
plt.savefig(f"/Users/weilu/Research/server/mar_2020/cmd_cmd_exclude_volume/figures/{res1}_{res2}.png", dpi=300)
plt.clf()
print(df)
df.to_csv('/Users/weilu/Research/server/mar_2020/cmd_cmd_exclude_volume/cbd_cbd_max_min.csv', index=False)
In [66]:
df["diff"] = df["Lowest"] - df["Min"]
In [74]:
lower_data["r_com_com"].min()
Out[74]:
In [93]:
df_bin30 = pd.read_csv("/Users/weilu/Research/server/mar_2020/cmd_cmd_exclude_volume/cbd_cbd_max_min.csv")
In [96]:
plt.plot(df_bin30["Lowest"]-df["Lowest"])
Out[96]:
In [107]:
plt.plot(df_bin30["Max"]-df["Max"])
Out[107]:
In [97]:
plt.plot(df_bin30["Max"]-df["Max"])
Out[97]:
In [77]:
weight_list = ["ALA", "SER", "PRO", "VAL", "THR", "CYS", "ILE", "LEU", "ASN", "ASP", "GLN", "LYS", "GLU", "MET", "HIS", "PHE", "ARG", "TYR", "TRP"]
res_to_index = {}
for i, res in enumerate(weight_list):
res_to_index[res] = i
min_r_com_com_matrix = np.zeros((19, 19))
for i, line in df.iterrows():
res1 = line["ResName1"]
res2 = line["ResName2"]
min_r_com_com_matrix[res_to_index[res1]][res_to_index[res2]] = line["Min"]
# plt.imshow(min_r_com_com_matrix, origin=0, cmap="seismic", vmin=-2, vmax=2)
plt.imshow(min_r_com_com_matrix, origin=0, cmap="seismic")
plt.colorbar()
plt.xticks(ticks=np.arange(19), labels=weight_list)
Out[77]:
In [78]:
weight_list = ["ALA", "SER", "PRO", "VAL", "THR", "CYS", "ILE", "LEU", "ASN", "ASP", "GLN", "LYS", "GLU", "MET", "HIS", "PHE", "ARG", "TYR", "TRP"]
res_to_index = {}
for i, res in enumerate(weight_list):
res_to_index[res] = i
min_r_com_com_matrix = np.zeros((19, 19))
for i, line in df.iterrows():
res1 = line["ResName1"]
res2 = line["ResName2"]
min_r_com_com_matrix[res_to_index[res1]][res_to_index[res2]] = line["Max"]
# plt.imshow(min_r_com_com_matrix, origin=0, cmap="seismic", vmin=-2, vmax=2)
plt.imshow(min_r_com_com_matrix, origin=0, cmap="seismic")
plt.colorbar()
plt.xticks(ticks=np.arange(19), labels=weight_list)
Out[78]:
In [32]:
data.query("ResName1 == 'TRP' and ResName2 == 'TRP' and Protein == '4ft3A02'").sort_values("r_com_com")
Out[32]:
In [133]:
bins
Out[133]:
In [132]:
bin_centers
Out[132]:
In [114]:
hist,bins
Out[114]:
In [134]:
selected_data = data.query("ResName1 == 'ARG' and ResName2 == 'TRP' and r_com_com < 7.5")
hist,bins = np.histogram(selected_data["r_com_com"], bins=10)
bin_centers = (bins[1:] + bins[:-1])/2
y = -np.log((hist+10)/np.sum(hist+10))
plt.plot(bin_centers, y)
Out[134]:
In [120]:
np.sum(((hist+10)/np.sum(hist+10)))
Out[120]:
In [112]:
data.query("ResName1 == 'ARG' and ResName2 == 'TRP' and r_com_com < 7.5").hist("r_com_com", bins=30)
Out[112]:
In [111]:
data.query("ResName1 == 'ARG' and ResName2 == 'TRP' and r_com_com < 7.5").hist("r_com_com", bins=10)
Out[111]:
In [113]:
data.query("ResName1 == 'ARG' and ResName2 == 'TRP' and r_com_com < 7.5").hist("r_com_com", bins=20)
Out[113]:
In [105]:
data.query("ResName1 == 'TRP' and ResName2 == 'TRP' and r_com_com < 7.5").hist("r_com_com", bins=10)
Out[105]:
In [13]:
data.query("ResName1 == 'TRP' and ResName2 == 'TRP'").hist("r_com_com", bins=50)
Out[13]:
In [20]:
data.query("ResName1 == 'ALA' and ResName2 == 'ALA'").hist("r_com_com", bins=50)
Out[20]:
In [ ]:
In [30]:
data.query("ResName1 == 'ASN' and ResName2 == 'ARG'").hist("r_com_com", bins=50)
Out[30]:
In [32]:
data.query("ResName1 == 'ASN' and ResName2 == 'ARG'").sort_values("r_com_com").head(2)
Out[32]:
In [33]:
data.query("ResName1 == 'ARG' and ResName2 == 'ASN'").sort_values("r_com_com").head(2)
Out[33]:
In [29]:
data.query("ResName1 == 'ARG' and ResName2 == 'ASN'").hist("r_com_com", bins=50)
Out[29]:
In [39]:
import plotly.express as px
res = "PHE"
data_res = data.query(f"ResName1 == '{res}' and ResName2 == '{res}' and r_com_com < 7.5").reset_index(drop=True)
print(res, data_res.shape)
fig = px.scatter_3d(data_res, x='r_ca_ca', y='r_com_com', z='r_ca_com', opacity=0.1)
fig.show()
In [40]:
import plotly.express as px
res = "PHE"
data_res = data.query(f"ResName1 == '{res}' and ResName2 == '{res}'").reset_index(drop=True)
print(res, data_res.shape)
fig = px.scatter_3d(data_res, x='r_ca_ca', y='r_com_com', z='r_ca_com', opacity=0.1)
fig.show()
In [62]:
data_res = data.query("ResName1 == 'ARG' and ResName2 == 'MET' and r_com_com < 7.5")
data_res_filtered = data_res.query("abs(id1-id2)<12")
data_res_filtered.hist("r_com_com", bins=20)
Out[62]:
In [61]:
data_res = data.query("ResName1 == 'ARG' and ResName2 == 'MET' and r_com_com < 7.5")
data_res_filtered = data_res.query("abs(id1-id2)>12")
data_res_filtered.hist("r_com_com", bins=20)
Out[61]:
In [53]:
res = "ARG"
data_res = data.query(f"ResName1 == '{res}' and ResName2 == '{res}'").reset_index(drop=True)
In [51]:
data_res.dtypes
Out[51]:
In [55]:
data_res.query("r_com_com > 7 and r_com_com < 8")
Out[55]:
In [46]:
import plotly.express as px
res = "TRP"
data_res = data.query(f"ResName1 == '{res}' and ResName2 == '{res}' and r_com_com < 7.5").reset_index(drop=True)
data_res["r_com_com_plus_r_com_ca"] = data_res["r_com_com"] + data_res["r_com_ca"]
data_res["r_ca_com_plus_r_com_com"] = data_res["r_ca_com"] + data_res["r_com_com"]
print(res, data_res.shape)
# fig = px.scatter_3d(data_res, x='r_com_com_plus_r_com_ca', y='r_ca_com_plus_r_com_com', z='r_ca_com', opacity=0.1)
# fig.show()
sns.scatterplot("r_com_com_plus_r_com_ca", "r_ca_com_plus_r_com_com", size="r_com_com", data=data_res, )
Out[46]:
In [41]:
import plotly.express as px
res = "TRP"
data_res = data.query(f"ResName1 == '{res}' and ResName2 == '{res}'").reset_index(drop=True)
print(res, data_res.shape)
fig = px.scatter_3d(data_res, x='r_ca_ca', y='r_com_com', z='r_ca_com', opacity=0.1)
fig.show()
In [34]:
data_res = data.query("ResName1 == 'PHE' and ResName2 == 'PHE'").reset_index(drop=True)
sns.scatterplot("r_ca_ca", "r_com_com", data=data_res, )
Out[34]:
In [22]:
data_res = data.query("ResName1 == 'ALA' and ResName2 == 'ALA'").reset_index(drop=True)
sns.scatterplot("r_ca_ca", "r_com_com", data=data_res, )
Out[22]:
In [23]:
data_res = data.query("ResName1 == 'TRP' and ResName2 == 'TRP'").reset_index(drop=True)
sns.scatterplot("r_ca_ca", "r_com_com", data=data_res, )
Out[23]:
In [24]:
data_res = data.query("ResName1 == 'GLU' and ResName2 == 'GLU'").reset_index(drop=True)
sns.scatterplot("r_ca_ca", "r_com_com", data=data_res, )
Out[24]:
In [10]:
selected.sort_values("r_com_com")
Out[10]:
In [ ]: