In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
# sys.path.insert(0,'..')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.Polypeptide import one_to_three
from Bio.PDB.Polypeptide import three_to_one
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
# from small_script.myFunctions import *
sys.path.insert(0, "/Users/weilu/openmmawsem")
from helperFunctions.myFunctions import *
from collections import defaultdict
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2

In [2]:
plt.rcParams['figure.figsize'] = np.array([16.18033, 10])    #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
plt.rcParams.update({'font.size': 22})

In [3]:
from simtk.openmm.app import *
from simtk.openmm import *
from simtk.unit import *
import numpy as np

se_map_1_letter = {'A': 0,  'P': 1,  'K': 2,  'N': 3,  'R': 4,
                   'F': 5,  'D': 6,  'Q': 7,  'E': 8,  'G': 9,
                   'I': 10, 'H': 11, 'L': 12, 'C': 13, 'M': 14,
                   'S': 15, 'T': 16, 'Y': 17, 'V': 18, 'W': 19}

def isChainStart(residueId, chain_starts, n=2):
    # return true if residue is near chain starts.
    # n=0 means always return False
    # n=1 means only return True if residue is the the first residue of a chain.
    # n=2 means return True if residue is the first or the one nearest to the first residue of a chain.
    atBegin = False
    for i in range(n):
        if (residueId-i) in chain_starts:
            atBegin = True
    return atBegin

def isChainEnd(residueId, chain_ends, n=2):
    # return true if residue is near chain ends.
    # n=0 means always return False
    # n=1 means only return True if residue is the the last residue of a chain.
    # n=2 means return True if residue is the last or the one nearest to the last residue of a chain.
    atEnd = False
    for i in range(n):
        if (residueId+i) in chain_ends:
            atEnd = True
    return atEnd
def isChainEdge(residueId, chain_starts, chain_ends, n=2):
    # n is how far away from the two ends count as in chain edge.
    return (isChainStart(residueId, chain_starts, n) or isChainEnd(residueId, chain_ends, n))
    # atBegin = False
    # atEnd = False
    # for i in range(n):
    #     if (residueId-i) in chain_starts:
    #         atBegin = True
    # for i in range(n):
    #     if (residueId+i) in chain_ends:
    #         atEnd = True
    # return (atBegin or atEnd)

def inWhichChain(residueId, chain_ends):
    chain_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    for i, end_of_chain_resId in enumerate(chain_ends):
        if end_of_chain_resId < residueId:
            pass
        else:
            return chain_table[i]

def read_beta_parameters():
    ### directly copied from Nick Schafer's
    # os.chdir(parameter_directory)
    in_anti_HB = open("anti_HB", 'r').readlines()
    in_anti_NHB = open("anti_NHB", 'r').readlines()
    in_para_HB = open("para_HB", 'r').readlines()
    in_para_one = open("para_one", 'r').readlines()
    in_anti_one = open("anti_one", 'r').readlines()

    p_par = np.zeros((20))
    p_anti = np.zeros((20))
    p_antihb = np.zeros((20,20,2))
    p_antinhb = np.zeros((20,20,2))
    p_parhb = np.zeros((20,20,2))

    for i in range(20):
        p_par[i] = float(in_para_one[i].strip())
        p_anti[i] = float(in_anti_one[i].strip())
        for j in range(20):
            p_antihb[i][j][0] = float(in_anti_HB[i].strip().split()[j])
            p_antinhb[i][j][0] = float(in_anti_NHB[i].strip().split()[j])
            p_parhb[i][j][0] = float(in_para_HB[i].strip().split()[j])

    for i in range(20):
        for j in range(20):
            p_antihb[i][j][1] = float(in_anti_HB[i+21].strip().split()[j])
            p_antinhb[i][j][1] = float(in_anti_NHB[i+21].strip().split()[j])
            p_parhb[i][j][1] = float(in_para_HB[i+21].strip().split()[j])
    return p_par, p_anti, p_antihb, p_antinhb, p_parhb


def get_lambda_by_index(i, j, lambda_i):


    lambda_table = [[1.37, 1.36, 1.17],
                    [3.89, 3.50, 3.52],
                    [0.00, 3.47, 3.62]]
    if abs(j-i) >= 4 and abs(j-i) < 18:
        return lambda_table[lambda_i][0]
    elif abs(j-i) >= 18 and abs(j-i) < 45:
        return lambda_table[lambda_i][1]
    elif abs(j-i) >= 45:
        return lambda_table[lambda_i][2]
    else:
        return 0

def get_alpha_by_index(i, j, alpha_i):
    alpha_table = [[1.30, 1.30, 1.30],
                    [1.32, 1.32, 1.32],
                    [1.22, 1.22, 1.22],
                    [0.00, 0.33, 0.33],
                    [0.00, 1.01, 1.01]]
    if abs(j-i) >= 4 and abs(j-i) < 18:
        return alpha_table[alpha_i][0]
    elif abs(j-i) >= 18 and abs(j-i) < 45:
        return alpha_table[alpha_i][1]
    elif abs(j-i) >= 45:
        return alpha_table[alpha_i][2]
    else:
        return 0






def get_Lambda_2(i, j, p_par, p_anti, p_antihb, p_antinhb, p_parhb, a):
    Lambda = get_lambda_by_index(i, j, 1)
    Lambda += -0.5*get_alpha_by_index(i, j, 0)*p_antihb[a[i], a[j]][0]
    Lambda += -0.25*get_alpha_by_index(i, j, 1)*(p_antinhb[a[i+1], a[j-1]][0] + p_antinhb[a[i-1], a[j+1]][0])
    Lambda += -get_alpha_by_index(i, j, 2)*(p_anti[a[i]] + p_anti[a[j]])
    return Lambda

def get_Lambda_3(i, j, p_par, p_anti, p_antihb, p_antinhb, p_parhb, a):
    Lambda = get_lambda_by_index(i, j, 2)
    Lambda += -get_alpha_by_index(i, j, 3)*p_parhb[a[i+1], a[j]][0]
    Lambda += -get_alpha_by_index(i, j, 4)*p_par[a[i+1]]
    Lambda += -get_alpha_by_index(i, j, 3)*p_par[a[j]]
    return Lambda


def pap_term_1(oa, k_pap=4.184, dis_i_to_i4=1.2, forceGroup=28):
    print("pap_1 term ON")
    # dis_i_to_i4 should be in nm, it disfavor hydrogen bond when ca_i and ca_i+4 are 1.2 nm apart away.
    nres, ca = oa.nres, oa.ca
    # r0 = 2.0 # nm
    r0 = 0.8  # nm
    eta_pap = 70  # nm^-1
    gamma_aph = 1.0
    gamma_ap = 0.4
    gamma_p = 0.4

    gamma_1 = np.zeros((nres, nres))
    gamma_2 = np.zeros((nres, nres))
    for i in range(nres):
        for j in range(nres):
            resId1 = i
            chain1 = inWhichChain(resId1, oa.chain_ends)
            resId2 = j
            chain2 = inWhichChain(resId2, oa.chain_ends)
            gamma_1[i][j] = get_pap_gamma_APH(i, j, chain1, chain2, gamma_aph)
            gamma_2[i][j] = get_pap_gamma_AP(i, j, chain1, chain2, gamma_ap)

    constraint_i_and_i4 = f"0.5*(1+tanh({eta_pap}*(distance(a1,a2)-{dis_i_to_i4})))"
    constraint_i_and_i4 = 1
    pap_function = f"-{k_pap}*(gamma_1(donor_idx,acceptor_idx)+gamma_2(donor_idx,acceptor_idx))\
                        *0.5*(1+tanh({eta_pap}*({r0}-distance(a1,d1))))\
                        *0.5*(1+tanh({eta_pap}*({r0}-distance(a2,d2))))\
                        *{constraint_i_and_i4}"

    # pap_function = f"-{k_pap}*(gamma_1(donor_idx,acceptor_idx))\
    #                     *0.5*(1+tanh({eta_pap}*({r0}-distance(a1,d1))))\
    #                     *0.5*(1+tanh({eta_pap}*({r0}-distance(a2,d2))))"

    # pap_function = f"-{k_pap}*distance(a1,d1)"
    pap = CustomHbondForce(pap_function)
    pap.addPerDonorParameter("donor_idx")
    pap.addPerAcceptorParameter("acceptor_idx")
    pap.addTabulatedFunction("gamma_1", Discrete2DFunction(nres, nres, gamma_1.T.flatten()))
    pap.addTabulatedFunction("gamma_2", Discrete2DFunction(nres, nres, gamma_2.T.flatten()))
    # print(ca)
    # count = 0;
    i = 0
    # pap.addAcceptor(ca[0], ca[4], -1, [0])
    # pap.addAcceptor(ca[20], ca[8], -1, [4])
    # pap.addDonor(ca[20], ca[0], -1, [4])
    for i in range(nres):
        if not isChainEnd(i, oa.chain_ends, n=4):
            pap.addAcceptor(ca[i], ca[i+4], -1, [i])

        # if i > 13 and not isChainStart(i, oa.chain_starts, n=4):
        if not isChainStart(i, oa.chain_starts, n=4):
            pap.addDonor(oa.n[i], oa.n[i-4], -1, [i])

    pap.setNonbondedMethod(CustomHbondForce.CutoffNonPeriodic)
    pap.setCutoffDistance(1.0)
    # print(count)
    pap.setForceGroup(forceGroup)
    return pap

def pap_term_2(oa, k_pap=4.184, dis_i_to_i4=1.2, forceGroup=28):
    print("pap_2 term ON")
    nres, ca = oa.nres, oa.ca
    # r0 = 2.0 # nm
    r0 = 0.8  # nm
    eta_pap = 70  # nm^-1
    gamma_aph = 1.0
    gamma_ap = 0.4
    gamma_p = 0.4

    gamma_3 = np.zeros((nres, nres))
    for i in range(nres):
        for j in range(nres):
            resId1 = i
            chain1 = inWhichChain(resId1, oa.chain_ends)
            resId2 = j
            chain2 = inWhichChain(resId2, oa.chain_ends)
            gamma_3[i][j] = get_pap_gamma_P(i, j, chain1, chain2, gamma_p)
            # gamma_3[j][i] = gamma_3[i][j]

    # constraint_i_and_i4 = f"0.5*(1+tanh({eta_pap}*(distance(a1,a2)-{dis_i_to_i4})))"
    constraint_i_and_i4 = 1

    pap_function = f"-{k_pap}*gamma_3(donor_idx,acceptor_idx)\
                        *0.5*(1+tanh({eta_pap}*({r0}-distance(a1,d1))))\
                        *0.5*(1+tanh({eta_pap}*({r0}-distance(a2,d2))))\
                        *{constraint_i_and_i4}"
    pap = CustomHbondForce(pap_function)
    pap.addPerDonorParameter("donor_idx")
    pap.addPerAcceptorParameter("acceptor_idx")
    pap.addTabulatedFunction("gamma_3", Discrete2DFunction(nres, nres, gamma_3.T.flatten()))
    # print(oa.n)
    # count = 0;
    for i in range(nres):
        if not isChainEnd(i, oa.chain_ends, n=13):
            pap.addAcceptor(ca[i], ca[i+4], -1, [i])
        if not isChainEnd(i, oa.chain_ends, n=4):
            # pap.addDonor(ca[i], ca[i+4], -1, [i])
            if oa.n[i] != -1 and oa.n[i+4] != -1:
                pap.addDonor(oa.n[i], oa.n[i+4], -1, [i])

    pap.setNonbondedMethod(CustomHbondForce.CutoffNonPeriodic)
    pap.setCutoffDistance(1.0)
    # print(count)
    pap.setForceGroup(forceGroup)
    return pap

In [55]:
parser = PDBParser()
structure = parser.get_structure("X", "/Users/weilu/Research/server/nov_2019/beta_pap_term_energy/openMM_chainA/chainA-cleaned.pdb")

all_res = list(structure.get_residues())
chains = [res.get_full_id()[2] for res in all_res]
n = len(all_res)

In [56]:
def get_lambda_by_index(i, j, lambda_i):


    lambda_table = [[1.37, 1.36, 1.17],
                    [3.89, 3.50, 3.52],
                    [0.00, 3.47, 3.62]]
    if abs(j-i) >= 4 and abs(j-i) < 18:
        return lambda_table[lambda_i][0]
    elif abs(j-i) >= 18 and abs(j-i) < 45:
        return lambda_table[lambda_i][1]
    elif abs(j-i) >= 45:
        return lambda_table[lambda_i][2]
    else:
        return 0

def get_alpha_by_index(i, j, alpha_i):
    alpha_table = [[1.30, 1.30, 1.30],
                    [1.32, 1.32, 1.32],
                    [1.22, 1.22, 1.22],
                    [0.00, 0.33, 0.33],
                    [0.00, 1.01, 1.01]]
    if abs(j-i) >= 4 and abs(j-i) < 18:
        return alpha_table[alpha_i][0]
    elif abs(j-i) >= 18 and abs(j-i) < 45:
        return alpha_table[alpha_i][1]
    elif abs(j-i) >= 45:
        return alpha_table[alpha_i][2]
    else:
        return 0

In [57]:
nres = n
lambda_1 = np.zeros((nres, nres))
for i in range(nres):
    for j in range(nres):
        lambda_1[i][j] = get_lambda_by_index(i, j, 0)

In [63]:
r_ON = .298 * 10
sigma_NO = .068 * 10
r_OH = .206 * 10
sigma_HO = .076 * 10
eta_beta_1 = 10.0 * 0.1
eta_beta_2 = 5.0 * 0.1
# r_HB_c = 0.4
r_HB_c = 1.2 * 10
V_beta1 = 0
for i in range(n):
    for j in range(n):
#         if abs(i-j) < 2:
#             continue
        chain_i = chains[i]
        chain_j = chains[j]
        if all_res[j].get_resname() == "PRO":
            continue
        r_Oi_Nj = all_res[i]["O"] - all_res[j]["N"]
        r_Oi_Hj = all_res[i]["O"] - all_res[j]["H"]
#         if all_res[i].get_resname() == "PRO":
#             continue
#         r_Oi_Nj = all_res[j]["O"] - all_res[i]["N"]
#         r_Oi_Hj = all_res[j]["O"] - all_res[i]["H"]
        theta_ij = np.exp(-(r_Oi_Nj-r_ON)**2/(2*sigma_NO**2)-(r_Oi_Hj-r_OH)**2/(2*sigma_HO**2))
        if i < 2 or i >= n-2:
            nu_i = 1
        else:
            r_CAim2_CAip2 = all_res[i-2]["CA"] - all_res[i+2]["CA"]
            nu_i = 0.5*(1+tanh(eta_beta_1*(r_CAim2_CAip2-r_HB_c)))
        if j < 2 or j >= n-2:
            nu_j = 1
        else:
            r_CAjm2_CAjp2 = all_res[j-2]["CA"] - all_res[j+2]["CA"]
            nu_j = 0.5*(1+tanh(eta_beta_2*(r_CAjm2_CAjp2-r_HB_c)))
        V_ij = lambda_1[i][j] * theta_ij * nu_i * nu_j
        V_beta1 += V_ij
#         if abs(V_ij) > 1e-3:
#             print(i, j, V_ij,lambda_1[i][j], theta_ij, nu_i, nu_j, r_Oi_Nj, r_Oi_Hj)
V_beta1


Out[63]:
30.895846092148226

In [ ]:


In [60]:
res = all_res[9]

In [61]:
res.get_resname()


Out[61]:
'VAL'

In [ ]:
res.get_atoms()

In [51]:
j


Out[51]:
11

In [52]:
i


Out[52]:
1

In [47]:
i


Out[47]:
0

In [ ]:


In [62]:
list(res.get_atoms())


Out[62]:
[<Atom N>,
 <Atom H>,
 <Atom CA>,
 <Atom HA>,
 <Atom C>,
 <Atom O>,
 <Atom CB>,
 <Atom HB>,
 <Atom CG1>,
 <Atom HG11>,
 <Atom HG12>,
 <Atom HG13>,
 <Atom CG2>,
 <Atom HG21>,
 <Atom HG22>,
 <Atom HG23>]

In [10]:
res = all_res[0]

In [12]:
res.get_resname()


Out[12]:
'GLN'

In [16]:
# chains = pd.read_csv(io.StringIO(a), sep=" ", names=["res", "chain"])

all_res = list(structure.get_residues())
chains = [res.get_full_id()[2] for res in all_res]
n = len(all_res)

In [ ]:
# APH term

In [46]:
def get_pap_gamma_APH(donor_idx, acceptor_idx, chain_i, chain_j, gamma_APH):
    # if chain_i == chain_j and abs(j-i) < 13 or abs(j-i) > 16:
    # if abs(j-i) < 13 or abs(j-i) > 16:
    # if i-j < 13 or i-j > 16:
    # if (donor_idx - acceptor_idx >= 13 and donor_idx - acceptor_idx <= 16) or chain_i != chain_j:
    if (donor_idx - acceptor_idx >= 13 and donor_idx - acceptor_idx <= 16):
        return gamma_APH
    else:
        return 0
    # if donor_idx - acceptor_idx < 13 or donor_idx - acceptor_idx > 16:
    #     return 0
    # else:
    #     return gamma_APH

In [48]:
eta = 7
r0 = 8
a_ = 0 
for i in range(n-4):
    for j in range(4, n):
        chain_i = chains.iloc[i][1]
        chain_j = chains.iloc[j][1]
        gamma = get_pap_gamma_APH(j, i, chain_i, chain_j, 1)
        dis = all_res[i]["CA"] - all_res[j]["CA"]
        dis_p4 = all_res[i+4]["CA"] - all_res[j-4]["CA"]
        rho1 = 0.5*(1+np.tanh(eta*(r0-dis)))
        rho2 = 0.5*(1+np.tanh(eta*(r0-dis_p4)))
        show = -gamma * rho1 * rho2
        a_ += show
#         if show > 1e-6:
#             print(i, j, show, rho1, rho2)
#         if i == 0:
#             print(i, j, show, rho1, rho2)
#     break
print(a_)


56.30805174575171

In [ ]:
# P term

In [44]:
def get_pap_gamma_AP(donor_idx, acceptor_idx, chain_i, chain_j, gamma_AP):
    if (donor_idx - acceptor_idx >= 17) or chain_i != chain_j:
        #     if (donor_idx - acceptor_idx >= 17):
        return gamma_AP
    else:
        return 0

In [45]:
eta = 7
r0 = 8
a_ = 0 
for i in range(n-4):
    for j in range(4, n):
        chain_i = chains.iloc[i][1]
        chain_j = chains.iloc[j][1]
        gamma = get_pap_gamma_AP(j, i, chain_i, chain_j, 0.4)
        dis = all_res[i]["CA"] - all_res[j]["CA"]
        dis_p4 = all_res[i+4]["CA"] - all_res[j-4]["CA"]
        rho1 = 0.5*(1+np.tanh(eta*(r0-dis)))
        rho2 = 0.5*(1+np.tanh(eta*(r0-dis_p4)))
        show = gamma * rho1 * rho2
        a_ += show
#         if show > 1e-6:
#             print(i, j, show, rho1, rho2)
#         if i == 0:
#             print(i, j, show, rho1, rho2)
#     break
print(a_)


90.8770620506384

In [ ]:
# AP term

In [54]:
print(a_)


88.6965610402261

In [60]:
def get_pap_gamma_P(donor_idx, acceptor_idx, chain_i, chain_j, gamma_P):
    if (donor_idx - acceptor_idx >= 9) or chain_i != chain_j:
        return gamma_P
    else:
        return 0

In [61]:
eta = 7
r0 = 8
a_ = 0 
for i in range(n-4):
    for j in range(n-4):
        chain_i = chains.iloc[i][1]
        chain_j = chains.iloc[j][1]
        gamma = get_pap_gamma_P(j, i, chain_i, chain_j, 0.4)
        dis = all_res[i]["CA"] - all_res[j]["CA"]
        dis_p4 = all_res[i+4]["CA"] - all_res[j+4]["CA"]
        rho1 = 0.5*(1+np.tanh(eta*(r0-dis)))
        rho2 = 0.5*(1+np.tanh(eta*(r0-dis_p4)))
        show = gamma * rho1 * rho2
        a_ += show
#         if show > 1e-6:
#             print(i, j, show)

In [62]:
print(a_)


22.350501757230532

In [56]:
eta = 7
r0 = 8
a_ = 0 
for i in range(n-4):
    for j in range(i+9, n-4):
        gamma = 0.4
        dis = all_res[i]["CA"] - all_res[j]["CA"]
        dis_p4 = all_res[i+4]["CA"] - all_res[j+4]["CA"]
        rho1 = 0.5*(1+np.tanh(eta*(r0-dis)))
        rho2 = 0.5*(1+np.tanh(eta*(r0-dis_p4)))
        show = gamma * rho1 * rho2
        a_ += show
#         if show > 1e-6:
#             print(i, j, show)

In [57]:
print(a_)


22.3505017569809

In [30]:
-22.983010/22.350501756


Out[30]:
-1.0282995098233176

In [7]:
from pdbfixer import PDBFixer

In [2]:
pdbfixer.absolute_import


Out[2]:
_Feature((2, 5, 0, 'alpha', 1), (3, 0, 0, 'alpha', 0), 16384)

In [64]:
fixer = PDBFixer(filename="/Users/weilu/Research/server/nov_2019/benchmark/20191025_benchmark/original/6d3r.pdb")

In [8]:
fixer.findMissingResidues()
# add missing residues in the middle of a chain, not ones at the start or end of the chain.
chains = list(fixer.topology.chains())
keys = fixer.missingResidues.keys()

In [67]:
keys


Out[67]:
dict_keys([])

In [9]:
fixer = PDBFixer(pdbid='1VII')
fixer.findMissingResidues()
missing_residues = fixer.missingResidues
missing_residues


Out[9]:
{}

In [69]:


In [11]:
fixer = pdbfixer.PDBFixer(pdbid="5uak")
fixer.findMissingResidues()
keys = fixer.missingResidues.keys()
keys


Out[11]:
dict_keys([(0, 0), (0, 398), (0, 605), (0, 645), (0, 909), (0, 1139)])

In [71]:
6D3R


Out[71]:
{}

In [72]:
fixer.missingResidues.keys()


Out[72]:
dict_keys([])

In [4]:
fixer = pdbfixer.PDBFixer(filename="/Users/weilu/Research/server/nov_2019/benchmark/20191025_benchmark/original/6d3r.pdb")
fixer.findMissingResidues()
keys = fixer.missingResidues.keys()
keys


Out[4]:
dict_keys([])

In [73]:
fixer = PDBFixer(filename="/Users/weilu/Research/server/nov_2019/benchmark/20191025_benchmark/original/6jdq.pdb")
fixer.findMissingResidues()
keys = fixer.missingResidues.keys()
keys


Out[73]:
dict_keys([(0, 0), (0, 136), (0, 443), (0, 1069)])

In [74]:
fixer.missingResidues


Out[74]:
{(0, 0): ['MET', 'ALA', 'ALA', 'PHE', 'LYS', 'PRO', 'ASN'],
 (0, 136): ['ARG', 'LYS'],
 (0, 443): ['TYR', 'GLY', 'LYS', 'LYS', 'ASN'],
 (0, 1069): ['GLU', 'HIS', 'HIS', 'HIS', 'HIS', 'HIS', 'HIS', 'HIS', 'HIS']}

In [ ]: