In [2]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
# sys.path.insert(0,'..')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.PDBParser import PDBParser

# from small_script.myFunctions import *
# from pyCodeLib import *

%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2

In [3]:
plt.rcParams['figure.figsize'] = [16.18033, 10]    #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100

In [21]:
def getSeqFromFasta(fastaFile):
    # "/Users/weilu/Research/server/aug_2019/hybrid_protein_simulation/setup/1pv6/1pv6.fasta"
    with open(fastaFile) as f:
        a = f.readlines()
    seq = ""
    for line in a:
        if line[0] == ">":
            continue
        seq += line.strip()
    return seq

def getContactMapFromPDB(pdbFile, n):
    # n = 472
    # pdbFile = "/Users/weilu/Research/server/oct_2019/draw_contact_for_DMP/5xr8_clean.pdb"
    cutoff = 9.5
    MAX_OFFSET = 0
    parser = PDBParser()
    structure = parser.get_structure('target', pdbFile)
    contact_table = np.ones((n,n)) * 99
    all_residues = list(structure.get_residues())
    for idx_i, res1 in enumerate(all_residues):
        i = res1.get_id()[1] - 1 
        for idx_j, res2 in enumerate(all_residues):
            j = res2.get_id()[1] - 1
            contact_table[i][j] = res1["CA"]-res2["CA"]

    data = (contact_table < cutoff)
    remove_band = np.eye(n)
    for i in range(1, MAX_OFFSET):
        remove_band += np.eye(n, k=i)
        remove_band += np.eye(n, k=-i)
    data[remove_band==1] = 0
    return data

def get_contactFromDMP(fileLocation, n, threshold=0.2):
    a = np.zeros((n,n))
    c_list = []
    with open(fileLocation, "r") as f:
        # for i in range(9):
        #     next(f)
        for line in f:
            # print(line)
            try:
                i,j,_,_,_,p = line.split(" ")
                # print(i,j,p)
                a[int(i)-1,int(j)-1] = float(p)
                a[int(j)-1,int(i)-1] = float(p)
                if float(p) > threshold:
                    c_list.append([int(i),int(j),float(p)])
            except Exception as e:
                print(e)
                pass
    return a, np.array(c_list)

In [25]:
fastaFile = "/Users/weilu/Research/server/jun_week1_2020/no_DNA_protein_DNA_benchmark/setups/1a36/1a36.fasta"
seq = getSeqFromFasta(fastaFile)

n = len(seq)

pdbFile = "/Users/weilu/Research/server/jun_week1_2020/no_DNA_protein_DNA_benchmark/setups/1a36/1a36.pdb"
data = getContactMapFromPDB(pdbFile, n)

fileLocation = "/Users/weilu/Research/server/jun_week1_2020/no_DNA_protein_DNA_benchmark/DMP/1a36/1a36.deepmetapsicov.con"
a, _ = get_contactFromDMP(fileLocation, n)

In [8]:


In [45]:
plt.rcParams['figure.figsize'] = 1.5*np.array([16.18033, 10])    #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100

In [56]:
np.tri(n, k=-2)


Out[56]:
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 1., 0., 0.]])

In [60]:
# data_s = data.astype(float)
threshold=0.5

native = data.astype(float)
predicted = (a>threshold).astype(float)

combined = native + predicted * 2
upper = combined * np.tri(n, k=-6)

# data_s = data.astype(float)
# t_s = (a>0.5).astype(float)
# combined = data_s + t_s * 2
combined = native + predicted * 2

lower = combined * (1 - np.tri(n, k=6))
combined = upper + lower
from matplotlib import colors
# red is in crystal but not in predicted.
# blue is in predicted but not in crystal
# black is in both
cmap = colors.ListedColormap(['white', '#e6194B', '#4363d8', '#bfef45'])
cmap = colors.ListedColormap(['white', '#fabed4', '#42d4f4', '#000000'])
# cmap = colors.ListedColormap(['white', 'red', 'green', 'blue'])
# cmap = colors.ListedColormap(['white', '#bfef45', '#4363d8', '#e6194B'])
# cmap = colors.ListedColormap(['white', 'red', 'blue', '#bfef45'])

bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
plt.title("Red is only in crystal, Blue is only in Predicted(upper), DMP in lower part")
# plt.savefig("/Users/weilu/Research/server/oct_2019/draw_contact_for_DMP/contact.png")


Out[60]:
Text(0.5, 1.0, 'Red is only in crystal, Blue is only in Predicted(upper), DMP in lower part')

In [48]:


In [49]:
fileLocation = "/Users/weilu/Research/server/oct_2019/draw_contact_for_DMP/cannabinoid_receptor.deepmetapsicov.con"
a, _ = get_contactFromDMP(fileLocation, 472)
n = 472
pdbFile = "/Users/weilu/Research/server/oct_2019/draw_contact_for_DMP/5xr8_clean.pdb"
data = getContactMapFromPDB(pdbFile, n)

In [50]:
pdbFile = "/Users/weilu/Research/server/oct_2019/draw_contact_for_DMP/lastFrame.pdb"
lastFrame = getContactMapFromPDB(pdbFile, n)

In [ ]:
data_s = data.astype(float)
t_s = lastFrame.astype(float)
combined = data_s + t_s * 2
from matplotlib import colors
# red is in crystal but not in predicted.
# blue is in predicted but not in crystal
# black is in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
plt.title("Red is only in crystal, Blue is only in Predicted")
# plt.savefig("/Users/weilu/Research/server/oct_2019/draw_contact_for_DMP/contact.png")

In [62]:
data_s = (a>0.5).astype(float)
t_s = lastFrame.astype(float)
combined = data_s + t_s * 2
from matplotlib import colors
# red is in crystal but not in predicted.
# blue is in predicted but not in crystal
# black is in both
cmap = colors.ListedColormap(['white', 'red', 'green', 'blue'])
# cmap = "Accent"
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
plt.title("Red is only in crystal, Blue is only in Predicted")
# plt.savefig("/Users/weilu/Research/server/oct_2019/draw_contact_for_DMP/contact.png")


Out[62]:
Text(0.5, 1.0, 'Red is only in crystal, Blue is only in Predicted')

In [80]:



Out[80]:
array([[0., 1., 1., ..., 1., 1., 1.],
       [0., 0., 1., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [83]:
data_s = data.astype(float)
t_s = lastFrame.astype(float)
combined = data_s + t_s * 2
upper = combined * np.tri(n, k=-1)

data_s = data.astype(float)
t_s = (a>0.5).astype(float)
combined = data_s + t_s * 2
lower = combined * (1 - np.tri(n, k=0))
combined = upper + lower
from matplotlib import colors
# red is in crystal but not in predicted.
# blue is in predicted but not in crystal
# black is in both
cmap = colors.ListedColormap(['white', '#e6194B', '#4363d8', '#bfef45'])
# cmap = colors.ListedColormap(['white', 'red', 'blue', '#bfef45'])

bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
plt.title("Red is only in crystal, Blue is only in Predicted(upper), DMP in lower part")
plt.savefig("/Users/weilu/Research/server/oct_2019/draw_contact_for_DMP/contact.png")



In [52]:
data_s = data.astype(float)
t_s = (a>0.5).astype(float)
combined = data_s + t_s * 2
from matplotlib import colors
# red is in crystal but not in predicted.
# blue is in predicted but not in crystal
# black is in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
plt.title("Red is only in crystal, Blue is only in Predicted")
# plt.savefig("/Users/weilu/Research/server/oct_2019/draw_contact_for_DMP/contact.png")


Out[52]:
Text(0.5, 1.0, 'Red is only in crystal, Blue is only in Predicted')

In [34]:
from matplotlib import colors
# red is in crystal but not in predicted.
# blue is in predicted but not in crystal
# black is in both
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)
plt.title("Red is only in crystal, Blue is only in Predicted")
# plt.savefig("/Users/weilu/Research/server/oct_2019/draw_contact_for_DMP/contact.png")



In [ ]:
plt.imshow(a[300:,150:250], origin="bottom")
plt.colorbar()

In [9]:
plt.imshow(a)


Out[9]:
<matplotlib.image.AxesImage at 0x1a249b42e8>

In [7]:
len(a[5])


Out[7]:
51

In [22]:
pdb = "4rws"
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/A.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_A = data
seq_A = getSeqFromRaptorXContact(fileLocation)

fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/B.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
data_B = data
seq_B = getSeqFromRaptorXContact(fileLocation)

In [86]:
import textwrap
header = '''\
PFRMAT RR
TARGET {}
AUTHOR RaptorX-Contact
METHOD deep dilated residual networks (one variant of deep CNN). Consult jinboxu@gmail.com for details.
MODEL 1
'''
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/{pdb}.txt"
with open(fileLocation, "w") as out:
    out.write(header.format(pdb))
    out.write("\n".join(textwrap.wrap(seq, width=50))+"\n")
    for index, d in data_A.iterrows():
    #     print(index)
        i = int(d["i"])
        j = int(d["j"])
        p = round(d["p"], 8)
        s = int(d["s"])
        ss = int(d["ss"])
        out.write(f"{i} {j} {s} {ss} {p}\n")
    for index, d in data_B.iterrows():
    #     print(index)
        i = int(d["i"]) + len(seq_A)
        j = int(d["j"]) + len(seq_A)
        p = round(d["p"], 8)
        s = int(d["s"])
        ss = int(d["ss"])
        out.write(f"{i} {j} {s} {ss} {p}\n")
    out.write("END\n")

In [80]:
for index, d in data_B.iterrows():
#     print(index)
    i = int(d["i"]) - 1 + len(seq_A)
    j = int(d["j"]) - 1 + len(seq_A)
    p = round(d["p"], 8)
    s = int(d["s"])
    ss = int(d["ss"])

In [84]:
data_A


Out[84]:
i j s ss p
0 34 59 0.0 8.0 0.997731
1 38 56 0.0 8.0 0.980999
2 41 52 0.0 8.0 0.973823
3 38 59 0.0 8.0 0.966630
4 27 66 0.0 8.0 0.962626
5 34 62 0.0 8.0 0.961573
6 35 59 0.0 8.0 0.916884
7 41 55 0.0 8.0 0.895539
8 31 63 0.0 8.0 0.874988
9 30 62 0.0 8.0 0.842116
10 20 73 0.0 8.0 0.814816
11 27 69 0.0 8.0 0.767721
12 37 55 0.0 8.0 0.758386
13 83 160 0.0 8.0 0.728676
14 42 52 0.0 8.0 0.727045
15 23 69 0.0 8.0 0.709704
16 83 161 0.0 8.0 0.701026
17 37 276 0.0 8.0 0.693589
18 31 62 0.0 8.0 0.680381
19 31 66 0.0 8.0 0.678132
20 37 275 0.0 8.0 0.675751
21 41 48 0.0 8.0 0.668435
22 109 195 0.0 8.0 0.657830
23 217 274 0.0 8.0 0.654746
24 113 199 0.0 8.0 0.654502
25 105 192 0.0 8.0 0.653025
26 109 196 0.0 8.0 0.652981
27 102 188 0.0 8.0 0.652657
28 112 196 0.0 8.0 0.650273
29 106 192 0.0 8.0 0.650126
... ... ... ... ... ...
36555 32 53 0.0 8.0 0.000235
36556 40 63 0.0 8.0 0.000228
36557 35 53 0.0 8.0 0.000224
36558 30 56 0.0 8.0 0.000219
36559 39 54 0.0 8.0 0.000218
36560 42 62 0.0 8.0 0.000209
36561 39 53 0.0 8.0 0.000201
36562 36 65 0.0 8.0 0.000194
36563 40 58 0.0 8.0 0.000191
36564 36 56 0.0 8.0 0.000183
36565 41 61 0.0 8.0 0.000179
36566 36 64 0.0 8.0 0.000146
36567 28 60 0.0 8.0 0.000140
36568 42 61 0.0 8.0 0.000136
36569 40 60 0.0 8.0 0.000136
36570 40 62 0.0 8.0 0.000133
36571 36 53 0.0 8.0 0.000128
36572 32 61 0.0 8.0 0.000121
36573 40 57 0.0 8.0 0.000088
36574 36 60 0.0 8.0 0.000080
36575 32 57 0.0 8.0 0.000078
36576 39 62 0.0 8.0 0.000075
36577 36 57 0.0 8.0 0.000066
36578 40 61 0.0 8.0 0.000064
36579 39 57 0.0 8.0 0.000064
36580 32 60 0.0 8.0 0.000058
36581 32 56 0.0 8.0 0.000049
36582 39 58 0.0 8.0 0.000049
36583 39 61 0.0 8.0 0.000045
36584 36 61 0.0 8.0 0.000044

36585 rows × 5 columns


In [79]:
round(p, 8)


Out[79]:
0.0023355

In [23]:
seq = seq_A + seq_B

In [26]:
seq


Out[26]:
'SMKEPCFREENANFNKIFLPTIYSIIFLTGIVGNGLVILVMGYQSMTDKYRLHLSVADLLFVITLPFWAVDAVANWYFGNFLCKAVHVIYTVNLYSSVWILAFISLDRYLAIVHATNSQRPRKLLAEKVVYVGVWIPALLLTIPDFIFANVSEADDRYICCRFYPNDLWVVVFQFQHIMVGLILPGIVILSCYCIIISKLSHSGHQKRKALKPTVILILAFFACWLPYYIGISIDSFILLEIIKQGCEFENTVHKWISITEALAFFHCCLNPILYALGASCHRPDKCCLGYQKRPLPQVLLSSWYPTSQLCSKPGVIFLTKRGRQVCADKSKDWVKKLMQQLPVTA'

In [27]:
n = len(seq)
t = np.zeros((n,n))
for index, d in data_A.iterrows():
#     print(index)
    i = int(d["i"]) - 1
    j = int(d["j"]) - 1 
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p
for index, d in data_B.iterrows():
#     print(index)
    i = int(d["i"]) - 1 + len(seq_A)
    j = int(d["j"]) - 1 + len(seq_A)
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p

In [28]:
plt.imshow((t>0.5).astype(float), origin="bottom")
plt.colorbar()


Out[28]:
<matplotlib.colorbar.Colorbar at 0x1a184bfb38>

In [29]:
def getSeqFromRaptorXContact(fileLocation):
    with open(fileLocation) as f:
        a = f.readlines()
    i = 4
    seq = ""
    assert a[i] == "MODEL 1\n"
    i += 1
    while True:
        line = a[i]
        if line[0].isdigit():
            break
        i += 1
        seq += line.strip()
    # print(i)
    return seq
def getContactMapFromPDB(pdbFile):
    cutoff = 9.5
    MAX_OFFSET = 6
    parser = PDBParser()
    structure = parser.get_structure('target', pdbFile)
    all_residues = list(structure.get_residues())
    n = len(all_residues)
    contact_table = np.zeros((n,n))
    print(pdb, n)
    for i, res1 in enumerate(all_residues):
        for j, res2 in enumerate(all_residues):
            contact_table[i][j] = res1["CA"]-res2["CA"]

    data = (contact_table < cutoff)
    remove_band = np.eye(n)
    for i in range(1, MAX_OFFSET):
        remove_band += np.eye(n, k=i)
        remove_band += np.eye(n, k=-i)
    data[remove_band==1] = 0
    return data

In [30]:
data = getContactMapFromPDB("/Users/weilu/Research/server/jul_2019/two_chains/cleaned_pdbs/4rws.pdb")


4rws 346

In [70]:
data_s = data.astype(int)
t_s = (t>0.55).astype(float)

In [72]:
fig, ax = plt.subplots()
# ax.imshow(data_s+1, origin="bottom")
ax.imshow(t, origin="bottom")


Out[72]:
<matplotlib.image.AxesImage at 0x1a24ae82b0>

In [60]:
combined = data_s + t_s * 2

In [63]:
from matplotlib import colors
cmap = colors.ListedColormap(['white', 'red', 'blue', 'black'])
bounds=[-1,0.1, 1.1, 2.1, 3.1]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(combined, origin="bottom", cmap=cmap, norm=norm)


Out[63]:
<matplotlib.image.AxesImage at 0x1a235698d0>

In [87]:
pdbFile = "/Users/weilu/Research/server/jul_2019/two_chains/cleaned_pdbs/4rws.pdb"
parser = PDBParser()
structure = parser.get_structure('target', pdbFile)

In [88]:
model = structure[0]

In [90]:
a = list(model.get_chains())

In [92]:
c = a[0]

In [94]:
c.id


Out[94]:
'A'

In [95]:
c = "ALL"

In [99]:
"ABC" is not "ALL"


Out[99]:
True

In [93]:
c.get_id()


Out[93]:
'A'

In [55]:
fig, ax = plt.subplots()
ax.imshow(data_s+1, origin="bottom")
ax.imshow(t_s, origin="bottom")


Out[55]:
<matplotlib.image.AxesImage at 0x1a21a9a9b0>

In [43]:
plt.imshow(data_s, origin="bottom")


Out[43]:
<matplotlib.image.AxesImage at 0x1a1cd67828>

In [44]:
# plt.imshow(-data_s, origin="bottom")
plt.imshow(t_s, origin="bottom")
# plt.colorbar()


Out[44]:
<matplotlib.image.AxesImage at 0x1a1d144e48>

In [16]:
346


Out[16]:
70

In [17]:
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/B.txt"

In [20]:
len(seq)


Out[20]:
276

In [18]:
pdb = "4rws"
fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/A.txt"
data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
data["i"] = data["i"].astype(int)
data["j"] = data["j"].astype(int)
n = int(info.query(f"Protein =='{pdb}'")["Length"])
t = np.zeros((n,n))
for index, d in data.iterrows():
#     print(index)
    i = int(d["i"]) - 1
    j = int(d["j"]) - 1 
    p = d["p"]
#     print(i,j,p)
    t[i,j] = p
    t[j,i] = p


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-f0b1e6216f4f> in <module>
      4 data["i"] = data["i"].astype(int)
      5 data["j"] = data["j"].astype(int)
----> 6 n = int(info.query(f"Protein =='{pdb}'")["Length"])
      7 t = np.zeros((n,n))
      8 for index, d in data.iterrows():

NameError: name 'info' is not defined

In [ ]:
def getContactMap(pdb):
    fileLocation = f"/Users/weilu/Research/raptorX/{pdb}/contactmap.txt"
    data = pd.read_csv(fileLocation, skiprows=6, sep="\s+", names=["i","j","s", "ss","p"]).dropna().reset_index(drop=True)
    data["i"] = data["i"].astype(int)
    data["j"] = data["j"].astype(int)
    n = int(info.query(f"Protein =='{pdb}'")["Length"])
    t = np.zeros((n,n))
    for index, d in data.iterrows():
    #     print(index)
        i = int(d["i"]) - 1
        j = int(d["j"]) - 1 
        p = d["p"]
    #     print(i,j,p)
        t[i,j] = p
        t[j,i] = p