In [1]:
import sys
sys.path.append("/projects/pw8/wl45/open3spn2")
sys.path.append("/projects/pw8/wl45/openawsem")
sys.path.append("/Users/weilu/open3spn2")
sys.path.append("/Users/weilu/openmmawsem")
sys.path.append("C:/Users/luwei/Documents/GitHub/open3spn2")
sys.path.append("C:/Users/luwei/Documents/GitHub/openawsem")


import open3SPN2
import ffAWSEM
import time
import simtk.openmm
import simtk.openmm.app

In [10]:
import scipy.spatial.distance as sdist
import pandas as pd

In [3]:
pdb_file = "/Users/weilu/Research/server/jun_week1_2020/protein_DNA_benchmark/DNAProtein_Platform_OpenCL_date_20200226_pdb_1a36_repetition_0_clean.pdb"
seq_file = "/Users/weilu/Research/server/jun_week1_2020/protein_DNA_benchmark/DNAProtein_Platform_OpenCL_date_20200226_pdb_1a36_repetition_0_protein.seq"
with open(seq_file) as ps:
    protein_sequence_one=ps.readlines()[0]
protein=ffAWSEM.Protein.fromCoarsePDB(pdb_file,sequence=protein_sequence_one)

In [6]:
data = protein.atoms

In [27]:
protein_resNames = ["NGP", "IGL", "IPR", "NTER", "CTER"]
DNA_resNames = ["DA", "DC", "DT", "DG"]
group1_index = []
group2_index = []
for i, line in data.iterrows():
    resname = line["resname"]
    index = line["serial"]
    name = line["name"]
    if resname in DNA_resNames and name == "P":
        group1_index.append(index)
    if resname in protein_resNames and name == "CA":
        group2_index.append(index)

In [11]:
#Calculate native distances
CA_atoms=protein.atoms[protein.atoms.name=='CA']
P_atoms=protein.atoms[protein.atoms.name=='P']
d=sdist.cdist(CA_atoms[['x','y','z']],P_atoms[['x','y','z']])/10 #Distance in nanometers
d_sq=pd.DataFrame(d,index=CA_atoms.index,columns=P_atoms.index)
d=d_sq.copy()
d['Protein']=d.index
d=d.melt(id_vars=['Protein'])
d=d.rename(columns={'variable': 'DNA', 'value': 'distance'})

In [23]:
d.query("distance < 1")


Out[23]:
Protein DNA distance
352 2209 3 0.993812
353 2215 3 0.982950
354 2221 3 0.909546
356 2233 3 0.853868
357 2238 3 0.817565
... ... ... ...
20757 2315 116 0.692107
20758 2321 116 0.960002
20759 2327 116 0.996391
21132 1272 119 0.958682
21683 1272 122 0.901890

268 rows × 3 columns


In [17]:
d["DNA"].unique()


Out[17]:
array([3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51,
       54, 57, 60, 63, 68, 71, 74, 77, 80, 83, 86, 89, 92, 95, 98, 101,
       104, 107, 110, 113, 116, 119, 122, 125, 128], dtype=object)

In [18]:
d.shape


Out[18]:
(23142, 3)

In [19]:
23142/40


Out[19]:
578.55

In [31]:
data = pd.read_csv("/Users/weilu/Research/server/jun_week3_2020/protein_DNA/selected.csv", index_col=0)

sampled = data.sample(6, random_state=28)
pdb_list = sampled.idcode.to_list()

In [38]:


In [41]:
pdb_list = sampled.idcode.to_list()

In [42]:
pdb_list = [a.lower() for a in pdb_list]

In [43]:
print(pdb_list)


['4y60', '5ke8', '1a1j', '5lxu', '1skn', '6a2h']

In [35]:
data.sample(6, random_state=0)


Out[35]:
Number of missing residues Resolution idcode release_date First Chain DNA1 ID Chain DNA1 Compound Chain DNA2 ID Chain DNA2 Compound Chain Protein ID Chain Protein Compound
45 11 3.10 6JHE 2020-01-01 3.10 b dna (5'-d(p*tp*tp*gp*ap*ap*ap*cp*cp*tp*tp*t)-3') c dna (5'-d(*ap*ap*ap*gp*gp*tp*tp*tp*cp*ap*a)-3') a ecf rna polymerase sigma factor sigw
59 0 1.90 6QEC 2020-02-05 1.90 u dna (5'-d(*ap*tp*tp*cp*gp*ap*ap*tp*ap*t*tp*ap*... b dna (5'-d(*ap*tp*tp*cp*gp*ap*ap*tp*ap*t*tp*ap*... a transcription factor lux
7 0 2.70 4S2Q 2016-02-17 2.70 a dna (5'-d(p*ap*gp*gp*cp*tp*tp*tp*gp*tp*tp*cp*t... b dna (5'-d(p*ap*gp*gp*ap*gp*ap*ap*cp*ap*ap*ap*g... d transcription factor sox-9
50 7 2.75 5JLX 2016-06-22 2.75 b, e dna (5'-d(*ap*gp*ap*ap*ap*gp*cp*(c7s)p*ap*tp*t... c, f dna (5'-d(*tp*cp*tp*cp*tp*ap*ap*tp*gp*gp*cp*tp... a, d homeotic protein antennapedia
92 3 2.40 3U2B 2011-12-28 2.40 a dna (5'-d(*gp*tp*cp*tp*cp*tp*ap*tp*tp*gp*tp*cp... b dna (5'-d(*cp*cp*ap*gp*gp*ap*cp*ap*ap*tp*ap*gp... c transcription factor sox-4
27 6 1.45 5KL3 2016-09-14 1.45 b dna (5'-d(*ap*gp*cp*gp*tp*gp*gp*gp*ap*gp*t)-3') c dna (5'-d(*tp*ap*cp*tp*cp*cp*cp*ap*cp*gp*c)-3') a wilms tumor protein

In [ ]: