notebook.community

Edit and run



In [1]:

    
import sys
sys.path.append("/projects/pw8/wl45/open3spn2")
sys.path.append("/projects/pw8/wl45/openawsem")
sys.path.append("/Users/weilu/open3spn2")
sys.path.append("/Users/weilu/openmmawsem")
sys.path.append("C:/Users/luwei/Documents/GitHub/open3spn2")
sys.path.append("C:/Users/luwei/Documents/GitHub/openawsem")


import open3SPN2
import ffAWSEM
import time
import simtk.openmm
import simtk.openmm.app



In [10]:

    
import scipy.spatial.distance as sdist
import pandas as pd



In [3]:

    
pdb_file = "/Users/weilu/Research/server/jun_week1_2020/protein_DNA_benchmark/DNAProtein_Platform_OpenCL_date_20200226_pdb_1a36_repetition_0_clean.pdb"
seq_file = "/Users/weilu/Research/server/jun_week1_2020/protein_DNA_benchmark/DNAProtein_Platform_OpenCL_date_20200226_pdb_1a36_repetition_0_protein.seq"
with open(seq_file) as ps:
    protein_sequence_one=ps.readlines()[0]
protein=ffAWSEM.Protein.fromCoarsePDB(pdb_file,sequence=protein_sequence_one)



In [6]:

    
data = protein.atoms



In [27]:

    
protein_resNames = ["NGP", "IGL", "IPR", "NTER", "CTER"]
DNA_resNames = ["DA", "DC", "DT", "DG"]
group1_index = []
group2_index = []
for i, line in data.iterrows():
    resname = line["resname"]
    index = line["serial"]
    name = line["name"]
    if resname in DNA_resNames and name == "P":
        group1_index.append(index)
    if resname in protein_resNames and name == "CA":
        group2_index.append(index)



In [11]:

    
#Calculate native distances
CA_atoms=protein.atoms[protein.atoms.name=='CA']
P_atoms=protein.atoms[protein.atoms.name=='P']
d=sdist.cdist(CA_atoms[['x','y','z']],P_atoms[['x','y','z']])/10 #Distance in nanometers
d_sq=pd.DataFrame(d,index=CA_atoms.index,columns=P_atoms.index)
d=d_sq.copy()
d['Protein']=d.index
d=d.melt(id_vars=['Protein'])
d=d.rename(columns={'variable': 'DNA', 'value': 'distance'})



In [23]:

    
d.query("distance < 1")









    Out[23]:







  
    
      
      Protein
      DNA
      distance
    
  
  
    
      352
      2209
      3
      0.993812
    
    
      353
      2215
      3
      0.982950
    
    
      354
      2221
      3
      0.909546
    
    
      356
      2233
      3
      0.853868
    
    
      357
      2238
      3
      0.817565
    
    
      ...
      ...
      ...
      ...
    
    
      20757
      2315
      116
      0.692107
    
    
      20758
      2321
      116
      0.960002
    
    
      20759
      2327
      116
      0.996391
    
    
      21132
      1272
      119
      0.958682
    
    
      21683
      1272
      122
      0.901890
    
  

268 rows × 3 columns



In [17]:

    
d["DNA"].unique()









    Out[17]:





array([3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51,
       54, 57, 60, 63, 68, 71, 74, 77, 80, 83, 86, 89, 92, 95, 98, 101,
       104, 107, 110, 113, 116, 119, 122, 125, 128], dtype=object)



In [18]:

    
d.shape









    Out[18]:





(23142, 3)



In [19]:

    
23142/40









    Out[19]:





578.55



In [31]:

    
data = pd.read_csv("/Users/weilu/Research/server/jun_week3_2020/protein_DNA/selected.csv", index_col=0)

sampled = data.sample(6, random_state=28)
pdb_list = sampled.idcode.to_list()



In [38]:



In [41]:

    
pdb_list = sampled.idcode.to_list()



In [42]:

    
pdb_list = [a.lower() for a in pdb_list]



In [43]:

    
print(pdb_list)









    



['4y60', '5ke8', '1a1j', '5lxu', '1skn', '6a2h']



In [35]:

    
data.sample(6, random_state=0)









    Out[35]:







  
    
      
      Number of missing residues
      Resolution
      idcode
      release_date
      First
      Chain DNA1 ID
      Chain DNA1 Compound
      Chain DNA2 ID
      Chain DNA2 Compound
      Chain Protein ID
      Chain Protein Compound
    
  
  
    
      45
      11
      3.10
      6JHE
      2020-01-01
      3.10
      b
      dna (5'-d(p*tp*tp*gp*ap*ap*ap*cp*cp*tp*tp*t)-3')
      c
      dna (5'-d(*ap*ap*ap*gp*gp*tp*tp*tp*cp*ap*a)-3')
      a
      ecf rna polymerase sigma factor sigw
    
    
      59
      0
      1.90
      6QEC
      2020-02-05
      1.90
      u
      dna (5'-d(*ap*tp*tp*cp*gp*ap*ap*tp*ap*t*tp*ap*...
      b
      dna (5'-d(*ap*tp*tp*cp*gp*ap*ap*tp*ap*t*tp*ap*...
      a
      transcription factor lux
    
    
      7
      0
      2.70
      4S2Q
      2016-02-17
      2.70
      a
      dna (5'-d(p*ap*gp*gp*cp*tp*tp*tp*gp*tp*tp*cp*t...
      b
      dna (5'-d(p*ap*gp*gp*ap*gp*ap*ap*cp*ap*ap*ap*g...
      d
      transcription factor sox-9
    
    
      50
      7
      2.75
      5JLX
      2016-06-22
      2.75
      b, e
      dna (5'-d(*ap*gp*ap*ap*ap*gp*cp*(c7s)p*ap*tp*t...
      c, f
      dna (5'-d(*tp*cp*tp*cp*tp*ap*ap*tp*gp*gp*cp*tp...
      a, d
      homeotic protein antennapedia
    
    
      92
      3
      2.40
      3U2B
      2011-12-28
      2.40
      a
      dna (5'-d(*gp*tp*cp*tp*cp*tp*ap*tp*tp*gp*tp*cp...
      b
      dna (5'-d(*cp*cp*ap*gp*gp*ap*cp*ap*ap*tp*ap*gp...
      c
      transcription factor sox-4
    
    
      27
      6
      1.45
      5KL3
      2016-09-14
      1.45
      b
      dna (5'-d(*ap*gp*cp*gp*tp*gp*gp*gp*ap*gp*t)-3')
      c
      dna (5'-d(*tp*ap*cp*tp*cp*cp*cp*ap*cp*gp*c)-3')
      a
      wilms tumor protein



In [ ]:

	Protein	DNA	distance
352	2209	3	0.993812
353	2215	3	0.982950
354	2221	3	0.909546
356	2233	3	0.853868
357	2238	3	0.817565
...	...	...	...
20757	2315	116	0.692107
20758	2321	116	0.960002
20759	2327	116	0.996391
21132	1272	119	0.958682
21683	1272	122	0.901890

	Number of missing residues	Resolution	idcode	release_date	First	Chain DNA1 ID	Chain DNA1 Compound	Chain DNA2 ID	Chain DNA2 Compound	Chain Protein ID	Chain Protein Compound
45	11	3.10	6JHE	2020-01-01	3.10	b	dna (5'-d(ptptpgpapapapcpcptptp*t)-3')	c	dna (5'-d(apapapgpgptptptpcpap*a)-3')	a	ecf rna polymerase sigma factor sigw
59	0	1.90	6QEC	2020-02-05	1.90	u	dna (5'-d(aptptpcpgpapaptpapttpap*...	b	dna (5'-d(aptptpcpgpapaptpapttpap*...	a	transcription factor lux
7	0	2.70	4S2Q	2016-02-17	2.70	a	dna (5'-d(papgpgpcptptptpgptptpcpt...	b	dna (5'-d(papgpgpapgpapapcpapapapg...	d	transcription factor sox-9
50	7	2.75	5JLX	2016-06-22	2.75	b, e	dna (5'-d(apgpapapapgpcp(c7s)paptp*t...	c, f	dna (5'-d(tpcptpcptpapaptpgpgpcptp...	a, d	homeotic protein antennapedia
92	3	2.40	3U2B	2011-12-28	2.40	a	dna (5'-d(gptpcptpcptpaptptpgptpcp...	b	dna (5'-d(cpcpapgpgpapcpapaptpapgp...	c	transcription factor sox-4
27	6	1.45	5KL3	2016-09-14	1.45	b	dna (5'-d(apgpcpgptpgpgpgpapgp*t)-3')	c	dna (5'-d(tpapcptpcpcpcpapcpgp*c)-3')	a	wilms tumor protein