In [1]:
import glob
import pandas
import seaborn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
In [2]:
%matplotlib inline
In [3]:
pd.set_option('display.max_columns', 1000)
In [19]:
plt.rcParams['figure.figsize'] = 0.5*np.array([16.18033, 10]) #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
plt.rcParams.update({'font.size': 22})
In [5]:
pre = "/Users/weilu/Research/server/jun_week1_2020/protein_DNA_benchmark/"
In [31]:
savefigPre = "/Users/weilu/Dropbox/openAWSEM/figures/DNA-Protein-Qvalues/"
In [25]:
def get_energies(protein):
# protein = "1a3q"
all_energies=[]
for i in range(5):
e = f"{pre}/energy_{protein}_{i}_jun17"
e=pandas.read_csv(e,index_col=0)
e['Frame']=e.index
e['Repetition']=i
all_energies+=[e]
all_energies=pandas.concat(all_energies)
return all_energies
In [33]:
pdb_list = ["1a3q", "1a36", "1mnn", "6cta"]
for protein in pdb_list:
all_energies = get_energies(protein)
all_energies["interface_energy"]=all_energies['ExclusionProteinDNA kJ/mol']+all_energies['ElectrostaticsProteinDNA kJ/mol']
plt.figure()
plt.scatter(all_energies["interface_energy"],all_energies['num_contact kJ/mol'],s=1,alpha=0.5)
plt.xlabel('Interface energy (kJ/mol)')
plt.ylabel('Number of native contact')
# seaborn.despine()
plt.savefig(f'{savefigPre}/{protein}_scatter.png')
In [34]:
protein = "6cta"
all_energies = get_energies(protein)
all_energies["interface_energy"]=all_energies['ExclusionProteinDNA kJ/mol']+all_energies['ElectrostaticsProteinDNA kJ/mol']
plt.figure()
plt.scatter(all_energies["interface_energy"],all_energies['num_contact kJ/mol'],s=1,alpha=0.5)
plt.xlabel('Interface energy (kJ/mol)')
plt.ylabel('Number of native contact')
# plt.suptitle(f'Q_value (DNA_symmetric) vs Interface Energy (PDBid:{pdb_name})')
# plt.ylim(0,1)
# plt.xlim(interface_energy.min()-1,interface_energy.max()+1)
seaborn.despine()
In [35]:
all_energies
Out[35]:
In [20]:
protein = "1a3q"
all_energies=[]
for i in range(5):
e = f"{pre}/energy_{protein}_{i}_jun17"
e=pandas.read_csv(e,index_col=0)
e['Frame']=e.index
e['Repetition']=i
all_energies+=[e]
all_energies=pandas.concat(all_energies)
plt.figure()
seaborn.lineplot(data=all_energies,x='Frame',y='Q_value_sigma5',hue='Repetition')
plt.legend([0,1,2,3,4])
pdb_name=pdb_file.split('/')[-1].split('_')[-4]
seaborn.lineplot(data=all_energies,x='Frame',y='Q_value_sigma5',hue='Repetition',)
seaborn.despine()
plt.legend([0,1,2,3,4],title="Repetition",loc=1)
plt.ylabel("Q")
plt.ylim(0,1)
plt.suptitle(f'Protein-DNA Q value [PDBid:{pdb_name}]')
Out[20]:
In [21]:
all_energies.columns
Out[21]:
In [22]:
plt.figure()
plt.scatter(interface_energy,all_energies['Q_value_sym_sigma5'],s=1,alpha=0.5)
plt.xlabel('Interface energy (kJ/mol)')
plt.ylabel('Q_value (DNA_symmetric)')
plt.suptitle(f'Q_value (DNA_symmetric) vs Interface Energy (PDBid:{pdb_name})')
# plt.ylim(0,1)
# plt.xlim(interface_energy.min()-1,interface_energy.max()+1)
seaborn.despine()
In [24]:
plt.figure()
plt.scatter(interface_energy,all_energies['num_contact kJ/mol'],s=1,alpha=0.5)
plt.xlabel('Interface energy (kJ/mol)')
plt.ylabel('Number of native contact')
# plt.suptitle(f'Q_value (DNA_symmetric) vs Interface Energy (PDBid:{pdb_name})')
# plt.ylim(0,1)
# plt.xlim(interface_energy.min()-1,interface_energy.max()+1)
seaborn.despine()
In [18]:
plt.figure()
interface_energy=all_energies['ExclusionProteinDNA kJ/mol']+all_energies['ElectrostaticsProteinDNA kJ/mol']
seaborn.kdeplot(interface_energy,all_energies['num_contact kJ/mol'],shade=True,shade_lowest=False,n_levels=10)
# plt.xlabel('Interface energy (kJ/mol)')
# plt.ylabel('Q_value (DNA_symmetric)')
# plt.ylim(0,1)
# plt.xlim(interface_energy.min(),interface_energy.max())
# plt.suptitle(f'Q value vs Energy (PDBid:{pdb_name})')
seaborn.despine()
In [12]:
plt.figure()
interface_energy=all_energies['ExclusionProteinDNA kJ/mol']+all_energies['ElectrostaticsProteinDNA kJ/mol']
seaborn.kdeplot(interface_energy,all_energies['Q_value_sym_sigma5'],shade=True,shade_lowest=False,n_levels=10)
plt.xlabel('Interface energy (kJ/mol)')
plt.ylabel('Q_value (DNA_symmetric)')
plt.ylim(0,1)
plt.xlim(interface_energy.min(),interface_energy.max())
plt.suptitle(f'Q value vs Energy (PDBid:{pdb_name})')
seaborn.despine()
In [11]:
plt.figure()
seaborn.lineplot(data=all_energies,x='Frame',y='num_contact kJ/mol',hue='Repetition')
Out[11]:
In [18]:
plt.figure()
seaborn.lineplot(data=all_energies,x='Frame',y='Q_value_sym_sigma5',hue='Repetition',)
seaborn.despine()
plt.legend([0,1,2,3,4],title="Repetition",loc=1)
plt.ylabel("Q (DNA symmetric)")
plt.ylim(0,1)
plt.suptitle(f'Protein-DNA Q value (DNA symmetric) [PDBid:{pdb_name}]')
Out[18]:
In [19]:
Rep0=all_energies[all_energies['Repetition']==0]
fig,ax=plt.subplots(1)
ax.plot(Rep0['Q_value_sym_sigma5'])
ax.plot(Rep0['Q_value_sigma5'])
#plt.plot(out2.sum(axis=0)/len(protein_contacts))
#plt.plot(all_energies['Q_value2 kJ/mol'])
#plt.plot(out3.sum(axis=0)/len(protein_contacts))
#plt.plot(all_energies['Q_value3 kJ/mol'])
ax.set_xlabel('Frame')
ax.set_ylabel('Q')
ax.set_ylim(-1,1)
ax.set_xlim(0,len(Rep0))
ax2=ax.twinx()
ax2.plot(Rep0['ExclusionProteinDNA kJ/mol']+Rep0['ElectrostaticsProteinDNA kJ/mol'],'r')
minimum=(Rep0['ExclusionProteinDNA kJ/mol']+Rep0['ElectrostaticsProteinDNA kJ/mol']).min()
ax2.set_ylim(minimum,-minimum)
ax.legend(['Q_interface (DNA symmetric)','Q_interface'],loc=1)
ax2.legend(['Protein - DNA interaction (kJ/mol)'],loc=4)
ax2.set_ylabel('Energy (kJ/mol)')
plt.suptitle(f'Q values and Interface Energy (PDBid:{pdb_name}, Repetition 0)')
Out[19]:
In [20]:
plt.figure()
interface_energy=all_energies['ExclusionProteinDNA kJ/mol']+all_energies['ElectrostaticsProteinDNA kJ/mol']
seaborn.kdeplot(interface_energy,all_energies['Q_value_sym_sigma5'],shade=True,shade_lowest=False,n_levels=10)
plt.xlabel('Interface energy (kJ/mol)')
plt.ylabel('Q_value (DNA_symmetric)')
plt.ylim(0,1)
plt.xlim(interface_energy.min(),interface_energy.max())
plt.suptitle(f'Q value vs Energy (PDBid:{pdb_name})')
seaborn.despine()
In [21]:
plt.figure()
plt.scatter(interface_energy,all_energies['Q_value_sym_sigma5'],s=1,alpha=0.5)
plt.xlabel('Interface energy (kJ/mol)')
plt.ylabel('Q_value (DNA_symmetric)')
plt.suptitle(f'Q_value (DNA_symmetric) vs Interface Energy (PDBid:{pdb_name})')
plt.ylim(0,1)
plt.xlim(interface_energy.min()-1,interface_energy.max()+1)
seaborn.despine()
In [ ]:
for dcd_file in glob.glob('/media/cab22/My Book/protein_DNA_selected_openmm/DNAProtein_Platform_OpenCL_date_20200226_pdb_*0_output.dcd'):
pdb_file=dcd_file[:-12]+'0_clean.pdb'
seq_file=dcd_file[:-12]+'0_protein.seq'
trajectory_files=glob.glob(dcd_file[:-12]+'*.dcd')
energy_files=glob.glob(dcd_file[:-12]+'*.csv')
trajectory_files.sort()
energy_files.sort()
all_energies=[]
for i,e in enumerate(energy_files):
e=pandas.read_csv(e,index_col=0)
e['Frame']=e.index
e['Repetition']=i
all_energies+=[e]
all_energies=pandas.concat(all_energies)
plt.figure()
seaborn.lineplot(data=all_energies,x='Frame',y='Q_value_sigma5',hue='Repetition')
plt.legend([0,1,2,3,4])
pdb_name=pdb_file.split('/')[-1].split('_')[-4]
seaborn.lineplot(data=all_energies,x='Frame',y='Q_value_sigma5',hue='Repetition',)
seaborn.despine()
plt.legend([0,1,2,3,4],title="Repetition",loc=1)
plt.ylabel("Q")
plt.ylim(0,1)
plt.suptitle(f'Protein-DNA Q value [PDBid:{pdb_name}]')
plt.savefig(f'{pdb_name}_fig1_Qvalue.png')
plt.figure()
seaborn.lineplot(data=all_energies,x='Frame',y='Q_value_sym_sigma5',hue='Repetition',)
seaborn.despine()
plt.legend([0,1,2,3,4],title="Repetition",loc=1)
plt.ylabel("Q (DNA symmetric)")
plt.ylim(0,1)
plt.suptitle(f'Protein-DNA Q value (DNA symmetric) [PDBid:{pdb_name}]')
plt.savefig(f'{pdb_name}_fig2_Qvaluesym.png')
Rep0=all_energies[all_energies['Repetition']==0]
fig,ax=plt.subplots(1)
ax.plot(Rep0['Q_value_sym_sigma5'])
ax.plot(Rep0['Q_value_sigma5'])
#plt.plot(out2.sum(axis=0)/len(protein_contacts))
#plt.plot(all_energies['Q_value2 kJ/mol'])
#plt.plot(out3.sum(axis=0)/len(protein_contacts))
#plt.plot(all_energies['Q_value3 kJ/mol'])
ax.set_xlabel('Frame')
ax.set_ylabel('Q')
ax.set_ylim(-1,1)
ax.set_xlim(0,len(Rep0))
ax2=ax.twinx()
ax2.plot(Rep0['ExclusionProteinDNA kJ/mol']+Rep0['ElectrostaticsProteinDNA kJ/mol'],'r')
minimum=(Rep0['ExclusionProteinDNA kJ/mol']+Rep0['ElectrostaticsProteinDNA kJ/mol']).min()
ax2.set_ylim(minimum,-minimum)
ax.legend(['Q_interface (DNA symmetric)','Q_interface'],loc=1)
ax2.legend(['Protein - DNA interaction (kJ/mol)'],loc=4)
ax2.set_ylabel('Energy (kJ/mol)')
plt.suptitle(f'Q values and Interface Energy (PDBid:{pdb_name}, Repetition 0)')
plt.savefig(f'{pdb_name}_fig3_QandEnergy.png')
plt.figure()
interface_energy=all_energies['ExclusionProteinDNA kJ/mol']+all_energies['ElectrostaticsProteinDNA kJ/mol']
seaborn.kdeplot(interface_energy,all_energies['Q_value_sym_sigma5'],shade=True,shade_lowest=False,n_levels=10)
plt.xlabel('Interface energy (kJ/mol)')
plt.ylabel('Q_value (DNA_symmetric)')
plt.ylim(0,1)
plt.xlim(interface_energy.min(),interface_energy.max())
plt.suptitle(f'Q value vs Energy (PDBid:{pdb_name})')
seaborn.despine()
plt.savefig(f'{pdb_name}_fig5_kde.png')
plt.figure()
plt.scatter(interface_energy,all_energies['Q_value_sym_sigma5'],s=1,alpha=0.5)
plt.xlabel('Interface energy (kJ/mol)')
plt.ylabel('Q_value (DNA_symmetric)')
plt.suptitle(f'Q_value (DNA_symmetric) vs Interface Energy (PDBid:{pdb_name})')
plt.ylim(0,1)
plt.xlim(interface_energy.min()-1,interface_energy.max()+1)
seaborn.despine()
plt.savefig(f'{pdb_name}_fig4_scatter.png')
In [33]:
import numpy as np
from Bio.PDB.PDBParser import PDBParser
parser = PDBParser()
structure = parser.get_structure('X', "/Users/weilu/Research/server/jun_week1_2020/protein_DNA_benchmark/DNAProtein_Platform_OpenCL_date_20200226_pdb_1a36_repetition_0_clean.pdb")
model = structure[0]
In [34]:
chain_start = 0
count = 0
proteinResidues = ['ALA', 'ASN', 'CYS', 'GLU', 'HIS', 'LEU', 'MET', 'PRO', 'THR', 'TYR', 'ARG', 'ASP', 'GLN', 'GLY', 'ILE', 'LYS', 'PHE', 'SER', 'TRP', 'VAL']
rnaResidues = ['A', 'G', 'C', 'U', 'I']
dnaResidues = ['DA', 'DG', 'DC', 'DT', 'DI']
removeDNAchains = True
for chain in model.get_chains():
chain_start += count
count = 0
if removeDNAchains and np.alltrue([a.get_resname().strip() in dnaResidues for a in chain.get_residues()]):
print(f"chain {chain.id} is a DNA chain. will be ignored for Q evaluation")
continue
print(chain)
In [35]:
a = list(chain.get_residues())
In [36]:
b = a[0]
In [37]:
b.get_resname().strip()
Out[37]:
In [102]:
from simtk.openmm.app import *
# from simtk.openmm import *
from simtk.unit import *
In [103]:
type(AWSEM_xml) == list
Out[103]:
In [ ]:
In [106]:
def compute_dis(pos1, pos2):
dis = pos1 - pos2
dis = dis.value_in_unit(nanometer)
r = (dis[0]**2 + dis[1]**2 + dis[2]**2)**0.5
return r
# def get_het_bonds_info(fileLocation="5x2r-openmmawsem.pdb", AWSEM_xml="awsem.xml", pre="./"):
fileLocation = "/Users/weilu/Research/server/jun_week1_2020/protein_DNA_benchmark/DNAProtein_Platform_OpenCL_date_20200226_pdb_1a36_repetition_0_clean.pdb"
AWSEM_xml = ["/Users/weilu/openmmawsem/awsem.xml", "/Users/weilu/open3spn2/open3SPN2/3SPN2.xml"]
pdb = PDBFile(fileLocation)
res_list = list(pdb.topology.residues())
atom_list = list(pdb.topology.atoms())
protein_resNames = ["NGP", "IGL", "IPR", "NTER", "CTER"]
DNA_resNames = ["DA", "DC", "DT", "DG"]
protein_res_list = []
DNA_res_list = []
ligand_res_list = []
for res in res_list:
if res.name in protein_resNames:
protein_res_list.append(res)
elif res.name in DNA_resNames:
DNA_res_list.append(res)
else:
ligand_res_list.append(res)
protein_atom_list = []
DNA_atom_list = []
ligand_atom_list = []
for atom in atom_list:
if atom.residue.name in protein_resNames:
protein_atom_list.append(atom)
elif atom.residue.name in DNA_resNames:
DNA_atom_list.append(atom)
else:
ligand_atom_list.append(atom)
if type(AWSEM_xml) == list:
forcefield = ForceField(*AWSEM_xml)
else:
forcefield = ForceField(AWSEM_xml)
[templates, names] = forcefield.generateTemplatesForUnmatchedResidues(pdb.topology)
for a in templates:
for a1 in a.atoms:
a1.type = a1.name
forcefield.registerResidueTemplate(a)
system = forcefield.createSystem(pdb.topology)
info = []
for DNA_atom in DNA_atom_list:
pos1 = pdb.positions[DNA_atom.index]
for protein_atom in protein_atom_list:
if protein_atom.name != "CB":
continue
protein_chain = protein_atom.residue.chain.id
pos2 = pdb.positions[protein_atom.index]
r = compute_dis(pos1, pos2)
if r < 0.65:
# print(ligand_atom, protein_atom, "in contact", r)
info.append([DNA_atom.index, protein_atom.index, r, protein_chain])
data = pd.DataFrame(info, columns=["DNA_atom_index", "Protein_atom_index", "r", "Protein_chain"])
# data.to_csv(f"{pre}/het_protein_bonds.csv", index=False)
In [107]:
data
Out[107]:
In [112]:
save_name = "/Users/weilu/Research/server/jun_week1_2020/protein_DNA_benchmark/1a36_DNA_protein_bonds.csv"
fileLocation = "/Users/weilu/Research/server/jun_week1_2020/protein_DNA_benchmark/DNAProtein_Platform_OpenCL_date_20200226_pdb_1a36_repetition_0_clean.pdb"
AWSEM_xml = ["/Users/weilu/openmmawsem/awsem.xml", "/Users/weilu/open3spn2/open3SPN2/3SPN2.xml"]
data = get_in_contact_bonds_info(fileLocation=fileLocation, AWSEM_xml=AWSEM_xml, interaction_atom="DNA", save_name=save_name)
In [113]:
data
Out[113]:
In [111]:
def get_in_contact_bonds_info(fileLocation="5x2r-openmmawsem.pdb", AWSEM_xml="awsem.xml", interaction_atom="DNA", save_name="./DNA_protein_bonds.csv", save_dir="./"):
# interaction_atom could be DNA or ligand
# save_name = "/Users/weilu/Research/server/jun_week1_2020/protein_DNA_benchmark/1a36_DNA_protein_bonds.csv"
# fileLocation = "/Users/weilu/Research/server/jun_week1_2020/protein_DNA_benchmark/DNAProtein_Platform_OpenCL_date_20200226_pdb_1a36_repetition_0_clean.pdb"
# AWSEM_xml = ["/Users/weilu/openmmawsem/awsem.xml", "/Users/weilu/open3spn2/open3SPN2/3SPN2.xml"]
pdb = PDBFile(fileLocation)
res_list = list(pdb.topology.residues())
atom_list = list(pdb.topology.atoms())
protein_resNames = ["NGP", "IGL", "IPR", "NTER", "CTER"]
DNA_resNames = ["DA", "DC", "DT", "DG"]
protein_res_list = []
DNA_res_list = []
ligand_res_list = []
for res in res_list:
if res.name in protein_resNames:
protein_res_list.append(res)
elif res.name in DNA_resNames:
DNA_res_list.append(res)
else:
ligand_res_list.append(res)
protein_atom_list = []
DNA_atom_list = []
ligand_atom_list = []
for atom in atom_list:
if atom.residue.name in protein_resNames:
protein_atom_list.append(atom)
elif atom.residue.name in DNA_resNames:
DNA_atom_list.append(atom)
else:
ligand_atom_list.append(atom)
if type(AWSEM_xml) == list:
forcefield = ForceField(*AWSEM_xml)
else:
forcefield = ForceField(AWSEM_xml)
[templates, names] = forcefield.generateTemplatesForUnmatchedResidues(pdb.topology)
for a in templates:
for a1 in a.atoms:
a1.type = a1.name
forcefield.registerResidueTemplate(a)
system = forcefield.createSystem(pdb.topology)
info = []
if interaction_atom == "DNA":
for DNA_atom in DNA_atom_list:
pos1 = pdb.positions[DNA_atom.index]
for protein_atom in protein_atom_list:
if protein_atom.name != "CB":
continue
protein_chain = protein_atom.residue.chain.id
pos2 = pdb.positions[protein_atom.index]
r = compute_dis(pos1, pos2)
if r < 0.65:
# print(ligand_atom, protein_atom, "in contact", r)
info.append([DNA_atom.index, protein_atom.index, r, protein_chain])
data = pd.DataFrame(info, columns=[f"DNA_atom_index", "Protein_atom_index", "r", "Protein_chain"])
data.to_csv(save_name, index=False)
elif interaction_atom == "Ligand":
info = []
for ligand_atom in ligand_atom_list:
pos1 = pdb.positions[ligand_atom.index]
for protein_atom in protein_atom_list:
if protein_atom.name != "CB":
continue
protein_chain = protein_atom.residue.chain.id
pos2 = pdb.positions[protein_atom.index]
r = compute_dis(pos1, pos2)
if r < 0.65:
# print(ligand_atom, protein_atom, "in contact", r)
info.append([ligand_atom.index, protein_atom.index, r, protein_chain])
data = pd.DataFrame(info, columns=["Ligand_atom_index", "Protein_atom_index", "r", "Protein_chain"])
data.to_csv(f"{save_dir}/het_protein_bonds.csv", index=False)
res_list = list(pdb.topology.residues())
atom_list = list(pdb.topology.atoms())
protein_resNames = ["NGP", "IGL", "IPR", "NTER", "CTER"]
DNA_resNames = ["DA", "DC", "DT", "DG"]
protein_res_list = []
DNA_res_list = []
ligand_res_list = []
for res in res_list:
if res.name in protein_resNames:
protein_res_list.append(res)
elif res.name in DNA_resNames:
DNA_res_list.append(res)
else:
ligand_res_list.append(res)
protein_atom_list = []
DNA_atom_list = []
ligand_atom_list = []
for atom in atom_list:
if atom.residue.name in protein_resNames:
protein_atom_list.append(atom)
elif atom.residue.name in DNA_resNames:
DNA_atom_list.append(atom)
else:
ligand_atom_list.append(atom)
ligand_res_list
info = []
for res in ligand_res_list:
atoms = list(res.atoms())
n_atoms = len(atoms)
for i in range(n_atoms):
atom1 = atoms[i]
pos1 = pdb.positions[atom1.index]
for j in range(i+1, n_atoms):
atom2 = atoms[j]
pos2 = pdb.positions[atom2.index]
dis = pos1 - pos2
dis = dis.value_in_unit(nanometer)
r = (dis[0]**2 + dis[1]**2 + dis[2]**2)**0.5
# print(atom1.index, atom2.index, round(r, 3))
info.append([res.name, atom1.index, atom2.index, atom1.name, atom2.name, atom1.element.symbol, atom2.element.symbol, round(r, 3)])
data = pd.DataFrame(info, columns=["Name", "Atom1", "Atom2", "Name1", "Name2", "Symbol1", "Symbol2", "r"])
data.to_csv(f"{save_dir}/het_frag.csv", index=False)
return data
In [ ]:
pdb_file = f"{pre}/DNAProtein_Platform_OpenCL_date_20200226_pdb_1a3q_repetition_0_clean.pdb"
seq_file = f"{pre}/DNAProtein_Platform_OpenCL_date_20200226_pdb_1a3q_repetition_0_protein.seq"
dcd_file = f"{pre}/local_jun08_pdb_1a3q_repetition_0_output.dcd"