In [1]:
## \example pmi/setup_cross_link_ms_restraint.py
from __future__ import print_function
"""In this example we setup and evaluate a scoring function based on XL-MS data.
"""
import IMP
import IMP.pmi
import IMP.pmi.representation
import IMP.pmi.io
import IMP.pmi.io.crosslink
import IMP.pmi.restraints
import IMP.pmi.restraints.crosslinking_new
m=IMP.Model()
In [2]:
# first we create the representation using PMI
# two proteins, one ProtA with one bead, which coarse grain residues 1 to 10
# and the other is ProtB with three beads, which coarse grain residues 1 to 10, 11 to 20 and 21 to 30
r = IMP.pmi.representation.Representation(m)
r.create_component("ProtA")
r.add_component_beads("ProtA", [(1,10)],incoord=(0,0,0))
r.create_component("ProtB")
r.add_component_beads("ProtB", [(1,10)],incoord=(-40,0,0))
r.add_component_beads("ProtB", [(11,20)],incoord=(0,0,0))
r.add_component_beads("ProtB", [(21,30)],incoord=(40,0,0))
r.set_floppy_bodies()
In [3]:
# the cross-link dataset is a comma separated value (csv) file with at least
# the protein and the residue names (no spaces between commas)
xldb='''Protein 1,Protein 2,Residue 1,Residue 2,UniqueID,Score
ProtA,ProtB,1,10,1,1.0
ProtA,ProtB,1,11,2,2.0
ProtA,ProtB,1,21,3,2.0
'''
In [4]:
#let's save the text into a file, as it should be
xlf=open('xlinks.csv','w')
xlf.write(xldb)
xlf.close()
In [5]:
# Now we create a conversion map between internal keywords of xlinks features and the one in the file
cldbkc=IMP.pmi.io.crosslink.CrossLinkDataBaseKeywordsConverter()
cldbkc.set_protein1_key("Protein 1")
cldbkc.set_protein2_key("Protein 2")
cldbkc.set_residue1_key("Residue 1")
cldbkc.set_residue2_key("Residue 2")
# the unique_id_key and id_score_key are optional,
# and they add features that will explained below
cldbkc.set_unique_id_key("UniqueID")
cldbkc.set_id_score_key("Score")
In [6]:
# with this keyword interpreter,let's read the cross-link database
cldb=IMP.pmi.io.crosslink.CrossLinkDataBase(cldbkc)
cldb.create_set_from_file("xlinks.csv")
In [7]:
# let's check that the database looks ok
print(cldb)
In [8]:
# with the database we can now setup the scoring function
xl = IMP.pmi.restraints.crosslinking_new.CrossLinkingMassSpectrometryRestraint(r,
cldb,
length=21.0,
slope=0.0,
resolution=1.0,
label="XL")
# note the text generated. The program reports the nuisances particles associated to the cross-link (sigma and psi)
#
In [9]:
# can evaluate this restraint at the given system configuration
print(xl.rs.unprotected_evaluate(None))
In [10]:
# Let's plot the score while moving ProtA bead wrt ProtB
# let's first get the particle corresponding to ProtA
hierarchyA=IMP.pmi.tools.select_by_tuple(r,"ProtA")
pA=hierarchyA[0].get_particle()
# Note that we can move PartA on the x-axis
scores=[]
xs=[]
for i in range(-100,100):
xs.append(float(i))
IMP.core.XYZ(pA).set_coordinates((i,0,0))
scores.append(xl.rs.unprotected_evaluate(None))
In [11]:
%matplotlib inline
import pylab
pylab.plot(xs, scores)
# this plot shows that the system has two minima
# one when ProtA is between ProtB:1-10 and ProtB:11-20,
# and the other when ProtA is between ProtB:11-20 and ProtB:21-30
# the plot is weird, let's analyse what is going on
Out[11]:
In [12]:
# First let's simplify our dataset, by considering only the first cross-link
# Let's filter by the UniqueID, creting a new database that contains only
# the second cross-link, namely UniqueID=2
from IMP.pmi.io.crosslink import FilterOperator
import operator
fo=FilterOperator(cldb.unique_id_key,operator.eq,"2")
fcldb=cldb.filter(fo)
print(fcldb)
In [13]:
# let's create a new restraint based on this database and plot while moving ProtA
xl1 = IMP.pmi.restraints.crosslinking_new.CrossLinkingMassSpectrometryRestraint(r,
fcldb,
length=21.0,
slope=0.0,
resolution=1.0,
label="XL")
scores=[]
xs=[]
for i in range(-100,100):
xs.append(float(i))
IMP.core.XYZ(pA).set_coordinates((i,0,0))
scores.append(xl1.rs.unprotected_evaluate(None))
pylab.plot(xs, scores)
# It is clear that the restraint has a minimum when ProtA and ProtB:11-20 are close
# namely when ProtA x is around 0
# In fact, the restraint has a sigmoid shape.
Out[13]:
In [14]:
# Now let's play with the parameters sigma and psi to understand their role
# Let's get sigma first
sigma=xl1.sigma_dictionary["SIGMA"][0]
# and let's vary its value between 1 and 20 to see what happens
scores_list=[]
xs_list=[]
for s in range(1,20):
scores=[]
xs=[]
sigma.set_scale(float(s))
for i in range(-100,100):
xs.append(float(i))
IMP.core.XYZ(pA).set_coordinates((i,0,0))
scores.append(xl1.rs.unprotected_evaluate(None))
scores_list.append(scores)
xs_list.append(xs)
for n,xs in enumerate(xs_list):
pylab.plot(xs, scores_list[n])
# as one can see sigma modulates both the slope of the sigmoid and the plateau of the minimum
# that is because sigma is the structural uncertainty associated to the position of the cross-linked beads
In [15]:
# Let's get psi now, and set sigma back to 11
sigma.set_scale(11)
psi=xl1.psi_dictionary["PSI"][0]
# and let's vary its value between 0.01 and 0.5 to see what happens
import numpy as np
scores_list=[]
xs_list=[]
for s in np.linspace(0.01, 0.5, 10):
scores=[]
xs=[]
psi.set_scale(float(s))
for i in range(-100,100):
xs.append(float(i))
IMP.core.XYZ(pA).set_coordinates((i,0,0))
scores.append(xl1.rs.unprotected_evaluate(None))
scores_list.append(scores)
xs_list.append(xs)
for n,xs in enumerate(xs_list):
pylab.plot(xs, scores_list[n])
# as one can see psi modulates the plateau of the minimum and the maxima
# that is because psi is the uncertainty associated to the crosslink observation
# When psi is 0.01, (low uncertainty) there is a big score difference between a satisfied cross-link (x=0) and a violated one (x=100,x=-100)
# When psi is 0.5, (high uncertainty) the score is flat, no difference between violated and satisfied
In [16]:
# Now let's move the parameters when we consider all three cross-links
sigma=xl.sigma_dictionary["SIGMA"][0]
scores_list=[]
xs_list=[]
for s in range(1,20):
scores=[]
xs=[]
sigma.set_scale(float(s))
for i in range(-100,100):
xs.append(float(i))
IMP.core.XYZ(pA).set_coordinates((i,0,0))
scores.append(xl.rs.unprotected_evaluate(None))
scores_list.append(scores)
xs_list.append(xs)
for n,xs in enumerate(xs_list):
pylab.plot(xs, scores_list[n])
In [ ]:
# the cross-link dataset is a comma separated value (csv) file with at least
# the protein and the residue names (no spaces between commas)
xldb='''Protein 1,Protein 2,Residue 1,Residue 2,UniqueID,Score
ProtA,ProtB,1,10,1,1.0
ProtA,ProtB,1,11,2,2.0
ProtA,ProtB,1,21,3,2.0
'''
In [ ]:
In [17]:
# And for PSI
sigma.set_scale(11)
psi=xl.psi_dictionary["PSI"][0]
import numpy as np
scores_list=[]
xs_list=[]
for s in np.linspace(0.01, 0.5, 10):
scores=[]
xs=[]
psi.set_scale(float(s))
for i in range(-100,100):
xs.append(float(i))
IMP.core.XYZ(pA).set_coordinates((i,0,0))
scores.append(xl.rs.unprotected_evaluate(None))
scores_list.append(scores)
xs_list.append(xs)
for n,xs in enumerate(xs_list):
pylab.plot(xs, scores_list[n])
In [18]:
%%capture cap
# Now we can try to optimize the values of PSI and SIGMA, and see what is the best scoring value
# fixing the coordinate of ProtA to a minimum
xl.add_to_model()
xl.set_sigma_is_sampled(True)
xl.set_psi_is_sampled(True)
IMP.core.XYZ(pA).set_coordinates((-20,0,0))
# here we sample using a simulated annealing scheme
'''
import IMP.pmi.macros
mc1=IMP.pmi.macros.ReplicaExchange0(m,
r,
monte_carlo_sample_objects=[xl],
output_objects=[xl,r],
monte_carlo_temperature=1.0,
simulated_annealing=True,
simulated_annealing_minimum_temperature=1.0,
simulated_annealing_maximum_temperature=5.0,
simulated_annealing_minimum_temperature_nframes=100,
simulated_annealing_maximum_temperature_nframes=10,
replica_exchange_minimum_temperature=1.0,
replica_exchange_maximum_temperature=2.5,
number_of_best_scoring_models=0,
monte_carlo_steps=10,
number_of_frames=1000,
write_initial_rmf=True,
initial_rmf_name_suffix="initial",
stat_file_name_suffix="stat",
best_pdb_name_suffix="model",
do_clean_first=True,
do_create_directories=True,
global_output_directory="output",
rmf_dir="rmfs/",
best_pdb_dir="pdbs/",
replica_stat_file_suffix="stat_replica")
mc1.execute_macro()
rex=mc1.get_replica_exchange_object()
'''
In [19]:
# Once the sampling is done let's print the values of psi and sigma
p=IMP.pmi.output.ProcessOutput("output/stat.0.out")
fs=p.get_fields(['CrossLinkingMassSpectrometryRestraint_Psi_PSI_XL','CrossLinkingMassSpectrometryRestraint_Sigma_SIGMA_XL','SimplifiedModel_Total_Score_None'])
psi_values=[float(v) for v in fs['CrossLinkingMassSpectrometryRestraint_Psi_PSI_XL']]
sigma_values=[float(v) for v in fs['CrossLinkingMassSpectrometryRestraint_Sigma_SIGMA_XL']]
score_values=[float(v) for v in fs['SimplifiedModel_Total_Score_None']]
In [19]:
In [20]:
import matplotlib.pyplot as plt
# Generate fake data
x = psi_values
y = sigma_values
z = score_values
fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=10, edgecolor='')
plt.show()
In [21]:
# Now we can try to optimize the values of PSI and SIGMA, and see what is the best scoring value
# fixing the coordinate of ProtA to a minimum
import math
import numpy as np
xl.add_to_model()
xl.set_sigma_is_sampled(True)
xl.set_psi_is_sampled(True)
hierarchyA=IMP.pmi.tools.select_by_tuple(r,"ProtA")
pA=hierarchyA[0].get_particle()
IMP.core.XYZ(pA).set_coordinates((0,0,0))
hierarchyB1=IMP.pmi.tools.select_by_tuple(r,(1,1,"ProtB"))
pB1=hierarchyB1[0].get_particle()
hierarchyB2=IMP.pmi.tools.select_by_tuple(r,(11,11,"ProtB"))
pB2=hierarchyB2[0].get_particle()
hierarchyB3=IMP.pmi.tools.select_by_tuple(r,(21,31,"ProtB"))
pB3=hierarchyB3[0].get_particle()
IMP.core.XYZ(pB1).set_coordinates((0,20,0))
IMP.core.XYZ(pB2).set_coordinates((-20*math.sqrt(3)/2,-20/2,0))
IMP.core.XYZ(pB3).set_coordinates((20*math.sqrt(3)/2,-20/2,0))
scores=[]
psis=[]
sigmas=[]
for p in np.linspace(0.01, 0.5, 100):
psi.set_scale(p)
for s in np.linspace(1, 40, 50):
psis.append(p)
sigmas.append(s)
sigma.set_scale(s)
scores.append(xl.rs.unprotected_evaluate(None))
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots()
ax.scatter(psis, sigmas, c=scores, s=10, edgecolor='')
plt.show()
# there is a minimum when PSI is close to zero and sigma is between 0 and 10
In [22]:
# If we move ProtA away, so that any cross-link is satisfied
IMP.core.XYZ(pA).set_coordinates((100,100,100))
scores=[]
psis=[]
sigmas=[]
for p in np.linspace(0.01, 0.5, 100):
psi.set_scale(p)
for s in np.linspace(1, 40, 50):
psis.append(p)
sigmas.append(s)
sigma.set_scale(s)
scores.append(xl.rs.unprotected_evaluate(None))
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots()
ax.scatter(psis, sigmas, c=scores, s=10, edgecolor='')
plt.show()
# The minimum is at Psi=0.5, irrespectively of the value of Sigma