In [1]:
## \example pmi/setup_cross_link_ms_restraint.py
from __future__ import print_function

"""In this example we setup and evaluate a scoring function based on XL-MS data.
"""
import IMP
import IMP.pmi
import IMP.pmi.representation
import IMP.pmi.io
import IMP.pmi.io.crosslink
import IMP.pmi.restraints
import IMP.pmi.restraints.crosslinking_new

m=IMP.Model()

In [2]:
# first we create the representation using PMI
# two proteins, one ProtA with one bead, which coarse grain residues 1 to 10
# and the other is ProtB with three beads, which coarse grain residues 1 to 10, 11 to 20 and 21 to 30
r = IMP.pmi.representation.Representation(m)
r.create_component("ProtA")
r.add_component_beads("ProtA", [(1,10)],incoord=(0,0,0))
r.create_component("ProtB")
r.add_component_beads("ProtB", [(1,10)],incoord=(-40,0,0))
r.add_component_beads("ProtB", [(11,20)],incoord=(0,0,0))
r.add_component_beads("ProtB", [(21,30)],incoord=(40,0,0))
r.set_floppy_bodies()

In [3]:
# the cross-link dataset is a comma separated value (csv) file with at least
# the protein and the residue names (no spaces between commas)

xldb='''Protein 1,Protein 2,Residue 1,Residue 2,UniqueID,Score
ProtA,ProtB,1,10,1,1.0
ProtA,ProtB,1,11,2,2.0
ProtA,ProtB,1,21,3,2.0
'''

In [4]:
#let's save the text into a file, as it should be

xlf=open('xlinks.csv','w')
xlf.write(xldb)
xlf.close()

In [5]:
# Now we create a conversion map between internal keywords of xlinks features and the one in the file

cldbkc=IMP.pmi.io.crosslink.CrossLinkDataBaseKeywordsConverter()
cldbkc.set_protein1_key("Protein 1")
cldbkc.set_protein2_key("Protein 2")
cldbkc.set_residue1_key("Residue 1")
cldbkc.set_residue2_key("Residue 2")

# the unique_id_key and id_score_key are optional,
# and they add features that will explained below

cldbkc.set_unique_id_key("UniqueID")
cldbkc.set_id_score_key("Score")

In [6]:
# with this keyword interpreter,let's read the cross-link database

cldb=IMP.pmi.io.crosslink.CrossLinkDataBase(cldbkc)
cldb.create_set_from_file("xlinks.csv")

In [7]:
# let's check that the database looks ok
print(cldb)


1
--- XLUniqueID 1
--- XLUniqueSubIndex 1
--- XLUniqueSubID 1.1
--- Protein1 ProtA
--- Protein2 ProtB
--- Residue1 1
--- Residue2 10
--- IDScore 1.0
--- Redundancy 1
--- RedundancyList ['1.1']
-------------
2
--- XLUniqueID 2
--- XLUniqueSubIndex 1
--- XLUniqueSubID 2.1
--- Protein1 ProtA
--- Protein2 ProtB
--- Residue1 1
--- Residue2 11
--- IDScore 2.0
--- Redundancy 1
--- RedundancyList ['2.1']
-------------
3
--- XLUniqueID 3
--- XLUniqueSubIndex 1
--- XLUniqueSubID 3.1
--- Protein1 ProtA
--- Protein2 ProtB
--- Residue1 1
--- Residue2 21
--- IDScore 2.0
--- Redundancy 1
--- RedundancyList ['3.1']
-------------


In [8]:
# with the database we can now setup the scoring function

xl = IMP.pmi.restraints.crosslinking_new.CrossLinkingMassSpectrometryRestraint(r,
                                                                           cldb,
                                                                           length=21.0,
                                                                           slope=0.0,
                                                                           resolution=1.0,
                                                                           label="XL")

# note the text generated. The program reports the nuisances particles associated to the cross-link (sigma and psi) 
#


generating a new crosslink restraint
--------------
CrossLinkingMassSpectrometryRestraint: generating cross-link restraint between
CrossLinkingMassSpectrometryRestraint: residue 1 of chain ProtA and residue 10 of chain ProtB
CrossLinkingMassSpectrometryRestraint: with sigma1 SIGMA sigma2 SIGMA psi PSI
CrossLinkingMassSpectrometryRestraint: between particles ProtA_1-10_bead_floppy_body and ProtB_1-10_bead_floppy_body
==========================================

generating a new crosslink restraint
--------------
CrossLinkingMassSpectrometryRestraint: generating cross-link restraint between
CrossLinkingMassSpectrometryRestraint: residue 1 of chain ProtA and residue 11 of chain ProtB
CrossLinkingMassSpectrometryRestraint: with sigma1 SIGMA sigma2 SIGMA psi PSI
CrossLinkingMassSpectrometryRestraint: between particles ProtA_1-10_bead_floppy_body and ProtB_11-20_bead_floppy_body
==========================================

generating a new crosslink restraint
--------------
CrossLinkingMassSpectrometryRestraint: generating cross-link restraint between
CrossLinkingMassSpectrometryRestraint: residue 1 of chain ProtA and residue 21 of chain ProtB
CrossLinkingMassSpectrometryRestraint: with sigma1 SIGMA sigma2 SIGMA psi PSI
CrossLinkingMassSpectrometryRestraint: between particles ProtA_1-10_bead_floppy_body and ProtB_21-30_bead_floppy_body
==========================================


In [9]:
# can evaluate this restraint at the given system configuration

print(xl.rs.unprotected_evaluate(None))


3.06027079469

In [10]:
# Let's plot the score while moving ProtA bead wrt ProtB
# let's first get the particle corresponding to ProtA

hierarchyA=IMP.pmi.tools.select_by_tuple(r,"ProtA")
pA=hierarchyA[0].get_particle()

# Note that we can move PartA on the x-axis
scores=[]
xs=[]
for i in range(-100,100):
    xs.append(float(i))
    IMP.core.XYZ(pA).set_coordinates((i,0,0))
    scores.append(xl.rs.unprotected_evaluate(None))

In [11]:
%matplotlib inline
import pylab

pylab.plot(xs, scores)
# this plot shows that the system has two minima
# one when ProtA is between ProtB:1-10 and ProtB:11-20,
# and the other when ProtA is between ProtB:11-20 and ProtB:21-30
# the plot is weird, let's analyse what is going on


Out[11]:
[<matplotlib.lines.Line2D at 0xd603850>]

In [12]:
# First let's simplify our dataset, by considering only the first cross-link
# Let's filter by the UniqueID, creting a new database that contains only
# the second cross-link, namely UniqueID=2

from IMP.pmi.io.crosslink import FilterOperator
import operator

fo=FilterOperator(cldb.unique_id_key,operator.eq,"2")
fcldb=cldb.filter(fo)

print(fcldb)


2
--- DataSetName XL
--- XLUniqueID 2
--- XLUniqueSubIndex 1
--- XLUniqueSubID 2.1
--- Protein1 ProtA
--- Protein2 ProtB
--- Residue1 1
--- Residue2 11
--- IDScore 2.0
--- Redundancy 1
--- RedundancyList ['2.1']
--- State 0
--- Sigma1 SIGMA
--- Sigma2 SIGMA
--- Psi PSI
--- Particle_sigma2 1e-07 <  Scale = 2 < 1000
--- Restraint "|XL|2.1|ProtA|1|ProtB|11|0|PSI|"
--- ShortLabel |XL|2.1|ProtA|1|ProtB|11|0|PSI|
--- Particle_psi 1e-07 <  Scale = 0.25 < 0.5
--- Particle2 "ProtB_11-20_bead_floppy_body" Fragment: [11, 21)  (0 0 0: 5.9919)
--- Particle1 "ProtA_1-10_bead_floppy_body" Fragment: [1, 11)  (99 0 0: 5.9919)
--- IntraRigidBody False
--- Particle_sigma1 1e-07 <  Scale = 2 < 1000
-------------


In [13]:
# let's create a new restraint based on this database and plot while moving ProtA

xl1 = IMP.pmi.restraints.crosslinking_new.CrossLinkingMassSpectrometryRestraint(r,
                                                                           fcldb,
                                                                           length=21.0,
                                                                           slope=0.0,
                                                                           resolution=1.0,
                                                                           label="XL")

scores=[]
xs=[]
for i in range(-100,100):
    xs.append(float(i))
    IMP.core.XYZ(pA).set_coordinates((i,0,0))
    scores.append(xl1.rs.unprotected_evaluate(None))
pylab.plot(xs, scores)

# It is clear that the restraint has a minimum when ProtA and ProtB:11-20 are close
# namely when ProtA x is around 0
# In fact, the restraint has a sigmoid shape.


generating a new crosslink restraint
--------------
CrossLinkingMassSpectrometryRestraint: generating cross-link restraint between
CrossLinkingMassSpectrometryRestraint: residue 1 of chain ProtA and residue 11 of chain ProtB
CrossLinkingMassSpectrometryRestraint: with sigma1 SIGMA sigma2 SIGMA psi PSI
CrossLinkingMassSpectrometryRestraint: between particles ProtA_1-10_bead_floppy_body and ProtB_11-20_bead_floppy_body
==========================================

Out[13]:
[<matplotlib.lines.Line2D at 0xd63bb90>]

In [14]:
# Now let's play with the parameters sigma and psi to understand their role
# Let's get sigma first

sigma=xl1.sigma_dictionary["SIGMA"][0]

# and let's vary its value between 1 and 20 to see what happens

scores_list=[]
xs_list=[]
for s in range(1,20):
    scores=[]
    xs=[]
    sigma.set_scale(float(s))
    for i in range(-100,100):
        xs.append(float(i))
        IMP.core.XYZ(pA).set_coordinates((i,0,0))
        scores.append(xl1.rs.unprotected_evaluate(None))
    scores_list.append(scores)
    xs_list.append(xs)

for n,xs in enumerate(xs_list):
    pylab.plot(xs, scores_list[n])

# as one can see sigma modulates both the slope of the sigmoid and the plateau of the minimum
# that is because sigma is the structural uncertainty associated to the position of the cross-linked beads



In [15]:
# Let's get psi now, and set sigma back to 11

sigma.set_scale(11)
psi=xl1.psi_dictionary["PSI"][0]

# and let's vary its value between 0.01 and 0.5 to see what happens
import numpy as np

scores_list=[]
xs_list=[]
for s in np.linspace(0.01, 0.5, 10):
    scores=[]
    xs=[]
    psi.set_scale(float(s))
    for i in range(-100,100):
        xs.append(float(i))
        IMP.core.XYZ(pA).set_coordinates((i,0,0))
        scores.append(xl1.rs.unprotected_evaluate(None))
    scores_list.append(scores)
    xs_list.append(xs)

for n,xs in enumerate(xs_list):
    pylab.plot(xs, scores_list[n])

# as one can see psi modulates the plateau of the minimum and the maxima
# that is because psi is the uncertainty associated to the crosslink observation
# When psi is 0.01, (low uncertainty) there is a big score difference between a satisfied cross-link (x=0) and a violated one (x=100,x=-100)
# When psi is 0.5, (high uncertainty) the score is flat, no difference between violated and satisfied



In [16]:
# Now let's move the parameters when we consider all three cross-links

sigma=xl.sigma_dictionary["SIGMA"][0]

scores_list=[]
xs_list=[]
for s in range(1,20):
    scores=[]
    xs=[]
    sigma.set_scale(float(s))
    for i in range(-100,100):
        xs.append(float(i))
        IMP.core.XYZ(pA).set_coordinates((i,0,0))
        scores.append(xl.rs.unprotected_evaluate(None))
    scores_list.append(scores)
    xs_list.append(xs)

for n,xs in enumerate(xs_list):
    pylab.plot(xs, scores_list[n])


Ambiguity


In [ ]:
# the cross-link dataset is a comma separated value (csv) file with at least
# the protein and the residue names (no spaces between commas)

xldb='''Protein 1,Protein 2,Residue 1,Residue 2,UniqueID,Score
ProtA,ProtB,1,10,1,1.0
ProtA,ProtB,1,11,2,2.0
ProtA,ProtB,1,21,3,2.0
'''

In [ ]:


In [17]:
# And for PSI

sigma.set_scale(11)
psi=xl.psi_dictionary["PSI"][0]


import numpy as np

scores_list=[]
xs_list=[]
for s in np.linspace(0.01, 0.5, 10):
    scores=[]
    xs=[]
    psi.set_scale(float(s))
    for i in range(-100,100):
        xs.append(float(i))
        IMP.core.XYZ(pA).set_coordinates((i,0,0))
        scores.append(xl.rs.unprotected_evaluate(None))
    scores_list.append(scores)
    xs_list.append(xs)

for n,xs in enumerate(xs_list):
    pylab.plot(xs, scores_list[n])



In [18]:
%%capture cap 

# Now we can try to optimize the values of PSI and SIGMA, and see what is the best scoring value
# fixing the coordinate of ProtA to a minimum

xl.add_to_model()
xl.set_sigma_is_sampled(True)
xl.set_psi_is_sampled(True)
IMP.core.XYZ(pA).set_coordinates((-20,0,0))

# here we sample using a simulated annealing scheme
'''
import IMP.pmi.macros

mc1=IMP.pmi.macros.ReplicaExchange0(m,
                                    r,
                                    monte_carlo_sample_objects=[xl],
                                    output_objects=[xl,r],
                                    monte_carlo_temperature=1.0,
                 		            simulated_annealing=True,
                                    simulated_annealing_minimum_temperature=1.0,
                                    simulated_annealing_maximum_temperature=5.0,
                                    simulated_annealing_minimum_temperature_nframes=100,
                                    simulated_annealing_maximum_temperature_nframes=10,
                                    replica_exchange_minimum_temperature=1.0,
                                    replica_exchange_maximum_temperature=2.5,
                                    number_of_best_scoring_models=0,
                                    monte_carlo_steps=10,
                                    number_of_frames=1000,
                                    write_initial_rmf=True,
                                    initial_rmf_name_suffix="initial",
                                    stat_file_name_suffix="stat",
                                    best_pdb_name_suffix="model",
                                    do_clean_first=True,
                                    do_create_directories=True,
                                    global_output_directory="output",
                                    rmf_dir="rmfs/",
                                    best_pdb_dir="pdbs/",
                                    replica_stat_file_suffix="stat_replica")
mc1.execute_macro()
rex=mc1.get_replica_exchange_object()
'''

In [19]:
# Once the sampling is done let's print the values of psi and sigma

p=IMP.pmi.output.ProcessOutput("output/stat.0.out")
fs=p.get_fields(['CrossLinkingMassSpectrometryRestraint_Psi_PSI_XL','CrossLinkingMassSpectrometryRestraint_Sigma_SIGMA_XL','SimplifiedModel_Total_Score_None'])
psi_values=[float(v) for v in fs['CrossLinkingMassSpectrometryRestraint_Psi_PSI_XL']]
sigma_values=[float(v) for v in fs['CrossLinkingMassSpectrometryRestraint_Sigma_SIGMA_XL']]
score_values=[float(v) for v in fs['SimplifiedModel_Total_Score_None']]

In [19]:


In [20]:
import matplotlib.pyplot as plt

# Generate fake data
x = psi_values
y = sigma_values

z = score_values

fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=10, edgecolor='')
plt.show()



In [21]:
# Now we can try to optimize the values of PSI and SIGMA, and see what is the best scoring value
# fixing the coordinate of ProtA to a minimum

import math
import numpy as np

xl.add_to_model()
xl.set_sigma_is_sampled(True)
xl.set_psi_is_sampled(True)
hierarchyA=IMP.pmi.tools.select_by_tuple(r,"ProtA")
pA=hierarchyA[0].get_particle()
IMP.core.XYZ(pA).set_coordinates((0,0,0))

hierarchyB1=IMP.pmi.tools.select_by_tuple(r,(1,1,"ProtB"))
pB1=hierarchyB1[0].get_particle()
hierarchyB2=IMP.pmi.tools.select_by_tuple(r,(11,11,"ProtB"))
pB2=hierarchyB2[0].get_particle()
hierarchyB3=IMP.pmi.tools.select_by_tuple(r,(21,31,"ProtB"))
pB3=hierarchyB3[0].get_particle()

IMP.core.XYZ(pB1).set_coordinates((0,20,0))
IMP.core.XYZ(pB2).set_coordinates((-20*math.sqrt(3)/2,-20/2,0))
IMP.core.XYZ(pB3).set_coordinates((20*math.sqrt(3)/2,-20/2,0))

scores=[]
psis=[]
sigmas=[]
for p in np.linspace(0.01, 0.5, 100):
    psi.set_scale(p)
    for s in np.linspace(1, 40, 50):
        psis.append(p)
        sigmas.append(s)
        sigma.set_scale(s)
        scores.append(xl.rs.unprotected_evaluate(None))

import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots()
ax.scatter(psis, sigmas, c=scores, s=10, edgecolor='')
plt.show()

# there is a minimum when PSI is close to zero and sigma is between 0 and 10



In [22]:
# If we move ProtA away, so that any cross-link is satisfied

IMP.core.XYZ(pA).set_coordinates((100,100,100))
scores=[]
psis=[]
sigmas=[]
for p in np.linspace(0.01, 0.5, 100):
    psi.set_scale(p)
    for s in np.linspace(1, 40, 50):
        psis.append(p)
        sigmas.append(s)
        sigma.set_scale(s)
        scores.append(xl.rs.unprotected_evaluate(None))

import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots()
ax.scatter(psis, sigmas, c=scores, s=10, edgecolor='')
plt.show()

# The minimum is at Psi=0.5, irrespectively of the value of Sigma