In [ ]:
## \example pmi/setup_cross_link_ms_restraint.py
In [1]:
from __future__ import print_function
"""In this example we explore the ambiguity of XL-MS data.
"""
import IMP
import IMP.pmi
import IMP.pmi.representation
import IMP.pmi.io
import IMP.pmi.io.crosslink
import IMP.pmi.restraints
import IMP.pmi.restraints.crosslinking_new
m=IMP.Model()
In [1]:
# There are several way one can deal with ambiguity
# Let's define the possible ambiguities:
# 1) crosslink identification ambiguity
# 2) compositional ambiguity
# 3) state ambiguity
In [2]:
# first we create the representation using PMI
# two proteins, one ProtA with one bead, which coarse grain residues 1 to 10
# and the other is ProtB with three beads, which coarse grain residues 1 to 10, 11 to 20 and 21 to 30
r = IMP.pmi.representation.Representation(m)
r.create_component("ProtA")
r.add_component_beads("ProtA", [(1,10)],incoord=(0,0,0))
r.create_component("ProtB")
r.add_component_beads("ProtB", [(1,10)],incoord=(-40,0,0))
r.add_component_beads("ProtB", [(11,20)],incoord=(0,0,0))
r.add_component_beads("ProtB", [(21,30)],incoord=(40,0,0))
r.set_floppy_bodies()
In [3]:
# there are several models on how to implement the identification ambiguity
# Use the UniqueID keyword: Crosslinks with same UniqueID are considered ambiguous:
xldb='''Protein 1,Protein 2,Residue 1,Residue 2,UniqueID,Score
ProtA,ProtB,1,10,1,1.0
ProtA,ProtB,1,11,1,2.0
ProtA,ProtB,1,21,2,2.0
'''
# in the example above, cross-links ProtA:1-ProtB:10 and ProtA:1-ProtB:11
# are ambiguous because they were assigned to the same UniqueID
In [4]:
#let's save the text into a file, as it should be
xlf=open('xlinks.csv','w')
xlf.write(xldb)
xlf.close()
In [5]:
# Now we create a conversion map between internal keywords of xlinks features and the one in the file
cldbkc=IMP.pmi.io.crosslink.CrossLinkDataBaseKeywordsConverter()
cldbkc.set_protein1_key("Protein 1")
cldbkc.set_protein2_key("Protein 2")
cldbkc.set_residue1_key("Residue 1")
cldbkc.set_residue2_key("Residue 2")
cldbkc.set_unique_id_key("UniqueID")
cldbkc.set_id_score_key("Score")
In [6]:
# with this keyword interpreter,let's read the cross-link database
cldb=IMP.pmi.io.crosslink.CrossLinkDataBase(cldbkc)
cldb.create_set_from_file("xlinks.csv")
In [7]:
# let's check that the database looks ok
print(cldb)
# as you can see there are two unique indexes, 1 and 2. The first spectral index contains
# two identifications, with subindexes 1.1 and 1.2, corresponding to the two ambiguous restraints.
In [15]:
# Compositional ambiguity occurs when identical copies of the same protein is present
# in the sample, and we are not able to attribute the cross-link to one or the other copy.
# Furthermore, let's suppose to have already an indentification ambiguity, to complicate the example
# and see how the two ambiguities combine each other. See the data below, note that two crosslinks have the same
# UniqueID
xldb='''Protein 1,Protein 2,Residue 1,Residue 2,UniqueID,Score
ProtA,ProtB,1,10,1,1.0
ProtA,ProtB,1,11,1,2.0
ProtB,ProtA,21,1,2,2.0
ProtA,ProtA,1,2,3,3.0
'''
# We will first create a the database
xlf=open('xlinks.csv','w')
xlf.write(xldb)
xlf.close()
cldbkc=IMP.pmi.io.crosslink.CrossLinkDataBaseKeywordsConverter()
cldbkc.set_protein1_key("Protein 1")
cldbkc.set_protein2_key("Protein 2")
cldbkc.set_residue1_key("Residue 1")
cldbkc.set_residue2_key("Residue 2")
cldbkc.set_unique_id_key("UniqueID")
cldbkc.set_id_score_key("Score")
cldb=IMP.pmi.io.crosslink.CrossLinkDataBase(cldbkc)
cldb.create_set_from_file("xlinks.csv")
# Now, we know that there are two copies of ProtA, which we called ProtA.1 and ProtA.2 in our IMP.Hierarchy.
# Let's rename ProtA into ProtA.1 for both ends of each crosslink
from IMP.pmi.io.crosslink import FilterOperator as FO
import operator
fo1=FO(cldb.protein1_key,operator.eq,"ProtA")
cldb.set_value(cldb.protein1_key,"ProtA.1",fo1)
fo2=FO(cldb.protein2_key,operator.eq,"ProtA")
cldb.set_value(cldb.protein2_key,"ProtA.1",fo2)
# next we clone all crosslink involving ProtA.1 so that they were observed also by ProtA.2
cldb.clone_protein("ProtA.1","ProtA.2")
# let's check that the database looks ok
print(cldb)
# as you can see there are three Unique indexes, 1, 2 and 3. The first index contains four cross-links,
# the second two crosslinks and the third four cross-links