In [ ]:
## \example pmi/setup_cross_link_ms_restraint.py

In [1]:
from __future__ import print_function

"""In this example we explore the ambiguity of XL-MS data.
"""
import IMP
import IMP.pmi
import IMP.pmi.representation
import IMP.pmi.io
import IMP.pmi.io.crosslink
import IMP.pmi.restraints
import IMP.pmi.restraints.crosslinking_new

m=IMP.Model()

How to deal with ambiguity


In [1]:
# There are several way one can deal with ambiguity
# Let's define the possible ambiguities:
# 1) crosslink identification ambiguity
# 2) compositional ambiguity
# 3) state ambiguity

In [2]:
# first we create the representation using PMI
# two proteins, one ProtA with one bead, which coarse grain residues 1 to 10
# and the other is ProtB with three beads, which coarse grain residues 1 to 10, 11 to 20 and 21 to 30
r = IMP.pmi.representation.Representation(m)
r.create_component("ProtA")
r.add_component_beads("ProtA", [(1,10)],incoord=(0,0,0))
r.create_component("ProtB")
r.add_component_beads("ProtB", [(1,10)],incoord=(-40,0,0))
r.add_component_beads("ProtB", [(11,20)],incoord=(0,0,0))
r.add_component_beads("ProtB", [(21,30)],incoord=(40,0,0))
r.set_floppy_bodies()

In [3]:
# there are several models on how to implement the identification ambiguity

# Use the UniqueID keyword: Crosslinks with same UniqueID are considered ambiguous:

xldb='''Protein 1,Protein 2,Residue 1,Residue 2,UniqueID,Score
ProtA,ProtB,1,10,1,1.0
ProtA,ProtB,1,11,1,2.0
ProtA,ProtB,1,21,2,2.0
'''

# in the example above, cross-links ProtA:1-ProtB:10 and ProtA:1-ProtB:11 
# are ambiguous because they were assigned to the same UniqueID

In [4]:
#let's save the text into a file, as it should be

xlf=open('xlinks.csv','w')
xlf.write(xldb)
xlf.close()

In [5]:
# Now we create a conversion map between internal keywords of xlinks features and the one in the file

cldbkc=IMP.pmi.io.crosslink.CrossLinkDataBaseKeywordsConverter()
cldbkc.set_protein1_key("Protein 1")
cldbkc.set_protein2_key("Protein 2")
cldbkc.set_residue1_key("Residue 1")
cldbkc.set_residue2_key("Residue 2")
cldbkc.set_unique_id_key("UniqueID")
cldbkc.set_id_score_key("Score")

In [6]:
# with this keyword interpreter,let's read the cross-link database

cldb=IMP.pmi.io.crosslink.CrossLinkDataBase(cldbkc)
cldb.create_set_from_file("xlinks.csv")

In [7]:
# let's check that the database looks ok
print(cldb)
# as you can see there are two unique indexes, 1 and 2. The first spectral index contains 
# two identifications, with subindexes 1.1 and 1.2, corresponding to the two ambiguous restraints.


1
--- XLUniqueID 1
--- XLUniqueSubIndex 1
--- XLUniqueSubID 1.1
--- Protein1 ProtA
--- Protein2 ProtB
--- Residue1 1
--- Residue2 10
--- IDScore 1.0
--- Redundancy 1
--- RedundancyList ['1.1']
-------------
--- XLUniqueID 1
--- XLUniqueSubIndex 2
--- XLUniqueSubID 1.2
--- Protein1 ProtA
--- Protein2 ProtB
--- Residue1 1
--- Residue2 11
--- IDScore 2.0
--- Redundancy 1
--- RedundancyList ['1.2']
-------------
2
--- XLUniqueID 2
--- XLUniqueSubIndex 1
--- XLUniqueSubID 2.1
--- Protein1 ProtA
--- Protein2 ProtB
--- Residue1 1
--- Residue2 21
--- IDScore 2.0
--- Redundancy 1
--- RedundancyList ['2.1']
-------------

2) compositional ambiguity


In [15]:
# Compositional ambiguity occurs when identical copies of the same protein is present 
# in the sample, and we are not able to attribute the cross-link to one or the other copy.

# Furthermore, let's suppose to have already an indentification ambiguity, to complicate the example
# and see how the two ambiguities combine each other. See the data below, note that two crosslinks have the same 
# UniqueID

xldb='''Protein 1,Protein 2,Residue 1,Residue 2,UniqueID,Score
ProtA,ProtB,1,10,1,1.0
ProtA,ProtB,1,11,1,2.0
ProtB,ProtA,21,1,2,2.0
ProtA,ProtA,1,2,3,3.0
'''

# We will first create a the database

xlf=open('xlinks.csv','w')
xlf.write(xldb)
xlf.close()

cldbkc=IMP.pmi.io.crosslink.CrossLinkDataBaseKeywordsConverter()
cldbkc.set_protein1_key("Protein 1")
cldbkc.set_protein2_key("Protein 2")
cldbkc.set_residue1_key("Residue 1")
cldbkc.set_residue2_key("Residue 2")
cldbkc.set_unique_id_key("UniqueID")
cldbkc.set_id_score_key("Score")

cldb=IMP.pmi.io.crosslink.CrossLinkDataBase(cldbkc)
cldb.create_set_from_file("xlinks.csv")

# Now, we know that there are two copies of ProtA, which we called ProtA.1 and ProtA.2 in our IMP.Hierarchy.
# Let's rename ProtA into ProtA.1 for both ends of each crosslink

from IMP.pmi.io.crosslink import FilterOperator as FO
import operator

fo1=FO(cldb.protein1_key,operator.eq,"ProtA")
cldb.set_value(cldb.protein1_key,"ProtA.1",fo1)
fo2=FO(cldb.protein2_key,operator.eq,"ProtA")
cldb.set_value(cldb.protein2_key,"ProtA.1",fo2)

# next we clone all crosslink involving ProtA.1 so that they were observed also by ProtA.2

cldb.clone_protein("ProtA.1","ProtA.2")

# let's check that the database looks ok
print(cldb)
# as you can see there are three Unique indexes, 1, 2 and 3. The first  index contains four cross-links, 
# the second two crosslinks and the third four cross-links


1
--- XLUniqueID 1
--- XLUniqueSubIndex 1
--- XLUniqueSubID 1.1
--- Protein1 ProtA.1
--- Protein2 ProtB
--- Residue1 1
--- Residue2 10
--- IDScore 1.0
--- Redundancy 1
--- RedundancyList ['1.1']
-------------
--- XLUniqueID 1
--- XLUniqueSubIndex 2
--- XLUniqueSubID 1.2
--- Protein1 ProtA.2
--- Protein2 ProtB
--- Residue1 1
--- Residue2 10
--- IDScore 1.0
--- Redundancy 1
--- RedundancyList ['1.2']
-------------
--- XLUniqueID 1
--- XLUniqueSubIndex 3
--- XLUniqueSubID 1.3
--- Protein1 ProtA.1
--- Protein2 ProtB
--- Residue1 1
--- Residue2 11
--- IDScore 2.0
--- Redundancy 1
--- RedundancyList ['1.3']
-------------
--- XLUniqueID 1
--- XLUniqueSubIndex 4
--- XLUniqueSubID 1.4
--- Protein1 ProtA.2
--- Protein2 ProtB
--- Residue1 1
--- Residue2 11
--- IDScore 2.0
--- Redundancy 1
--- RedundancyList ['1.4']
-------------
2
--- XLUniqueID 2
--- XLUniqueSubIndex 1
--- XLUniqueSubID 2.1
--- Protein1 ProtB
--- Protein2 ProtA.1
--- Residue1 21
--- Residue2 1
--- IDScore 2.0
--- Redundancy 1
--- RedundancyList ['2.1']
-------------
--- XLUniqueID 2
--- XLUniqueSubIndex 2
--- XLUniqueSubID 2.2
--- Protein1 ProtB
--- Protein2 ProtA.2
--- Residue1 21
--- Residue2 1
--- IDScore 2.0
--- Redundancy 1
--- RedundancyList ['2.2']
-------------
3
--- XLUniqueID 3
--- XLUniqueSubIndex 1
--- XLUniqueSubID 3.1
--- Protein1 ProtA.1
--- Protein2 ProtA.1
--- Residue1 1
--- Residue2 2
--- IDScore 3.0
--- Redundancy 1
--- RedundancyList ['3.1']
-------------
--- XLUniqueID 3
--- XLUniqueSubIndex 2
--- XLUniqueSubID 3.2
--- Protein1 ProtA.2
--- Protein2 ProtA.1
--- Residue1 1
--- Residue2 2
--- IDScore 3.0
--- Redundancy 1
--- RedundancyList ['3.2']
-------------
--- XLUniqueID 3
--- XLUniqueSubIndex 3
--- XLUniqueSubID 3.3
--- Protein1 ProtA.1
--- Protein2 ProtA.2
--- Residue1 1
--- Residue2 2
--- IDScore 3.0
--- Redundancy 1
--- RedundancyList ['3.3']
-------------
--- XLUniqueID 3
--- XLUniqueSubIndex 4
--- XLUniqueSubID 3.4
--- Protein1 ProtA.2
--- Protein2 ProtA.2
--- Residue1 1
--- Residue2 2
--- IDScore 3.0
--- Redundancy 1
--- RedundancyList ['3.4']
-------------