This is a notebook demos how to query associations from IMPC include evidence and provenance modeled with SEPIO
To setup:
pip install dipper jupyter ipython
In [17]:
from dipper.graph.RDFGraph import RDFGraph
impc_graph = "https://data.monarchinitiative.org/ttl/impc.ttl"
graph = RDFGraph()
# Import the rdf file, this takes a minute or two
graph.parse(impc_graph, format='turtle')
Out[17]:
In [18]:
# How many subjects have a has_phenotype relation
has_phenotype = graph._getNode("RO:0002200")
len(list(graph.subjects(predicate=has_phenotype)))
Out[18]:
In [19]:
# How many oban associations are in the graph
from rdflib.namespace import RDF
association = graph._getNode("OBAN:association")
len(list(graph.subjects(RDF.type, association)))
Out[19]:
In [20]:
# How many lines of mutant phenotype evidence
mut_pheno_evidence = graph._getNode("ECO:0000015")
len(list(graph.subjects(RDF.type, mut_pheno_evidence)))
Out[20]:
Given a list of genotype and phenotype labels, create a subgraph containing the 'has phenotype' relation, and all evidence.
In [84]:
# Bind all namespaces in curie_map for sparql
graph.bind_all_namespaces()
genotype = "Ankrd13a<Gt(RRH308)Byg>/Ankrd13a<Gt(RRH308)Byg> [CBA/Ca;129P2-WTSI-Ankrd13aAnkrd13a<Gt(RRH308)Byg>] (female)"
phenotype = "MP:0001399"
query = """
?genotype rdfs:label '{0}' .
?genotype RO:0002200 {1} .
?assoc a ?assoc_type ;
OBAN:association_has_object {1} ;
OBAN:association_has_predicate RO:0002200 ;
OBAN:association_has_subject ?genotype ;
RO:0002558 ECO:0000015 ;
SEPIO:0000007 ?evidenceline ;
SEPIO:0000015 ?assertion .
?assertion SEPIO:0000018 ?creator ;
SEPIO:0000111 ?evidenceline .
?assertion a ?assertion_type .
?creator ?creator_predicates ?creator_objects .
?evidenceline SEPIO:0000084 ?measure1 ;
SEPIO:0000085 ?study .
?evidenceline a ?ev_type .
?measure1 ?measure_predicates ?measure_objects .
?study ?study_predicates ?study_objects .
?study_objects ?stud_p ?stud_o .
?measure_objects ?meas_p ?meas_o .
""".format(genotype, phenotype)
sparql_query = """
CONSTRUCT {{
{0}
}}
WHERE {{
{0}
}}
""".format(query)
sparql_output = graph.query(sparql_query)
subGraph = RDFGraph()
for triple in sparql_output:
subGraph.add(triple)
subGraph.bind("OBAN", "http://purl.org/oban/")
print(subGraph.serialize(format='turtle').decode("utf-8"))
In [91]:
# Turn this into a function
template = """
?genotype rdfs:label '{0}' .
?genotype RO:0002200 {1} .
?assoc a ?assoc_type ;
OBAN:association_has_object {1} ;
OBAN:association_has_predicate RO:0002200 ;
OBAN:association_has_subject ?genotype ;
RO:0002558 ECO:0000015 ;
SEPIO:0000007 ?evidenceline ;
SEPIO:0000015 ?assertion .
?assertion SEPIO:0000018 ?creator ;
SEPIO:0000111 ?evidenceline .
?assertion a ?assertion_type .
?creator ?creator_predicates ?creator_objects .
?evidenceline SEPIO:0000084 ?measure1 ;
SEPIO:0000085 ?study .
?evidenceline a ?ev_type .
?measure1 ?measure_predicates ?measure_objects .
?study ?study_predicates ?study_objects .
?study_objects ?stud_p ?stud_o .
?measure_objects ?meas_p ?meas_o .
"""
subGraph = RDFGraph()
subGraph.bind("OBAN", "http://purl.org/oban/")
def create_subgraph(query, graph, new_graph):
sparql_query = """
CONSTRUCT {{
{0}
}}
WHERE {{
{0}
}}
""".format(query)
sparql_output = graph.query(sparql_query)
for triple in sparql_output:
new_graph.add(triple)
g2p_list = [
["Ankrd13a<Gt(RRH308)Byg>/Ankrd13a<Gt(RRH308)Byg> [CBA/Ca;129P2-WTSI-Ankrd13aAnkrd13a<Gt(RRH308)Byg>] (female)",
"MP:0001399"
],
["Ankrd13a<Gt(RRH308)Byg>/Ankrd13a<Gt(RRH308)Byg> [CBA/Ca;129P2-WTSI-Ankrd13aAnkrd13a<Gt(RRH308)Byg>] (male)",
"MP:0001399"
],
["Mapkap1<tm1b(EUCOMM)Wtsi>/Mapkap1<+> [C57BL/6N-BCM-Mapkap1 EPD0609_2_F05-B] (female)",
"MP:0002753"
],
["Hbs1l<tm1a(KOMP)Wtsi>/Hbs1l<tm1a(KOMP)Wtsi> [C57BL/6N-WTSI-METC] (female)",
"MP:0005292"
],
["Gnao1<tm1b(EUCOMM)Hmgu>/Gnao1<+> [C57BL/6NTac-MRC Harwell-H-GNAO1-G05-TM1B] (male)",
"MP:0001399"
]
]
for g2p in g2p_list:
query = template.format(g2p[0], g2p[1])
create_subgraph(query, graph, subGraph)
subGraph.serialize("/home/kshefchek/impc_test.ttl", format="ttl")
In [ ]: