This notebook tries to provides a nearly complete mapping between a W3C prov standard based provenance descriptions and a Neo4j graph representation.
The approach taken is as follows:
Next steps are:
In the following PROV-JSON documents are used as a compact way to
The generic prov graph model is based on:
see the following grahical representatin (taken from https://www.w3.org/TR/prov-overview/)
In [94]:
from IPython.display import display, Image
Image(filename='key-concepts.png')
Out[94]:
In [95]:
from prov.model import ProvDocument
d1 = ProvDocument()
In [19]:
%%writefile wps-prov.json
{
"prefix": {
"enes": "http://www.enes.org/enes_entitiy/",
"workflow": "http://www.enes.org/enes/workflow/#",
"dc": "http://dublin-core.org/",
"user": "http://www.enes.org/enes_entity/user/",
"file": "http://www.enes.org/enes_entity/file/",
"esgf": "http://carbon.dkrz.de/file/cmip5/",
"enes_data": "http://enes_org/enes_data#"
},
"entity": {
"enes:input-data-set.nc": {
"dc:title": "eval_series_1",
"prov:type": "Dataset",
"prov:label": "experiment-mpi-m"
},
"enes:temporal-mean-result1-v1.nc": {
"dc:title": "eval_series_1_1"
}
},
"wasDerivedFrom": {
"enes:process-step1": {
"prov:usedEntity": "enes:input-data-set.nc",
"prov:generatedEntity": "enes:temporal-mean-result1-v1.nc"
}
},
"activity": {
"workflow:temporal-mean-cdo": {
}
},
"used": {
"enes:used-rel1": {
"prov:entity": "enes:input-data-set.nc",
"prov:activity": "workflow:temporal-mean-cdo"
}
},
"wasGeneratedBy": {
"enes:gen-rel1": {
"prov:entity": "enes:temporal-mean-result1-v1.nc",
"prov:activity": "workflow:temporal-mean-cdo"
}
},
"agent": {
"enes:Stephan Kindermann": {}
},
"wasAttributedTo": {
"enes:data-generator-rel1": {
"prov:entity": "enes:temporal-mean-result1-v1.nc",
"prov:agent": "enes:Stephan Kindermann"
}
}
}
In [96]:
d2 = ProvDocument.deserialize('wps-prov.json')
In [83]:
xml_result = d2.serialize(format='xml')
In [90]:
%%writefile wps-prov2.xml
<?xml version=1.0 encoding=ASCII?>\n<prov:document xmlns:dc="http://dublin-core.org/" xmlns:enes="http://www.enes.org/enes_entitiy/" xmlns:enes_data="http://enes_org/enes_data#" xmlns:esgf="http://carbon.dkrz.de/file/cmip5/" xmlns:file="http://www.enes.org/enes_entity/file/" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:user="http://www.enes.org/enes_entity/user/" xmlns:workflow="http://www.enes.org/enes/workflow/#" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">\n <prov:wasDerivedFrom prov:id="enes:process-step1">\n <prov:generatedEntity prov:ref="enes:temporal-mean-result1-v1.nc"/>\n <prov:usedEntity prov:ref="enes:input-data-set.nc"/>\n </prov:wasDerivedFrom>\n <prov:used prov:id="enes:used-rel1">\n <prov:activity prov:ref="workflow:temporal-mean-cdo"/>\n <prov:entity prov:ref="enes:input-data-set.nc"/>\n </prov:used>\n <prov:wasAttributedTo prov:id="enes:data-generator-rel1">\n <prov:entity prov:ref="enes:temporal-mean-result1-v1.nc"/>\n <prov:agent prov:ref="enes:Stephan Kindermann"/>\n </prov:wasAttributedTo>\n <prov:agent prov:id="enes:Stephan Kindermann"/>\n <prov:entity prov:id="enes:temporal-mean-result1-v1.nc">\n <dc:title>eval_series_1_1</dc:title>\n </prov:entity>\n <prov:entity prov:id="enes:input-data-set.nc">\n <prov:label>experiment-mpi-m</prov:label>\n <prov:type xsi:type="xsd:string">Dataset</prov:type>\n <dc:title>eval_series_1</dc:title>\n </prov:entity>\n <prov:activity prov:id="workflow:temporal-mean-cdo"/>\n <prov:wasGeneratedBy prov:id="enes:gen-rel1">\n <prov:entity prov:ref="enes:temporal-mean-result1-v1.nc"/>\n <prov:activity prov:ref="workflow:temporal-mean-cdo"/>\n </prov:wasGeneratedBy>\n</prov:document>
In [93]:
d_xml_test = ProvDocument.deserialize('wps-prov2.xml',format='xml')
In [97]:
print d2.serialize(indent=2)
In [98]:
def visualize_prov(prov_doc):
from prov.dot import prov_to_dot
from IPython.display import Image
dot = prov_to_dot(prov_doc)
dot.write_png('tmp1.png')
dot.write_pdf('tmp1.pdf')
return Image('tmp1.png')
In [99]:
visualize_prov(d2)
Out[99]:
The transformation code is based on the prov_to_dot() function in the dot.py package of the prov python package mentioned above ( https://github.com/trungdong/prov ). The code was simplified and modified to generate neo4j nodes and relation instead of dot nodes and relations.
In [100]:
## d2 graph is input parameter for this cell ..
import six
from py2neo import Graph, Node, Relationship, authenticate
node_map = {}
count = [0, 0, 0, 0] # counters for node ids
records = d2.get_records()
relations = []
use_labels = True
show_relation_attributes = True
other_attributes = True
show_nary = True
def _add_node(record):
count[0] += 1
node_id = 'n%d' % count[0]
if use_labels:
if record.label == record.identifier:
node_label = '"%s"' % six.text_type(record.label)
else:
# Fancier label if both are different. The label will be
# the main node text, whereas the identifier will be a
# kind of suptitle.
node_label = six.text_type(record.label)+','+six.text_type(record.identifier)
else:
node_label = six.text_type(record.identifier)
uri = record.identifier.uri
node = Node(node_id, label=node_label, URL=uri)
node_map[uri] = node
## create Node ... ##dot.add_node(node)
return node
def _get_node(qname):
if qname is None:
print "ERROR: _get_node called for empty node"
#return _get_bnode()
uri = qname.uri
if uri not in node_map:
_add_generic_node(qname)
return node_map[uri]
In [103]:
from prov.model import (
PROV_ACTIVITY, PROV_AGENT, PROV_ALTERNATE, PROV_ASSOCIATION,
PROV_ATTRIBUTION, PROV_BUNDLE, PROV_COMMUNICATION, PROV_DERIVATION,
PROV_DELEGATION, PROV_ENTITY, PROV_GENERATION, PROV_INFLUENCE,
PROV_INVALIDATION, PROV_END, PROV_MEMBERSHIP, PROV_MENTION,
PROV_SPECIALIZATION, PROV_START, PROV_USAGE, Identifier,
PROV_ATTRIBUTE_QNAMES, sorted_attributes, ProvException
)
for rec in records:
if rec.is_element():
_add_node(rec)
else:
# Saving the relations for later processing
relations.append(rec)
neo_rels = []
for rec in relations:
args = rec.args
# skipping empty records
if not args:
continue
# picking element nodes
nodes = [
value for attr_name, value in rec.formal_attributes
if attr_name in PROV_ATTRIBUTE_QNAMES
]
other_attributes = [
(attr_name, value) for attr_name, value in rec.attributes
if attr_name not in PROV_ATTRIBUTE_QNAMES
]
add_attribute_annotation = (
show_relation_attributes and other_attributes
)
add_nary_elements = len(nodes) > 2 and show_nary
if len(nodes) < 2: # too few elements for a relation?
continue # cannot draw this
if add_nary_elements or add_attribute_annotation:
# a blank node for n-ary relations or the attribute annotation
# the first segment
rel = Relationship(_get_node(nodes[0]), rec.get_type()._str,_get_node(nodes[1]))
print "relationship: ",rel
neo_rels.append(rel)
if add_nary_elements:
for node in nodes[2:]:
if node is not None:
relx = Relationshipdot.ad(_get_node(nodes[0]), "...rel_name",_get_node(node))
neo_rels.append(rel)
else:
# show a simple binary relations with no annotation
rel = Relationship(_get_node(nodes[0]), rec.get_type()._str,_get_node(nodes[1]))
neo_rels.append(rel)
In [102]:
print node_map
print neo_rels
In [61]:
authenticate("localhost:7474", "neo4j", "prolog16")
# connect to authenticated graph database
graph = Graph("http://localhost:7474/db/data/")
graph.delete_all()
In [62]:
for rel in neo_rels:
graph.create(rel)
In [ ]:
%load_ext cypher
In [68]:
results = %cypher http://neo4j:prolog16@localhost:7474/db/data MATCH (a)-[r]-(b) RETURN a,r, b
%matplotlib inline
results.get_graph()
Out[68]:
In [69]:
results.draw()
Out[69]:
In [ ]:
# example info calls on nodes and relations ..
der = d2.get_records()[0]
print der.get_type()._str + "tst"
print der.attributes
print der.is_relation()
print der.label
print der.value
print der.args
print der.is_element()
print der.formal_attributes
print der.get_asserted_types()
print der.get_provn