Representation of data submission workflow components based on W3C-PROV


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from prov.model import ProvDocument
d1 = ProvDocument()

Model is along the concept described in https://www.w3.org/TR/prov-primer/


In [3]:
from IPython.display import display, Image
Image(filename='key-concepts.png')


Out[3]:

In [7]:
import sys

#sys.path.append('/home/stephan/Repos/ENES-EUDAT/submission_forms')
sys.path.append('C:\\Users\\Stephan Kindermann\\Documents\\GitHub\\submission_forms')
from dkrz_forms import form_handler
from dkrz_forms.config import *

name_space = project_config.NAME_SPACES
cordex_dict = project_config.PROJECT_DICT['test']
# add namespaces for submission provenance capture

for key,value in name_space.iteritems():
    d1.add_namespace(key,value)

#d1.add_namespace()
# to do: look into some predefined vocabs, e.g. dublin core, iso19139,foaf  etc.

d1.add_namespace("enes_entity",'http://www.enes.org/enes_entitiy#')
d1.add_namespace('enes_agent','http://www.enes.org/enes_agent#')
d1.add_namespace('data_collection','http://www.enes.org/enes_entity/file_collection')
d1.add_namespace('data_manager','http://www.enes.org/enes_agent/data_manager')
d1.add_namespace('data_provider','http://www.enes.org/enes_agent/data_provider')
d1.add_namespace('subm','http://www.enes.org/enes_entity/data_submsission')
d1.add_namespace('foaf','http://xmlns.com/foaf/0.1/')


Out[7]:
<Namespace: foaf {http://xmlns.com/foaf/0.1/}>

Example name spaces

(from DOI: 10.3390/ijgi5030038 , mehr unter https://github.com/tsunagun/vocab/blob/master/all_20130125.csv)

owl      Web Ontology Language    http://www.w3.org/2002/07/owl#
dctype   DCMI Type Vocabulary     http://purl.org/dc/dcmitype/
dco      DCO  Ontology            http://info.deepcarbon.net/schema#
prov     PROV Ontology            http://www.w3.org/ns/prov#
skos     Simple Knowledge
         Organization System      http://www.w3.org/2004/02/skos/core#
foaf     FOAF Ontology            http://xmlns.com/foaf/0.1/
vivo     VIVO Ontology            http://vivoweb.org/ontology/core#
bibo     Bibliographic Ontology   http://purl.org/ontology/bibo/
xsd      XML Schema Datatype      http://www.w3.org/2001/XMLSchema#
rdf      Resource Description
         Framework                http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs     Resource Description
         Framework Schema         http://www.w3.org/2000/01/rdf-schema#    

In [10]:
# later: organize things in bundles
data_manager_ats = {'foaf:givenName':'Peter','foaf:mbox':'lenzen@dkzr.de'}

d1.entity('sub:empty')
def add_stage(agent,activity,in_state,out_state):
    # in_stage exists, out_stage is generated
    d1.agent(agent, data_manager_ats)
    d1.activity(activity)
    d1.entity(out_state)
   
    d1.wasGeneratedBy(out_state,activity)
    d1.used(activity,in_state)
    d1.wasAssociatedWith(activity,agent)
    d1.wasDerivedFrom(out_state,in_state)

In [11]:
import json
form_file = open('/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.json',"r")
json_info = form_file.read()
#json_info["__type__"] = "sf",
form_file.close()
sf_dict = json.loads(json_info)

In [22]:
form_handler.FForm(sf_dict)


Out[22]:
<dkrz_forms.form_handler.FForm at 0x7f73e04f0cd0>

In [13]:
sf = form_handler.FForm(sf_dict)

In [14]:
print sf.__dict__


{u'project': u'test', u'qua': {u'comment': u'on lizzard ', u'status': u'checked', u'timestamp_started': u'', u'follow_up_ticket': u'', u'target_directory': u'/work/KD0956/qa_results/cmip5/mpi-m/test', u'qua_status': u'', u'ticket_id': u'', u'qa_tool_version': u'dkrz_qa_v09', u'responsible_person': u'hdh', u'timestamp_finished': u'', u'qua_comment': u''}, u'ing': {u'comment': u' copying data from ... to ... using ...  time: about 2 hours, volume: about .. GB ', u'status': u'ingested', u'timestamp_started': u'', u'target_directory': u'/work/kd0956/cmip5/ingest/cmip5/mpi-m/test', u'drsdir_file_pattern': u'', u'drs_file_pattern': u'project:cmip5 | experiment:test| variables: tua,uav', u'responsible_person': u'lenzen', u'timestamp_finished': u'', u'ticket_id': u''}, u'sub': {u'comment': u'terms of use clarified', u'last_name': u'ki', u'package_name': u'test_ki_sk1.json', u'review_comment': u'', u'form_path': u'/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.ipynb', u'package_path': u'/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.json', u'check_status': u'consistency_checked', u'responsible_person': u'pl', u'form_version': u'', u'report_ticket_subject': u'', u'first_name': u'', u'id': u'a257480a-b4d0-11e6-8358-080027f178b4', u'ticket_id': 22949, u'email': u'snkinder@freenet.de', u'status': u'submission_processing', u'key_word': u'', u'timestamp': u'2016-11-27 19:38:05.110823', u'ticket_url': u'https://dm-rt.dkrz.de/Ticket/Display.html?id=', u'repo': u'/home/stephan/tmp/Repos/submission_forms_repo/test', u'subform_path': u'/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.ipynb', u'submission_comment': u'', u'commit_hash': u'e6d6633ff6f4d904e3a54af891f6fe7f9b742508', u'keyword': u'sk1', u'source_path': u'/home/stephan/Repos/ENES-EUDAT/submission_forms/dkrz_forms/Templates/test_submission_form.ipynb', u'checks_done': u'none', u'submission_method': u'', u'form_name': u'test_ki_sk1', u'review_summary': u''}, u'pub': {u'comment': u'', u'status': u'published', u'pid_collections': u'', u'timestamp': u'2016-05-20 18:34:28.934536', u'facet_string': u'', u'search_string': u'&model=cmip5&experiment=test', u'publish_date': u'2016-05-20 19', u'ticket_id': u'', u'responsible_person': u'berger', u'map_file': u'host://path_to_mapfile'}}

In [15]:
print sf.sub


{u'comment': u'terms of use clarified', u'last_name': u'ki', u'package_name': u'test_ki_sk1.json', u'review_comment': u'', u'form_path': u'/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.ipynb', u'package_path': u'/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.json', u'check_status': u'consistency_checked', u'responsible_person': u'pl', u'form_version': u'', u'report_ticket_subject': u'', u'first_name': u'', u'id': u'a257480a-b4d0-11e6-8358-080027f178b4', u'ticket_id': 22949, u'email': u'snkinder@freenet.de', u'status': u'submission_processing', u'key_word': u'', u'timestamp': u'2016-11-27 19:38:05.110823', u'ticket_url': u'https://dm-rt.dkrz.de/Ticket/Display.html?id=', u'repo': u'/home/stephan/tmp/Repos/submission_forms_repo/test', u'subform_path': u'/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.ipynb', u'submission_comment': u'', u'commit_hash': u'e6d6633ff6f4d904e3a54af891f6fe7f9b742508', u'keyword': u'sk1', u'source_path': u'/home/stephan/Repos/ENES-EUDAT/submission_forms/dkrz_forms/Templates/test_submission_form.ipynb', u'checks_done': u'none', u'submission_method': u'', u'form_name': u'test_ki_sk1', u'review_summary': u''}

In [16]:
data_provider = sf.sub['first_name']+'_'+sf.sub['last_name']
submission_manager = sf.sub['responsible_person']
ingest_manager = sf.ing['responsible_person']
qa_manager = sf.ing['responsible_person']
publication_manager =  sf.pub['responsible_person']

add_stage(agent='data_provider:test_user_id',activity='subm:submit',in_state="subm:empty",out_state='subm:out1_sub')
add_stage(agent='data_manager:peter_lenzen_id',activity='subm:review',in_state="subm:out1_sub",out_state='subm:out1_rev')
add_stage(agent='data_manager:peter_lenzen_id',activity='subm:ingest',in_state="subm:out1_rev",out_state='subm:out1_ing')
add_stage(agent='data_manager:hdh_id',activity='subm:check',in_state="subm:out1_ing",out_state='subm:out1_che')
add_stage(agent='data_manager:katharina_b_id',activity='subm:publish',in_state="subm:out1_che",out_state='subm:out1_pub')
add_stage(agent='data_manager:lta_id',activity='subm:archive',in_state="subm:out1_pub",out_state='subm:out1_arch')

In [23]:
mylist = [a]
a = {'1':'2'}


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-23-ca4ab9224e27> in <module>()
----> 1 mylist = [a]
      2 a = {'1':'2'}

NameError: name 'a' is not defined

assign information to provenance graph nodes and edges


In [8]:
%matplotlib inline
d1.plot()



InvocationExceptionTraceback (most recent call last)
<ipython-input-8-1a8386be8e2d> in <module>()
      1 get_ipython().magic(u'matplotlib inline')
----> 2 d1.plot()
      3 

C:\Anaconda\envs\py2\lib\site-packages\prov\model.pyc in plot(self, filename, show_nary, use_labels, show_element_attributes, show_relation_attributes)
   1423             raise ValueError("Format '%s' cannot be saved." % format)
   1424         with io.BytesIO() as buf:
-> 1425             buf.write(getattr(d, method)())
   1426 
   1427             buf.seek(0, 0)

C:\Anaconda\envs\py2\lib\site-packages\pydotplus\graphviz.pyc in <lambda>(f, prog)
   1795             self.__setattr__(
   1796                 'create_' + frmt,
-> 1797                 lambda f=frmt, prog=self.prog: self.create(format=f, prog=prog)
   1798             )
   1799             f = self.__dict__['create_' + frmt]

C:\Anaconda\envs\py2\lib\site-packages\pydotplus\graphviz.pyc in create(self, prog, format)
   1958             if self.progs is None:
   1959                 raise InvocationException(
-> 1960                     'GraphViz\'s executables not found')
   1961 
   1962         if prog not in self.progs:

InvocationException: GraphViz's executables not found

In [44]:
d1.serialize()


Out[44]:
u'{"wasAssociatedWith": {"_:id7": {"prov:agent": "data_manager:peter_lenzen_id", "prov:activity": "subm:review"}, "_:id3": {"prov:agent": "data_provider:test_user_id", "prov:activity": "subm:submit"}, "_:id15": {"prov:agent": "data_manager:hdh_id", "prov:activity": "subm:check"}, "_:id23": {"prov:agent": "data_manager:lta_id", "prov:activity": "subm:archive"}, "_:id11": {"prov:agent": "data_manager:peter_lenzen_id", "prov:activity": "subm:ingest"}, "_:id19": {"prov:agent": "data_manager:katharina_b_id", "prov:activity": "subm:publish"}}, "wasDerivedFrom": {"_:id4": {"prov:usedEntity": "subm:empty", "prov:generatedEntity": "subm:out1_sub"}, "_:id8": {"prov:usedEntity": "subm:out1_sub", "prov:generatedEntity": "subm:out1_rev"}, "_:id20": {"prov:usedEntity": "subm:out1_che", "prov:generatedEntity": "subm:out1_pub"}, "_:id16": {"prov:usedEntity": "subm:out1_ing", "prov:generatedEntity": "subm:out1_che"}, "_:id24": {"prov:usedEntity": "subm:out1_pub", "prov:generatedEntity": "subm:out1_arch"}, "_:id12": {"prov:usedEntity": "subm:out1_rev", "prov:generatedEntity": "subm:out1_ing"}}, "used": {"_:id6": {"prov:entity": "subm:out1_sub", "prov:activity": "subm:review"}, "_:id2": {"prov:entity": "subm:empty", "prov:activity": "subm:submit"}, "_:id14": {"prov:entity": "subm:out1_ing", "prov:activity": "subm:check"}, "_:id22": {"prov:entity": "subm:out1_pub", "prov:activity": "subm:archive"}, "_:id10": {"prov:entity": "subm:out1_rev", "prov:activity": "subm:ingest"}, "_:id18": {"prov:entity": "subm:out1_che", "prov:activity": "subm:publish"}}, "agent": {"data_manager:lta_id": {"foaf:givenName": "Peter", "foaf:mbox": "lenzen@dkzr.de"}, "data_manager:katharina_b_id": {"foaf:givenName": "Peter", "foaf:mbox": "lenzen@dkzr.de"}, "data_manager:peter_lenzen_id": [{"foaf:givenName": "Peter", "foaf:mbox": "lenzen@dkzr.de"}, {"foaf:givenName": "Peter", "foaf:mbox": "lenzen@dkzr.de"}], "data_provider:test_user_id": {"foaf:givenName": "Peter", "foaf:mbox": "lenzen@dkzr.de"}, "data_manager:hdh_id": {"foaf:givenName": "Peter", "foaf:mbox": "lenzen@dkzr.de"}}, "entity": {"subm:out1_arch": {}, "subm:out1_sub": {"qua:id": "c6b4fab6-d5b0-11e6-9fc8-080027f178b4", "qua:repo": "/home/stephan/tmp/Repos/submission_forms_repo/test", "qua:form_path": "/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.ipynb", "qua:review_comment": "", "qua:package_path": "/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.json", "qua:key_word": "", "qua:review_summary": "", "qua:timestamp": "2017-01-08 15:43:10.608675", "qua:ticket_url": "https://dm-rt.dkrz.de/Ticket/Display.html?id=", "qua:checks_done": "none", "qua:form_version": "", "qua:check_status": "consistency_checked", "qua:comment": "terms of use clarified", "qua:first_name": "", "qua:form_name": "test_ki_sk1", "qua:ticket_id": {"type": "xsd:int", "$": 22949}, "qua:subform_path": "/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.ipynb", "qua:report_ticket_subject": "", "qua:submission_method": "", "qua:email": "snkinder@freenet.de", "qua:responsible_person": "lenzen", "qua:keyword": "sk1", "qua:source_path": "/home/stephan/Repos/ENES-EUDAT/submission_forms/dkrz_forms/Templates/test_submission_form.ipynb", "qua:status": "fertig", "qua:package_name": "test_ki_sk1.json", "qua:commit_hash": "be98119c3f5dbb4c85cdad230f323e417155246c", "qua:submission_comment": "", "qua:last_name": "ki"}, "subm:out1_che": {"qua:ticket_id": "", "qua:timestamp_started": "", "qua:status": "checked", "qua:qa_tool_version": "dkrz_qa_v09", "qua:timestamp_finished": "", "qua:follow_up_ticket": "", "qua:qua_status": "", "qua:target_directory": "/work/KD0956/qa_results/cmip5/mpi-m/test", "qua:responsible_person": "hdh", "qua:comment": "on lizzard ", "qua:qua_comment": ""}, "subm:out1_rev": {}, "qua:empty": {}, "subm:out1_pub": {"qua:ticket_id": "", "qua:timestamp": "2016-05-20 18:34:28.934536", "qua:status": "published", "qua:search_string": "&model=cmip5&experiment=test", "qua:facet_string": "", "qua:publish_date": "2016-05-20 19", "qua:pid_collections": "", "qua:map_file": "host://path_to_mapfile", "qua:comment": "", "qua:responsible_person": "berger"}, "subm:out1_ing": {"qua:ticket_id": "", "qua:drs_file_pattern": "project:cmip5 | experiment:test| variables: tua,uav", "qua:status": "ingested", "qua:timestamp_finished": "", "qua:timestamp_started": "", "qua:drsdir_file_pattern": "", "qua:comment": " copying data from ... to ... using ...  time: about 2 hours, volume: about .. GB ", "qua:target_directory": "/work/kd0956/cmip5/ingest/cmip5/mpi-m/test", "qua:responsible_person": "lenzen"}}, "prefix": {"foaf": "http://xmlns.com/foaf/0.1/", "data_collection": "http://www.enes.org/enes_entity/file_collection", "enes_agent": "http://www.enes.org/enes_agent#", "enes_entity": "http://www.enes.org/enes_entitiy#", "qua": "http://enes.org/entities/ingest-workflow#", "data_manager": "http://www.enes.org/enes_agent/data_manager", "data_provider": "http://www.enes.org/enes_agent/data_provider", "subm": "http://www.enes.org/enes_entity/data_submsission"}, "activity": {"subm:review": {}, "subm:ingest": {}, "subm:submit": {}, "subm:check": {}, "subm:publish": {}, "subm:archive": {}}, "wasGeneratedBy": {"_:id5": {"prov:entity": "subm:out1_rev", "prov:activity": "subm:review"}, "_:id1": {"prov:entity": "subm:out1_sub", "prov:activity": "subm:submit"}, "_:id9": {"prov:entity": "subm:out1_ing", "prov:activity": "subm:ingest"}, "_:id21": {"prov:entity": "subm:out1_arch", "prov:activity": "subm:archive"}, "_:id17": {"prov:entity": "subm:out1_pub", "prov:activity": "subm:publish"}, "_:id13": {"prov:entity": "subm:out1_che", "prov:activity": "subm:check"}}}'

In [47]:
import json
ingest_prov_file = open('ingest_prov_1.json','w')

prov_data = d1.serialize()
prov_data_json = json.dumps(prov_data)

ingest_prov_file.write(prov_data)

ingest_prov_file.close()

#d1.wasAttributedTo(data_submission,'????')

Transform submission object to a provenance graph


In [18]:
#d1.get_records()
submission = d1.get_record('subm:out1_sub')[0]
review = d1.get_record('subm:out1_rev')[0]
ingest = d1.get_record('subm:out1_ing')[0]
check = d1.get_record('subm:out1_che')[0]
publication = d1.get_record('subm:out1_pub')[0]
lta = d1.get_record('subm:out1_arch')[0]

In [19]:
res = form_handler.prefix_dict(sf.sub,'sub',sf.sub.keys())
res['sub:status']="fertig"
print res
ing = form_handler.prefix_dict(sf.ing,'ing',sf.ing.keys())
qua = form_handler.prefix_dict(sf.qua,'qua',sf.qua.keys())
pub = form_handler.prefix_dict(sf.pub,'pub',sf.pub.keys())


{u'sub:ticket_url': u'https://dm-rt.dkrz.de/Ticket/Display.html?id=', u'sub:package_name': u'test_ki_sk1.json', u'sub:repo': u'/home/stephan/tmp/Repos/submission_forms_repo/test', u'sub:timestamp': u'2016-11-27 19:38:05.110823', u'sub:email': u'snkinder@freenet.de', u'sub:package_path': u'/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.json', u'sub:form_version': u'', u'sub:subform_path': u'/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.ipynb', u'sub:key_word': u'', u'sub:submission_method': u'', u'sub:last_name': u'ki', u'sub:id': u'a257480a-b4d0-11e6-8358-080027f178b4', u'sub:comment': u'terms of use clarified', u'sub:responsible_person': u'pl', u'sub:review_comment': u'', u'sub:first_name': u'', u'sub:review_summary': u'', u'sub:source_path': u'/home/stephan/Repos/ENES-EUDAT/submission_forms/dkrz_forms/Templates/test_submission_form.ipynb', u'sub:checks_done': u'none', u'sub:report_ticket_subject': u'', u'sub:keyword': u'sk1', u'sub:status': 'fertig', u'sub:commit_hash': u'e6d6633ff6f4d904e3a54af891f6fe7f9b742508', u'sub:form_path': u'/home/stephan/tmp/Repos/submission_forms_repo/test/test_ki_sk1.ipynb', u'sub:form_name': u'test_ki_sk1', u'sub:submission_comment': u'', u'sub:ticket_id': 22949, u'sub:check_status': u'consistency_checked'}

In [20]:
submission.add_attributes(res)
ingest.add_attributes(ing)
check.add_attributes(qua)
publication.add_attributes(pub)

In [29]:
che_act = d1.get_record('subm:check') 
tst = che_act[0]
test_dict = {'subm:test':'test'}
tst.add_attributes(test_dict)

In [32]:
print tst
tst.FORMAL_ATTRIBUTES
tst.


activity(subm:check, -, -, [subm:test="test"])
Out[32]:
(<QualifiedName: prov:startTime>, <QualifiedName: prov:endTime>)

In [ ]:
che_act = d1.get_record('subm:check') 
#tst.formal_attributes
#tst.FORMAL_ATTRIBUTES
tst.add_attributes({'foaf:name':'tst'})
print tst.attributes
#for i in tst:
 #   print i
#tst.insert([('subm:givenName','sk')])

In [ ]:
import sys
sys.path.append('/home/stephan/Repos/ENES-EUDAT/submission_forms')
from dkrz_forms import form_handler
sf,repo = form_handler.init_form("CORDEX")



init_dict = sf.__dict__ 
sub_form = form_handler.prefix(sf,'subm',sf.__dict__.keys())            

sub_dict = sub_form.__dict__

#init_state = d1.get_record('subm:empty')[0]
#init_state.add_attributes(init_dict)

sub_state = d1.get_record('subm:out1_sub')[0]
init_state.add_attributes(sub_dict)

In [ ]:
tst_dict = {'test1':'val1','test2':'val2'}
tst = form_handler.submission_form(tst_dict) 
print tst.__dict__

In [ ]:
print result.__dict__

In [ ]:
dict_from_class(sf)

In [ ]: