Background:
Integration aspects:
In [ ]:
import ckanclient
from pprint import pprint
ckan = ckanclient.CkanClient('http://b2find.eudat.eu/api/3/')
In [ ]:
# restrict to few (2) results for the purpose of this notebook
q = 'tags:IPCC'
d = ckan.action('package_search', q=q, rows=6)
In [ ]:
# 'title' provides the aggregation info for the data collection
# 'url' provides the doi of the data collection
# 'notes' contains information on how to interpret the aggregation info string in 'title'
for result in d['results']:
print result['title']
print result['title'].split()
print result['url']
print result['notes']
print "----------------------------------------------------------------"
#for part in result:
# print part,":-->", result[part]
In the harvested B2Find metadata an indication is given how to derive the hierarchy information: "Entry name/title of data are specified according to the Data Reference Syntax (http://cmip-pcmdi.llnl.gov/cmip5/docs/cmip5_data_reference_syntax.pdf) as activity/product/institute/model/experiment/frequency/modeling realm/MIP table/ensemble member/version number/variable name/CMOR filename.nc"
In [ ]:
# collection pattern (neo4j nodes for pattern parts)
# <activity>/<product>/<institute>/<model>/<experiment>/<frequency>/
# <modeling realm>/<mip table>/<ensemble member>/
# <version number>/<variable name>/<CMORfilename.nc>
# example title: cmip5 output1 LASG-CESS FGOALS-g2 historicalNat
# collection info: activity product institute model experiment
pattern = ['activity','product','institute','model','experiment']
def parse_collection_info(info_string,pattern):
info_parts = info_string.split()
result = dict(zip(pattern,info_parts))
return result
parsed_results = []
for result in d['results']:
parsed_result = parse_collection_info(result['title'],pattern)
parsed_results.append(parsed_result)
print parsed_results
The ESGF metadata harvesting and Neo4j graph generation is done in the script ENES-Neo4J-fill1.py Each component of the collection hierarchy is assiged to a node connected with the "belongs_to" relationship and each component has a property name "name" corresponding to the values extracted from the B2Find result recods (see above). Additionally each collection has a level attribute
experiment(6) -- belongs_to --> model(7) -- belongs_to --> institute(8) -- belongs_to --> product(9) -- belongs_to --> activity(10)
The B2Find metadata aggregates all collection levels below 6, thus the level 6 node has to be identified in the Neo4j ESGF graph and related to the corresponding B2Find information
In [ ]:
from py2neo import authenticate, Node, Relationship, Graph
authenticate("localhost:7474", 'neo4j', 'prolog16')
graph = Graph("http://localhost:7474/db/data/")
cypher = graph.cypher
In [ ]:
from neo4jrestclient.client import GraphDatabase
from neo4jrestclient.query import Q
gdb = GraphDatabase("http://localhost:7474/db/data/",username="neo4j",password="prolog16")
In [ ]:
%load_ext cypher
In [ ]:
%%cypher http://neo4j:prolog16@localhost:7474/db/data
MATCH (a)-[]-(b) RETURN a, b
In [ ]:
%load_ext icypher
In [ ]:
%install_ext https://bitbucket.org/vladf/ipython-diags/raw/default/diagmagic.py
In [ ]:
%install_ext https://raw.github.com/cjdrake/ipython-magic/master/gvmagic.py
In [ ]:
%load_ext gvmagic
In [ ]:
%dot digraph G { a -> b; a -> c }
In [ ]:
pattern = ['activity','product','institute','model','experiment']
nodes = [Node("Collection",name="ENES-data",level=0)]
rels = []
def add_collection(collection_info):
for index, facet in enumerate(pattern):
new_node = Node("Collection",name=pattern[index],level= index)
nodes.append( new_node)
new_rel = Relationship(new_node,"belongs-to",nodes[index-1])
rels.append(new_rel)
In [ ]:
%install_ext https://raw.githubusercontent.com/dongweiming/idb/master/idb.py
In [ ]:
%load_ext idb ## database interaction
In [1]:
import jgraph
jgraph.draw([(1, 2), (2, 3), (3, 4), (4, 1), (4, 5), (5, 2)])
In [ ]:
for result in parsed_results:
add_collection(result)
In [ ]:
print nodes
print rels
In [ ]:
In [ ]:
Match (n1:Collection {name:%experiment})-[r:belongs_to]->(n2:Collection {name:%model})-[r:belongs_to]
->(n3:Collection {name:%institute})-[r:belongs_to]->(n4:Collection {name:%product})-[r:belongs_to]
->(n5:Collection {name:%activity})