Test
Try to extract paths between classes in an RDF graph and count the number of occurances between those paths.
Using dbpedia as an example. Start with the dbpedia ontology 3.9 unziped in a datafiles directory
In [2]:
import rdflib
import itertools
import time
g = rdflib.Graph()
g.parse("./datafiles/dbpedia_3.9.owl")
#FILTER (?domain = <http://dbpedia.org/ontology/Island> || ?domain = <http://dbpedia.org/ontology/Place>)
qres = g.query(
"""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
select ?domain ?prop ?range where {
?prop rdfs:domain ?domain .
?prop rdfs:range ?range .
}
""")
prop_to_class = {}
class_to_prop = {}
for row in qres:
prop = str(row['prop'])
domain = str(row['domain'])
range = str(row['range'])
if prop in prop_to_class:
prop_to_class[prop].extend([range])
else:
prop_to_class[prop] = []
prop_to_class[prop].extend([range])
if domain in class_to_prop:
class_to_prop[domain].extend([prop])
else:
class_to_prop[domain] = []
class_to_prop[domain].extend([prop])
#print class_to_prop
#print "-----"
#print prop_to_class
#All path algorithm
# modified from http://www.python.org/doc/essays/graphs/
# supports property labels and path length max
def find_all_paths (start, end, path=[], prop="", len=0):
if prop == "":
path = path + [start]
else:
path = path + [prop] + [start]
if start == end:
return[path]
if len==3:
return []
if not class_to_prop.has_key(start):
return []
paths = []
for prop in class_to_prop[start]:
#print "p= " + prop
for node in prop_to_class[prop]:
#print "n= " + node
if node not in path:
newpaths = find_all_paths(node, end, path, prop, len=len+1)
for newpath in newpaths:
paths.append(newpath)
return paths
#start = "http://dbpedia.org/ontology/Island"
#end = "http://dbpedia.org/ontology/Place"
#find_all_paths(start,end)
print "Total number of classes: " + str(len(class_to_prop.keys()))
st = time.clock()
combos = itertools.combinations_with_replacement(class_to_prop.keys(), 2)
path_results = {}
numberOfCombos = 0
for i in combos:
numberOfCombos = numberOfCombos +1
start = i[0]
end = i[1]
#print start + " " + end
res = find_all_paths(start,end)
if len(res) > 0:
path_results[start + " " + end] = res
#print start + " " + end
#print res
elapsed = (time.clock() - st)
print "Time to find paths between " + str(numberOfCombos) + " combinations " + str(elapsed)
In [3]:
from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
out = open("test.csv", 'w')
for res in path_results:
paths = path_results[res]
for path in paths:
if len(path) > 1:
index = 0
stringpath = ''
query = '''
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
select (count(*) as ?ct) where {
'''
for elem in path:
stringpath += elem + ','
if index % 2 == 0:
query += " ?v" + str(index) + " rdf:type <" + elem + "> . "
else:
query += " ?v" + str(index-1) + " <" + elem + "> ?v" + str(index+1) + " . "
index = index + 1
query += " }"
#print query
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
spres = sparql.query().convert()
output = ''
val = spres["results"]["bindings"][0]['ct']['value']
output += str(val) +',' + stringpath + '\n'
out.write(output)
out.close()
In [ ]: