Test

Try to extract paths between classes in an RDF graph and count the number of occurances between those paths.

Using dbpedia as an example. Start with the dbpedia ontology 3.9 unziped in a datafiles directory

Phase 1: Generate

  1. Input: A given t-box
  2. Extract all properties and domain and range classes
  3. From the domain classes, generate all pairs of classes
  4. Find all paths between all pairs of classes (with a given depth)
  5. Output: a hash: pairs of classes --> paths between those classes

In [2]:
import rdflib
import itertools
import time

g = rdflib.Graph()
g.parse("./datafiles/dbpedia_3.9.owl")

#FILTER (?domain = <http://dbpedia.org/ontology/Island> || ?domain = <http://dbpedia.org/ontology/Place>)
                

qres = g.query(
               """
               PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
               select ?domain ?prop ?range where {
                   ?prop rdfs:domain ?domain .
                   ?prop rdfs:range ?range .
               }

               """)


prop_to_class = {}
class_to_prop =  {}



for row in qres:
    prop = str(row['prop'])
    domain = str(row['domain'])
    range = str(row['range'])
    
    if prop in prop_to_class:
        prop_to_class[prop].extend([range])
    else:
        prop_to_class[prop] = []
        prop_to_class[prop].extend([range])
        
    if domain in class_to_prop:
        class_to_prop[domain].extend([prop])
    else:
        class_to_prop[domain] = []
        class_to_prop[domain].extend([prop])

    
        
        
#print class_to_prop
#print "-----"
#print prop_to_class

#All path algorithm
# modified from http://www.python.org/doc/essays/graphs/
# supports property labels and path length max

def find_all_paths (start, end, path=[], prop="", len=0):
    if prop == "":
        path = path + [start]
    else:
        path = path + [prop] + [start]
        
    if start == end:
        return[path]
    if len==3:
        return []
    if not class_to_prop.has_key(start):
        return []
    paths = []
    for prop in class_to_prop[start]:
        #print "p= " + prop
        for node in prop_to_class[prop]:
            #print "n= " + node
            if node not in path:
                newpaths = find_all_paths(node, end, path, prop, len=len+1)
                for newpath in newpaths:
                    paths.append(newpath)
    
    return paths
    
#start = "http://dbpedia.org/ontology/Island"
#end = "http://dbpedia.org/ontology/Place"

    
#find_all_paths(start,end)

print "Total number of classes: " + str(len(class_to_prop.keys()))
          
st = time.clock()
combos = itertools.combinations_with_replacement(class_to_prop.keys(), 2)

path_results = {}

numberOfCombos = 0
for i in combos:
    numberOfCombos = numberOfCombos +1
    start = i[0]
    end = i[1]
    #print start + " " + end 
    res =  find_all_paths(start,end)
    if len(res) > 0:
        path_results[start + " " + end] = res
        #print start + " " + end 
        #print res
    
elapsed = (time.clock() - st)
print "Time to find paths between " + str(numberOfCombos) + " combinations " + str(elapsed)


Total number of classes: 227
Time to find paths between 25878 combinations 40.50793
/Users/pgroth/.virtualenvs/ipy/lib/python2.7/site-packages/rdflib/plugin.py:108: UserWarning: Module readline was already imported from /Users/pgroth/.virtualenvs/ipy/lib/python2.7/lib-dynload/readline.so, but /Users/pgroth/.virtualenvs/ipy/lib/python2.7/site-packages is being added to sys.path
  from pkg_resources import iter_entry_points

Phase 2: Test


In [3]:
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("http://dbpedia.org/sparql")

out = open("test.csv", 'w')
for res in path_results:
    paths = path_results[res]
    for path in paths:
        if len(path) > 1:
            index = 0
            stringpath = ''
            query = '''
                 PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                 select (count(*) as ?ct) where {
            '''
            for elem in path: 
                stringpath += elem + ','
                if index % 2 == 0:
                    query += " ?v" + str(index) +  " rdf:type <" +  elem + "> . "
                else:
                    query += " ?v" + str(index-1) + " <" + elem + "> ?v" + str(index+1) + " . " 
                
                index = index + 1
            query += " }"
            #print query
            
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            spres = sparql.query().convert()
            output = ''
            val = spres["results"]["bindings"][0]['ct']['value']
            output += str(val) +',' + stringpath + '\n'
            out.write(output)
out.close()

In [ ]: