In [1]:
from rdflib import *
manual_annotation_url = 'https://raw.githubusercontent.com/DataONEorg/semantic-query/master/lib/test_corpus_F_dev/manual_annotations.tsv.txt'
data_url = 'https://raw.githubusercontent.com/DataONEorg/semantic-query/master/lib/test_corpus_E_id_list.txt'
dataset_service_url = 'https://cn.dataone.org/cn/v1/query/solr/?wt=json&fl=title,abstract,attribute&q=identifier:"%s"'
service_url = 'http://localhost:8080/annotate/annotate/'
oboe = Namespace('http://ecoinformatics.org/oboe/oboe.1.1/oboe-core.owl#')
cmo = Namespace('http://purl.org/twc/ontologies/cmo.owl#')
skos = Namespace('http://www.w3.org/2004/02/skos/core#')
_prefix = Namespace('http://purl.dataone.org/odo/ECSTRA_')
In [4]:
import csv, urllib, json, urllib2
import pandas as pd
from rdflib.extras.infixowl import *
import collections
import json
import base64
import random
import datetime
import requests
from io import StringIO
from rdflib.compare import to_isomorphic
def get_manual_annotations():
resp = requests.get(manual_annotation_url)
annotations = [x for x in csv.DictReader(StringIO(resp.text,newline=None), delimiter="\t")]
resp.close()
result = collections.defaultdict(set)
for annotation in annotations:
if len(annotation['class_id_int'].strip()) == 0:
continue
package = annotation['pkg_id']
uri = 'http://purl.dataone.org/odo/ECSO_%08d'%int(annotation['class_id_int'].strip())
result[package].add(URIRef(uri))
return result
#datasets = urllib2.urlopen(data_url).read().split("\n")[1:]
def get_dataset_columns(identifier):
url = dataset_service_url%identifier
result = json.loads(urllib2.urlopen(url).read())['response']['docs'][0]['attribute']
return result
nt_file = '/Users/jimmccusker/src/linkipedia/dataone-index/NTriple/merged.nt'
from rdflib import *
graph = ConjunctiveGraph()
graph.load(open(nt_file),format="nt")
def extract_mentions(text, context):
urls = collections.defaultdict(float)
params = {
'numResult':numResult,
'minScore':minScore,
'contentWeight':contentWeight,
'relationWeight':relationWeight,
}
data = json.dumps({
'query':text,
'context':context
})
response = requests.post(service_url, params=params, data=data,
headers={"ContentType":"application/json"}).json()
for r in response['results']:
for annotation in r['annotations']:
urls[annotation['url']] += float(annotation['score'])
urls = [URIRef(url) for url, score in sorted(urls.items(), key=lambda x: x[1])]
return urls
import ipywidgets
from IPython.display import display
def by_super_class(resources):
result = collections.defaultdict(list)
for r in resources:
for s in graph.transitive_objects(URIRef(r),RDFS.subClassOf):
result[s].append(r)
return result
def get_ir_tuples(annotations):
result = set()
for dataset, classes in annotations.items():
for c in classes:
result.add((dataset, c))
return result
f = ipywidgets.FloatProgress(min=0, max=1)
ftext = ipywidgets.Text(value='0', description='%')
display(f)
display(ftext)
def extract(datasets):
i = 0
automated = collections.defaultdict(set)
for dataset in datasets:
try:
columns = get_dataset_columns(dataset)
except:
print "Problem processing the dataset '", dataset, "'."
continue
for column in columns:
#try:
resources = list(extract_mentions(column, column))
by_super = by_super_class(resources)
classes = by_super[oboe.MeasurementType]
classes = classes[:topHits]
automated[dataset] |= set(classes)
#except:
# print 'Error processing "', column, '".'
i += 1
ftext.value = str(100 * float(i)/len(datasets))
f.value = float(i)/len(datasets)
return automated
In [5]:
numResult = 20
minScore = 1
topHits = 20
contentWeight = 6
relationWeight = 6
scores = pd.DataFrame(columns=['precision','recall','fmeasure',
'numResult','minScore','topHits',
'contentWeight','relationWeight'])
manual_annotations = get_manual_annotations()
manual_tuples = get_ir_tuples(manual_annotations)
def run():
automated_annotations = extract(manual_annotations.keys())
automated_tuples = get_ir_tuples(automated_annotations)
hits = manual_tuples & automated_tuples
misses = manual_tuples - automated_tuples
precision = float(len(hits))/len(automated_tuples)
recall = float(len(hits))/len(manual_tuples)
fmeasure = 2 * (precision * recall)/(precision + recall)
row = dict(precision=precision, recall=recall, fmeasure=fmeasure,
numResult=numResult, minScore=minScore, topHits=topHits,
contentWeight=contentWeight, relationWeight=relationWeight)
return row
for cw in [5]:
contentWeight = cw
for rw in [5]:
relationWeight = rw
for ms in [3]:
minScore = ms
row = run()
print '\t'.join([str(row[h]) for h in scores.columns])
scores = scores.append(row,
ignore_index=True)
scores
In [26]:
scores
Out[26]:
In [24]:
len(misses)
Out[24]:
In [23]:
len(hits)
Out[23]:
In [ ]: