In [1]:
from rdflib import *

manual_annotation_url = 'https://raw.githubusercontent.com/DataONEorg/semantic-query/master/lib/test_corpus_F_dev/manual_annotations.tsv.txt'
data_url = 'https://raw.githubusercontent.com/DataONEorg/semantic-query/master/lib/test_corpus_E_id_list.txt'
dataset_service_url = 'https://cn.dataone.org/cn/v1/query/solr/?wt=json&fl=title,abstract,attribute&q=identifier:"%s"'
service_url = 'http://localhost:8080/annotate/annotate/'

oboe = Namespace('http://ecoinformatics.org/oboe/oboe.1.1/oboe-core.owl#')
cmo = Namespace('http://purl.org/twc/ontologies/cmo.owl#')
skos = Namespace('http://www.w3.org/2004/02/skos/core#')
_prefix = Namespace('http://purl.dataone.org/odo/ECSTRA_')

In [4]:
import csv, urllib, json, urllib2
import pandas as pd
from rdflib.extras.infixowl import *
import collections
import json
import base64
import random
import datetime
import requests
from io import StringIO
from rdflib.compare import to_isomorphic

def get_manual_annotations():
    resp = requests.get(manual_annotation_url)
    annotations = [x for x in csv.DictReader(StringIO(resp.text,newline=None), delimiter="\t")]
    resp.close()
    result = collections.defaultdict(set)
    for annotation in annotations:
        if len(annotation['class_id_int'].strip()) == 0:
            continue
        package = annotation['pkg_id']
        uri = 'http://purl.dataone.org/odo/ECSO_%08d'%int(annotation['class_id_int'].strip())
        result[package].add(URIRef(uri))
    return result

#datasets = urllib2.urlopen(data_url).read().split("\n")[1:]

def get_dataset_columns(identifier):
    url = dataset_service_url%identifier
    result = json.loads(urllib2.urlopen(url).read())['response']['docs'][0]['attribute']
    return result

nt_file = '/Users/jimmccusker/src/linkipedia/dataone-index/NTriple/merged.nt'
from rdflib import *

graph = ConjunctiveGraph()
graph.load(open(nt_file),format="nt")


def extract_mentions(text, context):
    urls = collections.defaultdict(float)
    params = {
        'numResult':numResult,
        'minScore':minScore,
        'contentWeight':contentWeight,
        'relationWeight':relationWeight,
    }
    data = json.dumps({
        'query':text,
        'context':context
    })
    response = requests.post(service_url, params=params, data=data,
                             headers={"ContentType":"application/json"}).json()
    for r in response['results']:
        for annotation in r['annotations']:
            urls[annotation['url']] += float(annotation['score'])
    urls = [URIRef(url) for url, score in sorted(urls.items(), key=lambda x: x[1])]
    return urls

import ipywidgets 
from IPython.display import display

def by_super_class(resources):
    result = collections.defaultdict(list)
    for r in resources:
        for s in graph.transitive_objects(URIRef(r),RDFS.subClassOf):
            result[s].append(r)
    return result

def get_ir_tuples(annotations):
    result = set()
    for dataset, classes in annotations.items():
        for c in classes:
            result.add((dataset, c))
    return result

f = ipywidgets.FloatProgress(min=0, max=1)
ftext = ipywidgets.Text(value='0', description='%')
display(f)
display(ftext)

def extract(datasets):
    i = 0
    automated = collections.defaultdict(set)
    for dataset in datasets:
        try:
            columns = get_dataset_columns(dataset)
        except:
            print "Problem processing the dataset '", dataset, "'."
            continue
        for column in columns:
            #try:
            resources = list(extract_mentions(column, column))
            by_super = by_super_class(resources)
            classes = by_super[oboe.MeasurementType]
            classes = classes[:topHits]
            automated[dataset] |= set(classes)
            #except:
            #    print 'Error processing "', column, '".'
        i += 1
        ftext.value = str(100 * float(i)/len(datasets))
        f.value = float(i)/len(datasets)
    return automated

In [5]:
numResult = 20
minScore = 1
topHits = 20
contentWeight = 6
relationWeight = 6

scores = pd.DataFrame(columns=['precision','recall','fmeasure',
                               'numResult','minScore','topHits',
                               'contentWeight','relationWeight'])
manual_annotations = get_manual_annotations()
manual_tuples = get_ir_tuples(manual_annotations)

def run():
    automated_annotations = extract(manual_annotations.keys())
    automated_tuples = get_ir_tuples(automated_annotations)
    hits = manual_tuples & automated_tuples
    misses = manual_tuples - automated_tuples

    precision = float(len(hits))/len(automated_tuples)
    recall = float(len(hits))/len(manual_tuples)
    fmeasure = 2 * (precision * recall)/(precision + recall)
    row = dict(precision=precision, recall=recall, fmeasure=fmeasure, 
               numResult=numResult, minScore=minScore, topHits=topHits, 
               contentWeight=contentWeight, relationWeight=relationWeight)
    return row

for cw in [5]:
    contentWeight = cw
    for rw in [5]:
        relationWeight = rw
        for ms in [3]:
            minScore = ms
            row = run()
            print '\t'.join([str(row[h]) for h in scores.columns])
            scores = scores.append(row,
                                   ignore_index=True)

scores


0.170085979399	0.361432706223	0.231316931983	20	3	20	5	5
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-c6d0d92a7aa1> in <module>()
     33             row = run()
     34             print '\t'.join([str(row[h]) for h in scores.columns])
---> 35             scores = scores.append(dict(precision=precision, recall=recall, fmeasure=fmeasure, 
     36                                         numResult=numResult, minScore=minScore, topHits=topHits,
     37                                         contentWeight=contentWeight, relationWeight=relationWeight),

NameError: name 'precision' is not defined

In [26]:
scores


Out[26]:
precision recall fmeasure numResult minScore topHits
0 0.198395 0.20568 0.201972 20 2 1

In [24]:
len(misses)


Out[24]:
4956

In [23]:
len(hits)


Out[23]:
572

In [ ]: