In [ ]:
from __future__ import division
import codecs
import pickle
import networkx as nx
from collections import Counter
rcParams['figure.figsize'] = (12.0, 10.0)
rcParams['font.family'] = 'Times New Roman'
In [ ]:
from os.path import abspath
workspace = "/".join(abspath('.').split('/')[:-1])
Note: Make sure that your workspace sees the root directory of openie_eval.
In [ ]:
from openie_eval.openie_eval import semantic_parsing as sp
from openie_eval.openie_eval import ontologization
reload(sp)
reload(ontologization)
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
In [ ]:
keyword = 'carnatic_music'
wiki_entities = codecs.open(workspace + '/data/ground-truth/'+keyword+'_pages.txt', encoding='utf-8').readlines()
wiki_entities = [i.strip().lower() for i in wiki_entities]
methods = ['reverb', 'openie', 'semantic-parsing']
labels = {'reverb': 'ReVerb', 'openie': 'OpenIE 4.0', 'semantic-parsing': 'Sem. Parsing'}
colors = ['#990033', '#006600', '#330066']
#coref_suffix = ''
coref_suffix = '-coref'
#filtered_suffix = ''
filtered_suffix = '-filtered'
In [ ]:
print len(wiki_entities)
for method in methods:
relations = pickle.load(file(workspace + '/data/'+method+'/'+keyword+'/relations'+coref_suffix+filtered_suffix+'.pickle'))
relations = [[i['arg1'].lower(), lemmatizer.lemmatize(i['rel'].lower(), pos='v'), i['arg2'].lower()] for i in relations]
candidate_entities = [i[0] for i in relations]
overlap = set(candidate_entities).intersection(wiki_entities)
residual = set(candidate_entities)-set(wiki_entities)
print method, len(overlap), len(residual), round(len(overlap)/len(wiki_entities), 2), round(len(residual)/len(set(candidate_entities)), 2)
In [ ]:
#carnatic
class_terms = {}
class_terms['carnatic_ragas'] = ['raga', 'raaga', 'scale']
class_terms['carnatic_singers'] = ['vocalist', 'singer']
class_terms['carnatic_composers'] = ['composer', 'poet']
class_terms['carnatic_instrumentalists'] = ['instrumentalist', 'player', 'violonist']
class_terms['carnatic_compositions'] = ['composition', 'song']
class_terms['carnatic_musicians'] = list(concatenate([class_terms[i] for i in ['carnatic_singers', 'carnatic_composers', 'carnatic_instrumentalists']]))
class_terms['carnatic_musicians'].append('artist')
out_file = workspace + '/data/results/qualitative/entity-identification/rule-based/carnatic_music/rules.pickle'
pickle.dump(class_terms, file(out_file, 'w'))
In [ ]:
#hindustani
class_terms = {}
class_terms['hindustani_ragas'] = ['raga', 'raaga', 'raag', 'rag', 'scale', u'rāga']
class_terms['hindustani_singers'] = ['vocalist', 'singer']
class_terms['hindustani_composers'] = ['composer', 'poet']
class_terms['hindustani_instrumentalists'] = ['instrumentalist', 'player', 'violonist']
#class_terms['carnatic_compositions'] = ['composition', 'song']
class_terms['hindustani_musicians'] = list(concatenate([class_terms[i] for i in ['hindustani_singers', 'hindustani_composers', 'hindustani_instrumentalists']]))
class_terms['hindustani_musicians'].append('artist')
out_file = workspace + '/data/results/qualitative/entity-identification/rule-based/hindustani_music/rules.pickle'
pickle.dump(class_terms, file(out_file, 'w'))
In [ ]:
keyword = 'hindustani_music'
coverage = {}
labelled_class_instances = {}
rules = pickle.load(file(workspace + '/data/results/qualitative/entity-identification/rule-based/'+keyword+'/rules.pickle'))
groundtruth = ontologization.load_groundtruth(keyword, rules.keys())
class_terms = pickle.load(file(workspace + '/data/results/qualitative/entity-identification/rule-based/'+keyword+'/rules.pickle'))
for method in methods:
relations = pickle.load(file(workspace + '/data/'+method+'/'+keyword+'/relations'+coref_suffix+filtered_suffix+'.pickle'))
relations = [[i['arg1'].lower(), lemmatizer.lemmatize(i['rel'].lower(), pos='v'), i['arg2'].lower()] for i in relations]
class_instances = ontologization.class_instances_by_rules(relations, rules)
res = ontologization.analyze_coverage(class_instances, groundtruth)
coverage[method] = res['coverage']
labelled_class_instances[method] = res['labelled_class_instances']
In [ ]:
def label_numbers(rects, numbers):
# attach some text labels
for i in xrange(len(rects)):
rect = rects[i]
text_label = str(numbers[i])
if text_label == '0':
continue
height = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2., height-0.03, '%s'%(text_label),
fontsize=22, ha='center', va='bottom', color='w')
In [ ]:
rcParams['figure.figsize'] = (12.0, 10.0)
fig, ax = plt.subplots()
bar_width = 0.2
index = arange(len(class_terms))
count = 0
all_fp_ratios = []
for method in methods:
overlap_scores = [i[0] for i in coverage[method]]
rects = bar(index, overlap_scores, width=bar_width, color=colors[count], label=labels[method])
label_numbers(rects, [len(labelled_class_instances[method][i]['tp']) for i in class_terms.keys()])
fp_ratios = [i[1] for i in coverage[method]]
all_fp_ratios.extend(zip(index+bar_width/2.0, fp_ratios))
index = index+bar_width
count += 1
all_fp_ratios = array(sorted(all_fp_ratios, key=lambda x:x[0]))
stem(all_fp_ratios[:, 0], all_fp_ratios[:, 1], linefmt='k--', markerfmt='ko')
fontsize=30
xlabel('Concepts', fontsize=fontsize+2)
ylabel('Overlap ($O$) with reference data', fontsize=fontsize+2)
if keyword == 'carnatic_music':
xticks(index-1.5*bar_width, [i[9:] for i in class_terms.keys()])
else:
xticks(index-1.5*bar_width, [i[11:] for i in class_terms.keys()])
legend(prop={'size': fontsize}, loc='upper left',
fancybox=True)
xticks(fontsize=fontsize, rotation=14)
yticks(fontsize=fontsize)
In [ ]:
ylim(0, 0.74)
In [ ]:
fname = workspace + '/data/results/qualitative/entity-identification/rule-based/'+keyword+'/class-agreement-with-wikipedia'
savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape',
papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape',
papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
In [ ]:
close('all')
In [ ]:
agreement_scores = ontologization.compute_agreement(labelled_class_instances, methods)
In [ ]:
rcParams['figure.figsize'] = (12.0, 10.0)
inter_labels = {'reverb-openie': 'ReVerb-OpenIE 4.0', 'openie-semantic-parsing': 'OpenIE 4.0-Sem. Parsing',
'reverb-semantic-parsing': 'Sem. Parsing-ReVerb'}
fig, ax = plt.subplots()
bar_width = 0.2
index = arange(len(class_terms))
count = 0
for method, res in agreement_scores.items():
scores = [i[0] for i in res]
abs_numbers = [len(i[1]) for i in res]
rects = bar(index, scores, bar_width, color=colors[count], label=inter_labels[method])
label_numbers(rects, abs_numbers)
index = index+bar_width
count += 1
fontsize=30
xlabel('Concepts', fontsize=fontsize+2)
ylabel('Inter-system agreement over $R$', fontsize=fontsize+2)
if keyword == 'carnatic_music':
xticks(index-1.5*bar_width, [i[9:] for i in class_terms.keys()])
else:
xticks(index-1.5*bar_width, [i[11:] for i in class_terms.keys()])
legend(prop={'size': fontsize}, loc='upper center',
bbox_to_anchor=(0.5, 1.2), fancybox=True)
xticks(fontsize=fontsize, rotation=10)
yticks(fontsize=fontsize)
In [ ]:
ylim(0, 1.05)
In [ ]:
fname = workspace + '/data/results/qualitative/entity-identification/rule-based/'+keyword+'/class-agreement-inter-method'
savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape',
papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape',
papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
Distance measure: Cosine similarity between split-object vectors of seedset and the given entity. For seedset, we consider those split objects which occur more than once.
Variables to play with:
In [ ]:
from random import shuffle
reload(ontologization)
#NOTE: Run rule-based before running this, it uses class_instances and the groundtruth from that!!
In [ ]:
def get_seedset(class_instances, n=3):
seedset = {}
for class_type, instances in class_instances.items():
shuffle(instances)
seedset[class_type] = instances[:n]
return seedset
In [ ]:
coverage = {}
iteration_step = 5
for method in methods:
coverage[method] = {}
relations = pickle.load(file(workspace + '/data/'+method+'/'+keyword+'/relations'+coref_suffix+filtered_suffix+'.pickle'))
relations = [[i['arg1'].lower(), lemmatizer.lemmatize(i['rel'].lower(), pos='v'), i['arg2'].lower()] for i in relations]
predicates = ontologization.get_predicates(relations, normalization=False)
objects = ontologization.get_objects(relations, split=True, normalization=True)
class_instances = ontologization.class_instances_by_rules(relations, rules)
n_seedsets = 5
for n_seedset in xrange(n_seedsets):
seedset = get_seedset(class_instances, 3)
total_iterations = 0
for class_type in seedset.keys():
if class_type not in coverage[method].keys():
coverage[method][class_type] = []
bootstrap_iterator = ontologization.bootstrap_lsa(seedset[class_type], objects, predicates,
expansion=1, iterations=len(groundtruth[class_type]), yield_step=iteration_step)
iter_count = 1
while True:
try:
res = bootstrap_iterator.next()
overlap_score = ontologization.overlap(res, groundtruth[class_type])
fp_ratio = len(set(res)-set(groundtruth[class_type]))/len(res)
if len(coverage[method][class_type]) <= iter_count:
coverage[method][class_type].append([overlap_score, fp_ratio])
else:
coverage[method][class_type][iter_count-1][0] += overlap_score
coverage[method][class_type][iter_count-1][0] /= 2.0
coverage[method][class_type][iter_count-1][1] += fp_ratio
coverage[method][class_type][iter_count-1][1] /= 2.0
iter_count += 1
except StopIteration:
break
In [ ]:
coverage
In [ ]:
import itertools
def flip(items, ncol):
return itertools.chain(*[items[i::ncol] for i in range(ncol)])
In [ ]:
rcParams['figure.figsize'] = (12.0, 10.0)
styles = ['-', '--']
for class_type in seedset.keys():
fig = figure()
ax = fig.add_subplot(1,1,1)
count = 0
for method in methods:
y1 = [i[0] for i in coverage[method][class_type]]
y2 = [i[1] for i in coverage[method][class_type]]
x = arange(1, len(y1)+1)*iteration_step
plot(x, y1, styles[0], color=colors[count], label=labels[method], linewidth=2.5)
plot(x, y2, styles[1], color=colors[count], linewidth=2.5)
count += 1
fontsize=30
xlabel('No. of entities bootstrapped', fontsize=fontsize+2)
#ylabel('Num. of instances bootstrapped', fontsize=fontsize+2)
#Get artists and labels for legend
handles, _labels = ax.get_legend_handles_labels()
#Create custom artists
custom_artists = []
custom_artists.append(plt.Line2D((0,1),(0,0), color='k', linestyle='-'))
custom_artists.append(plt.Line2D((0,1),(0,0), color='k', linestyle='--'))
ax.legend(flip(handles+custom_artists, 3),
flip(_labels + ['Overlap ($O$)', 'Residual ($R$)'], 3),
ncol=3, prop={'size': fontsize-6},
loc='upper center', bbox_to_anchor=(0.5, 1.1),
fancybox=True)
xticks(fontsize=fontsize)
yticks(fontsize=fontsize)
ylim_down, ylim_up = ax.get_ylim()
ylim(ylim_down, ylim_up*1.05)
xlim_down, xlim_up = ax.get_xlim()
xlim(xlim_down, xlim_up*0.95)
grid(True)
xgridlines = getp(gca(), 'xgridlines')
ygridlines = getp(gca(), 'ygridlines')
setp(xgridlines, 'color', '0.6')
setp(ygridlines, 'color', '0.6')
fname = workspace + '/data/results/qualitative/entity-identification/bootstrapping/'+keyword+'/'+class_type
savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape',
papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape',
papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
close()
In [ ]:
close('all')