In [1]:
import sys
sys.path.append("../utils")
import pdtb_utils
import pandas as pd
import json
from os.path import join
base_dir = '../resources/conll16st-en-zh-dev-train-test_LDC2016E50/'
test_types = {'test': 'conll16st-en-03-29-16-test',
'train': 'conll16st-en-03-29-16-train',
'blind_test': 'conll15st-en-03-29-16-blind-test',
'dev': 'conll16st-en-03-29-16-dev'}
def get_relations(test_type, separate_dual_classes=False):
with open(join(base_dir, test_types[test_type], 'relations.json')) as f:
for line in f:
rel = pdtb_utils.DiscourseRelation(json.loads(line.strip()))
if separate_dual_classes:
senses = rel.senses().copy()
for sense in senses:
rel.set_senses([sense])
yield rel
else:
yield rel
In [11]:
rels = list(get_relations('blind_test'))
total = 0
correct = 0
for rel in rels:
if 'EntRel' in rel.senses():
correct += 1
total += 1
print(total)
print(correct)
print(correct / total)
In [23]:
def output_relation_for_type(test_type):
relations = get_relations(test_type, separate_dual_classes=False)
df = pd.DataFrame([{'id': rel.relation_id(),
'senses': tuple(rel.senses()),
'relation_type': rel.relation_type(),
'connective_token': rel.connective_token()}
for rel in relations]).set_index('id')
type_distribution = pd.concat([df.groupby('relation_type').size(), pd.Series(df.count()['relation_type'], index=['Total'])])
type_distribution = pd.concat([type_distribution, type_distribution / type_distribution['Total']], axis=1).rename(columns={0: 'count', 1: 'ratio'}).sort_values('count', ascending=False)
return type_distribution
types_dist = pd.DataFrame()
types = ['train', 'dev', 'test', 'blind_test']
dfs = []
for test_type in types:
dfs.append(output_relation_for_type(test_type))
type_distribution = pd.concat(dfs, keys=types, axis=1).rename(columns={'blind_test': 'blind test'})
import sys
def formatter(x):
if type(x) == str:
return "\textbf({})".format(x)
else:
return str(x)
type_distribution.to_latex(buf='../../master-paper/tables/type_distribution.tex',
float_format=lambda x: "{:.2f}".format(x))
type_distribution
Out[23]:
In [19]:
def output_sense_for_type(test_type):
rels_separate_sense = get_relations(test_type, separate_dual_classes=True)
df = pd.DataFrame([{'id': rel.relation_id(),
'senses': tuple(rel.senses()),
'relation_type': rel.relation_type(),
'connective_token': rel.connective_token()}
for rel in rels_separate_sense]).set_index('id')
df['senses'] = df['senses'].apply(lambda x: x[0])
df = df[df['relation_type'].isin(['EntRel', 'Implicit'])]
senses_dist = pd.concat([df.groupby('senses').size(), pd.Series(df.count()['senses'], index=['Total'])])
senses_dist = pd.concat([senses_dist, senses_dist / df.count()['senses']], axis=1).rename(columns={0: 'count', 1: 'ratio'}).sort_values('count', ascending=False)
return senses_dist
senses_dist = pd.DataFrame()
types = ['train', 'dev', 'test', 'blind_test']
dfs = []
for test_type in types:
dfs.append(output_sense_for_type(test_type))
import math
def shannon(col):
entropy = - sum([ p * math.log(p) / math.log(2.0) for p in col])
return entropy
def float_format(x):
return "{:.2f}".format(x)
def find_max(test_type):
def find_max_in_df(x):
y = '\\textbf{' + "{:.2f}".format(x) + '}' if x == senses_dist.drop(['Total', 'Entropy']).max()[test_type] else "{:.2f}".format(x)
y = "" if y == '0' else y
return y
return find_max_in_df
senses_dist = pd.concat(dfs, axis=1, keys=types).dropna().sort_values([('train', 'ratio')], ascending=False)
senses_dist = pd.concat([senses_dist, pd.DataFrame(senses_dist.drop('Total').apply(lambda x: pd.to_numeric(x, downcast='integer')).apply(shannon), columns=['Entropy']).T]).applymap(lambda x: 0 if x < 0 else x).apply(lambda x: pd.to_numeric(x, downcast='integer')).rename(columns={'blind_test': 'blind test'})
#senses_dist.to_latex('../../master-paper/tables/implicit_sense_distribution.table', na_rep='', float_format=float_format, formatters={('train', 'ratio'): find_max(('train', 'ratio'))})
formatters = {('train', 'ratio'): find_max(('train', 'ratio')),
('dev', 'ratio'): find_max(('dev', 'ratio')),
('test', 'ratio'): find_max(('test', 'ratio')),
('blind test', 'ratio'): find_max(('blind test', 'ratio'))}
senses_dist.to_latex('../../master-paper/tables/implicit_sense_distribution.tex', na_rep='', float_format=float_format, formatters=formatters, escape=False)
senses_dist
Out[19]: