In [1]:
import sys
sys.path.append("../utils")
import pdtb_utils
import pandas as pd
import json
from os.path import join

base_dir = '../resources/conll16st-en-zh-dev-train-test_LDC2016E50/'

test_types = {'test': 'conll16st-en-03-29-16-test',
              'train': 'conll16st-en-03-29-16-train',
              'blind_test': 'conll15st-en-03-29-16-blind-test',
              'dev': 'conll16st-en-03-29-16-dev'}

def get_relations(test_type, separate_dual_classes=False):
    with open(join(base_dir, test_types[test_type], 'relations.json')) as f:
        for line in f:
            rel = pdtb_utils.DiscourseRelation(json.loads(line.strip()))
            if separate_dual_classes:
                senses = rel.senses().copy()
                for sense in senses:
                    rel.set_senses([sense])
                    yield rel
            else:
                yield rel

In [11]:
rels = list(get_relations('blind_test'))
total = 0
correct = 0
for rel in rels: 
    if 'EntRel' in rel.senses():
        correct += 1
    total += 1
print(total)
print(correct)
print(correct / total)


1209
200
0.1654259718775848

In [23]:
def output_relation_for_type(test_type):
    relations = get_relations(test_type, separate_dual_classes=False)
    df = pd.DataFrame([{'id': rel.relation_id(), 
                        'senses': tuple(rel.senses()), 
                        'relation_type': rel.relation_type(), 
                        'connective_token': rel.connective_token()} 
                       for rel in relations]).set_index('id')

    type_distribution = pd.concat([df.groupby('relation_type').size(), pd.Series(df.count()['relation_type'], index=['Total'])])
    type_distribution = pd.concat([type_distribution, type_distribution / type_distribution['Total']], axis=1).rename(columns={0: 'count', 1: 'ratio'}).sort_values('count', ascending=False)
    return type_distribution

types_dist = pd.DataFrame()
types = ['train', 'dev', 'test', 'blind_test']
dfs = []
for test_type in types:
    dfs.append(output_relation_for_type(test_type))
    
type_distribution = pd.concat(dfs, keys=types, axis=1).rename(columns={'blind_test': 'blind test'})
import sys
def formatter(x):
    if type(x) == str:
        return "\textbf({})".format(x)
    else:
        return str(x)
type_distribution.to_latex(buf='../../master-paper/tables/type_distribution.tex',
                           float_format=lambda x: "{:.2f}".format(x))
type_distribution


Out[23]:
train dev test blind test
count ratio count ratio count ratio count ratio
Total 32535 1.000000 1436 1.000000 1939 1.000000 1209 1.000000
Explicit 14722 0.452497 680 0.473538 923 0.476019 556 0.459884
Implicit 13156 0.404365 522 0.363510 769 0.396596 425 0.351530
EntRel 4133 0.127032 215 0.149721 217 0.111913 200 0.165426
AltLex 524 0.016106 19 0.013231 30 0.015472 28 0.023160

In [19]:
def output_sense_for_type(test_type):
    rels_separate_sense = get_relations(test_type, separate_dual_classes=True)
    df = pd.DataFrame([{'id': rel.relation_id(), 
                        'senses': tuple(rel.senses()), 
                        'relation_type': rel.relation_type(), 
                        'connective_token': rel.connective_token()} 
                       for rel in rels_separate_sense]).set_index('id')
    df['senses'] = df['senses'].apply(lambda x: x[0])
    df = df[df['relation_type'].isin(['EntRel', 'Implicit'])]
    senses_dist = pd.concat([df.groupby('senses').size(), pd.Series(df.count()['senses'], index=['Total'])])
    senses_dist = pd.concat([senses_dist, senses_dist / df.count()['senses']], axis=1).rename(columns={0: 'count', 1: 'ratio'}).sort_values('count', ascending=False)
    return senses_dist


senses_dist = pd.DataFrame()
types = ['train', 'dev', 'test', 'blind_test']
dfs = []
for test_type in types:
    dfs.append(output_sense_for_type(test_type))
    
import math
def shannon(col):
    entropy = - sum([ p * math.log(p) / math.log(2.0) for p in col])
    return entropy
def float_format(x):
    return "{:.2f}".format(x)
def find_max(test_type):
    def find_max_in_df(x):
        y = '\\textbf{' + "{:.2f}".format(x) + '}' if x == senses_dist.drop(['Total', 'Entropy']).max()[test_type] else "{:.2f}".format(x)
        y = "" if y == '0' else y
        return y
    return find_max_in_df
    
senses_dist = pd.concat(dfs, axis=1, keys=types).dropna().sort_values([('train', 'ratio')], ascending=False)
senses_dist = pd.concat([senses_dist, pd.DataFrame(senses_dist.drop('Total').apply(lambda x: pd.to_numeric(x, downcast='integer')).apply(shannon), columns=['Entropy']).T]).applymap(lambda x: 0 if x < 0 else x).apply(lambda x: pd.to_numeric(x, downcast='integer')).rename(columns={'blind_test': 'blind test'})
#senses_dist.to_latex('../../master-paper/tables/implicit_sense_distribution.table', na_rep='', float_format=float_format, formatters={('train', 'ratio'): find_max(('train', 'ratio'))})

formatters = {('train', 'ratio'): find_max(('train', 'ratio')),
              ('dev', 'ratio'): find_max(('dev', 'ratio')),
              ('test', 'ratio'): find_max(('test', 'ratio')),
              ('blind test', 'ratio'): find_max(('blind test', 'ratio'))}
senses_dist.to_latex('../../master-paper/tables/implicit_sense_distribution.tex', na_rep='', float_format=float_format, formatters=formatters, escape=False)
senses_dist


Out[19]:
train dev test blind test
count ratio count ratio count ratio count ratio
Total 17577 1.000000 763 1.000000 996 1.000000 633 1.000000
EntRel 4133 0.235137 215 0.281782 217 0.217871 200 0.315956
Expansion.Conjunction 3308 0.188200 122 0.159895 147 0.147590 106 0.167457
Expansion.Restatement 2514 0.143028 103 0.134993 190 0.190763 141 0.222749
Contingency.Cause.Reason 2092 0.119019 73 0.095675 116 0.116466 42 0.066351
Comparison.Contrast 1657 0.094271 88 0.115334 127 0.127510 27 0.042654
Contingency.Cause.Result 1389 0.079024 52 0.068152 89 0.089357 33 0.052133
Expansion.Instantiation 1134 0.064516 48 0.062910 69 0.069277 37 0.058452
Temporal.Asynchronous.Precedence 433 0.024634 26 0.034076 8 0.008032 10 0.015798
Temporal.Synchrony 212 0.012061 19 0.024902 5 0.005020 3 0.004739
Comparison.Concession 196 0.011151 5 0.006553 5 0.005020 30 0.047393
Entropy 0 2.857801 0 2.872594 0 2.793237 0 2.694550