In [ ]:
import numpy as np
import pandas as pd

In [ ]:
input_file = '../../resources/corpora/sensem.conll'

metadata_cols = ['META', 'sentence', 'corpus', 'main_lemma', 'main_lemma_index',
                 'resource_sentence', 'sense', 'wn', 'wn16', 'wn30']

In [ ]:
input_file = '../../resources/corpora/semeval.conll'

metadata_cols = ['META', 'sentence', 'corpus', 'doc', 'lemma_tag', 'main_lemma',
                 'main_lemma_index', 'resource_sentence', 'sense']

In [ ]:
metadata = []
for mdata in sh.grep('^META', input_file):
    metadata.append(dict(md.split(':', 1) for md in mdata.strip().split()))

metadata = pd.DataFrame(metadata, columns=metadata_cols)
metadata.head()

In [ ]:
not_filter_condition = (metadata.corpus != 'filtered') & (metadata.lemma_pos == 'v')
train_condition = (metadata.corpus == 'train') & (metadata.lemma_pos == 'v')
test_condition = (metadata.corpus == 'test') & (metadata.lemma_pos == 'v')

In [ ]:
not_filter_condition = (metadata.corpus != 'filtered')
train_condition = (metadata.corpus == 'train')
test_condition = (metadata.corpus == 'test')

In [ ]:
print("Sentences: %d" % metadata.sentence.count())
print("Filtered: %d" % metadata[~not_filter_condition].sentence.count())
print("Training: %d" % metadata[train_condition].sentence.count())
print("Test: %d" % metadata[test_condition].sentence.count())

In [ ]:
print("Avg. sentences per lemma: %.2f" % metadata[not_filter_condition]
      .groupby([lemma_column]).count().sentence.mean())
print("Median sentences per lemma: %.2f" % metadata[not_filter_condition]
      .groupby([lemma_column]).count().sentence.median())
print("Avg. sentences per sense: %.2f" % metadata[not_filter_condition]
      .groupby([lemma_column, sense_column]).count().sentence.mean())
print("Median sentences per sense: %.2f" % metadata[not_filter_condition]
      .groupby([lemma_column, sense_column]).count().sentence.median())

print("Avg. sentences per lemma for train: %.2f" % metadata[train_condition]
      .groupby([lemma_column]).count().sentence.mean())
print("Avg. sentences per lemma for test: %.2f" % metadata[test_condition]
      .groupby([lemma_column]).count().sentence.mean())
print("Avg. sentences per sense for train: %.2f" % metadata[train_condition]
      .groupby([lemma_column, sense_column]).count().sentence.mean())
print("Avg. sentences per sense for test: %.2f" % metadata[test_condition]
      .groupby([lemma_column, sense_column]).count().sentence.mean())

In [ ]:
print("No. of lemmas: %d" % metadata[not_filter_condition][lemma_column].nunique())
print("No. of senses: %d" % metadata[not_filter_condition]
      .groupby([lemma_column, sense_column])[sense_column].nunique().sum())
print("Avg. no. of senses per lemma: %.2f" % metadata[not_filter_condition]
      .groupby([lemma_column])[sense_column].nunique().mean())
print("Median no. of senses per lemma: %.2f" % metadata[not_filter_condition]
      .groupby([lemma_column])[sense_column].nunique().median())

Statistics for Semeval


In [ ]:
input_file = '../../resources/corpora/semeval.conll'

sentences = []
with open(input_file, 'r') as fin:
    for line in fin:
        if line.startswith('META'):
            sentences.append(dict(w.split(':', 1) for w in line.strip().split()))

sentences = pd.DataFrame(sentences)

In [ ]:
sentences['sense_sentence_count'] = sentences\
    .groupby(['main_lemma', 'sense'])['sentence'].transform('count')

sentences['lemma_sentence_count'] = sentences\
    .groupby(['main_lemma'])['sentence'].transform('count')
    
sentences['sense_count'] = sentences\
    .groupby(['main_lemma'])['sense']\
    .transform(lambda x: x.nunique())

sentences['senses_over_threshold'] = sentences['main_lemma']\
    .map(sentences.groupby('main_lemma')\
    .apply(lambda x: x.loc[x.sense_sentence_count >= 3, 'sense'].nunique()))

sentences['is_valid'] = (sentences['senses_over_threshold'] > 1) & (sentences['sense_sentence_count'] >= 3) &\
    (sentences['lemma_tag'] == 'v')

In [ ]:
print("Statistics per lemma")
print(sentences[sentences.is_valid].groupby(['main_lemma']).first()['lemma_sentence_count'].describe())

print("\nStatistics per sense")
print(sentences[sentences.is_valid].groupby(['main_lemma', 'sense']).first()['sense_sentence_count'].describe())

Statistics for Google


In [ ]:
input_file = '../../resources/corpora/google_wsd.conll.new'

sentences = []
with open(input_file, 'r') as fin:
    for line in fin:
        if line.startswith('META'):
            sentences.append(dict(w.split(':', 1) for w in line.strip().split()))

sentences = pd.DataFrame(sentences)

In [ ]:
sentences['sense_sentence_count'] = sentences\
    .groupby(['main_lemma', 'sense'])['sentence'].transform('count')

sentences['lemma_sentence_count'] = sentences\
    .groupby(['main_lemma'])['sentence'].transform('count')
    
sentences['sense_count'] = sentences\
    .groupby(['main_lemma'])['sense']\
    .transform(lambda x: x.nunique())

sentences['senses_over_threshold'] = sentences['main_lemma']\
    .map(sentences.groupby('main_lemma')\
    .apply(lambda x: x.loc[x.sense_sentence_count >= 3, 'sense'].nunique()))

sentences['is_valid'] = (sentences['senses_over_threshold'] > 1) & (sentences['sense_sentence_count'] >= 3)

In [ ]:
print("Statistics per lemma")
print(sentences[sentences.is_valid].groupby(['main_lemma']).first()['lemma_sentence_count'].describe())

print("\nStatistics per sense")
print(sentences[sentences.is_valid].groupby(['main_lemma', 'sense']).first()['sense_sentence_count'].describe())

In [ ]: