In [ ]:
import numpy as np
import pandas as pd
In [ ]:
input_file = '../../resources/corpora/sensem.conll'
metadata_cols = ['META', 'sentence', 'corpus', 'main_lemma', 'main_lemma_index',
'resource_sentence', 'sense', 'wn', 'wn16', 'wn30']
In [ ]:
input_file = '../../resources/corpora/semeval.conll'
metadata_cols = ['META', 'sentence', 'corpus', 'doc', 'lemma_tag', 'main_lemma',
'main_lemma_index', 'resource_sentence', 'sense']
In [ ]:
metadata = []
for mdata in sh.grep('^META', input_file):
metadata.append(dict(md.split(':', 1) for md in mdata.strip().split()))
metadata = pd.DataFrame(metadata, columns=metadata_cols)
metadata.head()
In [ ]:
not_filter_condition = (metadata.corpus != 'filtered') & (metadata.lemma_pos == 'v')
train_condition = (metadata.corpus == 'train') & (metadata.lemma_pos == 'v')
test_condition = (metadata.corpus == 'test') & (metadata.lemma_pos == 'v')
In [ ]:
not_filter_condition = (metadata.corpus != 'filtered')
train_condition = (metadata.corpus == 'train')
test_condition = (metadata.corpus == 'test')
In [ ]:
print("Sentences: %d" % metadata.sentence.count())
print("Filtered: %d" % metadata[~not_filter_condition].sentence.count())
print("Training: %d" % metadata[train_condition].sentence.count())
print("Test: %d" % metadata[test_condition].sentence.count())
In [ ]:
print("Avg. sentences per lemma: %.2f" % metadata[not_filter_condition]
.groupby([lemma_column]).count().sentence.mean())
print("Median sentences per lemma: %.2f" % metadata[not_filter_condition]
.groupby([lemma_column]).count().sentence.median())
print("Avg. sentences per sense: %.2f" % metadata[not_filter_condition]
.groupby([lemma_column, sense_column]).count().sentence.mean())
print("Median sentences per sense: %.2f" % metadata[not_filter_condition]
.groupby([lemma_column, sense_column]).count().sentence.median())
print("Avg. sentences per lemma for train: %.2f" % metadata[train_condition]
.groupby([lemma_column]).count().sentence.mean())
print("Avg. sentences per lemma for test: %.2f" % metadata[test_condition]
.groupby([lemma_column]).count().sentence.mean())
print("Avg. sentences per sense for train: %.2f" % metadata[train_condition]
.groupby([lemma_column, sense_column]).count().sentence.mean())
print("Avg. sentences per sense for test: %.2f" % metadata[test_condition]
.groupby([lemma_column, sense_column]).count().sentence.mean())
In [ ]:
print("No. of lemmas: %d" % metadata[not_filter_condition][lemma_column].nunique())
print("No. of senses: %d" % metadata[not_filter_condition]
.groupby([lemma_column, sense_column])[sense_column].nunique().sum())
print("Avg. no. of senses per lemma: %.2f" % metadata[not_filter_condition]
.groupby([lemma_column])[sense_column].nunique().mean())
print("Median no. of senses per lemma: %.2f" % metadata[not_filter_condition]
.groupby([lemma_column])[sense_column].nunique().median())
In [ ]:
input_file = '../../resources/corpora/semeval.conll'
sentences = []
with open(input_file, 'r') as fin:
for line in fin:
if line.startswith('META'):
sentences.append(dict(w.split(':', 1) for w in line.strip().split()))
sentences = pd.DataFrame(sentences)
In [ ]:
sentences['sense_sentence_count'] = sentences\
.groupby(['main_lemma', 'sense'])['sentence'].transform('count')
sentences['lemma_sentence_count'] = sentences\
.groupby(['main_lemma'])['sentence'].transform('count')
sentences['sense_count'] = sentences\
.groupby(['main_lemma'])['sense']\
.transform(lambda x: x.nunique())
sentences['senses_over_threshold'] = sentences['main_lemma']\
.map(sentences.groupby('main_lemma')\
.apply(lambda x: x.loc[x.sense_sentence_count >= 3, 'sense'].nunique()))
sentences['is_valid'] = (sentences['senses_over_threshold'] > 1) & (sentences['sense_sentence_count'] >= 3) &\
(sentences['lemma_tag'] == 'v')
In [ ]:
print("Statistics per lemma")
print(sentences[sentences.is_valid].groupby(['main_lemma']).first()['lemma_sentence_count'].describe())
print("\nStatistics per sense")
print(sentences[sentences.is_valid].groupby(['main_lemma', 'sense']).first()['sense_sentence_count'].describe())
In [ ]:
input_file = '../../resources/corpora/google_wsd.conll.new'
sentences = []
with open(input_file, 'r') as fin:
for line in fin:
if line.startswith('META'):
sentences.append(dict(w.split(':', 1) for w in line.strip().split()))
sentences = pd.DataFrame(sentences)
In [ ]:
sentences['sense_sentence_count'] = sentences\
.groupby(['main_lemma', 'sense'])['sentence'].transform('count')
sentences['lemma_sentence_count'] = sentences\
.groupby(['main_lemma'])['sentence'].transform('count')
sentences['sense_count'] = sentences\
.groupby(['main_lemma'])['sense']\
.transform(lambda x: x.nunique())
sentences['senses_over_threshold'] = sentences['main_lemma']\
.map(sentences.groupby('main_lemma')\
.apply(lambda x: x.loc[x.sense_sentence_count >= 3, 'sense'].nunique()))
sentences['is_valid'] = (sentences['senses_over_threshold'] > 1) & (sentences['sense_sentence_count'] >= 3)
In [ ]:
print("Statistics per lemma")
print(sentences[sentences.is_valid].groupby(['main_lemma']).first()['lemma_sentence_count'].describe())
print("\nStatistics per sense")
print(sentences[sentences.is_valid].groupby(['main_lemma', 'sense']).first()['sense_sentence_count'].describe())
In [ ]: