In [ ]:
import numpy as np
import pandas as pd
import sh

from tqdm import tqdm_notebook

In [ ]:
input_file = '../../resources/sensem.conll'

metadata_cols = ['META', 'sentence', 'corpus', 'main_lemma', 'main_lemma_index',
                 'resource_sentence', 'sense', 'wn', 'wn16', 'wn30']

In [ ]:
input_file = '../../resources/corpora/semeval.conll'

metadata_cols = ['META', 'sentence', 'corpus', 'doc', 'lemma_tag', 'main_lemma',
                 'main_lemma_index', 'resource_sentence', 'sense']

In [ ]:
metadata = []
for mdata in sh.grep('^META', input_file):
    metadata.append(dict(md.split(':', 1) for md in mdata.strip().split()))

metadata = pd.DataFrame(metadata, columns=metadata_cols)
metadata.head()

In [ ]:
filtered = metadata.groupby([lemma_column, sense_column]).filter(
    lambda x: len(x) < 2 or x[sense_column].values[0] == '-').index
metadata.loc[filtered,'corpus'] = 'filtered'

non_filtered = metadata.groupby([lemma_column, sense_column]).filter(
    lambda x: len(x) >= 2 and x[sense_column].values[0] != '-').index

In [ ]:
test_size = 0.2

labels = metadata.loc[non_filtered][sense_column]

classes, y_counts = np.unique(labels, return_counts=True)
n_cls = classes.shape[0]
n_test = labels.shape[0] * test_size
n_train = labels.shape[0] - n_test

assert n_train >= n_cls and n_test >= n_cls

test_count = np.maximum(np.round(y_counts * test_size), np.ones(n_cls)).astype(np.int32)
train_count = (y_counts - test_count).astype(np.int32)

train_indices = []
test_indices = []

for idx, cls in enumerate(classes):
    labels_for_class = labels[labels == cls]

    train_indices.extend(labels_for_class[:train_count[idx]].index)
    test_indices.extend(labels_for_class[train_count[idx]:train_count[idx]+test_count[idx]].index)

train_indices = np.array(train_indices, dtype=np.int32)
test_indices = np.array(test_indices, dtype=np.int32)

metadata.loc[train_indices, 'corpus'] = 'train'
metadata.loc[test_indices, 'corpus'] = 'test'

In [ ]:
meta_lines = ('\t'.join(":".join(r) for r in zip(row.index, row)) for _, row in metadata.iterrows())

with open(input_file, 'r') as fin, open(output_file, 'w') as fout:
    for line in tqdm_notebook(fin, total=840705):
        if line.startswith("META"):
            print(next(meta_lines), file=fout)
        else:
            print(line.strip(), file=fout)

Train/Test split for Google WSD


In [ ]:
input_file = '../../resources/corpora/google_wsd.conll'
output_file = '../../resources/corpora/google_wsd.conll.new'

sentences = []
last_meta = {}
with open('../../resources/corpora/google_wsd.conll', 'r') as fin:
    for line in fin:
        if line.startswith('META'):
            last_meta = dict(w.split(':', 1) for w in line.strip().split())
            last_meta['sense'] = last_meta['sense'].split('/')[-1]
        try:
            if line.strip().split()[0] == last_meta['main_lemma_index']:
                last_meta['correctly_lemmatized'] = last_meta['main_lemma'] == line.strip().split()[2]
                sentences.append(last_meta)
        except IndexError:
            continue

sentences = pd.DataFrame(sentences, columns=['META', 'sentence', 'doc', 'domain', 'main_lemma',
                                             'main_lemma_index', 'main_token', 'sense', 'correctly_lemmatized'])

In [ ]:
sentences['domain_sentence_count'] = sentences\
    .groupby(['main_lemma', 'sense', 'domain'])['sentence'].transform('count')

sentences['sense_sentence_count'] = sentences\
    .groupby(['main_lemma', 'sense'])['sentence'].transform('count')

sentences['lemma_sentence_count'] = sentences\
    .groupby(['main_lemma'])['sentence'].transform('count')
    
sentences['sense_count'] = sentences\
    .groupby(['main_lemma'])['sense']\
    .transform(lambda x: x.nunique())

sentences['senses_over_threshold'] = sentences['main_lemma']\
    .map(sentences.groupby('main_lemma')\
    .apply(lambda x: x.loc[x.sense_sentence_count >= 3, 'sense'].nunique()))

sentences['is_valid'] = (sentences['senses_over_threshold'] > 1) & (sentences['sense_sentence_count'] >= 3)

sentences.insert(2, 'corpus', 'filtered')

In [ ]:
test_size = 0.2

labels = sentences.loc[sentences.is_valid, 'sense']

classes, y_counts = np.unique(labels, return_counts=True)
n_cls = classes.shape[0]
n_test = labels.shape[0] * test_size
n_train = labels.shape[0] - n_test

assert n_train >= n_cls and n_test >= n_cls

test_count = np.maximum(np.round(y_counts * test_size), np.ones(n_cls)).astype(np.int32)
train_count = (y_counts - test_count).astype(np.int32)

train_indices = []
test_indices = []

for idx, cls in enumerate(classes):
    labels_for_class = labels[labels == cls]

    train_indices.extend(labels_for_class[:train_count[idx]].index)
    test_indices.extend(labels_for_class[train_count[idx]:train_count[idx]+test_count[idx]].index)

train_indices = np.array(train_indices, dtype=np.int32)
test_indices = np.array(test_indices, dtype=np.int32)

sentences.loc[train_indices, 'corpus'] = 'train'
sentences.loc[test_indices, 'corpus'] = 'test'

In [ ]:
meta_lines = ('\t'.join(":".join(r) for r in zip(row.index, row.astype(str)))
              for _, row in sentences.iloc[:, :10].iterrows())

with open(input_file, 'r') as fin, open(output_file, 'w') as fout:
    for line in tqdm_notebook(fin):
        if line.startswith("META"):
            print(next(meta_lines), file=fout)
        else:
            print(line.strip(), file=fout)

In [ ]: