In [ ]:
import numpy as np
import pandas as pd
import sh
from tqdm import tqdm_notebook
In [ ]:
input_file = '../../resources/sensem.conll'
metadata_cols = ['META', 'sentence', 'corpus', 'main_lemma', 'main_lemma_index',
'resource_sentence', 'sense', 'wn', 'wn16', 'wn30']
In [ ]:
input_file = '../../resources/corpora/semeval.conll'
metadata_cols = ['META', 'sentence', 'corpus', 'doc', 'lemma_tag', 'main_lemma',
'main_lemma_index', 'resource_sentence', 'sense']
In [ ]:
metadata = []
for mdata in sh.grep('^META', input_file):
metadata.append(dict(md.split(':', 1) for md in mdata.strip().split()))
metadata = pd.DataFrame(metadata, columns=metadata_cols)
metadata.head()
In [ ]:
filtered = metadata.groupby([lemma_column, sense_column]).filter(
lambda x: len(x) < 2 or x[sense_column].values[0] == '-').index
metadata.loc[filtered,'corpus'] = 'filtered'
non_filtered = metadata.groupby([lemma_column, sense_column]).filter(
lambda x: len(x) >= 2 and x[sense_column].values[0] != '-').index
In [ ]:
test_size = 0.2
labels = metadata.loc[non_filtered][sense_column]
classes, y_counts = np.unique(labels, return_counts=True)
n_cls = classes.shape[0]
n_test = labels.shape[0] * test_size
n_train = labels.shape[0] - n_test
assert n_train >= n_cls and n_test >= n_cls
test_count = np.maximum(np.round(y_counts * test_size), np.ones(n_cls)).astype(np.int32)
train_count = (y_counts - test_count).astype(np.int32)
train_indices = []
test_indices = []
for idx, cls in enumerate(classes):
labels_for_class = labels[labels == cls]
train_indices.extend(labels_for_class[:train_count[idx]].index)
test_indices.extend(labels_for_class[train_count[idx]:train_count[idx]+test_count[idx]].index)
train_indices = np.array(train_indices, dtype=np.int32)
test_indices = np.array(test_indices, dtype=np.int32)
metadata.loc[train_indices, 'corpus'] = 'train'
metadata.loc[test_indices, 'corpus'] = 'test'
In [ ]:
meta_lines = ('\t'.join(":".join(r) for r in zip(row.index, row)) for _, row in metadata.iterrows())
with open(input_file, 'r') as fin, open(output_file, 'w') as fout:
for line in tqdm_notebook(fin, total=840705):
if line.startswith("META"):
print(next(meta_lines), file=fout)
else:
print(line.strip(), file=fout)
In [ ]:
input_file = '../../resources/corpora/google_wsd.conll'
output_file = '../../resources/corpora/google_wsd.conll.new'
sentences = []
last_meta = {}
with open('../../resources/corpora/google_wsd.conll', 'r') as fin:
for line in fin:
if line.startswith('META'):
last_meta = dict(w.split(':', 1) for w in line.strip().split())
last_meta['sense'] = last_meta['sense'].split('/')[-1]
try:
if line.strip().split()[0] == last_meta['main_lemma_index']:
last_meta['correctly_lemmatized'] = last_meta['main_lemma'] == line.strip().split()[2]
sentences.append(last_meta)
except IndexError:
continue
sentences = pd.DataFrame(sentences, columns=['META', 'sentence', 'doc', 'domain', 'main_lemma',
'main_lemma_index', 'main_token', 'sense', 'correctly_lemmatized'])
In [ ]:
sentences['domain_sentence_count'] = sentences\
.groupby(['main_lemma', 'sense', 'domain'])['sentence'].transform('count')
sentences['sense_sentence_count'] = sentences\
.groupby(['main_lemma', 'sense'])['sentence'].transform('count')
sentences['lemma_sentence_count'] = sentences\
.groupby(['main_lemma'])['sentence'].transform('count')
sentences['sense_count'] = sentences\
.groupby(['main_lemma'])['sense']\
.transform(lambda x: x.nunique())
sentences['senses_over_threshold'] = sentences['main_lemma']\
.map(sentences.groupby('main_lemma')\
.apply(lambda x: x.loc[x.sense_sentence_count >= 3, 'sense'].nunique()))
sentences['is_valid'] = (sentences['senses_over_threshold'] > 1) & (sentences['sense_sentence_count'] >= 3)
sentences.insert(2, 'corpus', 'filtered')
In [ ]:
test_size = 0.2
labels = sentences.loc[sentences.is_valid, 'sense']
classes, y_counts = np.unique(labels, return_counts=True)
n_cls = classes.shape[0]
n_test = labels.shape[0] * test_size
n_train = labels.shape[0] - n_test
assert n_train >= n_cls and n_test >= n_cls
test_count = np.maximum(np.round(y_counts * test_size), np.ones(n_cls)).astype(np.int32)
train_count = (y_counts - test_count).astype(np.int32)
train_indices = []
test_indices = []
for idx, cls in enumerate(classes):
labels_for_class = labels[labels == cls]
train_indices.extend(labels_for_class[:train_count[idx]].index)
test_indices.extend(labels_for_class[train_count[idx]:train_count[idx]+test_count[idx]].index)
train_indices = np.array(train_indices, dtype=np.int32)
test_indices = np.array(test_indices, dtype=np.int32)
sentences.loc[train_indices, 'corpus'] = 'train'
sentences.loc[test_indices, 'corpus'] = 'test'
In [ ]:
meta_lines = ('\t'.join(":".join(r) for r in zip(row.index, row.astype(str)))
for _, row in sentences.iloc[:, :10].iterrows())
with open(input_file, 'r') as fin, open(output_file, 'w') as fout:
for line in tqdm_notebook(fin):
if line.startswith("META"):
print(next(meta_lines), file=fout)
else:
print(line.strip(), file=fout)
In [ ]: