In [ ]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook
In [ ]:
for corpus in ('sensem', 'semeval'):
input_file = '../../resources/hashed/%s/train_dataset.npz' % corpus
output_file = '../../resources/active_learning/%s_indices.npz' % corpus
dataset = np.load(input_file)
target = dataset['target']
lemmas = dataset['lemmas']
initial_indices = []
unlabeled_indices = []
for lemma in np.unique(lemmas):
indices = np.where(lemmas == lemma)[0]
initial_size = np.int(indices.shape[0]/2)
lemma_initial_indices = indices[:initial_size]
lemma_unlabeled_indices = indices[initial_size:]
while np.unique(target[indices]).shape[0] >= 2 and np.unique(target[lemma_initial_indices]).shape[0] < 2:
np.random.shuffle(indices)
lemma_initial_indices = indices[:initial_size]
lemma_unlabeled_indices = indices[initial_size:]
initial_indices.extend(lemma_initial_indices)
unlabeled_indices.extend(lemma_unlabeled_indices)
initial_indices = np.array(initial_indices, dtype=np.int32)
unlabeled_indices = np.array(unlabeled_indices, dtype=np.int32)
np.savez_compressed(output_file, initial_indices=initial_indices, unlabeled_indices=unlabeled_indices)
In [ ]:
for corpus in ('sensem', 'semeval'):
input_file = '../../resources/hashed/%s/train_dataset.npz' % corpus
output_file = '../../resources/active_learning/%s_indices.npz' % corpus
dataset = np.load(input_file)
target = dataset['target']
initial_size = 0.5
classes, y_counts = np.unique(target, return_counts=True)
n_cls = classes.shape[0]
n_initial = target.shape[0] * initial_size
n_unlabel = target.shape[0] - n_initial
assert n_initial >= n_cls and n_unlabel >= n_cls
initial_count = np.maximum(np.round(y_counts * initial_size), np.ones(n_cls)).astype(np.int32)
unlabeled_count = (y_counts - initial_count).astype(np.int32)
initial_indices = []
unlabeled_indices = []
for idx, cls in enumerate(classes):
labels_for_class = np.where(target == cls)[0]
initial_indices.extend(labels_for_class[:initial_count[idx]])
unlabeled_indices.extend(labels_for_class[initial_count[idx]:initial_count[idx]+unlabeled_count[idx]])
initial_indices = np.array(initial_indices, dtype=np.int32)
unlabeled_indices = np.array(unlabeled_indices, dtype=np.int32)
np.savez_compressed(output_file, initial_indices=initial_indices, unlabeled_indices=unlabeled_indices)
In [ ]: