In [ ]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook

In [ ]:
for corpus in ('sensem', 'semeval'):
    input_file = '../../resources/hashed/%s/train_dataset.npz' % corpus
    output_file = '../../resources/active_learning/%s_indices.npz' % corpus

    dataset = np.load(input_file)
    target = dataset['target']
    lemmas = dataset['lemmas']

    initial_indices = []
    unlabeled_indices = []

    for lemma in np.unique(lemmas):
        indices = np.where(lemmas == lemma)[0]
        initial_size = np.int(indices.shape[0]/2)
        lemma_initial_indices = indices[:initial_size]
        lemma_unlabeled_indices = indices[initial_size:]

        while np.unique(target[indices]).shape[0] >= 2 and np.unique(target[lemma_initial_indices]).shape[0] < 2:
            np.random.shuffle(indices)
            lemma_initial_indices = indices[:initial_size]
            lemma_unlabeled_indices = indices[initial_size:]

        initial_indices.extend(lemma_initial_indices)
        unlabeled_indices.extend(lemma_unlabeled_indices)

    initial_indices = np.array(initial_indices, dtype=np.int32)
    unlabeled_indices = np.array(unlabeled_indices, dtype=np.int32)

    np.savez_compressed(output_file, initial_indices=initial_indices, unlabeled_indices=unlabeled_indices)

In [ ]:
for corpus in ('sensem', 'semeval'):
    input_file = '../../resources/hashed/%s/train_dataset.npz' % corpus
    output_file = '../../resources/active_learning/%s_indices.npz' % corpus

    dataset = np.load(input_file)
    target = dataset['target']

    initial_size = 0.5

    classes, y_counts = np.unique(target, return_counts=True)

    n_cls = classes.shape[0]
    n_initial = target.shape[0] * initial_size
    n_unlabel = target.shape[0] - n_initial

    assert n_initial >= n_cls and n_unlabel >= n_cls

    initial_count = np.maximum(np.round(y_counts * initial_size), np.ones(n_cls)).astype(np.int32)
    unlabeled_count = (y_counts - initial_count).astype(np.int32)

    initial_indices = []
    unlabeled_indices = []

    for idx, cls in enumerate(classes):
        labels_for_class = np.where(target == cls)[0]

        initial_indices.extend(labels_for_class[:initial_count[idx]])
        unlabeled_indices.extend(labels_for_class[initial_count[idx]:initial_count[idx]+unlabeled_count[idx]])

    initial_indices = np.array(initial_indices, dtype=np.int32)
    unlabeled_indices = np.array(unlabeled_indices, dtype=np.int32)

    np.savez_compressed(output_file, initial_indices=initial_indices, unlabeled_indices=unlabeled_indices)

In [ ]: