In [ ]:
import numpy as np
import pickle

from tqdm import tqdm_notebook

In [ ]:
unlabeled_instances_id = np.load('../../resources/hashed/unlabeled_spanish/dataset.npz')['instances_id']

basefilename = '../../resources/corpora/unlabeled_spanish/%(corpus)s/%(filename)s'
last_filename = ''
file = None

with open('../../resources/active_learning/spanish_sentences.txt', 'w') as fout:
    for iid in tqdm_notebook(unlabeled_instances_id):
        corpus, filename, id_sentence, verb, id_line = iid.split(':')
        id_sentence = int(id_sentence)
        id_line = int(id_line)
        filename = basefilename % {'corpus': corpus, 'filename': filename}

        if last_filename != filename:
            if file is not None:
                file.close()
            file = open(filename, 'r')
        else:
            file.seek(0)

        for line in file:
            if line.startswith('META'):
                sentence = int(line.strip().split()[1].split(':')[1])
                if sentence == id_sentence:
                    tokens = []
                    for idx, token in enumerate(file, start=1):
                        if token.strip() == '':
                            break
                        else:
                            token = token.strip().split()[1]
                            tokens.append('_%s_' % token if idx == id_line else token)
                    print('%s\t%s' % (iid, ' '.join(tokens)), file=fout)
                    break

    file.close()

In [ ]: