In [ ]:
import numpy as np
import pickle
from tqdm import tqdm_notebook
In [ ]:
unlabeled_instances_id = np.load('../../resources/hashed/unlabeled_spanish/dataset.npz')['instances_id']
basefilename = '../../resources/corpora/unlabeled_spanish/%(corpus)s/%(filename)s'
last_filename = ''
file = None
with open('../../resources/active_learning/spanish_sentences.txt', 'w') as fout:
for iid in tqdm_notebook(unlabeled_instances_id):
corpus, filename, id_sentence, verb, id_line = iid.split(':')
id_sentence = int(id_sentence)
id_line = int(id_line)
filename = basefilename % {'corpus': corpus, 'filename': filename}
if last_filename != filename:
if file is not None:
file.close()
file = open(filename, 'r')
else:
file.seek(0)
for line in file:
if line.startswith('META'):
sentence = int(line.strip().split()[1].split(':')[1])
if sentence == id_sentence:
tokens = []
for idx, token in enumerate(file, start=1):
if token.strip() == '':
break
else:
token = token.strip().split()[1]
tokens.append('_%s_' % token if idx == id_line else token)
print('%s\t%s' % (iid, ' '.join(tokens)), file=fout)
break
file.close()
In [ ]: