In [ ]:
import sys
from tqdm import tqdm_notebook
In [ ]:
sys.path.append('../')
from thesis.parsers import ColumnCorpusParser
In [ ]:
parser = ColumnCorpusParser('../../resources/sensem.newparse.conll', 'idx', 'token', 'lemma', 'tag',
'short_tag', 'morpho_info', 'ner', 'dep_head', 'dep_rel')
In [ ]:
with open('../../resources/sensem.rediscover.2.conll', 'w') as fout,\
open('../../resources/notfound.2.log', 'w') as flog:
for sentence in tqdm_notebook(parser.sentences, total=24153):
if sentence.main_lemma_index == '-':
possible_indexes = []
for word in sentence:
if word.lemma == sentence.main_lemma:
possible_indexes.append(word.idx)
if len(possible_indexes) == 0:
print('Lemma not found for sentence %05d' % sentence.sentence_index, file=flog)
elif len(possible_indexes) == 1:
sentence['main_lemma_index'] = str(possible_indexes[0])
else:
print('Multiple lemmas found for sentence %05d: %s' %
(sentence.sentence_index, ' '.join(str(idx) for idx in possible_indexes)), file=flog)
print(sentence.metadata_string, file=fout)
print(str(sentence), file=fout, end='\n\n')
In [ ]: