In [ ]:
import sys

from tqdm import tqdm_notebook

In [ ]:
sys.path.append('../')
from thesis.parsers import ColumnCorpusParser

In [ ]:
parser = ColumnCorpusParser('../../resources/sensem.newparse.conll', 'idx', 'token', 'lemma', 'tag',
                            'short_tag', 'morpho_info', 'ner', 'dep_head', 'dep_rel')

In [ ]:
with open('../../resources/sensem.rediscover.2.conll', 'w') as fout,\
    open('../../resources/notfound.2.log', 'w') as flog:
    for sentence in tqdm_notebook(parser.sentences, total=24153):
        if sentence.main_lemma_index == '-':
            possible_indexes = []
            for word in sentence:
                if word.lemma == sentence.main_lemma:
                    possible_indexes.append(word.idx)
            if len(possible_indexes) == 0:
                print('Lemma not found for sentence %05d' % sentence.sentence_index, file=flog)
            elif len(possible_indexes) == 1:
                sentence['main_lemma_index'] = str(possible_indexes[0])
            else:
                print('Multiple lemmas found for sentence %05d: %s' %
                      (sentence.sentence_index, ' '.join(str(idx) for idx in possible_indexes)), file=flog)
            
        print(sentence.metadata_string, file=fout)
        print(str(sentence), file=fout, end='\n\n')

In [ ]: