In [ ]:
import numpy as np
import os
import pandas as pd
import sh
from collections import Counter
from lxml import etree
from tqdm import tqdm_notebook
In [ ]:
base_directory = '../resources/word_sense_disambigation_corpora/'
sentences = 1
with open('google_wsd.col', 'w') as fout:
for file in tqdm_notebook(sh.find('%smasc' % base_directory, '%ssemcor' % base_directory,
'-type', 'f', '-name', '*.xml')):
root = etree.parse(file.strip()).getroot()
sentence = []
verb_senses = []
for word in root.findall('word'):
if word.attrib['text'].strip() == '':
continue
break_level = word.attrib['break_level']
docname = file.strip().replace(base_directory, '')
corpus = 'masc' if docname.startswith('masc') else 'semcor'
domain = 'semcor' if corpus == 'semcor' else os.path.dirname(docname)[5:]
if break_level == 'PARAGRAPH_BREAK' or break_level == 'SENTENCE_BREAK':
for vidx, token, lemma, sense in verb_senses:
meta_string = 'META:%s\tsentence:%05d\t' % (corpus, sentences)
meta_string += 'doc:%s\t' % docname
meta_string += 'domain:%s\t' % domain
meta_string += 'main_lemma:%s\t' % lemma
meta_string += 'main_lemma_index:%d\t' % vidx
meta_string += 'main_token:%s\t' % token
meta_string += 'sense:%s' % sense
print(meta_string, file=fout)
print('\n'.join(sentence), file=fout, end='\n\n')
sentences += 1
sentence = []
verb_senses = []
sidx = len(sentence) + 1
if 'pos' in word.attrib and word.attrib['pos'] == 'VERB':
verb_senses.append((sidx, word.attrib['text'], word.attrib['lemma'], word.attrib['sense']))
sentence.append('%d\t%s\t%s\t-' % (sidx, word.attrib['text'], word.attrib['lemma']))
else:
sentence.append('%d\t%s\t-\t-' % (sidx, word.attrib['text']))