In [ ]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import unicodedata
from os import path
matplotlib.style.use('ggplot')
pylab.rcParams['figure.figsize'] = 16, 10 # that's default image size for this interactive session
In [ ]:
sensem_results = "../resources/results/results_semisupervised_sensem_7k/"
with open("../resources/sensem/lemmas", "r") as f:
sensem_lemmas = unicodedata.normalize("NFC", f.read().decode("utf-8")).strip().split()
semeval_results = "../resources/results/results_semisupervised_semeval_7k/"
with open("../resources/semeval/lexelts/lemmas", "r") as f:
semeval_lemmas = [lemma.strip() for lemma in f.readlines()]
lemmas = {
"sensem": sensem_lemmas,
"semeval": semeval_lemmas
}
results = {
"sensem": sensem_results,
"semeval": semeval_results
}
results_count = 1770
In [ ]:
semisupervised_data = pd.DataFrame({
'corpus': np.empty(results_count, dtype=('U', 20)),
'experiment': np.empty(results_count, dtype=('U', 20)),
'accuracy': np.empty(results_count, dtype=np.float32),
})
idx = 0
for corpus in ["sensem", "semeval"]:
corpus_lemmas = lemmas[corpus]
corpus_results = results[corpus]
for lemma_idx in os.listdir(corpus_results):
if lemma_idx.endswith(".yaml"):
continue
lemma = corpus_lemmas[int(lemma_idx)]
lemma_dir = path.join(corpus_results, lemma_idx)
for experiment in os.listdir(lemma_dir):
print u"Getting info from experiment {} of lemma {} of corpus {}".format(
experiment, lemma, corpus
)
experiment_dir = path.join(lemma_dir, experiment)
with open(path.join(experiment_dir, "test_accuracy"), "r") as f:
accuracies = [float(x) for x in f.readlines()]
# Before
semisupervised_data['corpus'][idx] = "{}_before".format(corpus)
semisupervised_data['experiment'][idx] = experiment
semisupervised_data['accuracy'][idx] = accuracies[0]
# After
semisupervised_data['corpus'][idx+1] = "{}_after".format(corpus)
semisupervised_data['experiment'][idx+1] = experiment
semisupervised_data['accuracy'][idx+1] = accuracies[1]
idx += 2
In [ ]:
semisupervised_data.to_csv("data/semisupervised_results.csv",
columns=["corpus", "experiment", "accuracy"], encoding='utf-8',
float_format="%.2f", index=False)
In [ ]: