In [ ]:
%matplotlib inline

import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import unicodedata
from os import path

matplotlib.style.use('ggplot')
pylab.rcParams['figure.figsize'] = 16, 10  # that's default image size for this interactive session

In [ ]:
sensem_results = "../resources/results/results_semisupervised_sensem_7k/"
with open("../resources/sensem/lemmas", "r") as f:
    sensem_lemmas = unicodedata.normalize("NFC", f.read().decode("utf-8")).strip().split()

semeval_results = "../resources/results/results_semisupervised_semeval_7k/"
with open("../resources/semeval/lexelts/lemmas", "r") as f:
    semeval_lemmas = [lemma.strip() for lemma in f.readlines()]

lemmas = {
    "sensem": sensem_lemmas,
    "semeval": semeval_lemmas
}
    
results = {
    "sensem": sensem_results,
    "semeval": semeval_results
}

results_count = 1770

In [ ]:
semisupervised_data = pd.DataFrame({
    'corpus': np.empty(results_count, dtype=('U', 20)),
    'experiment': np.empty(results_count, dtype=('U', 20)),
    'accuracy': np.empty(results_count, dtype=np.float32),
})

idx = 0

for corpus in ["sensem", "semeval"]:
    corpus_lemmas = lemmas[corpus]
    corpus_results = results[corpus]

    for lemma_idx in os.listdir(corpus_results):
        if lemma_idx.endswith(".yaml"):
            continue
        
        lemma = corpus_lemmas[int(lemma_idx)]
        lemma_dir = path.join(corpus_results, lemma_idx)

        for experiment in os.listdir(lemma_dir):
            print u"Getting info from experiment {} of lemma {} of corpus {}".format(
                experiment, lemma, corpus
            )
            
            experiment_dir = path.join(lemma_dir, experiment)

            with open(path.join(experiment_dir, "test_accuracy"), "r") as f:
                accuracies = [float(x) for x in f.readlines()]

            # Before
            semisupervised_data['corpus'][idx] = "{}_before".format(corpus)
            semisupervised_data['experiment'][idx] = experiment
            semisupervised_data['accuracy'][idx] = accuracies[0]
            
            # After
            semisupervised_data['corpus'][idx+1] = "{}_after".format(corpus)
            semisupervised_data['experiment'][idx+1] = experiment
            semisupervised_data['accuracy'][idx+1] = accuracies[1]
            idx += 2

In [ ]:
semisupervised_data.to_csv("data/semisupervised_results.csv",
                           columns=["corpus", "experiment", "accuracy"], encoding='utf-8',
                           float_format="%.2f", index=False)

In [ ]: