In [0]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import os
import numpy as np
from collections import defaultdict
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive/')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/

In [0]:
FOLDER_thesis = "./drive/My Drive/00Tesis/"

In [0]:
import sys
sys.path.insert(0,f"{FOLDER_thesis}/helper/")
from functions import get_dataset_from_json

In [0]:
FOLDER = f"{FOLDER_thesis}/auxfiles/json/"

In [0]:
file_list = os.listdir(FOLDER)

In [0]:
features_files = [file for file in file_list if file.startswith("features")]

In [0]:
# indexes_Ibsen = [" ".join(file[:-5].split("_")[1:]) for file in features_files if file.split("_")[0].endswith("Ibsen")]
# indexes_Quixote = [" ".join(file[:-5].split("_")[1:]) for file in features_files if file.split("_")[0].endswith("Quixote")]

In [0]:
results_all_corpora = defaultdict(pd.DataFrame)

for translator in ["Quixote","Ibsen"]:
    indexes = []
    cols = ["Dimension", "SVC", "Naïve Bayes", "Decision Tree", "Logistic Regression"]
    results = []
    for file in [file for file in features_files if file.split("_")[0].endswith(translator)]:
        with open(f"{FOLDER_thesis}/auxfiles/logs/experiment_colab.log", "a") as f:
            print(file, file=f)
        X_dict, y_str = get_dataset_from_json(file, folder=FOLDER)
        v = DictVectorizer(sparse=True)
        encoder = LabelEncoder()

        X = v.fit_transform(X_dict, )
        y = encoder.fit_transform(y_str)

        dimension = X.shape[1]

        kf = KFold(n_splits=10, shuffle=True, random_state=42)

        try:
            svm_model = Pipeline([("scaler", StandardScaler(with_mean=False)),
                                  ("scv", LinearSVC(random_state=42)),
                                 ])
            cv_svm = cross_val_score(svm_model, X, y, cv=kf, n_jobs=-1)

            nb_model = GaussianNB()
            cv_nb = cross_val_score(nb_model, X.toarray(), y, cv=kf)

            dt_model = DecisionTreeClassifier(random_state=24)
            cv_dt = cross_val_score(dt_model, X, y, cv=kf, n_jobs=-1)

            log_model = LogisticRegression()
            cv_log = cross_val_score(log_model, X, y, cv=kf, n_jobs=-1)
        
        except MemoryError:
            cv_svm = -1*np.ones((1,4))
            #cv_nb  = cv_svm
            cv_rf = cv_svm
            cv_log = cv_svm
        
        result_per_featureset = [dimension, cv_svm.mean(), cv_nb.mean(), cv_dt.mean(), cv_log.mean()]
        #print(result_per_featureset)
        
        results.append(result_per_featureset)        
        indexes.append(" ".join(file[:-5].split("_")[1:]))
    
    #print(results)
    results_all_corpora[translator] = pd.DataFrame(np.array(results), index=indexes, columns = cols)

In [0]:
final = dict(results_all_corpora)

Save results to CSV, $\LaTeX$, and HTML


In [0]:
RESULTS_FOLDER = f"{FOLDER_thesis}/results/"

In [0]:
for translator in ["Quixote", "Ibsen"]:
    df = final[translator].sort_index()
    
    df.to_csv(f"{RESULTS_FOLDER}{translator}_scaled_20200219_.csv", float_format='%.4f')
    
    latex = df.to_latex(float_format=lambda x: '%.4f' % x)
    with open(f"{RESULTS_FOLDER}{translator}_scaled_20200219_.tex", "w") as f:
        f.write(latex)
    
    html = df.to_html(float_format='%.4f')
    with open(f"{RESULTS_FOLDER}{translator}_scaled_20200219_.html", "w") as f:
        f.write(html)