In [0]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import os
import numpy as np
from collections import defaultdict
import pandas as pd
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

In [0]:
from google.colab import drive
drive.mount('/content/drive/')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/

In [0]:
FOLDER_thesis = Path(r"./drive/My Drive/00Tesis/")

In [0]:
import sys
sys.path.insert(0,f"{FOLDER_thesis}/helper/")
from analysis import get_dataset_from_json

In [0]:
FOLDER = FOLDER_thesis/"auxfiles/json/"
LOG_PATH = FOLDER_thesis/"auxfiles/logs/"

In [0]:
features_files[:4]


Out[0]:
[PosixPath('drive/My Drive/00Tesis/auxfiles/json/featuresQuixote_unigrams.json'),
 PosixPath('drive/My Drive/00Tesis/auxfiles/json/featuresQuixote_bigrams.json'),
 PosixPath('drive/My Drive/00Tesis/auxfiles/json/featuresQuixote_trigrams.json'),
 PosixPath('drive/My Drive/00Tesis/auxfiles/json/featuresQuixote_unigrams_punct.json')]

In [0]:
features_files = [file for file in FOLDER.iterdir() if file.name.startswith("features")]

In [0]:
results_all_corpora = defaultdict(pd.DataFrame)

for translator in ["Quixote","Ibsen"]:
    indexes = []
    cols = ["Dimension", "SVC", "Naïve Bayes", "Decision Tree", "Logistic Regression"]
    results = []
    for file in [file for file in features_files if file.name.split("_")[0].endswith(translator)]:
        with open(LOG_PATH/"experiment_colab.log", "a") as f:
            print(file, file=f)
        X_dict, y_str = get_dataset_from_json(file)
        v = DictVectorizer(sparse=True)
        encoder = LabelEncoder()

        X = v.fit_transform(X_dict, )
        y = encoder.fit_transform(y_str)

        dimension = X.shape[1]

        kf = KFold(n_splits=10, shuffle=True, random_state=42)

        try:
            svm_model = Pipeline([("scaler", StandardScaler(with_mean=False)),
                                  ("scv", LinearSVC(random_state=42)),
                                 ])
            cv_svm = cross_val_score(svm_model, X, y, cv=kf, n_jobs=-1)

            nb_model = GaussianNB()
            cv_nb = cross_val_score(nb_model, X.toarray(), y, cv=kf)

            dt_model = DecisionTreeClassifier(random_state=24)
            cv_dt = cross_val_score(dt_model, X, y, cv=kf, n_jobs=-1)

            log_model = LogisticRegression()
            cv_log = cross_val_score(log_model, X, y, cv=kf, n_jobs=-1)
        
        except MemoryError:
            cv_svm = -1*np.ones((1,4))
            #cv_nb  = cv_svm
            cv_rf = cv_svm
            cv_log = cv_svm
        
        result_per_featureset = [dimension, cv_svm.mean(), cv_nb.mean(), cv_dt.mean(), cv_log.mean()]
        #print(result_per_featureset)
        
        results.append(result_per_featureset)        
        indexes.append(" ".join(file.stem.split("_")[1:]))
    
    #print(results)
    results_all_corpora[translator] = pd.DataFrame(np.array(results), index=indexes, columns = cols)

In [0]:
final = dict(results_all_corpora)

Save results to CSV, $\LaTeX$, and HTML


In [0]:
RESULTS_FOLDER = FOLDER_thesis/"results/"

In [0]:
for translator in ["Quixote", "Ibsen"]:
    df = final[translator].sort_index()
    
    df.to_csv(f"{RESULTS_FOLDER / (translator + '_scaled_20200316_.csv')}", float_format='%.4f')
    
    latex = df.to_latex(float_format=lambda x: '%.4f' % x)
    with open(RESULTS_FOLDER/(translator+"_scaled_20200316_.tex"), "w") as f:
        f.write(latex)
    
    html = df.to_html(float_format='%.4f')
    with open(RESULTS_FOLDER/(translator+"_scaled_20200316_.html"), "w") as f:
        f.write(html)