In [0]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import os
import numpy as np
from collections import defaultdict
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [0]:
from google.colab import drive
drive.mount('/content/drive/')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/

In [0]:
FOLDER_thesis = "./drive/My Drive/00Tesis/"

In [0]:
import sys
sys.path.insert(0,f"{FOLDER_thesis}/helper/")
from functions import get_dataset_from_json

In [0]:
FOLDER = f"{FOLDER_thesis}/auxfiles/json/"

In [0]:
file_list = os.listdir(FOLDER)

In [0]:
features_files = [file for file in file_list if file.startswith("features")]

In [0]:
# indexes_Ibsen = [" ".join(file[:-5].split("_")[1:]) for file in features_files if file.split("_")[0].endswith("Ibsen")]
# indexes_Quixote = [" ".join(file[:-5].split("_")[1:]) for file in features_files if file.split("_")[0].endswith("Quixote")]

In [0]:
results_all_corpora = defaultdict(pd.DataFrame)

for translator in ["Quixote","Ibsen"]:
    indexes = []
    cols = ["SVC", "Naïve Bayes", "Random Forest", "Logistic Regression"]
    results = []
    for file in [file for file in features_files if file.split("_")[0].endswith(translator)]:
        with open(f"{FOLDER_thesis}/auxfiles/logs/experiment_colab.log", "a") as f:
            print(file, file=f)
        X_dict, y_str = get_dataset_from_json(file, folder=FOLDER)
        v = DictVectorizer(sparse=True)
        encoder = LabelEncoder()

        X = v.fit_transform(X_dict, )
        y = encoder.fit_transform(y_str)

        kf = KFold(n_splits=10, shuffle=True, random_state=42)

        try:
            svm_model = Pipeline([("scaler", StandardScaler(with_mean=False)),
                                  ("scv", LinearSVC(random_state=42)),
                                 ])
            cv_svm = cross_val_score(svm_model, X, y, cv=kf, n_jobs=-1)

            nb_model = GaussianNB()
            cv_nb = cross_val_score(nb_model, X.toarray(), y, cv=kf)

            rf_model = RandomForestClassifier(n_estimators=100)
            cv_rf = cross_val_score(rf_model, X, y, cv=kf, n_jobs=-1)

            log_model = LogisticRegression()
            cv_log = cross_val_score(log_model, X, y, cv=kf, n_jobs=-1)
        
        except MemoryError:
            cv_svm = -1*np.ones((1,4))
            #cv_nb  = cv_svm
            cv_rf = cv_svm
            cv_log = cv_svm
        
        result_per_featureset = [cv_svm.mean(), cv_nb.mean(), cv_rf.mean(), cv_log.mean()]
        #print(result_per_featureset)
        
        results.append(result_per_featureset)        
        indexes.append(" ".join(file[:-5].split("_")[1:]))
    
    #print(results)
    results_all_corpora[translator] = pd.DataFrame(np.array(results), index=indexes, columns = cols)

In [0]:


In [0]:
final


Out[0]:
{'Ibsen':                                  SVC  ...  Logistic Regression
 bigrams                     0.646739  ...             0.965580
 unigrams                    0.846558  ...             0.982790
 trigrams                    0.540399  ...             0.761232
 unigrams punct              0.863587  ...             0.995652
 trigrams punct              0.561594  ...             0.936413
 bigrams punct               0.838043  ...             0.991304
 trigrams pos                0.764855  ...             0.795833
 bigrams pos                 0.787319  ...             0.817210
 trigrams pos punct          0.961232  ...             0.995652
 bigrams pos punct           0.974638  ...             0.995833
 cohesive punctuation        0.838043  ...             0.910507
 cohesive                    0.694384  ...             0.740399
 syntactic n2                0.803442  ...             0.991304
 syntactic n3                0.485145  ...             0.927355
 bigrams tf                  0.715036  ...             0.544565
 bigrams tfidf               0.715036  ...             0.544565
 bigrams pos punct tf        0.974638  ...             0.544565
 bigrams pos punct tfidf     0.974638  ...             0.544565
 bigrams pos tfidf           0.804529  ...             0.544565
 bigrams pos tf              0.804529  ...             0.544565
 bigrams punct tf            0.842210  ...             0.544565
 cohesive tf                 0.677536  ...             0.544565
 cohesive punctuation tf     0.838043  ...             0.544565
 bigrams punct tfidf         0.842210  ...             0.544565
 cohesive tfidf              0.677536  ...             0.544565
 cohesive punctuation tfidf  0.838043  ...             0.544565
 syntactic n2 tf             0.709239  ...             0.539855
 syntactic n2 tfidf          0.709239  ...             0.539855
 syntactic n3 tf             0.485145  ...             0.539855
 syntactic n3 tfidf          0.485145  ...             0.539855
 trigrams tf                 0.544565  ...             0.544565
 trigrams tfidf              0.544565  ...             0.544565
 trigrams pos tfidf          0.808333  ...             0.544565
 trigrams pos tf             0.808333  ...             0.544565
 trigrams pos punct tfidf    0.969746  ...             0.544565
 trigrams pos punct tf       0.969746  ...             0.544565
 trigrams punct tf           0.561594  ...             0.544565
 trigrams punct tfidf        0.561594  ...             0.544565
 unigrams tfidf              0.867935  ...             0.544565
 unigrams tf                 0.867935  ...             0.544565
 unigrams punct tf           0.893116  ...             0.544565
 unigrams punct tfidf        0.893116  ...             0.544565
 
 [42 rows x 4 columns],
 'Quixote':                                  SVC  ...  Logistic Regression
 unigrams                    0.960384  ...             0.981508
 bigrams                     0.629587  ...             0.976316
 trigrams                    0.124253  ...             0.878307
 unigrams punct              0.960384  ...             0.986700
 bigrams punct               0.812091  ...             0.984139
 trigrams punct              0.471266  ...             0.976316
 bigrams pos                 0.828094  ...             0.846373
 trigrams pos                0.841323  ...             0.849075
 bigrams pos punct           0.986771  ...             0.992034
 trigrams pos punct          0.994737  ...             0.986842
 cohesive                    0.862233  ...             0.917923
 cohesive punctuation        0.965789  ...             0.973542
 syntactic n2                0.674680  ...             0.968350
 syntactic n3                0.216856  ...             0.910171
 bigrams tf                  0.632290  ...             0.269915
 bigrams tfidf               0.632290  ...             0.269915
 bigrams pos tf              0.820199  ...             0.269915
 bigrams pos tfidf           0.820199  ...             0.269915
 bigrams pos punct tfidf     0.986771  ...             0.269915
 bigrams pos punct tf        0.986771  ...             0.269915
 bigrams punct tf            0.812091  ...             0.269915
 cohesive punctuation tf     0.963158  ...             0.269915
 cohesive tfidf              0.870128  ...             0.269915
 bigrams punct tfidf         0.812091  ...             0.269915
 cohesive punctuation tfidf  0.963158  ...             0.269915
 cohesive tf                 0.870128  ...             0.269915
 syntactic n2 tf             0.666785  ...             0.269915
 syntactic n2 tfidf          0.666785  ...             0.269915
 syntactic n3 tf             0.211451  ...             0.269915
 syntactic n3 tfidf          0.211451  ...             0.269915
 trigrams tf                 0.126956  ...             0.269915
 trigrams tfidf              0.126956  ...             0.269915
 trigrams pos tf             0.849218  ...             0.269915
 trigrams pos tfidf          0.849218  ...             0.269915
 trigrams pos punct tf       0.994737  ...             0.269915
 trigrams pos punct tfidf    0.994737  ...             0.269915
 trigrams punct tf           0.476529  ...             0.269915
 trigrams punct tfidf        0.476529  ...             0.269915
 unigrams tf                 0.960384  ...             0.269915
 unigrams tfidf              0.960384  ...             0.269915
 unigrams punct tf           0.960384  ...             0.269915
 unigrams punct tfidf        0.960384  ...             0.269915
 
 [42 rows x 4 columns]}

In [0]:
final = dict(results_all_corpora)

Save results to CSV, $\LaTeX$, and HTML


In [0]:
RESULTS_FOLDER = f"{FOLDER_thesis}/results/"

In [0]:
for translator in ["Quixote", "Ibsen"]:
    df = final[translator].sort_index()
    
    df.to_csv(f"{RESULTS_FOLDER}{translator}_scaled.csv", float_format='%.4f')
    
    latex = df.to_latex(float_format=lambda x: '%.4f' % x)
    with open(f"{RESULTS_FOLDER}{translator}_scaled.tex", "w") as f:
        f.write(latex)
    
    html = df.to_html(float_format='%.4f')
    with open(f"{RESULTS_FOLDER}{translator}_scaled.html", "w") as f:
        f.write(html)