In [0]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import os
import numpy as np
from collections import defaultdict
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
In [0]:
from google.colab import drive
drive.mount('/content/drive/')
In [0]:
FOLDER_thesis = "./drive/My Drive/00Tesis/"
In [0]:
import sys
sys.path.insert(0,f"{FOLDER_thesis}/helper/")
from functions import get_dataset_from_json
In [0]:
FOLDER = f"{FOLDER_thesis}/auxfiles/json/"
In [0]:
file_list = os.listdir(FOLDER)
In [0]:
features_files = [file for file in file_list if file.startswith("features")]
In [0]:
# indexes_Ibsen = [" ".join(file[:-5].split("_")[1:]) for file in features_files if file.split("_")[0].endswith("Ibsen")]
# indexes_Quixote = [" ".join(file[:-5].split("_")[1:]) for file in features_files if file.split("_")[0].endswith("Quixote")]
In [0]:
results_all_corpora = defaultdict(pd.DataFrame)
for translator in ["Quixote","Ibsen"]:
indexes = []
cols = ["SVC", "Naïve Bayes", "Random Forest", "Logistic Regression"]
results = []
for file in [file for file in features_files if file.split("_")[0].endswith(translator)]:
with open(f"{FOLDER_thesis}/auxfiles/logs/experiment_colab.log", "a") as f:
print(file, file=f)
X_dict, y_str = get_dataset_from_json(file, folder=FOLDER)
v = DictVectorizer(sparse=True)
encoder = LabelEncoder()
X = v.fit_transform(X_dict, )
y = encoder.fit_transform(y_str)
kf = KFold(n_splits=10, shuffle=True, random_state=42)
try:
svm_model = Pipeline([("scaler", StandardScaler(with_mean=False)),
("scv", LinearSVC(random_state=42)),
])
cv_svm = cross_val_score(svm_model, X, y, cv=kf, n_jobs=-1)
nb_model = GaussianNB()
cv_nb = cross_val_score(nb_model, X.toarray(), y, cv=kf)
rf_model = RandomForestClassifier(n_estimators=100)
cv_rf = cross_val_score(rf_model, X, y, cv=kf, n_jobs=-1)
log_model = LogisticRegression()
cv_log = cross_val_score(log_model, X, y, cv=kf, n_jobs=-1)
except MemoryError:
cv_svm = -1*np.ones((1,4))
#cv_nb = cv_svm
cv_rf = cv_svm
cv_log = cv_svm
result_per_featureset = [cv_svm.mean(), cv_nb.mean(), cv_rf.mean(), cv_log.mean()]
#print(result_per_featureset)
results.append(result_per_featureset)
indexes.append(" ".join(file[:-5].split("_")[1:]))
#print(results)
results_all_corpora[translator] = pd.DataFrame(np.array(results), index=indexes, columns = cols)
In [0]:
In [0]:
final
Out[0]:
In [0]:
final = dict(results_all_corpora)
In [0]:
RESULTS_FOLDER = f"{FOLDER_thesis}/results/"
In [0]:
for translator in ["Quixote", "Ibsen"]:
df = final[translator].sort_index()
df.to_csv(f"{RESULTS_FOLDER}{translator}_scaled.csv", float_format='%.4f')
latex = df.to_latex(float_format=lambda x: '%.4f' % x)
with open(f"{RESULTS_FOLDER}{translator}_scaled.tex", "w") as f:
f.write(latex)
html = df.to_html(float_format='%.4f')
with open(f"{RESULTS_FOLDER}{translator}_scaled.html", "w") as f:
f.write(html)