In [0]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import os
import numpy as np
from collections import defaultdict
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")
In [0]:
from google.colab import drive
drive.mount('/content/drive/')
In [0]:
FOLDER_thesis = Path(r"./drive/My Drive/00Tesis/")
In [0]:
import sys
sys.path.insert(0,f"{FOLDER_thesis}/helper/")
from analysis import get_dataset_from_json
In [0]:
FOLDER = FOLDER_thesis/"auxfiles/json/"
LOG_PATH = FOLDER_thesis/"auxfiles/logs/"
In [0]:
features_files[:4]
Out[0]:
In [0]:
features_files = [file for file in FOLDER.iterdir() if file.name.startswith("features")]
In [0]:
results_all_corpora = defaultdict(pd.DataFrame)
for translator in ["Quixote","Ibsen"]:
indexes = []
cols = ["Dimension", "SVC", "Naïve Bayes", "Decision Tree", "Logistic Regression"]
results = []
for file in [file for file in features_files if file.name.split("_")[0].endswith(translator)]:
with open(LOG_PATH/"experiment_colab.log", "a") as f:
print(file, file=f)
X_dict, y_str = get_dataset_from_json(file)
v = DictVectorizer(sparse=True)
encoder = LabelEncoder()
X = v.fit_transform(X_dict, )
y = encoder.fit_transform(y_str)
dimension = X.shape[1]
kf = KFold(n_splits=10, shuffle=True, random_state=42)
try:
svm_model = Pipeline([("scaler", StandardScaler(with_mean=False)),
("scv", LinearSVC(random_state=42)),
])
cv_svm = cross_val_score(svm_model, X, y, cv=kf, n_jobs=-1)
nb_model = GaussianNB()
cv_nb = cross_val_score(nb_model, X.toarray(), y, cv=kf)
dt_model = DecisionTreeClassifier(random_state=24)
cv_dt = cross_val_score(dt_model, X, y, cv=kf, n_jobs=-1)
log_model = LogisticRegression()
cv_log = cross_val_score(log_model, X, y, cv=kf, n_jobs=-1)
except MemoryError:
cv_svm = -1*np.ones((1,4))
#cv_nb = cv_svm
cv_rf = cv_svm
cv_log = cv_svm
result_per_featureset = [dimension, cv_svm.mean(), cv_nb.mean(), cv_dt.mean(), cv_log.mean()]
#print(result_per_featureset)
results.append(result_per_featureset)
indexes.append(" ".join(file.stem.split("_")[1:]))
#print(results)
results_all_corpora[translator] = pd.DataFrame(np.array(results), index=indexes, columns = cols)
In [0]:
final = dict(results_all_corpora)
In [0]:
RESULTS_FOLDER = FOLDER_thesis/"results/"
In [0]:
for translator in ["Quixote", "Ibsen"]:
df = final[translator].sort_index()
df.to_csv(f"{RESULTS_FOLDER / (translator + '_scaled_20200316_.csv')}", float_format='%.4f')
latex = df.to_latex(float_format=lambda x: '%.4f' % x)
with open(RESULTS_FOLDER/(translator+"_scaled_20200316_.tex"), "w") as f:
f.write(latex)
html = df.to_html(float_format='%.4f')
with open(RESULTS_FOLDER/(translator+"_scaled_20200316_.html"), "w") as f:
f.write(html)