In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext autotime
from baselines import load_comments_and_labels, assemble_data, one_hot
from ngram import test_cross,calibration_curve_plotter,two_class_roc_plotter, two_class_precision_recall_plotter
from ngram import roc_scorer, spearman_scorer
import matplotlib.pyplot as plt
import joblib
import os
from serialization import save_pipeline, load_pipeline
import pandas as pd
In [2]:
def eval_n_save_model(task, model_name, debug=False):
cv_path = '../../models/cv'
results_path = '../../models/%s/%s' % (task, model_name)
if not os.path.exists(results_path):
os.makedirs(results_path)
#load data
data = load_comments_and_labels(task)
if 'ed' in model_name:
label_type = 'empirical_dist'
elif 'oh' in model_name:
label_type = 'one_hot'
X_train, y_train = assemble_data(data, 'comments', label_type, splits = ['train'])
X_test, y_test = assemble_data(data, 'comments', 'empirical_dist', splits = ['test'])
if debug:
X_train = X_train[:1000]
y_train = y_train[:1000]
X_test = X_test[:1000]
y_test = y_test[:1000]
#load architecture
model = load_pipeline(cv_path, model_name)
#fit model on train, eval on test
model.fit(X_train, y_train)
save_pipeline(model, results_path, model_name + "_train" )
# metrics
roc_df, spearman_df = test_cross(model, data, 'comments')
roc_df.to_csv(os.path.join(results_path, 'roc.csv'))
spearman_df.to_csv(os.path.join(results_path, 'spearman.csv'))
y_test_pred = model.predict_proba(X_test)
calibration_curve_plotter(one_hot(y_test)[:, 1], y_test_pred[:, 1])
plt.savefig(os.path.join(results_path, 'calibration.png'))
two_class_roc_plotter(y_test, y_test_pred)
plt.savefig(os.path.join(results_path, 'roc_curve.png'))
two_class_precision_recall_plotter(y_test, y_test_pred)
plt.savefig(os.path.join(results_path, 'precision_recall_curve.png'))
# train on all data and save model
X, y = assemble_data(data, 'comments', label_type)
if debug:
X=X[:1000]
y=y[:1000]
save_pipeline(model.fit(X, y), results_path, model_name + "_all" )
In [3]:
tasks = ['attack', 'recipient', 'aggression']
models = ['linear_char_ed', 'mlp_char_ed', 'linear_char_oh', 'mlp_char_oh']
In [4]:
for task in tasks:
for model in models:
eval_n_save_model(task, model, debug = False)
In [ ]: