In [96]:
import pandas as pd
import os
from baselines import remove_na, tidy_labels, map_aggression_score_to_2class, plurality
from serialization import load_pipeline
from sklearn.metrics import roc_auc_score
In [46]:
path = '../../data/annotations/raw/nda'
In [76]:
df_nda_1 = pd.read_csv(os.path.join(path, 'nda_2015_raters_10.csv'))
df_nda_2 = pd.read_csv(os.path.join(path, 'nda_onion_layer_5_raters_10.csv'))
In [77]:
df_nda_.index = df_nda_1.rev_id
df_nda_2.index = df_nda_2.rev_id
In [104]:
# replicate annotations cleaning
def clean(df):
df = tidy_labels(df)
df['aggression'] = df['aggression_score'].apply(map_aggression_score_to_2class)
df = df.query('_golden == False')
df = remove_na(df)
df = df.query('na==False')
df = df.dropna(subset = ['aggression_score', 'is_harassment_or_attack'])
def ambivalent(s):
return 'not_attack' in s and s!= 'not_attack'
df['ambivalent'] = df['is_harassment_or_attack'].apply(ambivalent)
non_ambivalent_workers = df.groupby('_worker_id', as_index = False)['ambivalent'].mean().query('ambivalent < 0.1')
df = df.merge(non_ambivalent_workers[['_worker_id']], how = 'inner', on = '_worker_id')
df = df.query('ambivalent==False')
df = df.drop_duplicates(subset = ['rev_id', '_worker_id'])
comments = df.drop_duplicates(subset = ['rev_id'])
u_comments = comments.drop_duplicates(subset = ['clean_diff'])
df = df.merge(u_comments[['rev_id']], how = 'inner', on = 'rev_id')
counts = df['rev_id'].value_counts().to_frame()
counts.columns = ['n']
counts['rev_id'] = counts.index
counts_enough = counts.query("n>=8")
df = df.merge(counts_enough[['rev_id']], how = 'inner', on = 'rev_id')
return df
In [79]:
df_nda_1_clean = clean(df_nda_1)
df_nda_2_clean = clean(df_nda_2)
In [83]:
df_nda_1_clean.index = df_nda_1_clean.rev_id
df_nda_2_clean.index = df_nda_2_clean.rev_id
In [84]:
print(df_nda_1.shape)
print(df_nda_1_clean.shape)
In [85]:
print(df_nda_2.shape)
print(df_nda_2_clean.shape)
In [106]:
tasks = ['attack', 'aggression', 'recipient']
models = ['linear_tfidf', 'edp_tfidf']
In [111]:
for task in tasks:
for model_name in models:
labels_1 = plurality(df_nda_1_clean[task]).sort_index()
labels_2 = plurality(df_nda_2_clean[task]).sort_index()
comments_1 = df_nda_1_clean.drop_duplicates(subset=['rev_id'])['clean_diff'].sort_index()
comments_2 = df_nda_2_clean.drop_duplicates(subset=['rev_id'])['clean_diff'].sort_index()
model = load_pipeline('../../models/%s/%s' % (task, model_name), model_name)
roc_1 = roc_auc_score(labels_1, model.predict_proba(comments_1)[:, 1])
roc_2= roc_auc_score(labels_2, model.predict_proba(comments_2)[:, 1])
print('Task: %s Model: %s: Data: random ROC: %0.3f' % (model_name, task, roc_1))
print('Task: %s Model: %s: Data: blocked ROC: %0.3f' % (model_name, task, roc_2))
print('\n')
In [ ]: