In [96]:
import pandas as pd
import os
from baselines import remove_na, tidy_labels, map_aggression_score_to_2class, plurality
from serialization import load_pipeline
from sklearn.metrics import roc_auc_score


Using TensorFlow backend.

In [46]:
path = '../../data/annotations/raw/nda'

In [76]:
df_nda_1 = pd.read_csv(os.path.join(path, 'nda_2015_raters_10.csv'))
df_nda_2 = pd.read_csv(os.path.join(path, 'nda_onion_layer_5_raters_10.csv'))

In [77]:
df_nda_.index = df_nda_1.rev_id
df_nda_2.index = df_nda_2.rev_id

In [104]:
# replicate annotations cleaning 
def clean(df):
    df = tidy_labels(df)
    
    df['aggression'] = df['aggression_score'].apply(map_aggression_score_to_2class)
    
    df = df.query('_golden == False')
    
    df = remove_na(df)
    df = df.query('na==False')
    df = df.dropna(subset = ['aggression_score', 'is_harassment_or_attack'])
    
    def ambivalent(s):
        return 'not_attack' in s and s!= 'not_attack'
    df['ambivalent'] = df['is_harassment_or_attack'].apply(ambivalent)
    non_ambivalent_workers = df.groupby('_worker_id', as_index = False)['ambivalent'].mean().query('ambivalent < 0.1')
    df = df.merge(non_ambivalent_workers[['_worker_id']], how = 'inner', on = '_worker_id')
    df = df.query('ambivalent==False')
    
    df = df.drop_duplicates(subset = ['rev_id', '_worker_id'])
    
    comments = df.drop_duplicates(subset = ['rev_id'])
    u_comments = comments.drop_duplicates(subset = ['clean_diff'])
    df = df.merge(u_comments[['rev_id']], how = 'inner', on = 'rev_id')
    
    counts = df['rev_id'].value_counts().to_frame()
    counts.columns = ['n']
    counts['rev_id'] = counts.index
    counts_enough = counts.query("n>=8")
    df = df.merge(counts_enough[['rev_id']], how = 'inner', on = 'rev_id')
    
    return df

In [79]:
df_nda_1_clean = clean(df_nda_1)
df_nda_2_clean = clean(df_nda_2)

In [83]:
df_nda_1_clean.index = df_nda_1_clean.rev_id
df_nda_2_clean.index = df_nda_2_clean.rev_id

In [84]:
print(df_nda_1.shape)
print(df_nda_1_clean.shape)


(100000, 41)
(75597, 42)

In [85]:
print(df_nda_2.shape)
print(df_nda_2_clean.shape)


(100444, 41)
(72862, 42)

In [106]:
tasks = ['attack', 'aggression', 'recipient']
models = ['linear_tfidf', 'edp_tfidf']

In [111]:
for task in tasks:
    for model_name in models:
        labels_1 = plurality(df_nda_1_clean[task]).sort_index()
        labels_2 = plurality(df_nda_2_clean[task]).sort_index()
        comments_1 = df_nda_1_clean.drop_duplicates(subset=['rev_id'])['clean_diff'].sort_index()
        comments_2 = df_nda_2_clean.drop_duplicates(subset=['rev_id'])['clean_diff'].sort_index()
        model = load_pipeline('../../models/%s/%s' % (task, model_name), model_name)
        roc_1 = roc_auc_score(labels_1, model.predict_proba(comments_1)[:, 1])
        roc_2= roc_auc_score(labels_2, model.predict_proba(comments_2)[:, 1])
        print('Task: %s Model: %s: Data: random ROC: %0.3f' % (model_name, task, roc_1))
        print('Task: %s Model: %s: Data: blocked ROC: %0.3f' % (model_name, task, roc_2))
        print('\n')


Task: linear_tfidf Model: attack: Data: random ROC: 0.954
Task: linear_tfidf Model: attack: Data: blocked ROC: 0.961


7922/7922 [==============================] - 0s     
Task: edp_tfidf Model: attack: Data: random ROC: 0.957
Task: edp_tfidf Model: attack: Data: blocked ROC: 0.964


Task: linear_tfidf Model: aggression: Data: random ROC: 0.964
Task: linear_tfidf Model: aggression: Data: blocked ROC: 0.961


7922/7922 [==============================] - 0s     
Task: edp_tfidf Model: aggression: Data: random ROC: 0.967
Task: edp_tfidf Model: aggression: Data: blocked ROC: 0.966


Task: linear_tfidf Model: recipient: Data: random ROC: 0.992
Task: linear_tfidf Model: recipient: Data: blocked ROC: 0.949


8109/8109 [==============================] - 0s     
7922/7922 [==============================] - 0s     
Task: edp_tfidf Model: recipient: Data: random ROC: 0.995
Task: edp_tfidf Model: recipient: Data: blocked ROC: 0.957



In [ ]: