We want our model to accurately measure the fraction of comments that are attacks for a group of comments. In this notebook we explore both model calibration and picking a threshold on the dev set and measure the reliability of our model on the test set.


In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from serialization import load_pipeline
from baselines import load_comments_and_labels, assemble_data
from sklearn.metrics import roc_curve, brier_score_loss, log_loss, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV


Using TensorFlow backend.

In [2]:
# load data
task = 'attack'
data = load_comments_and_labels(task)

In [3]:
#load model trained on train split
model_name = 'linear_char_ed_train'
model_type = 'linear_char_ed'
path = '../../models/%s/%s' % (task, model_type)
model =  load_pipeline(path, model_name)

In [4]:
# train calibrator on dev set, eval done on test
# all data comes from data sampled at random

from sklearn.linear_model import LogisticRegression as LR
from sklearn.isotonic import IsotonicRegression as IR

X_dev, y_dev = assemble_data(data, 'comments', 'plurality', splits = ['dev'], samples = ['random'])
X_test, y_test = assemble_data(data, 'comments', 'plurality', splits = ['test'], samples = ['random'])

y_dev_pred = model.predict_proba(X_dev)[:,1]
y_test_pred = model.predict_proba(X_test)[:,1]

Model calibration


In [5]:
lr_calibrator = LR().fit( y_dev_pred.reshape( -1, 1 ), y_dev )                                                       
ir_calibrator = IR( out_of_bounds = 'clip' ).fit( y_dev_pred, y_dev )

In [6]:
y_test_pred_lr_calibrated = lr_calibrator.predict_proba( y_test_pred.reshape( -1, 1 ))[:,1]
y_test_pred_ir_calibrated = ir_calibrator.transform( y_test_pred )

In [7]:
# plot calibration transformations

In [8]:
plt.scatter(y_test_pred, y_test_pred_lr_calibrated)


Out[8]:
<matplotlib.collections.PathCollection at 0x13df179b0>

In [9]:
plt.scatter(y_test_pred, y_test_pred_ir_calibrated)


Out[9]:
<matplotlib.collections.PathCollection at 0x13d390f60>

In [10]:
df_brier = pd.DataFrame()
df_brier['predicted_prob'] = y_test_pred
df_brier['raw_error'] = (y_test - y_test_pred)**2
df_brier['ir_error'] = (y_test - y_test_pred_ir_calibrated)**2
df_brier['lr_error'] = (y_test - y_test_pred_lr_calibrated)**2
df_brier = df_brier.sort_values('predicted_prob')
df_brier.index = df_brier['predicted_prob']
df_brier = pd.rolling_mean(df_brier[['raw_error', 'ir_error', 'lr_error']], 200)
df_brier.plot()
plt.ylabel('smoothed local brier score loss')


Out[10]:
<matplotlib.text.Text at 0x13deb80f0>

In [11]:
# see how well model scores can be used to infer attack rates for each calibrator

def plot_attack_rates(y, y_pred):
    
    print('brier_score_loss: ',  brier_score_loss(y, y_pred))
    print('log_loss: ',  log_loss(y, y_pred))
    print('roc_auc_score: ',  roc_auc_score(y, y_pred))
        
    true_rate = y.mean()
    expected_rate = y_pred.mean()
    print('true_rate: ', true_rate, ' expected_rate: ', expected_rate)
    fpr, tpr, thresholds = roc_curve(y, y_pred)
    rates = [true_rate * tpr[i] + (1-true_rate) * fpr[i] for i in range(len(thresholds))]
    
    plt.plot(thresholds, rates, label='threshold rate')
    plt.plot(thresholds, [true_rate] * len(rates), label='true rate')
    plt.plot(thresholds, [expected_rate] * len(rates), label='expected rate')
    plt.ylabel('attack rate')
    plt.xlabel('threshold')
    plt.ylim((true_rate - 2.0*true_rate, true_rate + 2.0*true_rate))
    plt.legend()
    plt.xlim(0.0, 0.6)

In [12]:
#no calibration
plot_attack_rates(y_test, y_test_pred)


brier_score_loss:  0.0104372378311
log_loss:  0.0749894085529
roc_auc_score:  0.981287335638
true_rate:  0.00948196114709  expected_rate:  0.0690152

In [13]:
# isotonic regression calibration
plot_attack_rates(y_test, y_test_pred_ir_calibrated)


brier_score_loss:  0.00471483816757
log_loss:  0.0368245004985
roc_auc_score:  0.978369532411
true_rate:  0.00948196114709  expected_rate:  0.0108583374526

In [14]:
# logistic regression calibration
plot_attack_rates(y_test, y_test_pred_lr_calibrated)


brier_score_loss:  0.00485288803301
log_loss:  0.0228516631178
roc_auc_score:  0.981287335638
true_rate:  0.00948196114709  expected_rate:  0.0115423619033

What about picking a threshold on the dev set?


In [15]:
true_rate = y_dev.mean()
thresholds = sorted(y_dev_pred)

In [16]:
rates = [(y_dev_pred > t).mean() for t in thresholds]

In [17]:
plt.plot(thresholds, rates)
plt.ylim((true_rate - 0.1*true_rate, true_rate + 0.1*true_rate))
plt.xlim((0.4, 0.42))

plt.plot(thresholds, [true_rate] * len(rates), label='true rate')


Out[17]:
[<matplotlib.lines.Line2D at 0x13da58828>]

In [18]:
dev_er_t = 0.408

In [19]:
y_test.mean()


Out[19]:
0.0094819611470860314

In [20]:
(y_test_pred > dev_er_t).mean()


Out[20]:
0.010638297872340425

In [22]:
from stats_utils import bayesian_rate_interval
num_attacks = y_test.sum()
num_comments = y_test.shape[0]
bayesian_rate_interval(num_attacks, num_comments, conf=95, plot =True)


Out[22]:
(0.007649017918883348, 0.011764587797954223)

Random Subset Rate Estimation


In [29]:
df_sub = pd.DataFrame()
df_sub['true'] = y_test
df_sub['threshold'] = (y_test_pred > threshold).astype(int)
df_sub['sum'] = y_test_pred_ir_calibrated

In [30]:
n = df_sub.shape[0]

In [31]:
ks = [1, 8, 64, int(n/64), int(n/8), n]

In [32]:
results_list=[]

for k in ks:
    for i in range(1000):
        idxs = np.random.choice(n, k)
        df_temp = df_sub.loc[idxs]
        result_dict = {
            'true': df_sub.loc[idxs]['true'].mean(),
            'sum': df_sub['sum'].loc[idxs].mean(),
            'threshold': df_sub['threshold'].loc[idxs].mean(),
            'k': k,
        }
        
        results_list.append(result_dict)
        
df_results = pd.DataFrame(results_list)

In [33]:
df_results = df_results*100

In [34]:
df_results.query('k==100')['true'].hist()


Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x134361cf8>

In [35]:
df_results.groupby(['k']).mean()


Out[35]:
sum threshold true
k
100 1.166583 1.100000 1.200000
800 1.016613 1.037500 0.825000
6400 1.124038 1.096875 1.007812
13500 1.080150 1.050370 0.950370
108100 1.079148 1.053747 0.940241
864800 1.086390 1.063263 0.951735

In [36]:
df_results.groupby(['k']).std()


Out[36]:
sum threshold true
k
100 8.242614 10.435464 10.893974
800 2.523244 3.539756 3.105079
6400 0.967050 1.255692 1.219936
13500 0.649441 0.864682 0.826235
108100 0.230559 0.312717 0.306673
864800 0.084183 0.111802 0.105094

More Subsets


In [25]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))
from analysis.load_utils import load_diffs

In [27]:
d = load_diffs()

In [32]:
df_test = pd.read_csv('../../data/annotations/split/test/annotations.tsv', sep = '\t')
df_test = df_test.query("sample=='random'").drop_duplicates(subset='rev_id')[['rev_id', 'clean_diff']]
df_test = d['annotated'].merge(df_test, how = 'inner', on = 'rev_id')
df_test['pred_attack'] = (df_test['pred_attack_score_uncalibrated'] > dev_er_t).astype(int)

In [168]:
d1 = df_test[['ns', 'year', 'author_anon', 'user_id', 'user_text', 'clean_diff' ]].copy()
d1['source'] = 'crowd'
d1['attack'] = df_test.attack * 100


d2 = df_test[['ns', 'year', 'author_anon', 'user_id', 'user_text', 'clean_diff' ]].copy()
d2['source'] = 'model'
d2['attack'] = df_test.pred_attack * 100

d = pd.concat([d1, d2])

d['logged_in'] =  d['author_anon'].apply(lambda x: not x)
d['thank'] = d['clean_diff'].apply(lambda x: 'thank' in x.lower())

In [170]:
sns.set(font_scale=4)

g = sns.pointplot(data= d, x = 'source', y = 'attack', dodge=0.15, capsize=.1, linestyles=[ ""], )
plt.ylabel('Attack percentage')
plt.xlabel('Label source')
plt.savefig('../../paper/figs/sanity_check.png',bbox_inches='tight')



In [171]:
d_tmp1 = d.query("source == 'crowd'").copy()
d_tmp1['x'] = 'crowd'
d_tmp2 = d.query("source == 'model'").copy()
d_tmp2['x'] = 'model'

In [172]:
d_tmp3 = d[:2].copy()
d_tmp3['x'] = 'crowd'
d_tmp3['attack'][0] = 0.7
d_tmp3['x'][0] = 'model'
d_tmp3['attack'][1] = 0.7
d_tmp3['x'][1] = 'crowd'

In [173]:
d_temp = pd.concat([d_tmp1, d_tmp2, d_tmp3])

In [174]:
d_temp['x'].value_counts()


Out[174]:
crowd    8649
model    8649
Name: x, dtype: int64

In [176]:
ax = sns.pointplot(data= d_temp, x = 'x', y = 'attack', hue = 'source', capsize=.1,linestyles=[" ", ""] )
ax.legend_.remove()
plt.ylim(0.8, 1.4)
plt.ylabel('Attack percentage')
plt.xlabel('Label source')
plt.savefig('../../paper/figs/sanity_check.png',bbox_inches='tight')



In [177]:
ax = sns.pointplot(data= d, x = 'ns', y = 'attack', hue = 'source', linestyles=[" ", ""], dodge=0.15, capsize=.1)
plt.xlabel('Namespace')
ax.legend_.remove()
plt.ylabel('Attack percentage')
plt.savefig('../../paper/figs/sanity_check_ns.png',bbox_inches='tight')



In [178]:
ax = sns.pointplot(data= d, x = 'logged_in', y = 'attack', hue = 'source', linestyles=[" ", ""], dodge=0.15, capsize=.1)
ax.legend_.remove()
plt.ylabel('Attack percentage')
plt.xlabel('Author Logged-in')
plt.savefig('../../paper/figs/sanity_check_anon.png', bbox_inches='tight')



In [179]:
def map_year(y):
    if y < 2010:
        return 'before 2010'
    else:
        return 'after 2010'

d['com_year'] = d.year.apply(map_year)
ax = sns.pointplot(data= d, x = 'com_year', y = 'attack', hue = 'source', dodge=0.15, capsize=.1, linestyles=[" ", ""], order = ['before 2010', 'after 2010'])
ax.legend_.remove()
plt.ylabel('Attack percentage')
plt.xlabel('Year')

plt.savefig('../../paper/figs/sanity_check_year.png',bbox_inches='tight')



In [180]:
d_gender = pd.read_csv('../../data/genders.tsv', sep = '\t')
d_gender = d.merge(d_gender, how = 'left', on = 'user_id').query('not author_anon')
d_gender.gender.fillna('unknown', inplace = True)
ax = sns.pointplot(data= d_gender, x = 'gender', y = 'attack', hue = 'source', dodge=0.15, capsize=.1, linestyles=[" ", ""])
ax.legend_.remove()
plt.ylabel('Attack percentage')
plt.xlabel('Gender')
plt.savefig('../../paper/figs/sanity_check_gender.png', bbox_inches='tight')



In [181]:
d_activity = pd.read_csv('../../data/long_term_users.tsv', sep = '\t')
d_activity = d.merge(d_activity, how = 'left', on = 'user_text').query('not author_anon')
d_activity.num_days.fillna(0, inplace = True)


def map_activity(a):
    if a < 8:
        return '0-7'
    elif a < 31:
        return '7-30'
    else:
        return '30 + '

d_activity['Days active'] = d_activity.num_days.apply(map_activity)


ax = sns.pointplot(data= d_activity, x = 'Days active', y = 'attack', hue = 'source',dodge=0.15, capsize=.1, linestyles=[" ", ""], order = ['0-7','7-30', '30 + ' ])
ax.legend_.remove()
plt.ylabel('Attack percentage')
plt.savefig('../../paper/figs/sanity_check_activity.png',bbox_inches='tight')



In [182]:
ax = sns.pointplot(data= d_activity, x = 'thank', y = 'attack', hue = 'source',dodge=0.15, capsize=.1, linestyles=[" ", ""])
ax.legend_.remove()
plt.ylabel('Attack percentage')
plt.xlabel("Contains word 'thank'")
plt.savefig('../../paper/figs/sanity_check_thank.png',bbox_inches='tight')


Get Dev + Test Equal Error Threshold


In [195]:
X_test, y_test_oh = assemble_data(data, 'comments', 'empirical_dist', splits = ['test', 'dev'], samples = ['random'])
y_test_pred = model.predict_proba(X_test)

In [204]:
from ngram import two_class_combo_plotter
sns.set(font_scale=1.5)
two_class_combo_plotter(y_test_oh, y_test_pred)
plt.xlim(0.41, 0.445)
plt.ylim(0.6, 0.7)


0.424823 0.00342010260308 0.635135135135
0.429155 0.632653061224 0.628378378378
Out[204]:
(0.6, 0.7)

In [ ]: