In [130]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from baselines import plurality

In [92]:
df = pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t')
df.index=df.rev_id

Explore ambivalent is_harassment_or_attack labels

It is incorrect to give a revision a label an attack label and a not attack label. Lets see how often this occurs and who makes this error.


In [17]:
df['is_harassment_or_attack'].value_counts(dropna=False)


Out[17]:
not_attack                                            1029013
recipient                                              135358
other                                                   34796
third_party                                             29702
recipient\nthird_party                                   8988
other\nnot_attack                                        8300
recipient\nnot_attack                                    6326
quoting                                                  5733
recipient\nthird_party\nquoting\nother\nnot_attack       5510
recipient\nother                                         3939
recipient\nthird_party\nnot_attack                       2209
recipient\nthird_party\nquoting\nother                   2182
third_party\nother                                       1669
recipient\nthird_party\nother                            1649
quoting\nnot_attack                                      1427
recipient\nthird_party\nquoting                          1385
third_party\nnot_attack                                  1110
quoting\nother\nnot_attack                               1022
recipient\nthird_party\nquoting\nnot_attack               969
recipient\nother\nnot_attack                              744
quoting\nother                                            537
third_party\nquoting                                      494
recipient\nquoting                                        415
third_party\nquoting\nother                               325
recipient\nquoting\nnot_attack                            179
recipient\nquoting\nother                                 108
third_party\nquoting\nnot_attack                           99
third_party\nother\nnot_attack                             69
recipient\nthird_party\nother\nnot_attack                  59
third_party\nquoting\nother\nnot_attack                    35
recipient\nquoting\nother\nnot_attack                      19
Name: is_harassment_or_attack, dtype: int64

In [27]:
def attack_and_not_attack(s):
    return 'not_attack' in s and s!= 'not_attack'

In [44]:
df[df['is_harassment_or_attack'].apply(attack_and_not_attack)]['_worker_id'].value_counts().head()


Out[44]:
33939537    841
35210143    785
9232361     707
13318404    674
31317272    666
Name: _worker_id, dtype: int64

Ok, there are a few users who do this a lot.


In [43]:
y = df[df['is_harassment_or_attack'].apply(attack_and_not_attack)]['_worker_id'].value_counts().cumsum()
y = y/y.iloc[-1]
x = list(range(len(y)))
plt.plot(x, y)
plt.xlabel('N')
plt.ylabel('Fraction of ambivalent labels coming from N users')


Out[43]:
<matplotlib.text.Text at 0x12e6549b0>

It looks like 90% of ambivalent labels come from 200 users. We might consider dropping all annotations from these users. We can also check what fraction of a users labels are ambivalent and select a threshold based on that.


In [136]:
counts = df[df['is_harassment_or_attack'].apply(attack_and_not_attack)]['_worker_id'].value_counts()
fraction = (counts / df['_worker_id'].value_counts()).dropna()
d_ambi = pd.DataFrame({'counts': counts, 'fraction':fraction}).sort_values('fraction', ascending=False)
d_ambi['N'] = 1
d_ambi = d_ambi.groupby('fraction', as_index = False).sum()
d_ambi = d_ambi.sort_values('fraction', ascending = False)
d_ambi['cum_counts'] = d_ambi['counts'].cumsum() / d_ambi['counts'].sum()
d_ambi['cum_N'] = d_ambi['N'].cumsum() / d_ambi['N'].sum()
d_ambi['fraction'] = 1 -d_ambi['fraction']

In [137]:
d_ambi.head()


Out[137]:
fraction counts N cum_counts cum_N
838 0.000000 1953 7 0.069559 0.006233
837 0.002801 356 1 0.082238 0.007124
836 0.002817 354 1 0.094846 0.008014
835 0.002959 674 1 0.118852 0.008905
834 0.003891 256 1 0.127970 0.009795

In [142]:
plt.plot(d_ambi['fraction'], d_ambi['cum_counts'])


Out[142]:
[<matplotlib.lines.Line2D at 0x1195826d8>]

In [143]:
plt.plot(d_ambi['fraction'], d_ambi['cum_N'])


Out[143]:
[<matplotlib.lines.Line2D at 0x11975d780>]

In [144]:
plt.plot(d_ambi['cum_N'], d_ambi['cum_counts'])


Out[144]:
[<matplotlib.lines.Line2D at 0x11b496358>]

Consider dropping all annotations from users who score 1:5 comments as ambivalent.

Explore consistently deviant users


In [146]:
col = 'recipient'
pl = plurality(df[col])
df['plurality'] = pl
df['deviant'] = df[col] != df['plurality']
deviance_scores = df.groupby('_worker_id')['deviant'].mean()
deviance_scores.sort_values(ascending = False).hist(bins=100)


Out[146]:
<matplotlib.axes._subplots.AxesSubplot at 0x11bea6b38>

In [147]:
col = 'attack'
pl = plurality(df[col])
df['plurality'] = pl
df['deviant'] = df[col] != df['plurality']
deviance_scores = df.groupby('_worker_id')['deviant'].mean()
deviance_scores.sort_values(ascending = False).hist(bins=100)


Out[147]:
<matplotlib.axes._subplots.AxesSubplot at 0x11bd136d8>

In [148]:
col = 'aggression'
pl = plurality(df[col])
df['plurality'] = pl
df['deviant'] = df[col] != df['plurality']
deviance_scores = df.groupby('_worker_id')['deviant'].mean()
deviance_scores.sort_values(ascending = False).hist(bins=100)


Out[148]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c7cc668>