In [130]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from baselines import plurality
In [92]:
df = pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t')
df.index=df.rev_id
It is incorrect to give a revision a label an attack label and a not attack label. Lets see how often this occurs and who makes this error.
In [17]:
df['is_harassment_or_attack'].value_counts(dropna=False)
Out[17]:
In [27]:
def attack_and_not_attack(s):
return 'not_attack' in s and s!= 'not_attack'
In [44]:
df[df['is_harassment_or_attack'].apply(attack_and_not_attack)]['_worker_id'].value_counts().head()
Out[44]:
Ok, there are a few users who do this a lot.
In [43]:
y = df[df['is_harassment_or_attack'].apply(attack_and_not_attack)]['_worker_id'].value_counts().cumsum()
y = y/y.iloc[-1]
x = list(range(len(y)))
plt.plot(x, y)
plt.xlabel('N')
plt.ylabel('Fraction of ambivalent labels coming from N users')
Out[43]:
It looks like 90% of ambivalent labels come from 200 users. We might consider dropping all annotations from these users. We can also check what fraction of a users labels are ambivalent and select a threshold based on that.
In [136]:
counts = df[df['is_harassment_or_attack'].apply(attack_and_not_attack)]['_worker_id'].value_counts()
fraction = (counts / df['_worker_id'].value_counts()).dropna()
d_ambi = pd.DataFrame({'counts': counts, 'fraction':fraction}).sort_values('fraction', ascending=False)
d_ambi['N'] = 1
d_ambi = d_ambi.groupby('fraction', as_index = False).sum()
d_ambi = d_ambi.sort_values('fraction', ascending = False)
d_ambi['cum_counts'] = d_ambi['counts'].cumsum() / d_ambi['counts'].sum()
d_ambi['cum_N'] = d_ambi['N'].cumsum() / d_ambi['N'].sum()
d_ambi['fraction'] = 1 -d_ambi['fraction']
In [137]:
d_ambi.head()
Out[137]:
In [142]:
plt.plot(d_ambi['fraction'], d_ambi['cum_counts'])
Out[142]:
In [143]:
plt.plot(d_ambi['fraction'], d_ambi['cum_N'])
Out[143]:
In [144]:
plt.plot(d_ambi['cum_N'], d_ambi['cum_counts'])
Out[144]:
Consider dropping all annotations from users who score 1:5 comments as ambivalent.
In [146]:
col = 'recipient'
pl = plurality(df[col])
df['plurality'] = pl
df['deviant'] = df[col] != df['plurality']
deviance_scores = df.groupby('_worker_id')['deviant'].mean()
deviance_scores.sort_values(ascending = False).hist(bins=100)
Out[146]:
In [147]:
col = 'attack'
pl = plurality(df[col])
df['plurality'] = pl
df['deviant'] = df[col] != df['plurality']
deviance_scores = df.groupby('_worker_id')['deviant'].mean()
deviance_scores.sort_values(ascending = False).hist(bins=100)
Out[147]:
In [148]:
col = 'aggression'
pl = plurality(df[col])
df['plurality'] = pl
df['deviant'] = df[col] != df['plurality']
deviance_scores = df.groupby('_worker_id')['deviant'].mean()
deviance_scores.sort_values(ascending = False).hist(bins=100)
Out[148]: