In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from ngram import *
from baselines import *
import seaborn as sns
In [2]:
d = pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t')
d.index = d.rev_id
In [3]:
cols = ['rev_id', 'clean_diff', 'aggression', 'recipient','attack']
b = d.query("ns=='user' and sample=='blocked'").dropna(subset=cols)[cols]
r = d.query("ns=='user' and sample=='random'").dropna(subset=cols)[cols]
In [4]:
def plot(d, label):
if label == 'aggression':
bins = np.arange(0, 2.1, 0.1)
else:
bins = np.arange(0, 1.1, 0.1)
def nice_hist(myarray):
results, edges = np.histogram(myarray, normed=True)
binWidth = edges[1] - edges[0]
plt.bar(edges[:-1], results*binWidth, binWidth)
nice_hist(d[label].values)
#_ = plt.hist(d[label].values, bins = bins, normed = True )
plt.xlabel('%s raw annotations' % label)
plt.ylabel('fraction')
plt.figure()
#_ = plt.hist(d.groupby('rev_id')[label].mean().values, bins = bins)
nice_hist(d.groupby('rev_id')[label].mean().values)
plt.xlabel('%s mean' % label)
plt.ylabel('fraction')
plt.figure()
#_ = plt.hist(plurality(d[label]).values, bins = bins)
nice_hist(plurality(d[label]).values)
plt.xlabel('%s plurailty' % label)
plt.ylabel('fraction')
"""
d['std'] = d.groupby('rev_id')[label].std()
plt.figure()
ax = sns.pointplot(x=label, y= 'std', data=d, order = [-3, -2, -1, 0, 1, 2, 3])
plt.xlabel('raw annotations')
"""
plt.figure()
d_temp = pd.DataFrame({'mean': d.groupby('rev_id')[label].mean(), 'std': d.groupby('rev_id')[label].std()})
ax = sns.regplot(x='mean', y= 'std', data=d_temp,lowess = True)
plt.figure()
d_temp = pd.DataFrame({'plurality': plurality(d[label]), 'std': d.groupby('rev_id')[label].std()})
ax = sns.pointplot(x='plurality', y= 'std', data=d_temp, order = bins)
In [12]:
# paper special
bins = np.arange(0, 1.1, 0.1)
def nice_hist(myarray):
results, edges = np.histogram(myarray, normed=True)
binWidth = edges[1] - edges[0]
plt.bar(edges[:-1], results*binWidth, binWidth)
x = d.groupby('rev_id')['attack'].mean()
plt.figure()
nice_hist(x)
plt.xlabel("Annotator agreement")
plt.ylabel("Fraction of comments")
plt.savefig("../../paper/figs/agreement.png")
In [7]:
d['rev_id'].value_counts().value_counts()
Out[7]:
In [5]:
plot(b, 'recipient')
In [6]:
plot(r, 'recipient')
In [7]:
plot(b, 'aggression')
In [8]:
plot(r, 'aggression')