In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from ngram import *
from baselines import *
import seaborn as sns

In [2]:
d = pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t')
d.index = d.rev_id

In [3]:
cols = ['rev_id', 'clean_diff', 'aggression', 'recipient','attack']
b = d.query("ns=='user' and sample=='blocked'").dropna(subset=cols)[cols]
r = d.query("ns=='user' and sample=='random'").dropna(subset=cols)[cols]

In [4]:
def plot(d, label):
    if label == 'aggression':
        bins  = np.arange(0, 2.1, 0.1)
    else:
        bins = np.arange(0, 1.1, 0.1)
       
    def nice_hist(myarray):
        results, edges = np.histogram(myarray, normed=True)
        binWidth = edges[1] - edges[0]
        plt.bar(edges[:-1], results*binWidth, binWidth)

    nice_hist(d[label].values)
    #_ = plt.hist(d[label].values,  bins = bins, normed = True )
    plt.xlabel('%s raw annotations' % label)
    plt.ylabel('fraction')

    plt.figure()
    #_ = plt.hist(d.groupby('rev_id')[label].mean().values,  bins = bins)
    nice_hist(d.groupby('rev_id')[label].mean().values)
    plt.xlabel('%s mean' % label)
    plt.ylabel('fraction')

    plt.figure()
    #_ = plt.hist(plurality(d[label]).values, bins = bins)
    nice_hist(plurality(d[label]).values)
    plt.xlabel('%s plurailty' % label)
    plt.ylabel('fraction')


    """
    d['std'] = d.groupby('rev_id')[label].std()
    plt.figure()
    ax = sns.pointplot(x=label, y= 'std', data=d, order = [-3, -2, -1, 0, 1, 2, 3])
    plt.xlabel('raw annotations')
    """



    plt.figure()
    d_temp = pd.DataFrame({'mean': d.groupby('rev_id')[label].mean(), 'std': d.groupby('rev_id')[label].std()})
    ax = sns.regplot(x='mean', y= 'std', data=d_temp,lowess = True)



    plt.figure()
    d_temp = pd.DataFrame({'plurality': plurality(d[label]), 'std': d.groupby('rev_id')[label].std()})
    ax = sns.pointplot(x='plurality', y= 'std', data=d_temp, order = bins)

In [12]:
# paper special

bins = np.arange(0, 1.1, 0.1)

def nice_hist(myarray):
    results, edges = np.histogram(myarray, normed=True)
    binWidth = edges[1] - edges[0]
    plt.bar(edges[:-1], results*binWidth, binWidth)

    
x = d.groupby('rev_id')['attack'].mean()
plt.figure()
nice_hist(x)

plt.xlabel("Annotator agreement")
plt.ylabel("Fraction of comments")
plt.savefig("../../paper/figs/agreement.png")



In [7]:
d['rev_id'].value_counts().value_counts()


Out[7]:
10    55156
9     29478
8      7856
19     6905
20     6197
18     3603
29     1378
28     1336
17     1222
27      776
30      717
26      371
16      328
25      135
15       92
24       40
14       37
23       25
13       19
22       18
12       14
11       11
31        4
36        4
39        4
21        3
37        3
45        1
34        1
35        1
38        1
46        1
Name: rev_id, dtype: int64

Recipient


In [5]:
plot(b, 'recipient')



In [6]:
plot(r, 'recipient')


/Users/ellerywulczyn/miniconda3/lib/python3.5/site-packages/numpy/lib/function_base.py:3569: RuntimeWarning: Invalid value encountered in median
  RuntimeWarning)

Aggression


In [7]:
plot(b, 'aggression')



In [8]:
plot(r, 'aggression')


/Users/ellerywulczyn/miniconda3/lib/python3.5/site-packages/numpy/lib/function_base.py:3569: RuntimeWarning: Invalid value encountered in median
  RuntimeWarning)