In [3]:
import pandas as pd

In [26]:
# load annotations from train split
path = '../../data/annotations/split/train/annotations.tsv'
df = pd.read_csv(path, '\t')
df.index = df.rev_id
print('# of annotations' , df.shape[0])


# of annotations 764050

In [27]:
#get text for classification
comments = df.drop_duplicates(subset='rev_id')['clean_diff']
print('# of revs/comments' , comments.shape[0])


# of revs/comments 69708

In [28]:
comments.head()


Out[28]:
rev_id
379377106     Social media campaign \n\nHi Bod!!!! great to...
560366988     General Brotherton \n\nHi,\n\nAny chance you ...
537948       Hi Alex\n\nJust thought I ought to mention tha...
533417386    "\n\n AfC help desk \n\nCould you please take ...
74926009     I see you have been experimenting with Wikiped...
Name: clean_diff, dtype: object

In [ ]:
# get annotations for a certain task
# valid tasks are 'attack', 'recipient', 'aggression'
# note that each comment/revision has multiple annotations

In [29]:
attack_annotations = df['attack'] 
attack_annotations.head()


Out[29]:
rev_id
379377106    0.0
379377106    0.0
379377106    0.0
379377106    0.0
379377106    0.0
Name: attack, dtype: float64

In [ ]:
# get attack labels
# label a comment an attack if more than half the annotations say its an attack

In [30]:
attack_labels = (attack_annotations.groupby(attack_annotations.index).mean() > 0.5).astype(int)
attack_labels.value_counts()


Out[30]:
0    61625
1     8083
Name: attack, dtype: int64