In [3]:
import pandas as pd
In [26]:
# load annotations from train split
path = '../../data/annotations/split/train/annotations.tsv'
df = pd.read_csv(path, '\t')
df.index = df.rev_id
print('# of annotations' , df.shape[0])
In [27]:
#get text for classification
comments = df.drop_duplicates(subset='rev_id')['clean_diff']
print('# of revs/comments' , comments.shape[0])
In [28]:
comments.head()
Out[28]:
In [ ]:
# get annotations for a certain task
# valid tasks are 'attack', 'recipient', 'aggression'
# note that each comment/revision has multiple annotations
In [29]:
attack_annotations = df['attack']
attack_annotations.head()
Out[29]:
In [ ]:
# get attack labels
# label a comment an attack if more than half the annotations say its an attack
In [30]:
attack_labels = (attack_annotations.groupby(attack_annotations.index).mean() > 0.5).astype(int)
attack_labels.value_counts()
Out[30]: