notebook.community

Edit and run



In [3]:

    
import pandas as pd



In [26]:

    
# load annotations from train split
path = '../../data/annotations/split/train/annotations.tsv'
df = pd.read_csv(path, '\t')
df.index = df.rev_id
print('# of annotations' , df.shape[0])









    



# of annotations 764050



In [27]:

    
#get text for classification
comments = df.drop_duplicates(subset='rev_id')['clean_diff']
print('# of revs/comments' , comments.shape[0])









    



# of revs/comments 69708



In [28]:

    
comments.head()









    Out[28]:





rev_id
379377106     Social media campaign \n\nHi Bod!!!! great to...
560366988     General Brotherton \n\nHi,\n\nAny chance you ...
537948       Hi Alex\n\nJust thought I ought to mention tha...
533417386    "\n\n AfC help desk \n\nCould you please take ...
74926009     I see you have been experimenting with Wikiped...
Name: clean_diff, dtype: object



In [ ]:

    
# get annotations for a certain task
# valid tasks are 'attack', 'recipient', 'aggression'
# note that each comment/revision has multiple annotations



In [29]:

    
attack_annotations = df['attack'] 
attack_annotations.head()









    Out[29]:





rev_id
379377106    0.0
379377106    0.0
379377106    0.0
379377106    0.0
379377106    0.0
Name: attack, dtype: float64



In [ ]:

    
# get attack labels
# label a comment an attack if more than half the annotations say its an attack



In [30]:

    
attack_labels = (attack_annotations.groupby(attack_annotations.index).mean() > 0.5).astype(int)
attack_labels.value_counts()









    Out[30]:





0    61625
1     8083
Name: attack, dtype: int64