notebook.community

Edit and run



In [1]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import os
import sys, os
sys.path.insert(0, os.path.abspath('..'))
import data_generation.diff_utils
import data_generation.mwdiff.mwdiffs_to_tsv
import numpy as np



In [2]:

    
# load split data
out_dir = "../../data/figshare"
in_dir = "../../data/annotations/split"
splits = ["train", "dev", "test"]

dfs = []
for split in splits:
    df = pd.read_csv(os.path.join(in_dir, split, 'annotations.tsv'), sep = '\t')
    df['split'] = split
    dfs.append(df)
df = pd.concat(dfs)
df.shape









    Out[2]:





(1368958, 24)



In [3]:

    
# rename workers
df_workers = df[['_worker_id']].drop_duplicates()
df_workers['anon_id'] = range(df_workers.shape[0])
df = df.merge(df_workers, how = 'inner', on = '_worker_id')

df = df.rename(columns={
                        'other': 'other_attack',
                        'quoting': 'quoting_attack', 
                        'recipient': 'recipient_attack',
                        'third_party': 'third_party_attack'
                       })


df.shape









    Out[3]:





(1368958, 25)



In [5]:

    
# save worker id mapping
df_workers.to_csv(os.path.join( "../../data/figshare", 'attack_annotations_worker_id_map.tsv'), sep = '\t', index = False)
df_workers.to_csv(os.path.join( "../../data/figshare", 'aggression_annotations_worker_id_map.tsv'), sep = '\t', index = False)



In [4]:

    
# get set of labeled comments
df_comments = df.drop_duplicates(subset = ['rev_id']).copy()
df_comments['logged_in'] = df_comments['user_id'].notnull()
df_comments['year'] = pd.to_datetime(df_comments['rev_timestamp']).apply(lambda x: x.year)



In [5]:

    
# fix legacy special token issues
df_comments['diff'] = df_comments['diff'].apply(data_generation.mwdiff.mwdiffs_to_tsv.replace_special_chars)

df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('TAB', 'TAB_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('NEWLINE', 'NEWLINE_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('"', '`'))

# apply latest version of clean and filter
df_comments = data_generation.diff_utils.clean_and_filter(df_comments)
# clean and filter drops some comments, so drop associated labels
df = df.merge(df_comments[['rev_id']], how = 'inner', on = 'rev_id' )



In [6]:

    
df.columns









    Out[6]:





Index(['rev_id', '_worker_id', 'ns', 'sample', 'src', 'clean_diff', 'diff',
       'insert_only', 'page_id', 'page_title', 'rev_comment', 'rev_timestamp',
       'user_id', 'user_text', 'not_attack', 'other_attack', 'quoting_attack',
       'recipient_attack', 'third_party_attack', 'attack', 'aggression',
       'aggression_score', 'counts', 'split', 'anon_id'],
      dtype='object')



In [7]:

    
# rename some columns
df_comments = df_comments.rename(columns={
                        'clean_diff': 'comment',
                        'rev_timestamp': 'timestamp',
                       })
order = ['rev_id', 'comment', 'year', 'logged_in', 'ns', 'sample', 'split']
df_comments = df_comments[order]
df_comments = df_comments.sort_values('rev_id')
df_comments.shape









    Out[7]:





(115864, 7)



In [15]:

    
# get set of human labels

df_attack_labels = df[['rev_id', 'anon_id', 'quoting_attack',
                       'recipient_attack', 'third_party_attack', 'other_attack', 'attack']]

df_attack_labels = df_attack_labels.rename(columns={
                        'anon_id': 'worker_id',
                       })

df_attack_labels = df_attack_labels.sort_values('rev_id')


df_aggression_labels = df[['rev_id', 'anon_id', 'aggression', 'aggression_score']]

df_aggression_labels = df_aggression_labels.rename(columns={
                        'anon_id': 'worker_id',
                       })

df_aggression_labels = df_aggression_labels.sort_values('rev_id')



In [16]:

    
# save dfs
df_comments.to_csv(os.path.join( "../../data/figshare", 'attack_annotated_comments.tsv'), sep = '\t', index = False)
df_comments.to_csv(os.path.join( "../../data/figshare", 'aggression_annotated_comments.tsv'), sep = '\t', index = False)

df_attack_labels.to_csv(os.path.join( "../../data/figshare", 'attack_annotations.tsv'), sep = '\t', index = False)
df_aggression_labels.to_csv(os.path.join( "../../data/figshare", 'aggression_annotations.tsv'), sep = '\t', index = False)



In [3]:

    
pd.read_csv(os.path.join( "../../data/figshare", 'attack_annotated_comments.tsv'), sep = '\t').shape









    Out[3]:





(115864, 7)



In [4]:

    
pd.read_csv(os.path.join( "../../data/figshare", 'attack_annotations.tsv'), sep = '\t').drop_duplicates(subset = 'rev_id').shape









    Out[4]:





(115864, 7)



In [19]:

    
df_comments['logged_in'].value_counts()









    Out[19]:





True     84918
False    30946
Name: logged_in, dtype: int64



In [20]:

    
df_comments.head()









    Out[20]:






  
    
      
      rev_id
      comment
      year
      logged_in
      ns
      sample
      split
    
  
  
    
      135608
      37675
      `-NEWLINE_TOKENThis is not ``creative``.  Thos...
      2002
      False
      article
      random
      train
    
    
      600662
      44816
      `NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...
      2002
      False
      article
      random
      train
    
    
      323593
      49851
      NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...
      2002
      False
      article
      random
      train
    
    
      487679
      89320
      Next, maybe you could work on being less cond...
      2002
      True
      article
      random
      dev
    
    
      126387
      93890
      This page will need disambiguation.
      2002
      True
      article
      random
      train



In [21]:

    
df_attack_labels.head()









    Out[21]:






  
    
      
      rev_id
      worker_id
      quoting_attack
      recipient_attack
      third_party_attack
      other_attack
      attack
    
  
  
    
      854386
      37675
      1362
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      854389
      37675
      2408
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      854388
      37675
      1493
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      854387
      37675
      1439
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      854380
      37675
      170
      0.0
      0.0
      0.0
      0.0
      0.0

	rev_id	comment	year	logged_in	ns	sample	split
135608	37675	`-NEWLINE_TOKENThis is not ``creative``. Thos...	2002	False	article	random	train
600662	44816	`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...	2002	False	article	random	train
323593	49851	NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...	2002	False	article	random	train
487679	89320	Next, maybe you could work on being less cond...	2002	True	article	random	dev
126387	93890	This page will need disambiguation.	2002	True	article	random	train

	rev_id	worker_id
854386	37675	1362
854389	37675	2408
854388	37675	1493
854387	37675	1439
854380	37675	170