In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import os
import sys, os
sys.path.insert(0, os.path.abspath('..'))
import data_generation.diff_utils
import data_generation.mwdiff.mwdiffs_to_tsv
import numpy as np

In [2]:
# load split data
out_dir = "../../data/figshare"
in_dir = "../../data/annotations/split"
splits = ["train", "dev", "test"]

dfs = []
for split in splits:
    df = pd.read_csv(os.path.join(in_dir, split, 'annotations.tsv'), sep = '\t')
    df['split'] = split
    dfs.append(df)
df = pd.concat(dfs)
df.shape


Out[2]:
(1368958, 24)

In [3]:
# rename workers
df_workers = df[['_worker_id']].drop_duplicates()
df_workers['anon_id'] = range(df_workers.shape[0])
df = df.merge(df_workers, how = 'inner', on = '_worker_id')

df = df.rename(columns={
                        'other': 'other_attack',
                        'quoting': 'quoting_attack', 
                        'recipient': 'recipient_attack',
                        'third_party': 'third_party_attack'
                       })


df.shape


Out[3]:
(1368958, 25)

In [5]:
# save worker id mapping
df_workers.to_csv(os.path.join( "../../data/figshare", 'attack_annotations_worker_id_map.tsv'), sep = '\t', index = False)
df_workers.to_csv(os.path.join( "../../data/figshare", 'aggression_annotations_worker_id_map.tsv'), sep = '\t', index = False)

In [4]:
# get set of labeled comments
df_comments = df.drop_duplicates(subset = ['rev_id']).copy()
df_comments['logged_in'] = df_comments['user_id'].notnull()
df_comments['year'] = pd.to_datetime(df_comments['rev_timestamp']).apply(lambda x: x.year)

In [5]:
# fix legacy special token issues
df_comments['diff'] = df_comments['diff'].apply(data_generation.mwdiff.mwdiffs_to_tsv.replace_special_chars)

df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('TAB', 'TAB_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('NEWLINE', 'NEWLINE_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('"', '`'))

# apply latest version of clean and filter
df_comments = data_generation.diff_utils.clean_and_filter(df_comments)
# clean and filter drops some comments, so drop associated labels
df = df.merge(df_comments[['rev_id']], how = 'inner', on = 'rev_id' )

In [6]:
df.columns


Out[6]:
Index(['rev_id', '_worker_id', 'ns', 'sample', 'src', 'clean_diff', 'diff',
       'insert_only', 'page_id', 'page_title', 'rev_comment', 'rev_timestamp',
       'user_id', 'user_text', 'not_attack', 'other_attack', 'quoting_attack',
       'recipient_attack', 'third_party_attack', 'attack', 'aggression',
       'aggression_score', 'counts', 'split', 'anon_id'],
      dtype='object')

In [7]:
# rename some columns
df_comments = df_comments.rename(columns={
                        'clean_diff': 'comment',
                        'rev_timestamp': 'timestamp',
                       })
order = ['rev_id', 'comment', 'year', 'logged_in', 'ns', 'sample', 'split']
df_comments = df_comments[order]
df_comments = df_comments.sort_values('rev_id')
df_comments.shape


Out[7]:
(115864, 7)

In [15]:
# get set of human labels

df_attack_labels = df[['rev_id', 'anon_id', 'quoting_attack',
                       'recipient_attack', 'third_party_attack', 'other_attack', 'attack']]

df_attack_labels = df_attack_labels.rename(columns={
                        'anon_id': 'worker_id',
                       })

df_attack_labels = df_attack_labels.sort_values('rev_id')


df_aggression_labels = df[['rev_id', 'anon_id', 'aggression', 'aggression_score']]

df_aggression_labels = df_aggression_labels.rename(columns={
                        'anon_id': 'worker_id',
                       })

df_aggression_labels = df_aggression_labels.sort_values('rev_id')

In [16]:
# save dfs
df_comments.to_csv(os.path.join( "../../data/figshare", 'attack_annotated_comments.tsv'), sep = '\t', index = False)
df_comments.to_csv(os.path.join( "../../data/figshare", 'aggression_annotated_comments.tsv'), sep = '\t', index = False)

df_attack_labels.to_csv(os.path.join( "../../data/figshare", 'attack_annotations.tsv'), sep = '\t', index = False)
df_aggression_labels.to_csv(os.path.join( "../../data/figshare", 'aggression_annotations.tsv'), sep = '\t', index = False)

In [3]:
pd.read_csv(os.path.join( "../../data/figshare", 'attack_annotated_comments.tsv'), sep = '\t').shape


Out[3]:
(115864, 7)

In [4]:
pd.read_csv(os.path.join( "../../data/figshare", 'attack_annotations.tsv'), sep = '\t').drop_duplicates(subset = 'rev_id').shape


Out[4]:
(115864, 7)

In [19]:
df_comments['logged_in'].value_counts()


Out[19]:
True     84918
False    30946
Name: logged_in, dtype: int64

In [20]:
df_comments.head()


Out[20]:
rev_id comment year logged_in ns sample split
135608 37675 `-NEWLINE_TOKENThis is not ``creative``. Thos... 2002 False article random train
600662 44816 `NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand... 2002 False article random train
323593 49851 NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s... 2002 False article random train
487679 89320 Next, maybe you could work on being less cond... 2002 True article random dev
126387 93890 This page will need disambiguation. 2002 True article random train

In [21]:
df_attack_labels.head()


Out[21]:
rev_id worker_id quoting_attack recipient_attack third_party_attack other_attack attack
854386 37675 1362 0.0 0.0 0.0 0.0 0.0
854389 37675 2408 0.0 0.0 0.0 0.0 0.0
854388 37675 1493 0.0 0.0 0.0 0.0 0.0
854387 37675 1439 0.0 0.0 0.0 0.0 0.0
854380 37675 170 0.0 0.0 0.0 0.0 0.0