In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import os
import sys, os
sys.path.insert(0, os.path.abspath('..'))
import data_generation.diff_utils
import data_generation.mwdiff.mwdiffs_to_tsv
import numpy as np
In [2]:
# load split data
out_dir = "../../data/figshare"
in_dir = "../../data/annotations/split"
splits = ["train", "dev", "test"]
dfs = []
for split in splits:
df = pd.read_csv(os.path.join(in_dir, split, 'annotations.tsv'), sep = '\t')
df['split'] = split
dfs.append(df)
df = pd.concat(dfs)
df.shape
Out[2]:
In [3]:
# rename workers
df_workers = df[['_worker_id']].drop_duplicates()
df_workers['anon_id'] = range(df_workers.shape[0])
df = df.merge(df_workers, how = 'inner', on = '_worker_id')
df = df.rename(columns={
'other': 'other_attack',
'quoting': 'quoting_attack',
'recipient': 'recipient_attack',
'third_party': 'third_party_attack'
})
df.shape
Out[3]:
In [5]:
# save worker id mapping
df_workers.to_csv(os.path.join( "../../data/figshare", 'attack_annotations_worker_id_map.tsv'), sep = '\t', index = False)
df_workers.to_csv(os.path.join( "../../data/figshare", 'aggression_annotations_worker_id_map.tsv'), sep = '\t', index = False)
In [4]:
# get set of labeled comments
df_comments = df.drop_duplicates(subset = ['rev_id']).copy()
df_comments['logged_in'] = df_comments['user_id'].notnull()
df_comments['year'] = pd.to_datetime(df_comments['rev_timestamp']).apply(lambda x: x.year)
In [5]:
# fix legacy special token issues
df_comments['diff'] = df_comments['diff'].apply(data_generation.mwdiff.mwdiffs_to_tsv.replace_special_chars)
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('TAB', 'TAB_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('NEWLINE', 'NEWLINE_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('"', '`'))
# apply latest version of clean and filter
df_comments = data_generation.diff_utils.clean_and_filter(df_comments)
# clean and filter drops some comments, so drop associated labels
df = df.merge(df_comments[['rev_id']], how = 'inner', on = 'rev_id' )
In [6]:
df.columns
Out[6]:
In [7]:
# rename some columns
df_comments = df_comments.rename(columns={
'clean_diff': 'comment',
'rev_timestamp': 'timestamp',
})
order = ['rev_id', 'comment', 'year', 'logged_in', 'ns', 'sample', 'split']
df_comments = df_comments[order]
df_comments = df_comments.sort_values('rev_id')
df_comments.shape
Out[7]:
In [15]:
# get set of human labels
df_attack_labels = df[['rev_id', 'anon_id', 'quoting_attack',
'recipient_attack', 'third_party_attack', 'other_attack', 'attack']]
df_attack_labels = df_attack_labels.rename(columns={
'anon_id': 'worker_id',
})
df_attack_labels = df_attack_labels.sort_values('rev_id')
df_aggression_labels = df[['rev_id', 'anon_id', 'aggression', 'aggression_score']]
df_aggression_labels = df_aggression_labels.rename(columns={
'anon_id': 'worker_id',
})
df_aggression_labels = df_aggression_labels.sort_values('rev_id')
In [16]:
# save dfs
df_comments.to_csv(os.path.join( "../../data/figshare", 'attack_annotated_comments.tsv'), sep = '\t', index = False)
df_comments.to_csv(os.path.join( "../../data/figshare", 'aggression_annotated_comments.tsv'), sep = '\t', index = False)
df_attack_labels.to_csv(os.path.join( "../../data/figshare", 'attack_annotations.tsv'), sep = '\t', index = False)
df_aggression_labels.to_csv(os.path.join( "../../data/figshare", 'aggression_annotations.tsv'), sep = '\t', index = False)
In [3]:
pd.read_csv(os.path.join( "../../data/figshare", 'attack_annotated_comments.tsv'), sep = '\t').shape
Out[3]:
In [4]:
pd.read_csv(os.path.join( "../../data/figshare", 'attack_annotations.tsv'), sep = '\t').drop_duplicates(subset = 'rev_id').shape
Out[4]:
In [19]:
df_comments['logged_in'].value_counts()
Out[19]:
In [20]:
df_comments.head()
Out[20]:
In [21]:
df_attack_labels.head()
Out[21]: