In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext autotime

import numpy as np
import pandas as pd
import joblib

import datetime
import os
import numpy as np
import time
import multiprocessing as mp
import re

In [2]:
import inspect, os
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
os.sys.path.insert(0,parentdir) 
from data_generation.diff_utils import clean_and_filter


time: 78.4 ms

Build Models


In [3]:
from baselines import plurality, average
from serialization import load_pipeline


Using TensorFlow backend.
time: 1.9 s

Load Training Data


In [4]:
all_annotations = pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t')
all_annotations.index = all_annotations.rev_id


time: 13.2 s

Load models


In [10]:
model_name = 'linear_char_ed_train'
model_type = 'linear_char_ed'

tasks = ['aggression', 'attack', 'recipient']

model_dict = {}
calibrator_dict = {}

for task in tasks:
    path = '../../models/%s/%s' % (task, model_type)
    model_dict[task] = load_pipeline(path, model_name)
    calibrator_dict[task] = joblib.load(os.path.join(path, 'calibrator'))


time: 20.7 s

In [12]:
def apply_models(df):
    diffs = df['clean_diff']
    for task, model in model_dict.items():
        scores = model.predict_proba(diffs)[:,1]
        df['pred_%s_score_uncalibrated' % task] = scores
        df['pred_%s_score_calibrated' % task] = calibrator_dict[task].transform(scores)
    return df


time: 3.29 ms

Load annotationed diffs


In [7]:
cols = ['rev_id', 'ns', 'sample', 'src', 'clean_diff', 'diff', 'page_id', 'page_title', 'rev_comment', 'rev_timestamp', 'user_id', 'user_text']
for ns in ['user', 'article']:

    d_annotations = all_annotations.query("sample=='random' and ns=='%s'" % ns)
    d_annotated = d_annotations\
                .drop_duplicates(subset=['rev_id'])[cols]\
                .assign(
                    recipient = plurality(d_annotations['recipient'].dropna()),
                    recipient_score = average(d_annotations['recipient'].dropna()),
                    aggression = plurality(d_annotations['aggression'].dropna()),
                    aggression_score = average(d_annotations['aggression'].dropna()),
                    attack = plurality(d_annotations['attack'].dropna()),
                    attack_score = average(d_annotations['attack'].dropna())
            )

    d_annotated.to_csv('../../data/samples/%s/clean/d_annotated.tsv' % ns, sep = '\t', index = False)
del all_annotations


time: 41.5 s

Load samples and apply models

We take various diffs datasets from hive, apply the clean and filter function and the score the clean diffs using the models.


In [8]:
def pred_helper(df):
    if len(df) == 0:
        return None
    
    return df.assign(rev_timestamp = lambda x: pd.to_datetime(x.rev_timestamp),
                     clean_diff = lambda x: x['clean_diff'].astype(str))\
             .pipe(apply_models)

    
def prep_in_parallel(path, k = 8):
    df = pd.read_csv(path, sep = '\t', encoding = 'utf-8')
    m = df.shape[0] 
    n_groups = int(m / 10000.0)
    df['key'] = np.random.randint(0, high=n_groups, size=m)
    dfs = [e[1] for e in df.groupby('key')]
    dfs = [pred_helper(d) for d in dfs]
    #p = mp.Pool(k)
    #dfs = p.map(pred_helper, dfs)
    #p.close()
    #p.join()
    return pd.concat(dfs)


time: 11.8 ms

In [13]:
base = '../../data/comments/'
nss = ['user', 'article']
samples = ['d_annotated.tsv', 'talk_diff_no_admin_sample.tsv', 'talk_diff_no_admin_2015.tsv', 'all_blocked_user.tsv']

base_cols = ['rev_id',
             'clean_diff',
             'rev_timestamp',
             'pred_aggression_score_uncalibrated',
             'pred_recipient_score_uncalibrated',
             'pred_attack_score_uncalibrated',
             'pred_aggression_score_calibrated',
             'pred_recipient_score_calibrated',
             'pred_attack_score_calibrated',
             'page_title',
             'user_text',
             'user_id'
            ]

extra_cols = ['recipient', 'recipient_score', 'aggression', 'aggression_score', 'attack', 'attack_score']

for ns in nss:
    for s in samples:
        inf = os.path.join(base, ns, 'clean', s)
        print(inf)
        outf = os.path.join(base, ns, 'scored', s)
        if s == 'd_annotated.tsv':
            cols = base_cols + extra_cols
        else:
            cols = base_cols
        prep_in_parallel(inf, k = 4)[cols].to_csv(outf, sep = '\t', index = False)


../../data/samples/user/clean/d_annotated.tsv
../../data/samples/user/clean/talk_diff_no_admin_sample.tsv
../../data/samples/user/clean/talk_diff_no_admin_2015.tsv
../../data/samples/user/clean/all_blocked_user.tsv
../../data/samples/article/clean/d_annotated.tsv
../../data/samples/article/clean/talk_diff_no_admin_sample.tsv
../../data/samples/article/clean/talk_diff_no_admin_2015.tsv
../../data/samples/article/clean/all_blocked_user.tsv
time: 8h 32min 10s

In [ ]: