In [4]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext autotime

import numpy as np
import pandas as pd
import joblib

import datetime
import numpy as np
import time
import multiprocessing as mp
import re


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 6.13 ms

Load Models


In [8]:
tasks = ['attack', 'toxicity', 'aggression']
model_dict = {}
for task in tasks:
    os.system("python get_prod_models.py --task %s" % task) 
    model_dict[task] = joblib.load("/tmp/%s_linear_char_oh_pipeline.pkl" % task)


time: 21min 31s

In [28]:
def apply_models(df):
    comments = df['comment']
    for task, model in model_dict.items():
        scores = model.predict_proba(comments)[:,1]
        df['pred_%s_score' % task] = scores
    return df


time: 3.08 ms

In [34]:
def pred_helper(df):
    if len(df) == 0:
        return None
    
    return df.assign(timestamp = lambda x: pd.to_datetime(x.timestamp),
                     comment = lambda x: x['comment'].astype(str))\
             .pipe(apply_models)

    
def prep_in_parallel(path, k = 8):
    df = pd.read_csv(path, sep = '\t', encoding = 'utf-8')
    m = df.shape[0] 
    if m < 15000:
        n_groups = 1
    else:
        n_groups = int(m / 10000.0)
    df['key'] = np.random.randint(0, high=n_groups, size=m)
    dfs = [e[1] for e in df.groupby('key')]
    #dfs = [pred_helper(d) for d in dfs]
    p = mp.Pool(k)
    dfs = p.map(pred_helper, dfs)
    p.close()
    p.join()
    return pd.concat(dfs)


time: 17.9 ms

In [37]:
base = '../../data/figshare/'
nss = ['user', 'article']
years = range(2001, 2016)

for ns in nss:
    for year in years:
        
        
        dirname = "comments_%s_%d" % (ns, year)
        print(dirname)
        
        indir = os.path.join(base, dirname + ".tar.gz")
        
        os.system("mkdir ", os.path.join(base, "scored"))
        outf = os.path.join(base, "scored", dirname + ".tsv.gz")
        
        os.system("cp %s ." % indir)
        os.system("tar -zxvf %s.tar.gz" % dirname)
        
        dfs = []
        
        for inf in os.listdir(dirname):
            print(inf)
            if inf.endswith(".tsv"): 
                df = prep_in_parallel(os.path.join(dirname, inf), k = 8)
                dfs.append(df)
        os.system("rm -rf %s" % dirname)
        os.system("rm -rf %s.tar.gz" % dirname)
        pd.concat(dfs).to_csv(outf, sep = '\t', index = False, compression = "gzip")


comments_user_2001
chunk_0.tsv
comments_user_2002
chunk_0.tsv
comments_user_2003
chunk_0.tsv
comments_user_2004
chunk_0.tsv
comments_user_2005
chunk_0.tsv
chunk_1.tsv
chunk_2.tsv
chunk_3.tsv
comments_user_2006
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_17.tsv
chunk_18.tsv
chunk_19.tsv
chunk_2.tsv
chunk_20.tsv
chunk_3.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_user_2007
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_17.tsv
chunk_18.tsv
chunk_19.tsv
chunk_2.tsv
chunk_20.tsv
chunk_21.tsv
chunk_22.tsv
chunk_23.tsv
chunk_24.tsv
chunk_25.tsv
chunk_26.tsv
chunk_27.tsv
chunk_28.tsv
chunk_29.tsv
chunk_3.tsv
chunk_30.tsv
chunk_31.tsv
chunk_32.tsv
chunk_33.tsv
chunk_34.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_user_2008
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_17.tsv
chunk_18.tsv
chunk_19.tsv
chunk_2.tsv
chunk_20.tsv
chunk_21.tsv
chunk_22.tsv
chunk_23.tsv
chunk_24.tsv
chunk_25.tsv
chunk_26.tsv
chunk_27.tsv
chunk_28.tsv
chunk_29.tsv
chunk_3.tsv
chunk_30.tsv
chunk_31.tsv
chunk_32.tsv
chunk_33.tsv
chunk_34.tsv
chunk_35.tsv
chunk_36.tsv
chunk_37.tsv
chunk_38.tsv
chunk_39.tsv
chunk_4.tsv
chunk_40.tsv
chunk_41.tsv
chunk_42.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_user_2009
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_17.tsv
chunk_18.tsv
chunk_19.tsv
chunk_2.tsv
chunk_20.tsv
chunk_21.tsv
chunk_22.tsv
chunk_23.tsv
chunk_24.tsv
chunk_25.tsv
chunk_26.tsv
chunk_27.tsv
chunk_28.tsv
chunk_29.tsv
chunk_3.tsv
chunk_30.tsv
chunk_31.tsv
chunk_32.tsv
chunk_33.tsv
chunk_34.tsv
chunk_35.tsv
chunk_36.tsv
chunk_37.tsv
chunk_38.tsv
chunk_39.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_user_2010
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_17.tsv
chunk_18.tsv
chunk_19.tsv
chunk_2.tsv
chunk_20.tsv
chunk_21.tsv
chunk_22.tsv
chunk_23.tsv
chunk_24.tsv
chunk_25.tsv
chunk_26.tsv
chunk_27.tsv
chunk_28.tsv
chunk_29.tsv
chunk_3.tsv
chunk_30.tsv
chunk_31.tsv
chunk_32.tsv
chunk_33.tsv
chunk_34.tsv
chunk_35.tsv
chunk_36.tsv
chunk_37.tsv
chunk_38.tsv
chunk_39.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_user_2011
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_17.tsv
chunk_18.tsv
chunk_19.tsv
chunk_2.tsv
chunk_20.tsv
chunk_21.tsv
chunk_22.tsv
chunk_23.tsv
chunk_24.tsv
chunk_25.tsv
chunk_26.tsv
chunk_27.tsv
chunk_28.tsv
chunk_29.tsv
chunk_3.tsv
chunk_30.tsv
chunk_31.tsv
chunk_32.tsv
chunk_33.tsv
chunk_34.tsv
chunk_35.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_user_2012
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_17.tsv
chunk_18.tsv
chunk_19.tsv
chunk_2.tsv
chunk_20.tsv
chunk_21.tsv
chunk_22.tsv
chunk_23.tsv
chunk_24.tsv
chunk_25.tsv
chunk_26.tsv
chunk_27.tsv
chunk_28.tsv
chunk_29.tsv
chunk_3.tsv
chunk_30.tsv
chunk_31.tsv
chunk_32.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_user_2013
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_17.tsv
chunk_18.tsv
chunk_19.tsv
chunk_2.tsv
chunk_20.tsv
chunk_21.tsv
chunk_22.tsv
chunk_23.tsv
chunk_24.tsv
chunk_25.tsv
chunk_26.tsv
chunk_27.tsv
chunk_28.tsv
chunk_29.tsv
chunk_3.tsv
chunk_30.tsv
chunk_31.tsv
chunk_32.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_user_2014
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_17.tsv
chunk_18.tsv
chunk_19.tsv
chunk_2.tsv
chunk_20.tsv
chunk_21.tsv
chunk_22.tsv
chunk_23.tsv
chunk_24.tsv
chunk_25.tsv
chunk_26.tsv
chunk_27.tsv
chunk_28.tsv
chunk_29.tsv
chunk_3.tsv
chunk_30.tsv
chunk_31.tsv
chunk_32.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_user_2015
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_17.tsv
chunk_18.tsv
chunk_19.tsv
chunk_2.tsv
chunk_20.tsv
chunk_21.tsv
chunk_22.tsv
chunk_23.tsv
chunk_24.tsv
chunk_25.tsv
chunk_26.tsv
chunk_27.tsv
chunk_28.tsv
chunk_29.tsv
chunk_3.tsv
chunk_30.tsv
chunk_31.tsv
chunk_32.tsv
chunk_33.tsv
chunk_34.tsv
chunk_35.tsv
chunk_36.tsv
chunk_37.tsv
chunk_38.tsv
chunk_39.tsv
chunk_4.tsv
chunk_40.tsv
chunk_41.tsv
chunk_42.tsv
chunk_43.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_article_2001
chunk_0.tsv
comments_article_2002
chunk_0.tsv
comments_article_2003
chunk_0.tsv
comments_article_2004
chunk_0.tsv
chunk_1.tsv
comments_article_2005
chunk_0.tsv
chunk_1.tsv
chunk_2.tsv
chunk_3.tsv
chunk_4.tsv
comments_article_2006
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_2.tsv
chunk_3.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_article_2007
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_17.tsv
chunk_18.tsv
chunk_19.tsv
chunk_2.tsv
chunk_20.tsv
chunk_21.tsv
chunk_3.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_article_2008
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_17.tsv
chunk_18.tsv
chunk_2.tsv
chunk_3.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_article_2009
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_16.tsv
chunk_2.tsv
chunk_3.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_article_2010
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_14.tsv
chunk_15.tsv
chunk_2.tsv
chunk_3.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_article_2011
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_2.tsv
chunk_3.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_article_2012
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_11.tsv
chunk_12.tsv
chunk_13.tsv
chunk_2.tsv
chunk_3.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_article_2013
chunk_0.tsv
chunk_1.tsv
chunk_10.tsv
chunk_2.tsv
chunk_3.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_article_2014
chunk_0.tsv
chunk_1.tsv
chunk_2.tsv
chunk_3.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
chunk_9.tsv
comments_article_2015
chunk_0.tsv
chunk_1.tsv
chunk_2.tsv
chunk_3.tsv
chunk_4.tsv
chunk_5.tsv
chunk_6.tsv
chunk_7.tsv
chunk_8.tsv
time: 1d 16h 58min 23s

In [39]:
df.sort_values("pred_toxicity_score").tail()


Out[39]:
rev_id comment raw_comment timestamp page_id page_title user_id user_text bot admin key pred_attack_score pred_aggression_score pred_toxicity_score
50262 691820150 NEWLINE shit is shit is shit shit... NEWLINE shit is shit is shit shit... 2015-11-22 11:15:24 47744616 Bigg Boss 9 NaN 106.216.154.87 0 0 1 0.987700 0.993569 0.999877
55656 680448189 NEWLINE:::Bullshit. NEWLINE:::Bullshit. -- [[Special:Contributions... 2015-09-10 22:53:04 5977208 HeadOn NaN 184.189.217.91 0 0 4 0.991760 0.996578 0.999880
5603 674681857 NEWLINE:Motherfucker. NEWLINE:Motherfucker. [[User:Rcbutcher|Rcbutch... 2015-08-05 13:28:02 47374798 Killing of Cecil the lion 579140.0 Rcbutcher 0 0 7 0.999136 0.999032 0.999958
45933 683100078 NEWLINE: basically they the Muslims killed 700... NEWLINE: basically they the Muslims killed 700... 2015-09-28 03:52:32 47917574 2015 Mina stampede 25415712.0 SundayRequiem 0 0 5 0.999561 0.999085 0.999969
63621 679578201 NEWLINENEWLINEFucking south korea. Fucking the... NEWLINENEWLINEFucking south korea. Fucking the... 2015-09-05 12:23:59 47442831 The Genius (TV series) NaN 211.36.149.193 0 0 2 0.999689 0.999862 0.999997
time: 202 ms

In [ ]: