In [4]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext autotime
import numpy as np
import pandas as pd
import joblib
import datetime
import numpy as np
import time
import multiprocessing as mp
import re
In [8]:
tasks = ['attack', 'toxicity', 'aggression']
model_dict = {}
for task in tasks:
os.system("python get_prod_models.py --task %s" % task)
model_dict[task] = joblib.load("/tmp/%s_linear_char_oh_pipeline.pkl" % task)
In [28]:
def apply_models(df):
comments = df['comment']
for task, model in model_dict.items():
scores = model.predict_proba(comments)[:,1]
df['pred_%s_score' % task] = scores
return df
In [34]:
def pred_helper(df):
if len(df) == 0:
return None
return df.assign(timestamp = lambda x: pd.to_datetime(x.timestamp),
comment = lambda x: x['comment'].astype(str))\
.pipe(apply_models)
def prep_in_parallel(path, k = 8):
df = pd.read_csv(path, sep = '\t', encoding = 'utf-8')
m = df.shape[0]
if m < 15000:
n_groups = 1
else:
n_groups = int(m / 10000.0)
df['key'] = np.random.randint(0, high=n_groups, size=m)
dfs = [e[1] for e in df.groupby('key')]
#dfs = [pred_helper(d) for d in dfs]
p = mp.Pool(k)
dfs = p.map(pred_helper, dfs)
p.close()
p.join()
return pd.concat(dfs)
In [37]:
base = '../../data/figshare/'
nss = ['user', 'article']
years = range(2001, 2016)
for ns in nss:
for year in years:
dirname = "comments_%s_%d" % (ns, year)
print(dirname)
indir = os.path.join(base, dirname + ".tar.gz")
os.system("mkdir ", os.path.join(base, "scored"))
outf = os.path.join(base, "scored", dirname + ".tsv.gz")
os.system("cp %s ." % indir)
os.system("tar -zxvf %s.tar.gz" % dirname)
dfs = []
for inf in os.listdir(dirname):
print(inf)
if inf.endswith(".tsv"):
df = prep_in_parallel(os.path.join(dirname, inf), k = 8)
dfs.append(df)
os.system("rm -rf %s" % dirname)
os.system("rm -rf %s.tar.gz" % dirname)
pd.concat(dfs).to_csv(outf, sep = '\t', index = False, compression = "gzip")
In [39]:
df.sort_values("pred_toxicity_score").tail()
Out[39]:
In [ ]: