In [1]:
import pickle
import pandas as pd
from user_object import User

In [2]:
user_objects = pickle.load( open("../../data/retention/newcomer_sample_pickle.pkl", "rb"))
user_objects = {u.user_text:u for u in user_objects}

In [3]:
cols = ['user_text', 'registration_day', 't1_harassment_received', 't1_harassment_made', 't1_num_days_active', 't2_num_days_active']
df_reg = pd.read_csv("../../data/retention/newcomer_sample_features.csv")[cols]

In [4]:
df_blocked = pd.read_csv("../../data/misc/blocked_user.tsv", sep = "\t")
df_blocked.columns = [c.split(".")[1] for c in df_blocked.columns]
df_blocked = df_blocked.drop_duplicates('user_text')

In [5]:
df_reg.index = df_reg.user_text
df_blocked.index = df_blocked.user_text

In [6]:
df_reg['blocked_timestamps'] = df_blocked['timestamps'].apply(lambda x: x.replace("PIPE", "  |  "))
df_blocked['blocked'] = 1
df_reg['blocked'] = df_blocked['blocked']
df_reg['blocked'] = df_reg['blocked'].fillna(0)

In [10]:
from dateutil.relativedelta import relativedelta

def select_month_since_registration(user,  activity, t):
    start = user.registration_day + relativedelta(months=(t-1))
    stop = user.registration_day + relativedelta(months= t)
    activity = activity[activity['timestamp'] < stop]
    activity = activity[activity['timestamp'] >= start]
    return activity

def comments_received_above_threshold(user, score, threshold, t):
    if user.df_comments_received is None:
        return None
    
    comments = user.df_comments_received
    comments['timestamp'] = pd.to_datetime(comments['timestamp'])

    comments = select_month_since_registration(user,  comments, t)
    comments = comments.query("%s > %f" % (score, threshold))
    
    if comments.shape[0] < 1:
        return None
    
    s = "From User:" + comments['user_text'].apply(str) + " at " + comments['timestamp'].apply(str) + " with score " + comments[score].apply(lambda x: "%.2f" % x)
    return "  |  ".join(s)
    

def comments_made_above_threshold(user, score, threshold, t):
    if user.df_comments_made is None:
        return None
    
    comments = user.df_comments_made
    comments['timestamp'] = pd.to_datetime(comments['timestamp'])

    comments = select_month_since_registration(user,  comments, t)
    comments = comments.query("%s > %f" % (score, threshold))
    
    if comments.shape[0] < 1:
        return None

    s = "To User:" + comments['page_title'].apply(str) + " at " + comments['timestamp'].apply(str) + " with score " + comments[score].apply(lambda x: "%.2f" % x)
    return "  |  ".join(s)

In [11]:
threshold = 0.425
models = ['attack', 'aggression', 'toxicity']
t = 1

for model in models:
    score = 'pred_%s_score' % model
    col = "%s_comments_made" % model
    df_reg[col] = pd.Series({str(k) : comments_made_above_threshold(v, score, threshold, t) for k, v in user_objects.items()})
    col = "%s_comments_received" % model
    df_reg[col] = pd.Series({str(k) : comments_received_above_threshold(v, score, threshold, t) for k, v in user_objects.items()})

In [12]:
df_reg.head()


Out[12]:
user_text registration_day t1_harassment_received t1_harassment_made t1_num_days_active t2_num_days_active blocked_timestamps blocked attack_comments_made attack_comments_received aggression_comments_made aggression_comments_received toxicity_comments_made toxicity_comments_received
user_text
Chrisclements521 Chrisclements521 2010-10-01 0 0 1 0 NaN 0.0 None None None None None None
Callyloo Callyloo 2007-11-21 0 0 1 0 NaN 0.0 None None None None None None
Eagleswar262 Eagleswar262 2009-08-04 0 0 1 0 NaN 0.0 None None None None None None
ILY=)L0Li ILY=)L0Li 2010-04-21 0 0 1 0 NaN 0.0 None None None None None None
Dirtyharry847 Dirtyharry847 2006-05-02 0 0 1 0 NaN 0.0 None None None None None None

In [14]:
df_reg.query("t1_harassment_received == 1").head()


Out[14]:
user_text registration_day t1_harassment_received t1_harassment_made t1_num_days_active t2_num_days_active blocked_timestamps blocked attack_comments_made attack_comments_received aggression_comments_made aggression_comments_received toxicity_comments_made toxicity_comments_received
user_text
Dunno74 Dunno74 2007-06-11 1 0 1 0 NaN 0.0 None From User:Shawnlandden at 2007-06-18 08:49:14 ... None From User:Shawnlandden at 2007-06-18 08:49:14 ... None From User:Shawnlandden at 2007-06-18 08:49:14 ...
StudiesWorld StudiesWorld 2013-12-17 1 0 26 8 NaN 0.0 None From User:Missionedit at 2014-01-02 22:20:11 w... None From User:Missionedit at 2014-01-02 22:20:11 w... None None
Catieinsightdesigns Catieinsightdesigns 2015-11-16 1 1 14 0 NaN 0.0 None From User:Missionedit at 2015-12-10 18:52:32 w... None From User:Missionedit at 2015-12-10 18:52:32 w... To User:Velella at 2015-12-08 16:41:28 with sc... None
Kagemaru2022 Kagemaru2022 2015-01-21 1 0 12 0 NaN 0.0 None From User:ChamithN at 2015-01-23 18:18:29 with... None From User:ChamithN at 2015-01-23 18:18:29 with... None None
Cluelesswonder Cluelesswonder 2014-01-13 1 1 1 0 2014-01-13T18:32:12Z | 2014-01-13T19:04:58Z 1.0 To User:Cluelesswonder at 2014-01-13 19:31:17 ... From User:Zad68 at 2014-01-13 19:39:28 with sc... To User:Cluelesswonder at 2014-01-13 19:31:17 ... From User:Zad68 at 2014-01-13 19:39:28 with sc... To User:Cluelesswonder at 2014-01-13 19:31:17 ... From User:Zad68 at 2014-01-13 19:39:28 with sc...

In [15]:
df_reg.to_csv("../../data/retention/uxr.csv")

In [ ]:


In [ ]:


In [ ]:
for u, v in user_objects.items():
    if v.df_comments_made is not None:
        print(u)
        break

In [ ]:
u = 'Dergbuytioporytderwquarrel'
comments =  user_objects[u].df_comments_made

In [ ]:
comments

In [ ]:
comments_received_above_threshold(user_objects[u], 'pred_aggression_score', 0, 1)

In [ ]:
comments_made_above_threshold(user_objects[u], 'pred_aggression_score', 0, 1)

In [ ]: