notebook.community

Edit and run



In [1]:

    
import pickle
import pandas as pd
from user_object import User



In [2]:

    
user_objects = pickle.load( open("../../data/retention/newcomer_sample_pickle.pkl", "rb"))
user_objects = {u.user_text:u for u in user_objects}



In [3]:

    
cols = ['user_text', 'registration_day', 't1_harassment_received', 't1_harassment_made', 't1_num_days_active', 't2_num_days_active']
df_reg = pd.read_csv("../../data/retention/newcomer_sample_features.csv")[cols]



In [4]:

    
df_blocked = pd.read_csv("../../data/misc/blocked_user.tsv", sep = "\t")
df_blocked.columns = [c.split(".")[1] for c in df_blocked.columns]
df_blocked = df_blocked.drop_duplicates('user_text')



In [5]:

    
df_reg.index = df_reg.user_text
df_blocked.index = df_blocked.user_text



In [6]:

    
df_reg['blocked_timestamps'] = df_blocked['timestamps'].apply(lambda x: x.replace("PIPE", "  |  "))
df_blocked['blocked'] = 1
df_reg['blocked'] = df_blocked['blocked']
df_reg['blocked'] = df_reg['blocked'].fillna(0)



In [10]:

    
from dateutil.relativedelta import relativedelta

def select_month_since_registration(user,  activity, t):
    start = user.registration_day + relativedelta(months=(t-1))
    stop = user.registration_day + relativedelta(months= t)
    activity = activity[activity['timestamp'] < stop]
    activity = activity[activity['timestamp'] >= start]
    return activity

def comments_received_above_threshold(user, score, threshold, t):
    if user.df_comments_received is None:
        return None
    
    comments = user.df_comments_received
    comments['timestamp'] = pd.to_datetime(comments['timestamp'])

    comments = select_month_since_registration(user,  comments, t)
    comments = comments.query("%s > %f" % (score, threshold))
    
    if comments.shape[0] < 1:
        return None
    
    s = "From User:" + comments['user_text'].apply(str) + " at " + comments['timestamp'].apply(str) + " with score " + comments[score].apply(lambda x: "%.2f" % x)
    return "  |  ".join(s)
    

def comments_made_above_threshold(user, score, threshold, t):
    if user.df_comments_made is None:
        return None
    
    comments = user.df_comments_made
    comments['timestamp'] = pd.to_datetime(comments['timestamp'])

    comments = select_month_since_registration(user,  comments, t)
    comments = comments.query("%s > %f" % (score, threshold))
    
    if comments.shape[0] < 1:
        return None

    s = "To User:" + comments['page_title'].apply(str) + " at " + comments['timestamp'].apply(str) + " with score " + comments[score].apply(lambda x: "%.2f" % x)
    return "  |  ".join(s)



In [11]:

    
threshold = 0.425
models = ['attack', 'aggression', 'toxicity']
t = 1

for model in models:
    score = 'pred_%s_score' % model
    col = "%s_comments_made" % model
    df_reg[col] = pd.Series({str(k) : comments_made_above_threshold(v, score, threshold, t) for k, v in user_objects.items()})
    col = "%s_comments_received" % model
    df_reg[col] = pd.Series({str(k) : comments_received_above_threshold(v, score, threshold, t) for k, v in user_objects.items()})



In [12]:

    
df_reg.head()









    Out[12]:






  
    
      
      user_text
      registration_day
      t1_harassment_received
      t1_harassment_made
      t1_num_days_active
      t2_num_days_active
      blocked_timestamps
      blocked
      attack_comments_made
      attack_comments_received
      aggression_comments_made
      aggression_comments_received
      toxicity_comments_made
      toxicity_comments_received
    
    
      user_text
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Chrisclements521
      Chrisclements521
      2010-10-01
      0
      0
      1
      0
      NaN
      0.0
      None
      None
      None
      None
      None
      None
    
    
      Callyloo
      Callyloo
      2007-11-21
      0
      0
      1
      0
      NaN
      0.0
      None
      None
      None
      None
      None
      None
    
    
      Eagleswar262
      Eagleswar262
      2009-08-04
      0
      0
      1
      0
      NaN
      0.0
      None
      None
      None
      None
      None
      None
    
    
      ILY=)L0Li
      ILY=)L0Li
      2010-04-21
      0
      0
      1
      0
      NaN
      0.0
      None
      None
      None
      None
      None
      None
    
    
      Dirtyharry847
      Dirtyharry847
      2006-05-02
      0
      0
      1
      0
      NaN
      0.0
      None
      None
      None
      None
      None
      None



In [14]:

    
df_reg.query("t1_harassment_received == 1").head()









    Out[14]:






  
    
      
      user_text
      registration_day
      t1_harassment_received
      t1_harassment_made
      t1_num_days_active
      t2_num_days_active
      blocked_timestamps
      blocked
      attack_comments_made
      attack_comments_received
      aggression_comments_made
      aggression_comments_received
      toxicity_comments_made
      toxicity_comments_received
    
    
      user_text
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Dunno74
      Dunno74
      2007-06-11
      1
      0
      1
      0
      NaN
      0.0
      None
      From User:Shawnlandden at 2007-06-18 08:49:14 ...
      None
      From User:Shawnlandden at 2007-06-18 08:49:14 ...
      None
      From User:Shawnlandden at 2007-06-18 08:49:14 ...
    
    
      StudiesWorld
      StudiesWorld
      2013-12-17
      1
      0
      26
      8
      NaN
      0.0
      None
      From User:Missionedit at 2014-01-02 22:20:11 w...
      None
      From User:Missionedit at 2014-01-02 22:20:11 w...
      None
      None
    
    
      Catieinsightdesigns
      Catieinsightdesigns
      2015-11-16
      1
      1
      14
      0
      NaN
      0.0
      None
      From User:Missionedit at 2015-12-10 18:52:32 w...
      None
      From User:Missionedit at 2015-12-10 18:52:32 w...
      To User:Velella at 2015-12-08 16:41:28 with sc...
      None
    
    
      Kagemaru2022
      Kagemaru2022
      2015-01-21
      1
      0
      12
      0
      NaN
      0.0
      None
      From User:ChamithN at 2015-01-23 18:18:29 with...
      None
      From User:ChamithN at 2015-01-23 18:18:29 with...
      None
      None
    
    
      Cluelesswonder
      Cluelesswonder
      2014-01-13
      1
      1
      1
      0
      2014-01-13T18:32:12Z  |  2014-01-13T19:04:58Z
      1.0
      To User:Cluelesswonder at 2014-01-13 19:31:17 ...
      From User:Zad68 at 2014-01-13 19:39:28 with sc...
      To User:Cluelesswonder at 2014-01-13 19:31:17 ...
      From User:Zad68 at 2014-01-13 19:39:28 with sc...
      To User:Cluelesswonder at 2014-01-13 19:31:17 ...
      From User:Zad68 at 2014-01-13 19:39:28 with sc...



In [15]:

    
df_reg.to_csv("../../data/retention/uxr.csv")



In [ ]:



In [ ]:



In [ ]:

    
for u, v in user_objects.items():
    if v.df_comments_made is not None:
        print(u)
        break



In [ ]:

    
u = 'Dergbuytioporytderwquarrel'
comments =  user_objects[u].df_comments_made



In [ ]:

    
comments



In [ ]:

    
comments_received_above_threshold(user_objects[u], 'pred_aggression_score', 0, 1)



In [ ]:

    
comments_made_above_threshold(user_objects[u], 'pred_aggression_score', 0, 1)



In [ ]:

	user_text	registration_day	t1_harassment_received	t1_harassment_made	t1_num_days_active	t2_num_days_active	blocked_timestamps	blocked	attack_comments_made	attack_comments_received	aggression_comments_made	aggression_comments_received	toxicity_comments_made	toxicity_comments_received
user_text
Chrisclements521	Chrisclements521	2010-10-01	0	0	1	0	NaN	0.0	None	None	None	None	None	None
Callyloo	Callyloo	2007-11-21	0	0	1	0	NaN	0.0	None	None	None	None	None	None
Eagleswar262	Eagleswar262	2009-08-04	0	0	1	0	NaN	0.0	None	None	None	None	None	None
ILY=)L0Li	ILY=)L0Li	2010-04-21	0	0	1	0	NaN	0.0	None	None	None	None	None	None
Dirtyharry847	Dirtyharry847	2006-05-02	0	0	1	0	NaN	0.0	None	None	None	None	None	None

	user_text	registration_day	t1_harassment_received	t1_harassment_made	t1_num_days_active	t2_num_days_active	blocked_timestamps	blocked	attack_comments_made	attack_comments_received	aggression_comments_made	aggression_comments_received	toxicity_comments_made	toxicity_comments_received
user_text
Dunno74	Dunno74	2007-06-11	1	0	1	0	NaN	0.0	None	From User:Shawnlandden at 2007-06-18 08:49:14 ...	None	From User:Shawnlandden at 2007-06-18 08:49:14 ...	None	From User:Shawnlandden at 2007-06-18 08:49:14 ...
StudiesWorld	StudiesWorld	2013-12-17	1	0	26	8	NaN	0.0	None	From User:Missionedit at 2014-01-02 22:20:11 w...	None	From User:Missionedit at 2014-01-02 22:20:11 w...	None	None
Catieinsightdesigns	Catieinsightdesigns	2015-11-16	1	1	14	0	NaN	0.0	None	From User:Missionedit at 2015-12-10 18:52:32 w...	None	From User:Missionedit at 2015-12-10 18:52:32 w...	To User:Velella at 2015-12-08 16:41:28 with sc...	None
Kagemaru2022	Kagemaru2022	2015-01-21	1	0	12	0	NaN	0.0	None	From User:ChamithN at 2015-01-23 18:18:29 with...	None	From User:ChamithN at 2015-01-23 18:18:29 with...	None	None
Cluelesswonder	Cluelesswonder	2014-01-13	1	1	1	0	2014-01-13T18:32:12Z \| 2014-01-13T19:04:58Z	1.0	To User:Cluelesswonder at 2014-01-13 19:31:17 ...	From User:Zad68 at 2014-01-13 19:39:28 with sc...	To User:Cluelesswonder at 2014-01-13 19:31:17 ...	From User:Zad68 at 2014-01-13 19:39:28 with sc...	To User:Cluelesswonder at 2014-01-13 19:31:17 ...	From User:Zad68 at 2014-01-13 19:39:28 with sc...