In [1]:
import pickle
import pandas as pd
from user_object import User
In [2]:
user_objects = pickle.load( open("../../data/retention/newcomer_sample_pickle.pkl", "rb"))
user_objects = {u.user_text:u for u in user_objects}
In [3]:
cols = ['user_text', 'registration_day', 't1_harassment_received', 't1_harassment_made', 't1_num_days_active', 't2_num_days_active']
df_reg = pd.read_csv("../../data/retention/newcomer_sample_features.csv")[cols]
In [4]:
df_blocked = pd.read_csv("../../data/misc/blocked_user.tsv", sep = "\t")
df_blocked.columns = [c.split(".")[1] for c in df_blocked.columns]
df_blocked = df_blocked.drop_duplicates('user_text')
In [5]:
df_reg.index = df_reg.user_text
df_blocked.index = df_blocked.user_text
In [6]:
df_reg['blocked_timestamps'] = df_blocked['timestamps'].apply(lambda x: x.replace("PIPE", " | "))
df_blocked['blocked'] = 1
df_reg['blocked'] = df_blocked['blocked']
df_reg['blocked'] = df_reg['blocked'].fillna(0)
In [10]:
from dateutil.relativedelta import relativedelta
def select_month_since_registration(user, activity, t):
start = user.registration_day + relativedelta(months=(t-1))
stop = user.registration_day + relativedelta(months= t)
activity = activity[activity['timestamp'] < stop]
activity = activity[activity['timestamp'] >= start]
return activity
def comments_received_above_threshold(user, score, threshold, t):
if user.df_comments_received is None:
return None
comments = user.df_comments_received
comments['timestamp'] = pd.to_datetime(comments['timestamp'])
comments = select_month_since_registration(user, comments, t)
comments = comments.query("%s > %f" % (score, threshold))
if comments.shape[0] < 1:
return None
s = "From User:" + comments['user_text'].apply(str) + " at " + comments['timestamp'].apply(str) + " with score " + comments[score].apply(lambda x: "%.2f" % x)
return " | ".join(s)
def comments_made_above_threshold(user, score, threshold, t):
if user.df_comments_made is None:
return None
comments = user.df_comments_made
comments['timestamp'] = pd.to_datetime(comments['timestamp'])
comments = select_month_since_registration(user, comments, t)
comments = comments.query("%s > %f" % (score, threshold))
if comments.shape[0] < 1:
return None
s = "To User:" + comments['page_title'].apply(str) + " at " + comments['timestamp'].apply(str) + " with score " + comments[score].apply(lambda x: "%.2f" % x)
return " | ".join(s)
In [11]:
threshold = 0.425
models = ['attack', 'aggression', 'toxicity']
t = 1
for model in models:
score = 'pred_%s_score' % model
col = "%s_comments_made" % model
df_reg[col] = pd.Series({str(k) : comments_made_above_threshold(v, score, threshold, t) for k, v in user_objects.items()})
col = "%s_comments_received" % model
df_reg[col] = pd.Series({str(k) : comments_received_above_threshold(v, score, threshold, t) for k, v in user_objects.items()})
In [12]:
df_reg.head()
Out[12]:
In [14]:
df_reg.query("t1_harassment_received == 1").head()
Out[14]:
In [15]:
df_reg.to_csv("../../data/retention/uxr.csv")
In [ ]:
In [ ]:
In [ ]:
for u, v in user_objects.items():
if v.df_comments_made is not None:
print(u)
break
In [ ]:
u = 'Dergbuytioporytderwquarrel'
comments = user_objects[u].df_comments_made
In [ ]:
comments
In [ ]:
comments_received_above_threshold(user_objects[u], 'pred_aggression_score', 0, 1)
In [ ]:
comments_made_above_threshold(user_objects[u], 'pred_aggression_score', 0, 1)
In [ ]: