In [1]:
import json
import pandas as pd
import numpy as np
import datetime
import os
In [2]:
with open('/scratch/wiki_dumps/user_data/metadata.json') as f:
usrdata = json.load(f)
with open('/scratch/wiki_dumps/talk_page_article_link.json') as f:
subjectpage = json.load(f)
def timestamp_2_sec(timestamp):
return (datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ') -datetime.datetime(1970,1,1)).total_seconds()
registration = {}
groups = {}
blocking = {}
for user, data in usrdata.items():
if 'registration' in data and data['registration']:
registration[user] = timestamp_2_sec(data['registration'])
if 'groups' in data:
groups[user] = data['groups']
else:
groups[user] = []
if 'blockedtimestamp' in data:
blocking[user] = timestamp_2_sec(data['blockedtimestamp'])
In [7]:
edits = []
data_dir = "/scratch/wiki_dumps/user_data/editing_data"
month_dict_edits
for _, _, filenames in os.walk(data_dir):
for filename in filenames:
df = pd.read_csv(os.path.join(data_dir, filename))
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['year'] = df['timestamp'].apply(lambda x: x.year)
df['month'] = df['timestamp'].apply(lambda x: x.month)
month_groups = df.groupby(['year', 'month'])
month_dict_edits = {}
res = []
for ind, dfg in month_groups:
cur = dfg.groupby(['user', 'page_id']).size().reset_index(name='count')
month_dict_edits[ind] = dfg
cur['year'] = ind[0]
cur['month'] = ind[1]
res.append(cur)
edits_months = pd.concat(res)
with open("/scratch/wiki_dumps/user_data/editing_cnts/%s"%filename, "w") as w:
edits_month.to_csv(w)
break
break
edits = pd.concat(edits)
In [11]:
edits_bak = edits
edits = edits.head(100)
In [ ]:
with open('/scratch/wiki_dumps/user_data/editing_data/all_edits.csv', 'w') as f:
edits_bak.to_csv(f)
In [12]:
len(edits)
Out[12]:
In [13]:
res = {}
In [23]:
In [ ]:
edits['timestamp'].head(10)
In [ ]:
edits['month'] = edits
In [26]:
talk_page = []
for year in range(2001, 2016):
df = pd.read_csv('/scratch/wiki_dumps/user_data/talk_data/activity_article_%d.csv'%year, sep="\t")
talk_page.append(df)
talk_page = pd.concat(talk_page)
In [11]:
talk_page['timestamp_in_sec'] = talk_page.apply(lambda x: timestamp_2_sec(x['timestamp']), axis=1)
In [14]:
len(sub_edits)
Out[14]:
In [15]:
len(ids)
Out[15]:
In [17]:
constraints = ['delta2_no_users', 'delta2_no_users_attacker_in_conv']
for constraint in constraints:
with open('/scratch/wiki_dumps/expr_with_matching/%s/data/all.json'%(constraint)) as f:
for line in f:
conv_id, clss, conversation = json.loads(line)
break
In [ ]:
users = []
user_id = {}
start_time = np.inf
for action in conversation['action_feature']:
start_time = min(start_time, action['timestamp_in_sec'])
if 'user_text' in action:
users.append(action['user_text'])
if 'user_id' in action:
user_id[action['user_text']] = action['user_id']
else:
user_id[action['user_text']] = '0|'+ action['user_text']
page_title = conversation['action_feature'][0]['page_title']
if page_title in subjectpage:
p_id = subjectpage[page_title]
else:
p_id = -1
user_features = {}
ids = list(user_id.values())
date = datetime.datetime.fromtimestamp(start_time)
year = date.year
month = date.month
In [ ]:
constraints = ['delta2_no_users', 'delta2_no_users_attacker_in_conv']
for constraint in constraints:
with open('/scratch/wiki_dumps/expr_with_matching/%s/data/all.json'%(constraint)) as f:
for line in f:
conv_id, clss, conversation = json.loads(line)
users = []
user_id = {}
start_time = np.inf
for action in conversation['action_feature']:
start_time = min(start_time, action['timestamp_in_sec'])
if 'user_text' in action:
users.append(action['user_text'])
if 'user_id' in action:
user_id[action['user_text']] = action['user_id']
else:
user_id[action['user_text']] = '0|'+ action['user_text']
page_title = conversation['action_feature'][0]['page_title']
if page_title in subjectpage:
p_id = subjectpage[page_title]
else:
p_id = -1
page_title = page_title[page_title.find('Talk') + 5:]
user_features = {}
comments = {}
for user in users:
# print(user)
info = {}
# metadata
if user in registration:
info['registration'] = registration[user]
else:
info['anon'] = True
info['groups'] = groups[user]
if user in blocking and blocking[user] < start_time:
info['blocked'] = blocking[user]
# editing data
"""
u_id = user_id[user]
try:
with open('/scratch/wiki_dumps/user_data/editing_per_user/%s'%(u_id)) as f:
edits = pd.read_csv(f)
info['edits_on_subjectpage'] = len(edits[(edits['page_id'] == p_id) & (edits['timestamp_in_sec'] < start_time)])
info['edits_on_wikipedia_articles'] = len(edits[edits['timestamp_in_sec'] < start_time])
except:
info['edits_on_subjectpage'] = 0
info['edits_on_wikipedia_articles'] = 0
"""
week = 7 * 24* 60 * 60
# talk page data
try:
with open('/scratch/wiki_dumps/user_data/talk_per_user/%s'%(user)) as f:
edits = pd.read_csv(f, sep = '\t')
edits['timestamp_in_sec'] = edits.apply(lambda x: timestamp_2_sec(x['timestamp']), axis=1)
# print(page_title)
info['edits_on_this_talk_page'] = len(edits[(edits['page_title'] == page_title) & (edits['timestamp_in_sec'] < start_time)])
info['edits_on_wikipedia_talks'] = len(edits[edits['timestamp_in_sec'] < start_time])
comments[user] = edits[edits['timestamp_in_sec'] < start_time - week].sort_values('timestamp_in_sec', ascending=False).head(100)
comments[user] = comments[user]['comment'].values.tolist()
comments[user] = [x.replace('NEWLINE', ' ') for x in comments[user]]
comments[user] = [x.replace('NEWTAB', ' ') for x in comments[user]]
except:
info['edits_on_this_talk_page'] = 0
info['edits_on_wikipedia_talks'] = 0
comments[user] = []
user_features[user] = info
with open('/scratch/wiki_dumps/expr_with_matching/%s/user_features.json'%(constraint), 'a') as w:
w.write(json.dumps([conv_id, user_features]) + '\n')
with open('/scratch/wiki_dumps/expr_with_matching/%s/comments.json'%(constraint), 'a') as w:
w.write(json.dumps([conv_id, comments]) + '\n')
In [ ]:
last_100_comments = pd.concat(last_100_comments)