In [12]:
import json
import pickle as cPickle
import numpy as np
from collections import defaultdict
import os
import csv
import datetime
import pandas as pd
import requests
In [30]:
def load_data(page):
apilink = 'https://en.wikipedia.org/w/api.php'
#?action=query&list=users&format=json'
response = requests.get(apilink,
params={'action': 'query', 'titles': page, 'prop': 'info', 'inprop': 'subjectid', 'format':'json'})
# print(response.json())
data = response.json()
tmp = data['query']['pages']
if 'subjectid' in tmp[list(tmp.keys())[0]]:
return tmp[list(tmp.keys())[0]]['subjectid']
else:
return -1
In [ ]:
#api.php?action=query&titles=Albert%20Einstein&prop=info&intestactions=edit|delete|rollback&formatversion=2
In [6]:
constraints = ['delta2_no_users', 'delta2_no_users_attacker_in_conv']
In [7]:
users = defaultdict(list)
In [8]:
for constraint in constraints:
with open('/scratch/wiki_dumps/expr_with_matching/%s/data/all.json'%(constraint)) as f:
for line in f:
conv_id, clss, conversation = json.loads(line)
start_time = np.inf
for action in conversation['action_feature']:
start_time = max(start_time, action['timestamp_in_sec'])
for action in conversation['action_feature']:
if 'user_text' in action:
users['user_text'].append(start_time)
In [9]:
with open('/scratch/wiki_dumps/user_data/user_timestamp_pair.json', 'w') as w:
json.dump(users,w)
In [31]:
pages = {}
for constraint in constraints:
with open('/scratch/wiki_dumps/expr_with_matching/%s/data/all.json'%(constraint)) as f:
for line in f:
conv_id, clss, conversation = json.loads(line)
page = conversation['action_feature'][0]['page_title']
pages[page] = load_data(page)
In [ ]:
with open('/scratch/wiki_dumps/user_data/user_timestamp_pair.json', 'w') as w:
json.dump(users,w)
In [ ]:
def get_activity_cnt(ns, year):
users = set([])
os.system('tar -xzf /scratch/wiki_dumps/wikipedia_talk_corpus/comments_%s_%d.tar.gz' % (ns, year))
data_dir = "comments_%s_%d" % (ns, year)
activity_dict = {}
for month in range(1, 13):
activity_dict[month] = []
total = []
for _, _, filenames in os.walk(data_dir):
for filename in filenames:
if re.match("chunk_\d*.tsv", filename):
df = pd.read_csv(os.path.join(data_dir, filename), sep = "\t")
df = df[df['user_text'].isin(usrlst)]
total.append(df)
ret = pd.concat(total)
os.system('rm -r comments_%s_%d' % (ns, year))
return ret
In [ ]:
for year in range(2001, 2016):
activity = []
for ns in ['article', 'user']:
with gzip.open("../user_activity_monthly_count/activity_%s_%d.csv.gz"%(ns, year)) as f:
activity.append(pd.read_csv(f, sep="\t"))
act = get_activity_cnt(ns, year)
with gzip.open("../user_activity_monthly_count/activity_%s_%d.csv.gz"%(ns, year), "wb") as f:
act.to_csv(f, sep="\t", encoding='utf-8')
break
In [20]:
users_of_interest = list(users.keys())
print(len(users_of_interest))
with open('/scratch/wiki_dumps/user_data.json', 'w') as w:
json.dump(users_of_interest,w)
In [1]:
# Article Activity Data
# From Toktrack https://zenodo.org/record/345571#.WdwhSyErLCL
# revisions.7z
# gather_user_data.py
In [3]:
# Metadata
with open('/scratch/wiki_dumps/user_data/metadata.json') as f:
usrdata = json.load(f)
values = set([])
for user, data in usrdata.items():
if 'groups' in data:
values = values | set(data['groups'])
In [ ]:
# Talk Page Activity Data
# Only Considering insertion