This dataset loads and parses the reverted_bot2bot
datasets for seven languages, which were created by processing the Wikipedia revision history dumps by the scripts called in the Makefile
in the root directory of the repository. This is the first script that you can run based entirely off the files in this GitHub repository. This generates:
/datasets/parsed_dataframes/df_all_2016.pickle
/datasets/parsed_dataframes/df_all_2016.pickle.xz
These datasets are used for the analyses in section 5 (5-*.ipynb) and as the basis of the comment parsing analysis in section 7 and 8.
This entire notebook can be run from the beginning with Kernel -> Restart & Run All in the menu bar. On a laptop running a Core i5-2540M processor, it takes about 5 minutes to run, then another 5 minutes to compress to xz.
In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import glob
import datetime
%matplotlib inline
pd.set_option("display.max_columns",100)
In [2]:
start = datetime.datetime.now()
In [3]:
!ls -lah ../../datasets/reverted_bot2bot/*.bz2
In [4]:
!bunzip2 -kf ../../datasets/reverted_bot2bot/*.bz2
In [5]:
!ls -lah ../../datasets/reverted_bot2bot/*.tsv
In [6]:
glob.glob("../../datasets/reverted_bot2bot/??wiki_20170420.tsv")
Out[6]:
In [7]:
df_dict = {}
for filename in glob.glob("../../datasets/reverted_bot2bot/??wiki_20170420.tsv"):
lang_code = filename[32:34]
df_dict[lang_code] = pd.read_csv(filename, sep="\t")
df_dict[lang_code] = df_dict[lang_code].drop_duplicates()
In [8]:
for lang, lang_df in df_dict.items():
print(lang, len(lang_df))
In [9]:
df_dict['en'][0:2].transpose()
Out[9]:
In [10]:
df_all = df_dict['en'].copy()
df_all = df_all.drop(df_all.index, axis=0)
for lang, lang_df in df_dict.items():
lang_df['language'] = lang
df_all = pd.concat([df_all, lang_df])
In [11]:
df_all['language'].value_counts()
Out[11]:
In [12]:
len(df_all)
Out[12]:
In [13]:
def namespace_type(item):
"""
Classifies namespace type. To be used with df.apply() on ['page_namespace']
"""
if int(item) == 0:
return 'article'
elif int(item) == 14:
return 'category'
elif int(item) % 2 == 1:
return 'other talk'
else:
return 'other page'
In [14]:
df_all['namespace_type'] = df_all['page_namespace'].apply(namespace_type)
In [15]:
df_all['namespace_type'].value_counts()
Out[15]:
In [16]:
def get_year(timestamp):
return timestamp.year
In [17]:
df_all['reverting_timestamp_dt'] = pd.to_datetime(df_all['reverting_timestamp'], format="%Y%m%d%H%M%S")
df_all['reverted_timestamp_dt'] = pd.to_datetime(df_all['rev_timestamp'], format="%Y%m%d%H%M%S")
df_all = df_all.set_index('reverting_timestamp_dt')
df_all['reverting_timestamp_dt'] = pd.to_datetime(df_all['reverting_timestamp'], format="%Y%m%d%H%M%S")
df_all['time_to_revert'] = df_all['reverting_timestamp_dt']-df_all['reverted_timestamp_dt']
df_all['time_to_revert_hrs'] = df_all['time_to_revert'].astype('timedelta64[s]')/(60*60)
df_all['time_to_revert_days'] = df_all['time_to_revert'].astype('timedelta64[s]')/(60*60*24)
df_all['reverting_year'] = df_all['reverting_timestamp_dt'].apply(get_year)
df_all['time_to_revert_days_log10'] = df_all['time_to_revert_days'].apply(np.log10)
df_all['time_to_revert_hrs_log10'] = df_all['time_to_revert_hrs'].apply(np.log10)
In [18]:
df_all = df_all.loc["2001-01-01":"2016-12-31"]
In [19]:
df_all[df_all['language']=='en'].reverting_year.value_counts().sort_index()
Out[19]:
In [20]:
df_all.reverting_year.value_counts().sort_index()
Out[20]:
In [21]:
# by http://stackoverflow.com/questions/14596884/remove-text-between-and-in-python
def remove_brackets(test_str):
"""
Takes a string and returns that string with text in brackets and parentheses removed
"""
test_str = str(test_str)
ret = ''
skip1c = 0
skip2c = 0
for i in test_str:
if i == '[':
skip1c += 1
elif i == '(':
skip2c += 1
elif i == ']' and skip1c > 0:
skip1c -= 1
elif i == ')'and skip2c > 0:
skip2c -= 1
elif skip1c == 0 and skip2c == 0:
ret += i
return " ".join(ret.split())
In [22]:
df_all['reverting_comment_nobracket'] = df_all['reverting_comment'].apply(remove_brackets)
In [23]:
In [23]:
def concat_botpair(row):
"""
Concatenate the reverting and reverted user names. To be used with df.apply()
on the entire row
"""
return str(row['reverting_user_text']) + " rv " + str(row['rev_user_text'])
def sorted_botpair(row):
"""
Returns a sorted list of bot pairs (reverted and reverting). To be used with
df.apply() on the entire row. list.sort() is locale dependent, but it doesn't
matter because all we need is a consistent way of uniquely sorting.
"""
return str(sorted([row['reverting_user_text'], row['rev_user_text']]))
In [24]:
df_all['botpair'] = df_all.apply(concat_botpair, axis=1)
In [25]:
df_all['botpair_sorted'] = df_all.apply(sorted_botpair, axis=1)
In [26]:
gb_lpb = df_all.groupby(["language", "rev_page", "botpair"])
gb_lpb_s = df_all.groupby(["language", "rev_page", "botpair_sorted"])
In [27]:
df_lpb = pd.DataFrame(gb_lpb['rev_id'].count()).reset_index().rename(columns={"rev_id":"reverts_per_page_botpair"})
df_lpb[0:5]
Out[27]:
In [28]:
df_lpb_s = pd.DataFrame(gb_lpb_s['rev_id'].count()).reset_index().rename(columns={"rev_id":"reverts_per_page_botpair_sorted"})
df_lpb_s[0:5]
Out[28]:
In [29]:
df_all = pd.merge(df_all, df_lpb, how='left',
left_on=["language", "rev_page", "botpair"],
right_on=["language", "rev_page", "botpair"])
df_all = pd.merge(df_all, df_lpb_s, how='left',
left_on=["language", "rev_page", "botpair_sorted"],
right_on=["language", "rev_page", "botpair_sorted"])
In [30]:
len(df_all.query("time_to_revert_days < 0"))
Out[30]:
In [31]:
len(df_all.query("time_to_revert_days > 0"))
Out[31]:
In [32]:
df_all.query("time_to_revert_days < 0").groupby("language")['rev_id'].count()
Out[32]:
In [33]:
df_all.query("time_to_revert_days > 0").groupby("language")['rev_id'].count()
Out[33]:
In [34]:
len(df_all)
Out[34]:
In [35]:
df_all.sample(2).transpose()
Out[35]:
In [36]:
!rm ../../datasets/parsed_dataframes/df_all_2016.p*
In [37]:
df_all.to_pickle("../../datasets/parsed_dataframes/df_all_2016.pickle")
In [38]:
!xz -k -e -9 ../../datasets/parsed_dataframes/df_all_2016.pickle
In [39]:
df_all.to_csv("../../datasets/parsed_dataframes/df_all_2016.tsv", sep="\t")
In [40]:
!xz -k -e -9 ../../datasets/parsed_dataframes/df_all_2016.tsv
In [41]:
end = datetime.datetime.now()
time_to_run = end - start
minutes = int(time_to_run.seconds/60)
seconds = time_to_run.seconds % 60
print("Total runtime: ", minutes, "minutes, ", seconds, "seconds")