This is a data analysis script used to produce findings in the paper, which you can run based entirely off the files in this GitHub repository.
This entire notebook can be run from the beginning with Kernel -> Restart & Run All in the menu bar. It takes about 2 minutes to run on a laptop running a Core i5-2540M processor.
In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import glob
import pickle
import datetime
%matplotlib inline
In [2]:
start = datetime.datetime.now()
In [3]:
!unxz --keep --force ../../datasets/parsed_dataframes/df_all_comments_parsed_2016.pickle.xz
In [4]:
with open("../../datasets/parsed_dataframes/df_all_comments_parsed_2016.pickle", "rb") as f:
df_all = pickle.load(f)
In [5]:
len(df_all)
Out[5]:
In [6]:
df_all[0:2].transpose()
Out[6]:
In [7]:
df_all_ns0 = df_all.query("page_namespace == 0")
In [8]:
counts_bottype_dict = {}
for lang in df_all_ns0['language'].unique():
df_lang_ns0 = df_all_ns0[df_all_ns0['language']==lang]
type_counts = df_lang_ns0['bottype'].value_counts().rename("count")
type_percent = df_lang_ns0['bottype'].value_counts(normalize=True).rename("percent") * 100
type_percent = type_percent.round(2).astype(str) + "%"
counts_bottype_dict[lang]=pd.concat([type_counts, type_percent], axis=1)
counts_bottype_group_dict = {}
for lang in df_all_ns0['language'].unique():
df_lang_ns0 = df_all_ns0[df_all_ns0['language']==lang]
type_counts = df_lang_ns0['bottype_group'].value_counts().rename("count")
type_percent = df_lang_ns0['bottype_group'].value_counts(normalize=True).rename("percent") * 100
type_percent = type_percent.round(2).astype(str) + "%"
counts_bottype_group_dict[lang]=pd.concat([type_counts, type_percent], axis=1)
In [9]:
counts_bottype_group_dict['en']
Out[9]:
In [10]:
prop_bottype_group_df = pd.DataFrame()
In [11]:
pd.set_option('precision',4)
In [12]:
for df in counts_bottype_group_dict.items():
concat_df = df[1]['percent']
concat_df.name = df[0] + " %"
prop_bottype_group_df = pd.concat([prop_bottype_group_df, concat_df], axis=1)
In [13]:
prop_bottype_group_df.fillna("---")
Out[13]:
In [14]:
pd.concat([df[1]['percent'], df[1]['percent']], axis=1)
Out[14]:
In [ ]:
In [15]:
gb_lang_bottype = df_all_ns0.groupby(["language", "bottype"])['revisions_reverted']
gb_lang_bottype_group = df_all_ns0.groupby(["language", "bottype_group"])['revisions_reverted']
In [16]:
gb_lang_bottype.count().unstack().transpose().replace(np.nan,0)
Out[16]:
In [17]:
gb_lang_bottype_group.count().unstack().transpose().replace(np.nan,0).sort_values(by='en', ascending=False)
Out[17]:
In [18]:
sns.set(font_scale=1.5)
sns.set_style("whitegrid")
sns.set_palette("husl")
gb_lang_bottype_group.sum().unstack().transpose().plot(kind='bar', subplots=False, figsize=[12,6])
Out[18]:
In [19]:
sns.set(font_scale=1.5)
sns.set_style("whitegrid")
sns.set_palette("husl")
gb_lang_bottype_group.sum().unstack().transpose().plot(kind='barh', subplots=False, figsize=[12,6])
plt.xscale("log")
In [ ]:
In [ ]:
In [20]:
sns.set(font_scale=2)
pal = sns.color_palette("hls", 10)
g = sns.FacetGrid(df_all.query("page_namespace == 0 and language == 'en'"),
palette=pal, hue="bottype_group", size=8, aspect=2)
g.map(sns.kdeplot, "time_to_revert_hrs_log10")
#g.add_legend()
leg = plt.legend()
for legobj in leg.legendHandles:
legobj.set_linewidth(8.0)
g.ax.set_xlim([np.log10(1/90), np.log10(24*365*5)])
g.ax.set_ylim(0,1.25)
g.ax.set_xticks([np.log10(1/60),np.log10(1), np.log10(24), np.log10(24*7), np.log10(24*30), np.log10(24*365)])
g.ax.set_xticklabels(["minute", "hour", "day", "week", "month", "year"])
Out[20]:
In [21]:
sns.set(font_scale=2)
pal = sns.color_palette("hls", 10)
g = sns.FacetGrid(df_all.query("page_namespace == 0"),
palette=pal, hue="bottype_group", size=8, aspect=2)
g.map(sns.kdeplot, "time_to_revert_hrs_log10")
#g.add_legend()
leg = plt.legend()
for legobj in leg.legendHandles:
legobj.set_linewidth(8.0)
g.ax.set_xlim([np.log10(1/90), np.log10(24*365*5)])
g.ax.set_ylim(0,1.25)
g.ax.set_xticks([np.log10(1/60),np.log10(1), np.log10(24), np.log10(24*7), np.log10(24*30), np.log10(24*365)])
g.ax.set_xticklabels(["minute", "hour", "day", "week", "month", "year"])
Out[21]:
In [22]:
sns.set(font_scale=2)
pal = sns.color_palette("husl", 7)
g = sns.FacetGrid(df_all.query("page_namespace == 0"),
palette=pal, row="bottype_group", size=3, aspect=4, sharex=False, sharey=False)
g.map(sns.kdeplot, "time_to_revert_hrs_log10")
xticks = [np.log10(1/60),np.log10(1), np.log10(24), np.log10(24*7), np.log10(24*30), np.log10(24*365)]
xticklabels = ["minute", "hour", "day", "week", "month", "year"]
for ax in g.axes.flatten():
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels)
ax.set_xlim(np.log10(1/90), np.log10(24*365*5))
In [23]:
sns.set(font_scale=2.25)
pal = sns.color_palette("husl", 7)
g = sns.FacetGrid(df_all.query("page_namespace == 0"),
palette=pal, col="bottype_group", size=3, aspect=4,
col_wrap = 2, sharex=False, sharey=True)
g.map(sns.kdeplot, "time_to_revert_hrs_log10", shade=True)
xticks = [np.log10(1/60),np.log10(1), np.log10(24), np.log10(24*7), np.log10(24*30), np.log10(24*365)]
xticklabels = ["minute", "hour", "day", "week", "month", "year"]
for ax in g.axes.flatten():
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels)
ax.set_ylim(0,1.25)
ax.set_xlim(np.log10(1/90), np.log10(24*365*5))
if ax.colNum == 0:
ax.set_ylabel("Probability")
if ax.rowNum == 4:
ax.set_xlabel("Time to revert")
plt.savefig("ttr-categorized.pdf", dpi=600)
In [24]:
gb_group_per_page = df_all.query("page_namespace == 0").groupby(["language","bottype_group"])['reverts_per_page_botpair_sorted']
In [25]:
gb_group_per_page.mean().unstack()
Out[25]:
In [26]:
df_all_ns0_multiple_reverts = df_all_ns0.query("reverts_per_page_botpair_sorted > 1 and time_to_revert_days < 180")
gb_lang_bottype_group_rr = df_all_ns0_multiple_reverts.groupby(["language", "bottype_group"])['revisions_reverted']
In [27]:
gb_lang_bottype_group_rr.count().unstack().transpose().replace(np.nan,0).sort_values(by='en', ascending=False)
Out[27]:
Sum by language
In [28]:
gb_lang_bottype_group_rr.count().unstack().transpose().replace(np.nan,0).sort_values(by='en', ascending=False).sum()
Out[28]:
In [29]:
gb_lang_bottype_group_rr.count().unstack().transpose().replace(np.nan,0).sort_values(by='en', ascending=False).sum()
Out[29]:
Total number of reverts that are part of possible botfights
In [30]:
gb_lang_bottype_group_rr.count().unstack().transpose().replace(np.nan,0).sort_values(by='en', ascending=False).sum().sum()
Out[30]:
In [31]:
sns.set(font_scale=1.5)
gb_lang_bottype_group_rr.count().unstack().transpose().replace(np.nan,0).sort_values(by='en', ascending=False).plot(kind='bar')
Out[31]:
In [ ]:
This is made by counting the number of reverts that were not classified, classified as identified "botfight", or have "revert" in the edit summary.
In [32]:
to_keep = ["botfight", "other w/ revert in comment", "not classified"]
In [33]:
len(df_all_ns0[df_all_ns0['bottype_group'].isin(to_keep)])
Out[33]:
So this is 19,673 bot-bot reverts to articles that we cannot assume are not conflict. What is the proportion? Divide this figure by the total number of bot-bot reverts to articles, then subtract from 1.
In [34]:
1 - len(df_all_ns0[df_all_ns0['bottype_group'].isin(to_keep)])/len(df_all_ns0)
Out[34]:
In [35]:
len(df_all_ns0[df_all_ns0['bottype_group'].isin(to_keep)])/len(df_all_ns0)
Out[35]:
What is the proportion if we filter to bot-bot reverts where the time to revert was less than 180 days and where there was more than one interaction per pair of bots on a particular article, then filter out cases classified as anything other than "botfight", "other w/ revert in comment", and "not classified"?
In [36]:
df_all_ns0_addl_assmpt = df_all_ns0.query("reverts_per_page_botpair_sorted > 1 and time_to_revert_days < 180")
len(df_all_ns0_addl_assmpt[df_all_ns0_addl_assmpt['bottype_group'].isin(to_keep)])
Out[36]:
So 3,007 additional possible cases of bot-bot conflict, which makes for what proportion of all bot-bot reverts to articles?
In [37]:
1 - len(df_all_ns0_addl_assmpt[df_all_ns0_addl_assmpt['bottype_group'].isin(to_keep)])/len(df_all_ns0)
Out[37]:
In [38]:
len(df_all_ns0_addl_assmpt[df_all_ns0_addl_assmpt['bottype_group'].isin(to_keep)])/len(df_all_ns0)
Out[38]:
In [39]:
df_all_ns0_multiple_reverts.to_pickle("../../datasets/parsed_dataframes/possible_botfights.pickle")
df_all_ns0_multiple_reverts.to_csv("../../datasets/parsed_dataframes/possible_botfights.tsv", sep="\t")
In [40]:
!xz -9 -e --keep ../../datasets/parsed_dataframes/possible_botfights.pickle
!xz -9 -e --keep ../../datasets/parsed_dataframes/possible_botfights.tsv
In [ ]:
In [41]:
end = datetime.datetime.now()
time_to_run = end - start
minutes = int(time_to_run.seconds/60)
seconds = time_to_run.seconds % 60
print("Total runtime: ", minutes, "minutes, ", seconds, "seconds")
In [ ]:
In [ ]:
In [ ]: