This is the first data analysis script used to produce findings in the paper, which you can run based entirely off the files in this GitHub repository.
This entire notebook can be run from the beginning with Kernel -> Restart & Run All in the menu bar. It takes about 1 minute to run on a laptop running a Core i5-2540M processor.
In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import datetime
%matplotlib inline
In [2]:
start = datetime.datetime.now()
In [3]:
!unxz -kf ../../datasets/parsed_dataframes/df_all_2016.pickle.xz
In [4]:
!ls -lah ../../datasets/parsed_dataframes/*
In [5]:
with open("../../datasets/parsed_dataframes/df_all_2016.pickle", "rb") as f:
df_all = pickle.load(f)
In [6]:
len(df_all)
Out[6]:
In [7]:
df_all.sample(2).transpose()
Out[7]:
In [8]:
gb = df_all[df_all['page_namespace']==0].groupby(["language","reverting_year"])
In [9]:
sns.set(font_scale=1.5)
gb['rev_id'].count().unstack().transpose()
Out[9]:
In [10]:
sns.set(font_scale=1.5)
sns.set_style("whitegrid")
groupby_unstack = gb['revisions_reverted'].count().unstack().transpose()
ax = groupby_unstack.plot(kind='line', logy=True, figsize=[10,6], colormap="Accent")
plt.xlim(2004,2018)
plt.ylabel("Number of bot-bot reverts (log scaled)")
plt.xlabel("Year of reverting edit")
#plt.suptitle("Bot-bot reverts per language by reverting year, articles only")
leg = plt.legend()
for legobj in leg.legendHandles:
legobj.set_linewidth(8.0)
plt.savefig("reverts-yearly-counts.pdf", bbox_inches='tight', dpi=600)
In [11]:
gb['rev_id'].count().unstack().transpose().sum()
Out[11]:
In [12]:
gb['rev_id'].count().unstack().transpose().sum().sum()
Out[12]:
In [13]:
gb_lang_nstype = df_all.groupby(["language", "namespace_type"])
gb_lang_nstype['revisions_reverted'].count().unstack().transpose()
Out[13]:
In [14]:
sns.set(font_scale=2)
sns.set_style("whitegrid")
g = sns.factorplot(data=df_all,
x='language',
y=None,
hue='namespace_type',
kind='count',
size=8,
palette="Accent",
aspect = 1)
plt.savefig("reverts-namespace-counts.pdf", bbox_inches='tight', dpi=600)
In [15]:
gb_lang_nstype['revisions_reverted'].count().unstack().transpose().sum()
Out[15]:
In [16]:
gb_lang_nstype['revisions_reverted'].count().unstack().sum()
Out[16]:
In [17]:
df_all['namespace_type'].value_counts(normalize=True)
Out[17]:
In [18]:
1 - df_all['namespace_type'].value_counts(normalize=True)['article']
Out[18]:
In [19]:
end = datetime.datetime.now()
time_to_run = end - start
minutes = int(time_to_run.seconds/60)
seconds = time_to_run.seconds % 60
print("Total runtime: ", minutes, "minutes, ", seconds, "seconds")
In [20]: