This is a data analysis script used to produce findings in this section of the paper, which you can run based entirely off the files in this GitHub repository.
This entire notebook can be run from the beginning with Kernel -> Restart & Run All in the menu bar. It takes about 2 minutes to run on a laptop running a Core i5-2540M processor.
In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import glob
import pickle
import datetime
%matplotlib inline
In [2]:
start = datetime.datetime.now()
In [3]:
!unxz -kf ../../datasets/parsed_dataframes/df_all_2016.pickle.xz
In [4]:
!ls ../../datasets/parsed_dataframes/*
In [5]:
with open("../../datasets/parsed_dataframes/df_all_2016.pickle", "rb") as f:
df_all = pickle.load(f)
In [6]:
df_all.query("time_to_revert_days < 0").groupby("language")['time_to_revert_days'].count()
Out[6]:
In [7]:
df_all.query("page_namespace == 0")['time_to_revert_days'].describe()
Out[7]:
In [8]:
ns0_gb_language = df_all.query("page_namespace == 0").groupby("language")
ns0_gb_language['time_to_revert_days'].describe().unstack()
Out[8]:
In [9]:
ns0_gb_language_year = df_all.query("page_namespace == 0").groupby(["language", "reverting_year"])
ns0_gb_language_year['time_to_revert_days'].mean().unstack()
Out[9]:
In [10]:
ns0_gb_language_year = df_all.query("page_namespace == 0").groupby(["language", "reverting_year"])
ns0_gb_language_year['time_to_revert_days'].median().unstack()
Out[10]:
In [11]:
sns.set(font_scale=2)
g = sns.FacetGrid(df_all.query("page_namespace == 0 and language == 'en'"), size=8, aspect=2)
g.map(sns.distplot, "time_to_revert_hrs_log10", bins=25)
g.ax.set_xticks([np.log10(1/60),np.log10(1), np.log10(24), np.log10(24*7), np.log10(24*30), np.log10(24*365)])
g.ax.set_xticklabels(["minute", "hour", "day", "week", "month", "year"])
g.ax.set_xlim(np.log10(1/90), np.log10(24*365*5))
g.ax.set_ylabel("Kernel Density Estimate value")
g.ax.set_xlabel("Time to revert (log10 scaled)")
Out[11]:
In [19]:
sns.set(font_scale=3.25, rc={'lines.linewidth': 4})
pal = sns.color_palette("hls", 10)
g = sns.FacetGrid(df_all.query("page_namespace == 0"), palette=pal, hue="language", size=8, aspect=3.5)
g.map(sns.kdeplot, "time_to_revert_hrs_log10")
#g.add_legend()
leg = plt.legend()
for legobj in leg.legendHandles:
legobj.set_linewidth(8.0)
g.ax.set_xlim(np.log10(1/90), np.log10(24*365*5))
g.ax.set_ylabel("KDE probability")
g.ax.set_xlabel("Time to revert (log10 scaled)")
#plt.suptitle("Distribution of time to revert for article bot-bot reverts across languages")
g.ax.set_xticks([np.log10(1/60),np.log10(1), np.log10(24), np.log10(24*7), np.log10(24*30), np.log10(24*365)])
g.ax.set_xticklabels(["minute", "hour", "day", "week", "month", "year"])
plt.savefig("time-to-revert-log-articles.pdf", bbox_inches='tight', dpi=600)
In [13]:
sns.set(font_scale=1.25)
pal = sns.color_palette("husl", 7)
g = sns.FacetGrid(df_all.query("page_namespace == 0"),
palette=pal, row="language", size=3, aspect=4, sharex=False, sharey=False)
g.map(sns.distplot, "time_to_revert_hrs_log10")
xticks = [np.log10(1/60),np.log10(1), np.log10(24), np.log10(24*7), np.log10(24*30), np.log10(24*365)]
xticklabels = ["minute", "hour", "day", "week", "month", "year"]
for ax in g.axes.flatten():
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels)
ax.set_xlim(np.log10(1/90), np.log10(24*365*5))
In [20]:
sns.set(font_scale=3)
g = sns.factorplot(data=df_all[df_all['page_namespace']==0],
x='language',
y='time_to_revert_days',
hue='reverting_year',
kind='bar',
size=10,
aspect = 3)
#plt.suptitle("Mean days to revert by language and reverting year (articles only, 95% CI bars)")
plt.savefig("mean-days-to-revert-articles.pdf", bbox_inches='tight', dpi=600)
In [15]:
sns.set(font_scale=3)
g = sns.factorplot(data=df_all[df_all['page_namespace']==0],
x='language',
y='time_to_revert_days',
hue='reverting_year',
kind='bar',
estimator=np.median,
size=10,
aspect = 3)
plt.suptitle("Median days to revert by language and reverting year (articles only, 95% CI bars)")
Out[15]:
In [16]:
end = datetime.datetime.now()
time_to_run = end - start
minutes = int(time_to_run.seconds/60)
seconds = time_to_run.seconds % 60
print("Total runtime: ", minutes, "minutes, ", seconds, "seconds")