In [1]:
!pip install pandas seaborn mwapi
import pandas as pd
import seaborn as sns
import mwapi
import numpy as np
%matplotlib inline
In [2]:
languages = ["en", "de", "es", "ja", "zh", "ar", "hu", "fr"]
sessions = {}
for language in languages:
sessions[language] = mwapi.Session("https://" + language + ".wikipedia.org",
user_agent="Research script by User:Staeiou")
sessions['meta'] = mwapi.Session("https://meta.wikimedia.org",
user_agent="Research script by User:Staeiou")
sessions['commons'] = mwapi.Session("https://commons.wikimedia.org",
user_agent="Research script by User:Staeiou")
In [3]:
with open("../../datasets/crosswiki_unified_bot_20170328.tsv") as f:
bots = f.read()
bot_list = bots.split("\n")
print(len(bot_list))
In [ ]:
In [4]:
result = sessions['en'].get(action='query',
list='allusers',
auprop='editcount',
aulimit=1,
aufrom="ClueBot_NG")
In [5]:
result['query']['allusers'][0]['editcount']
Out[5]:
In [6]:
def get_edit_count(language):
"""
With list of all bots, get the datetime of first edit for a language version.
Bots with no edits have a datetime of np.nan
Parameters:
language, a subdomain of en.wikipedia.org (that has a Session obj in sessions[])
Returns:
first_edit_df, a dataframe containing {bot name, datetime of first edit}
"""
first_edit_df = pd.DataFrame(columns=["bot", "language", "edit_count"])
for bot in bot_list:
try:
result = sessions[language].get(action='query',
list='allusers',
auprop='editcount',
aulimit=1,
aufrom=bot)
editcount = result['query']['allusers'][0]['editcount']
except:
editcount = np.nan
if editcount is not np.nan:
bot_dict = {'bot':bot, 'language':language, 'edit_count' : editcount}
#print(bot_dict)
first_edit_df = first_edit_df.append(bot_dict, ignore_index=True)
#print(first_edit_df)
if len(first_edit_df) % 500 == 0:
print(len(first_edit_df), sep=" ")
return first_edit_df
In [7]:
first_edit_df_dict = {}
for language in languages:
print(language)
print("-----")
first_edit_df_dict[language] = get_edit_count(language)
In [10]:
import pickle
with open("bot_edits_per_lang_dict.pickle", "wb") as f:
pickle.dump(first_edit_df_dict, f)
In [12]:
bot_edits_dict = first_edit_df_dict
In [14]:
tidy_df = pd.DataFrame(columns=["bot", "language", "edit_count"])
for lang in bot_edits_dict.keys():
tidy_df = tidy_df.append(bot_edits_dict[lang])
In [19]:
tidy_df = tidy_df.drop_duplicates()
In [22]:
tidy_df.groupby("language")['edit_count'].sum()
Out[22]:
In [ ]:
In [23]:
with open("../comments/df_all_comments_parsed.pickle", "rb") as f:
df_all = pickle.load(f)
In [26]:
df_all.groupby("language")['revisions_reverted'].sum()
Out[26]:
In [27]:
1501386/104277118
Out[27]:
In [28]:
188344/17358680
Out[28]:
In [29]:
121652/7660553
Out[29]:
In [ ]:
import pickle
In [ ]:
with open("bot_first_edit_df_dict.pickle", "wb") as f:
pickle.dump(first_edit_df_dict, f)
In [ ]:
tidy_df = pd.DataFrame(columns=["bot","language","first_edit"])
for lang, lang_df in first_edit_df_dict.items():
for index, row in lang_df.iterrows():
bot_dict = {"bot":row['bot'], "language":lang, "first_edit":row['first_edit']}
tidy_df = tidy_df.append(bot_dict, ignore_index=True)
In [ ]:
tidy_df['first_edit_dt'] = pd.to_datetime(tidy_df['first_edit'], format="%Y-%m-%dT%H:%M:%SZ")
In [ ]:
tidy_df_i = tidy_df.set_index('first_edit_dt')
In [ ]:
tidy_df_i.to_pickle("bot_first_edit_tidy_df.pickle")
In [ ]:
!bzip2 -k bot_first_edit_tidy_df.pickle
In [ ]:
gb = tidy_df_i.groupby([pd.TimeGrouper("1M"), 'language'])['bot']
In [ ]:
monthly_lang = gb.count().unstack()
monthly_lang[0:10]
In [ ]:
sns.set(font_scale=2)
In [ ]:
axes = monthly_lang.plot(kind="area", figsize=[12,17], subplots=True, sharey=True)
In [ ]:
pal = sns.color_palette("husl", 7)
for lang, lang_df in first_edit_df_dict.items():
ax = lang_df['bot'].groupby([pd.TimeGrouper("1M")]).count().cumsum().plot(figsize=[14,8], logy=False)
ax.set_xlabel("Date")
ax.set_ylabel("# of approved bots")
plt.suptitle("Number of bots that have ever run in various Wikipedia editions")
leg = plt.legend(first_edit_df_dict.keys())
for legobj in leg.legendHandles:
legobj.set_linewidth(8.0)
In [ ]:
pal = sns.color_palette("husl", 7)
for lang, lang_df in first_edit_df_dict.items():
ax = lang_df['bot'].groupby([pd.TimeGrouper("1M")]).count().cumsum().plot(figsize=[14,8], logy=True)
ax.set_xlabel("Date")
ax.set_ylabel("# of approved bots")
plt.suptitle("Number of bots that have ever run in various Wikipedia editions")
leg = plt.legend(first_edit_df_dict.keys())
for legobj in leg.legendHandles:
legobj.set_linewidth(8.0)
In [ ]:
import matplotlib.pyplot as plt
sns.set(font_scale=1.5)
ax = first_edit_df_dict['en']['bot'].groupby([pd.TimeGrouper("1D")]).count().cumsum().plot(figsize=[14,4])
ax.set_xlabel("Date")
ax.set_ylabel("cumulative # of bots")
plt.suptitle("Total number of bots that have ever run on the English-language Wikipedia")
In [ ]:
In [ ]:
In [ ]: