In [1]:
!pip install pandas seaborn mwapi
import pandas as pd
import seaborn as sns
import mwapi
import numpy as np
%matplotlib inline


Requirement already satisfied: pandas in /home/staeiou/conda/lib/python3.5/site-packages
Requirement already satisfied: seaborn in /home/staeiou/conda/lib/python3.5/site-packages
Requirement already satisfied: mwapi in /home/staeiou/conda/lib/python3.5/site-packages
Requirement already satisfied: python-dateutil>=2 in /home/staeiou/conda/lib/python3.5/site-packages (from pandas)
Requirement already satisfied: pytz>=2011k in /home/staeiou/conda/lib/python3.5/site-packages (from pandas)
Requirement already satisfied: numpy>=1.7.0 in /home/staeiou/conda/lib/python3.5/site-packages (from pandas)
Requirement already satisfied: requests in /home/staeiou/conda/lib/python3.5/site-packages (from mwapi)
Requirement already satisfied: six>=1.5 in /home/staeiou/conda/lib/python3.5/site-packages (from python-dateutil>=2->pandas)

Load data and initalize mwapi


In [2]:
languages = ["en", "de", "es", "ja", "zh", "ar", "hu", "fr"]

sessions = {}
for language in languages:
    sessions[language] = mwapi.Session("https://" + language + ".wikipedia.org",
                                       user_agent="Research script by User:Staeiou")
    
sessions['meta'] = mwapi.Session("https://meta.wikimedia.org",
                                       user_agent="Research script by User:Staeiou")

sessions['commons'] = mwapi.Session("https://commons.wikimedia.org",
                                       user_agent="Research script by User:Staeiou")

In [3]:
with open("../../datasets/crosswiki_unified_bot_20170328.tsv") as f:
    bots = f.read()
    bot_list = bots.split("\n")
    
print(len(bot_list))


6523

In [ ]:


In [4]:
result = sessions['en'].get(action='query',
                                            list='allusers',
                                            auprop='editcount',
                                            aulimit=1,
                                            aufrom="ClueBot_NG")

In [5]:
result['query']['allusers'][0]['editcount']


Out[5]:
4440861

Function for querying API to get edit counts


In [6]:
def get_edit_count(language):
    """
    With list of all bots, get the datetime of first edit for a language version.
    Bots with no edits have a datetime of np.nan
    
    Parameters:
        language, a subdomain of en.wikipedia.org (that has a Session obj in sessions[])
    Returns:
        first_edit_df, a dataframe containing {bot name, datetime of first edit}
    """
    first_edit_df = pd.DataFrame(columns=["bot", "language", "edit_count"])

    for bot in bot_list:
        try:
            result = sessions[language].get(action='query',
                                            list='allusers',
                                            auprop='editcount',
                                            aulimit=1,
                                            aufrom=bot)
            editcount = result['query']['allusers'][0]['editcount']
        except:
            editcount = np.nan

        if editcount is not np.nan:
            bot_dict = {'bot':bot, 'language':language, 'edit_count' : editcount}
            #print(bot_dict)
            first_edit_df = first_edit_df.append(bot_dict, ignore_index=True)
            #print(first_edit_df)

        if len(first_edit_df) % 500 == 0:
            print(len(first_edit_df), sep=" ")
    return first_edit_df

Get first edit dataframes for each language


In [7]:
first_edit_df_dict = {}

for language in languages:
    print(language)
    print("-----")
    first_edit_df_dict[language] = get_edit_count(language)


en
-----
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
de
-----
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
es
-----
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
ja
-----
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
zh
-----
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
ar
-----
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
hu
-----
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
fr
-----
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500

In [10]:
import pickle
with open("bot_edits_per_lang_dict.pickle", "wb") as f:
    pickle.dump(first_edit_df_dict, f)

In [12]:
bot_edits_dict = first_edit_df_dict

In [14]:
tidy_df = pd.DataFrame(columns=["bot", "language", "edit_count"])
for lang in bot_edits_dict.keys():
    tidy_df = tidy_df.append(bot_edits_dict[lang])

In [19]:
tidy_df = tidy_df.drop_duplicates()

In [22]:
tidy_df.groupby("language")['edit_count'].sum()


Out[22]:
language
ar     10832578.0
de     17358680.0
en    104277118.0
es     19280270.0
fr     22658191.0
hu      5971308.0
ja      7660553.0
zh     11678907.0
Name: edit_count, dtype: float64

In [ ]:


In [23]:
with open("../comments/df_all_comments_parsed.pickle", "rb") as f:
    df_all = pickle.load(f)

In [26]:
df_all.groupby("language")['revisions_reverted'].sum()


Out[26]:
language
de     188344
en    1501386
es     249560
fr     253060
ja     121652
pt     187968
zh     142870
Name: revisions_reverted, dtype: int64

In [27]:
1501386/104277118


Out[27]:
0.01439803888711232

In [28]:
188344/17358680


Out[28]:
0.01085013376593151

In [29]:
121652/7660553


Out[29]:
0.015880315690003057

Export raw dataset to pickle


In [ ]:
import pickle

In [ ]:
with open("bot_first_edit_df_dict.pickle", "wb") as f:
    pickle.dump(first_edit_df_dict, f)

Make a tidy dataframe

One dataframe, one row for each bot in each language


In [ ]:
tidy_df = pd.DataFrame(columns=["bot","language","first_edit"])

for lang, lang_df in first_edit_df_dict.items():
    for index, row in lang_df.iterrows():        
        bot_dict = {"bot":row['bot'], "language":lang, "first_edit":row['first_edit']}
        tidy_df = tidy_df.append(bot_dict, ignore_index=True)

In [ ]:
tidy_df['first_edit_dt'] = pd.to_datetime(tidy_df['first_edit'], format="%Y-%m-%dT%H:%M:%SZ")

In [ ]:
tidy_df_i = tidy_df.set_index('first_edit_dt')

In [ ]:
tidy_df_i.to_pickle("bot_first_edit_tidy_df.pickle")

In [ ]:
!bzip2 -k bot_first_edit_tidy_df.pickle

Group by number of new bots per langauge per month


In [ ]:
gb = tidy_df_i.groupby([pd.TimeGrouper("1M"), 'language'])['bot']

In [ ]:
monthly_lang = gb.count().unstack()
monthly_lang[0:10]

Plot number of new bots per month per language


In [ ]:
sns.set(font_scale=2)

In [ ]:
axes = monthly_lang.plot(kind="area", figsize=[12,17], subplots=True, sharey=True)

In [ ]:
pal = sns.color_palette("husl", 7)

for lang, lang_df in first_edit_df_dict.items():
        
    ax = lang_df['bot'].groupby([pd.TimeGrouper("1M")]).count().cumsum().plot(figsize=[14,8], logy=False)

    ax.set_xlabel("Date")
    ax.set_ylabel("# of approved bots")
plt.suptitle("Number of bots that have ever run in various Wikipedia editions")
leg = plt.legend(first_edit_df_dict.keys())
for legobj in leg.legendHandles:
    legobj.set_linewidth(8.0)

In [ ]:
pal = sns.color_palette("husl", 7)

for lang, lang_df in first_edit_df_dict.items():
        
    ax = lang_df['bot'].groupby([pd.TimeGrouper("1M")]).count().cumsum().plot(figsize=[14,8], logy=True)

    ax.set_xlabel("Date")
    ax.set_ylabel("# of approved bots")
plt.suptitle("Number of bots that have ever run in various Wikipedia editions")
leg = plt.legend(first_edit_df_dict.keys())
for legobj in leg.legendHandles:
    legobj.set_linewidth(8.0)

In [ ]:
import matplotlib.pyplot as plt
sns.set(font_scale=1.5)

ax = first_edit_df_dict['en']['bot'].groupby([pd.TimeGrouper("1D")]).count().cumsum().plot(figsize=[14,4])
ax.set_xlabel("Date")
ax.set_ylabel("cumulative # of bots")
plt.suptitle("Total number of bots that have ever run on the English-language Wikipedia")

In [ ]:


In [ ]:


In [ ]: