In [1]:

    
!pip install pandas seaborn mwapi
import pandas as pd
import seaborn as sns
import mwapi
import numpy as np
%matplotlib inline









    



Requirement already satisfied: pandas in /home/staeiou/conda/lib/python3.5/site-packages
Requirement already satisfied: seaborn in /home/staeiou/conda/lib/python3.5/site-packages
Requirement already satisfied: mwapi in /home/staeiou/conda/lib/python3.5/site-packages
Requirement already satisfied: python-dateutil>=2 in /home/staeiou/conda/lib/python3.5/site-packages (from pandas)
Requirement already satisfied: pytz>=2011k in /home/staeiou/conda/lib/python3.5/site-packages (from pandas)
Requirement already satisfied: numpy>=1.7.0 in /home/staeiou/conda/lib/python3.5/site-packages (from pandas)
Requirement already satisfied: requests in /home/staeiou/conda/lib/python3.5/site-packages (from mwapi)
Requirement already satisfied: six>=1.5 in /home/staeiou/conda/lib/python3.5/site-packages (from python-dateutil>=2->pandas)

Load data and initalize mwapi



In [2]:

    
languages = ["en", "de", "es", "ja", "zh", "ar", "hu", "fr"]

sessions = {}
for language in languages:
    sessions[language] = mwapi.Session("https://" + language + ".wikipedia.org",
                                       user_agent="Research script by User:Staeiou")
    
sessions['meta'] = mwapi.Session("https://meta.wikimedia.org",
                                       user_agent="Research script by User:Staeiou")

sessions['commons'] = mwapi.Session("https://commons.wikimedia.org",
                                       user_agent="Research script by User:Staeiou")



In [3]:

    
with open("../../datasets/crosswiki_unified_bot_20170328.tsv") as f:
    bots = f.read()
    bot_list = bots.split("\n")
    
print(len(bot_list))



In [ ]:



In [4]:

    
result = sessions['en'].get(action='query',
                                            list='allusers',
                                            auprop='editcount',
                                            aulimit=1,
                                            aufrom="ClueBot_NG")



In [5]:

    
result['query']['allusers'][0]['editcount']









    Out[5]:





4440861

Function for querying API to get edit counts



In [6]:

    
def get_edit_count(language):
    """
    With list of all bots, get the datetime of first edit for a language version.
    Bots with no edits have a datetime of np.nan
    
    Parameters:
        language, a subdomain of en.wikipedia.org (that has a Session obj in sessions[])
    Returns:
        first_edit_df, a dataframe containing {bot name, datetime of first edit}
    """
    first_edit_df = pd.DataFrame(columns=["bot", "language", "edit_count"])

    for bot in bot_list:
        try:
            result = sessions[language].get(action='query',
                                            list='allusers',
                                            auprop='editcount',
                                            aulimit=1,
                                            aufrom=bot)
            editcount = result['query']['allusers'][0]['editcount']
        except:
            editcount = np.nan

        if editcount is not np.nan:
            bot_dict = {'bot':bot, 'language':language, 'edit_count' : editcount}
            #print(bot_dict)
            first_edit_df = first_edit_df.append(bot_dict, ignore_index=True)
            #print(first_edit_df)

        if len(first_edit_df) % 500 == 0:
            print(len(first_edit_df), sep=" ")
    return first_edit_df

Get first edit dataframes for each language



In [7]:

    
first_edit_df_dict = {}

for language in languages:
    print(language)
    print("-----")
    first_edit_df_dict[language] = get_edit_count(language)



In [10]:

    
import pickle
with open("bot_edits_per_lang_dict.pickle", "wb") as f:
    pickle.dump(first_edit_df_dict, f)



In [12]:

    
bot_edits_dict = first_edit_df_dict



In [14]:

    
tidy_df = pd.DataFrame(columns=["bot", "language", "edit_count"])
for lang in bot_edits_dict.keys():
    tidy_df = tidy_df.append(bot_edits_dict[lang])



In [19]:

    
tidy_df = tidy_df.drop_duplicates()



In [22]:

    
tidy_df.groupby("language")['edit_count'].sum()









    Out[22]:





language
ar     10832578.0
de     17358680.0
en    104277118.0
es     19280270.0
fr     22658191.0
hu      5971308.0
ja      7660553.0
zh     11678907.0
Name: edit_count, dtype: float64



In [ ]:



In [23]:

    
with open("../comments/df_all_comments_parsed.pickle", "rb") as f:
    df_all = pickle.load(f)



In [26]:

    
df_all.groupby("language")['revisions_reverted'].sum()









    Out[26]:





language
de     188344
en    1501386
es     249560
fr     253060
ja     121652
pt     187968
zh     142870
Name: revisions_reverted, dtype: int64



In [27]:

    
1501386/104277118









    Out[27]:





0.01439803888711232



In [28]:

    
188344/17358680









    Out[28]:





0.01085013376593151



In [29]:

    
121652/7660553









    Out[29]:





0.015880315690003057

Export raw dataset to pickle



In [ ]:

    
import pickle



In [ ]:

    
with open("bot_first_edit_df_dict.pickle", "wb") as f:
    pickle.dump(first_edit_df_dict, f)

Make a tidy dataframe

One dataframe, one row for each bot in each language



In [ ]:

    
tidy_df = pd.DataFrame(columns=["bot","language","first_edit"])

for lang, lang_df in first_edit_df_dict.items():
    for index, row in lang_df.iterrows():        
        bot_dict = {"bot":row['bot'], "language":lang, "first_edit":row['first_edit']}
        tidy_df = tidy_df.append(bot_dict, ignore_index=True)



In [ ]:

    
tidy_df['first_edit_dt'] = pd.to_datetime(tidy_df['first_edit'], format="%Y-%m-%dT%H:%M:%SZ")



In [ ]:

    
tidy_df_i = tidy_df.set_index('first_edit_dt')



In [ ]:

    
tidy_df_i.to_pickle("bot_first_edit_tidy_df.pickle")



In [ ]:

    
!bzip2 -k bot_first_edit_tidy_df.pickle

Group by number of new bots per langauge per month



In [ ]:

    
gb = tidy_df_i.groupby([pd.TimeGrouper("1M"), 'language'])['bot']



In [ ]:

    
monthly_lang = gb.count().unstack()
monthly_lang[0:10]

Plot number of new bots per month per language



In [ ]:

    
sns.set(font_scale=2)



In [ ]:

    
axes = monthly_lang.plot(kind="area", figsize=[12,17], subplots=True, sharey=True)



In [ ]:

    
pal = sns.color_palette("husl", 7)

for lang, lang_df in first_edit_df_dict.items():
        
    ax = lang_df['bot'].groupby([pd.TimeGrouper("1M")]).count().cumsum().plot(figsize=[14,8], logy=False)

    ax.set_xlabel("Date")
    ax.set_ylabel("# of approved bots")
plt.suptitle("Number of bots that have ever run in various Wikipedia editions")
leg = plt.legend(first_edit_df_dict.keys())
for legobj in leg.legendHandles:
    legobj.set_linewidth(8.0)



In [ ]:

    
pal = sns.color_palette("husl", 7)

for lang, lang_df in first_edit_df_dict.items():
        
    ax = lang_df['bot'].groupby([pd.TimeGrouper("1M")]).count().cumsum().plot(figsize=[14,8], logy=True)

    ax.set_xlabel("Date")
    ax.set_ylabel("# of approved bots")
plt.suptitle("Number of bots that have ever run in various Wikipedia editions")
leg = plt.legend(first_edit_df_dict.keys())
for legobj in leg.legendHandles:
    legobj.set_linewidth(8.0)



In [ ]:

    
import matplotlib.pyplot as plt
sns.set(font_scale=1.5)

ax = first_edit_df_dict['en']['bot'].groupby([pd.TimeGrouper("1D")]).count().cumsum().plot(figsize=[14,4])
ax.set_xlabel("Date")
ax.set_ylabel("cumulative # of bots")
plt.suptitle("Total number of bots that have ever run on the English-language Wikipedia")



In [ ]:



In [ ]:



In [ ]: