By Stuart Geiger, Berkeley Institute for Data Science
(C) 2016, Released under The MIT license.
This data is collected and aggregated by Erik Zachte, which is here for the English Wikipedia. I have just copied that data from HTML tables into a CSV (which is not done here), then imported it into Pandas dataframes, and plotted it with matplotlib.
In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import numpy as np
%matplotlib inline
matplotlib.style.use('ggplot')
In [2]:
# Data by Erik Zachte at https://stats.wikimedia.org/EN/TablesWikipediaEN.htm
counts = pd.read_csv("edit_counts.tsv", sep="\t")
In [3]:
# Data by Stuart Geiger
# Random sample of 50000 users who have registered and made a userpage
# Then, how many edits did they make 1 to 2 years after their registration date
retention = pd.read_csv("retention.tsv", sep="\t")
retention.reg_date=pd.to_datetime(retention.reg_date,format="%Y%m%d%H%M%S")
In [4]:
def survived(edits):
if edits > 0:
return 1
else:
return 0
In [5]:
retention['survival'] = retention.edits_1yr.apply(survived)
In [6]:
retention[0:10]
Out[6]:
In [7]:
# Convert dates to datetimes
counts.date=pd.to_datetime(counts.date,infer_datetime_format=True)
In [8]:
# Peek at the dataset
counts[0:10]
Out[8]:
Some of the columns use 'k' for thousands and 'M' for millions, so we need to convert them.
In [9]:
def units_convert(s):
"""
Convert cells with k and M to times 1,000 and 1,000,000 respectively
I got this solution from
http://stackoverflow.com/questions/14218728/converting-string-of-numbers-and-letters-to-int-float-in-pandas-dataframe
"""
powers = {'k': 1000, 'M': 10 ** 6}
if(s[-1] == 'k' or s[-1] == 'M'):
try:
power = s[-1]
return float(s[:-1]) * powers[power]
except TypeError:
return float(s)
else:
return float(s)
In [10]:
# Apply this function to the columns that have 'k' or 'M' units, store them as new _float columns
counts['edits_float']=counts.edits.apply(units_convert)
counts['article_count_float']=counts['article count'].apply(units_convert)
In [11]:
# Make sure we've got data types figured out
counts.dtypes
Out[11]:
In [12]:
# Set date column as index
counts.set_index(['date'])
# Calculate some ratios
counts['highly_active_to_newcomer_ratio']=counts['>100 edits']/counts['new accts']
counts['active_to_newcomer_ratio']=counts['>5 edits']/counts['new accts']
counts['highly_active_to_active_ratio']=counts['>100 edits']/(counts['>5 edits']-counts['>100 edits'])
In [13]:
import datetime
def dt_to_yearmofirst(dt):
"""
Adding one year to the reg date, because we are looking at if people who registered were still editing 1 year later
"""
year= dt.year + 1
month= dt.month
return datetime.datetime(year=year,month=month,day=1)
In [ ]:
In [14]:
retention['reg_mo_first'] = retention.reg_date.apply(dt_to_yearmofirst)
retention[0:10]
Out[14]:
In [15]:
retention_group = retention.groupby(["reg_mo_first"])
monthly_averages = retention_group.aggregate({"survival":np.mean})
In [16]:
def add_year(dt):
year = dt.year + 1
month = dt.month
day = dt.day
return datetime.datetime(year, month, day)
In [ ]:
Filter because there is missing data before 2003 and counts after July 2014 aren't accurate
In [17]:
monthly_avg = monthly_averages[monthly_averages.index>datetime.datetime(2004,1,1)]
monthly_avg= monthly_avg[monthly_avg.index<datetime.datetime(2014,9,1)]
In [18]:
monthly_avg.plot()
Out[18]:
In [19]:
matplotlib.style.use(['bmh'])
font = {'weight' : 'regular',
'size' : 16}
matplotlib.rc('font', **font)
ax1 = counts.plot(x='date',y=['>5 edits', '>100 edits'], figsize=(12,4),
label="Users making >5 edits in a month", color="r")
ax1.set_xlabel("Year")
ax1.set_ylabel("Number of users")
ax2 = counts.plot(x='date',y='>100 edits', figsize=(12,4),
label="Users making >100 edits in a month",color="g")
ax2.set_xlabel("Year")
ax2.set_ylabel("Number of editors")
ax3 = counts.plot(x='date',y='new accts', figsize=(12,4),
label="New users making >10 edits in a month",color="b")
ax3.set_xlabel("Year")
ax3.set_ylabel("Number of editors")
ax3.yaxis.set_major_formatter(ScalarFormatter())
In [20]:
matplotlib.style.use(['bmh'])
font = {'weight' : 'regular',
'size' : 16}
matplotlib.rc('font', **font)
ax1 = counts.plot(x='date',y=['>5 edits', '>100 edits'], figsize=(12,7),
label="Users making >5 edits in a month", color="bg")
ax1.set_xlabel("Year")
ax1.set_ylabel("Number of users")
ax1.set_ylim(0,70000)
ax2 = ax1.twinx()
ax2 = monthly_avg.plot(ax=ax2,secondary_y=True,color="r")
ax1.set_xlim(372,519)
ax1.legend(loc='upper center', bbox_to_anchor=(0.35, 1.15),
ncol=2, fancybox=True, shadow=True)
ax2.legend(loc='upper center', bbox_to_anchor=(0.75, 1.15),
ncol=1, fancybox=True, shadow=True)
ax2.set_ylabel("1 year newcomer survival")
Out[20]:
In [21]:
matplotlib.style.use(['bmh'])
font = {'weight' : 'regular',
'size' : 16}
matplotlib.rc('font', **font)
ax1 = counts.plot(x='date',y=['>5 edits', '>100 edits'], figsize=(12,7),
label="Users making >5 edits in a month", color="bg")
ax1.set_xlabel("Year")
ax1.set_ylabel("Number of users making >x edits/month")
ax1.set_ylim(0,60000)
ax1.set_xlim(372,529)
ax1.legend(loc='upper center', bbox_to_anchor=(0.35, 1.15),
ncol=2, fancybox=True, shadow=True)
Out[21]:
In [23]:
ax1 = counts.plot(x='date',y=['>5 edits','>100 edits','new accts'], figsize=(12,4),
label="Users making >5 edits in a month",color=['r','g','b'])
ax1.set_xlabel("Year")
ax1.set_ylabel("Number of users")
ax1.yaxis.set_major_formatter(ScalarFormatter())
In [27]:
ax1 = counts.plot(x='date',y=['>5 edits','>100 edits','new accts'], figsize=(12,4),
label="Users making >5 edits in a month",logy=True, color=['r','g','b'])
ax1.set_xlabel("Year")
ax1.set_ylabel("Number of users")
ax1.yaxis.set_major_formatter(ScalarFormatter())
In [28]:
ax3 = counts.plot(x='date',y='highly_active_to_active_ratio', figsize=(12,4),
label="Highly active users to active users ratio",color="k")
ax3.set_xlabel("Year")
ax3.set_ylabel("Ratio")
Out[28]:
In [29]:
ax3 = counts.plot(x='date',y='highly_active_to_newcomer_ratio', figsize=(12,4),
label="Highly active users to newcomers ratio",color="k")
ax3.set_xlabel("Year")
ax3.set_ylabel("Ratio")
Out[29]:
In [30]:
ax3 = counts.plot(x='date',y='active_to_newcomer_ratio', figsize=(12,4),
label="Active users to newcomers ratio",color="k")
ax3.set_xlabel("Year")
ax3.set_ylabel("Ratio")
Out[30]:
In [31]:
ax3 = counts.plot(x='date',y='edits_float', figsize=(12,4),
label="Number of edits per month",color="k")
ax3.set_xlabel("Year")
ax3.set_ylabel("Number of editors")
Out[31]:
In [ ]:
In [32]:
ax3 = counts.plot(x='date',y='new per day', figsize=(12,4),
label="New articles written per day",color="k")
ax3.set_xlabel("Year")
ax3.set_ylabel("Number of articles")
Out[32]:
In [33]:
ax3 = counts.plot(x='date',y='article_count_float', figsize=(12,4),
label="Number of articles",color="k")
ax3.set_xlabel("Year")
ax3.set_ylabel("Number of articles")
Out[33]:
In [34]:
ax3 = counts.plot(x='date',y='article_count_float', figsize=(12,4),
label="Number of articles",color="k",logy=True)
ax3.set_xlabel("Year")
ax3.set_ylabel("Number of articles")
Out[34]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: