By Stuart Geiger, Berkeley Institute for Data Science
(C) 2016, Released under The MIT license.
This data is collected and aggregated by Erik Zachte, which is here for the English Wikipedia. I have just copied that data from HTML tables into a CSV (which is not done here), then imported it into Pandas dataframes, and plotted it with matplotlib.
In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
%matplotlib inline
matplotlib.style.use('ggplot')
In [2]:
# Data by Erik Zachte at https://stats.wikimedia.org/EN/TablesWikipediaEN.htm
counts = pd.read_csv("edit_counts.tsv", sep="\t")
In [3]:
# Convert dates to datetimes
counts.date=pd.to_datetime(counts.date,infer_datetime_format=True)
In [4]:
# Peek at the dataset
counts[0:10]
Out[4]:
Some of the columns use 'k' for thousands and 'M' for millions, so we need to convert them.
In [5]:
def units_convert(s):
"""
Convert cells with k and M to times 1,000 and 1,000,000 respectively
I got this solution from
http://stackoverflow.com/questions/14218728/converting-string-of-numbers-and-letters-to-int-float-in-pandas-dataframe
"""
powers = {'k': 1000, 'M': 10 ** 6}
if(s[-1] == 'k' or s[-1] == 'M'):
try:
power = s[-1]
return float(s[:-1]) * powers[power]
except TypeError:
return float(s)
else:
return float(s)
In [6]:
# Apply this function to the columns that have 'k' or 'M' units, store them as new _float columns
counts['edits_float']=counts.edits.apply(units_convert)
counts['article_count_float']=counts['article count'].apply(units_convert)
In [7]:
# Make sure we've got data types figured out
counts.dtypes
Out[7]:
In [38]:
# Set date column as index
counts.set_index(['date'])
# Calculate some ratios
counts['highly_active_to_newcomer_ratio']=counts['>100 edits']/counts['new accts']
counts['active_to_newcomer_ratio']=counts['>5 edits']/counts['new accts']
counts['highly_active_to_active_ratio']=counts['>100 edits']/(counts['>5 edits']-counts['>100 edits'])
In [10]:
matplotlib.style.use(['bmh'])
font = {'weight' : 'regular',
'size' : 16}
matplotlib.rc('font', **font)
ax1 = counts.plot(x='date',y='>5 edits', figsize=(12,4),
label="Users making >5 edits in a month", color="r")
ax1.set_xlabel("Year")
ax1.set_ylabel("Number of users")
ax2 = counts.plot(x='date',y='>100 edits', figsize=(12,4),
label="Users making >100 edits in a month",color="g")
ax2.set_xlabel("Year")
ax2.set_ylabel("Number of editors")
ax3 = counts.plot(x='date',y='new accts', figsize=(12,4),
label="New users making >10 edits in a month",color="b")
ax3.set_xlabel("Year")
ax3.set_ylabel("Number of editors")
ax3.yaxis.set_major_formatter(ScalarFormatter())
In [36]:
ax1 = counts.plot(x='date',y=['>5 edits','>100 edits','new accts'], figsize=(12,4),
label="Users making >5 edits in a month",color=['r','g','b'],logy=True)
ax1.set_xlim("2001-01-01","2005-01-01")
ax1.set_ylim(0,10000)
ax1.set_xlabel("Year")
ax1.set_ylabel("Number of users")
ax1.yaxis.set_major_formatter(ScalarFormatter())
In [ ]:
ax1 = counts.plot(x='date',y=['>5 edits','>100 edits','new accts'], figsize=(12,4),
label="Users making >5 edits in a month",color=['r','g','b'])
ax1.set_xlabel("Year")
ax1.set_ylabel("Number of users")
ax1.yaxis.set_major_formatter(ScalarFormatter())
In [12]:
ax1 = counts.plot(x='date',y=['>5 edits','>100 edits','new accts'], figsize=(12,4),
label="Users making >5 edits in a month")
ax1.set_xlabel("Year")
ax1.set_ylabel("Number of users")
ax1.yaxis.set_major_formatter(ScalarFormatter())
In [13]:
matplotlib.style.use(['bmh'])
font = {'weight' : 'regular',
'size' : 16}
matplotlib.rc('font', **font)
ax1 = counts.plot(x='date',y='>5 edits', figsize=(12,4),
label="Users making >5 edits in a month",logy=True)
ax1.set_xlabel("Year")
ax1.set_ylabel("Number of users")
ax1.yaxis.set_major_formatter(ScalarFormatter())
plt.legend(bbox_to_anchor=(.9, .3),
bbox_transform=plt.gcf().transFigure)
ax2 = counts.plot(x='date',y='>100 edits', figsize=(12,4),
label="Users making >100 edits in a month",color="g", logy=True)
ax2.set_xlabel("Year")
ax2.set_ylabel("Number of editors")
ax2.yaxis.set_major_formatter(ScalarFormatter())
ax3 = counts.plot(x='date',y='new accts', figsize=(12,4),
label="New users making >10 edits in a month",color="r", logy=True)
ax3.set_xlabel("Year")
ax3.set_ylabel("Number of editors")
ax3.yaxis.set_major_formatter(ScalarFormatter())
In [14]:
ax1 = counts.plot(x='date',y=['>5 edits','>100 edits','new accts'], figsize=(12,4),
label="Users making >5 edits in a month",logy=True, color=['r','g','b'])
ax1.set_xlabel("Year")
ax1.set_ylabel("Number of users")
ax1.yaxis.set_major_formatter(ScalarFormatter())
In [15]:
ax3 = counts.plot(x='date',y='highly_active_to_active_ratio', figsize=(12,4),
label="Highly active users to active users ratio",color="k")
ax3.set_xlabel("Year")
ax3.set_ylabel("Ratio")
Out[15]:
In [16]:
ax3 = counts.plot(x='date',y='highly_active_to_newcomer_ratio', figsize=(12,4),
label="Highly active users to newcomers ratio",color="k")
ax3.set_xlabel("Year")
ax3.set_ylabel("Ratio")
Out[16]:
In [17]:
ax3 = counts.plot(x='date',y='active_to_newcomer_ratio', figsize=(12,4),
label="Active users to newcomers ratio",color="k")
ax3.set_xlabel("Year")
ax3.set_ylabel("Ratio")
Out[17]:
In [18]:
ax3 = counts.plot(x='date',y='edits_float', figsize=(12,4),
label="Number of edits per month",color="k")
ax3.set_xlabel("Year")
ax3.set_ylabel("Number of editors")
Out[18]:
In [ ]:
In [19]:
ax3 = counts.plot(x='date',y='new per day', figsize=(12,4),
label="New articles written per day",color="k")
ax3.set_xlabel("Year")
ax3.set_ylabel("Number of articles")
Out[19]:
In [20]:
ax3 = counts.plot(x='date',y='article_count_float', figsize=(12,4),
label="Number of articles",color="k")
ax3.set_xlabel("Year")
ax3.set_ylabel("Number of articles")
Out[20]:
In [21]:
ax3 = counts.plot(x='date',y='article_count_float', figsize=(12,4),
label="Number of articles",color="k",logy=True)
ax3.set_xlabel("Year")
ax3.set_ylabel("Number of articles")
Out[21]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: