Nous allons étudier l'activité des dépôts R par source, autrement dit : en distinguant les dépôts qui hébergent un paquet en provenance de CRAN, de BioConductor, de R-Forge et enfin, ceux qui sont uniquement présents sur Github.
In [1]:
%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('svg')
from matplotlib import pyplot as plt
In [2]:
import pandas
packages = pandas.DataFrame.from_csv('../data/R-Packages.csv')
In [3]:
packages = packages[['bioconductor', 'cran', 'rforge', 'github', 'Github only', 'canonical', 'creation', 'last_push']]
packages = packages.query('github == 1 and canonical == 1').copy()
In [4]:
packages['creation'] = pandas.to_datetime(packages['creation'])
packages['last_push'] = pandas.to_datetime(packages['last_push'])
Par date de création
In [5]:
creations = packages.set_index('creation')
creations = creations.sort_index()
In [6]:
_ = creations[['github', 'Github only', 'cran', 'bioconductor', 'rforge']]
y = _.rename(columns={'github': 'Overall', 'Github only': 'Only on Github', 'rforge': 'R-Forge'})
x = y.cumsum()
_ = x.plot(figsize=(15,6), style=['k--'], logy=True, title='Accumulated number of newly created repositories on Github\n')
_.set_xlabel('creation date')
_.set_ylabel('accumulated number of packages')
Out[6]:
In [16]:
t = pandas.stats.moments.rolling_sum(y, freq='1M', how='sum', min_periods=0, window=1)['2012-01-01':'2014-12-31']
t = t.plot(figsize=(9,5), ylim=(0,400), style=['k--', "red", "blue", "cyan", "green"])
t.set_xlabel('creation date')
t.set_ylabel('number of created repositories (by month)')
t.legend(('GitHub', 'GitHub \ (CRAN $\cup$ BioConductor $\cup$ R-Forge)', 'GitHub $\cap$ CRAN', 'GitHub $\cap$ BioConductor', 'GitHub $\cap$ R-Forge'), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig("new_repositories_by_month.svg", bbox_inches="tight")
Par durée de vie
In [8]:
last_push = packages.set_index('last_push')
last_push = last_push.sort_index()
In [9]:
_ = last_push[['github', 'Github only', 'cran', 'bioconductor', 'rforge']]
_ = _.rename(columns={'github': 'Overall', 'Github only': 'Only on Github', 'rforge': 'R-Forge'})
_ = _.cumsum()
_ = _.plot(figsize=(15,6), style=['k--'], logy=True, title='Accumulated number of inactive repositories on Github\n')
_.set_xlabel('last PushEvent date')
_.set_ylabel('accumulated number of repositories')
Out[9]:
Par (avant dernière) activité
In [10]:
def filtered_count(df, category, date):
date6 = date - pandas.DateOffset(months=3)
return df[df['creation'] <= date][df['last_push'] >= date6][category].sum()
In [11]:
active = creations.copy()
active['creation'] = active.index
temp = pandas.DataFrame(index=pandas.bdate_range('2008-1-1', periods=84, freq='M'))
for date in temp.index:
for cat in ['github', 'rforge', 'bioconductor', 'cran', 'Github only']:
value = filtered_count(active, cat, date)
temp.at[date, cat+' active'] = value
In [12]:
_ = temp['2008-1-1':'2014-10-01'].plot(figsize=(15,6), style=['k--'],title='Number of active repositories on Github\n')
_.set_xlabel('date')
_.set_ylabel('number of repositories')
Out[12]: