In [38]:
import pandas
from matplotlib import pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')
data = pandas.DataFrame.from_csv('../data/github-cran-bioc-alldata.csv', index_col=None)
In [39]:
for datefield in ['Date', 'CommitDate', 'CRANRelease', 'SnapshotFirstDate', 'SnapshotLastDate', 'BiocDate']:
data[datefield] = pandas.to_datetime(data[datefield])
data = data.query('Date < "2015-01-01"')
github = data.query('Source == "github"')
cran = data.query('Source == "cran"')
In [43]:
first_github = github.sort('Date').drop_duplicates('Package')
first_cran = cran.sort('CRANRelease').drop_duplicates('Package')
common = first_github.merge(first_cran, how='inner', on='Package', suffixes=('_github', '_cran'))
In [44]:
len(first_github), len(first_cran), len(common)
Out[44]:
In [57]:
ax = first_github.set_index('Date').resample('1M', how='count')[['Package']].cumsum().plot(figsize=(15,6))
ax = first_cran.set_index('CRANRelease').resample('1M', how='count')[['Package']].cumsum().plot(ax=ax, xlim=('2009-01-01', None))
ax.legend(['Github', 'CRAN Release'], loc='best')
ax.set_xlabel('Date')
Out[57]:
In [42]: