In [152]:
import pandas
import deps
import itertools
from collections import OrderedDict
%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')
# Workaround if changes are made to deps.py and "Run All" is hit
deps = reload(deps)
In [153]:
data = pandas.DataFrame.from_csv('../data/github-cran-bioc-alldata.csv', index_col=None)
sources = ['github', 'cran', 'bioc']
combinations = []
for n in range(len(sources)):
combinations += [list(x) for x in itertools.combinations(sources, n+1)]
Let us create the graphs for several dates.
In [154]:
graphs = OrderedDict()
for date in pandas.date_range(start='2013-09', end='2015-01', freq='6M'):
graphs[date] = deps.create_graph_for(data, date)
Let us compute which are the packages that are installable.
In [155]:
installability = OrderedDict()
packages = OrderedDict()
for date, graph in graphs.iteritems():
date = '{}-{}-{}'.format(date.year, date.month, date.day)
installability[date] = {}
packages[date] = {}
for from_source in sources:
installability[date][from_source] = {}
packages[date][from_source] = filter(lambda p: deps.available(graph, p, [from_source]), graph.iterkeys())
for combination in [['None']] + combinations:
if combination == ['None']:
installable = deps.installable(graph, from_source, [])
else:
installable = deps.installable(graph, from_source, combination)
installability[date][from_source][' '.join(combination)] = installable
In [156]:
df_packages = pandas.DataFrame.from_dict({k: {k2: len(v2) for k2, v2 in v.iteritems()} for k, v in packages.iteritems()}, orient='index')
df_packages.index = pandas.to_datetime(df_packages.index)
df_packages = df_packages.sort_index()[sources]
ax = df_packages.plot(title=u'Number of available packages',
style=[None, None, None, '--', '--', '--', ':'],
figsize=(15,6))
ax.legend(ncol=2, loc='upper left')
Out[156]:
In [157]:
In [158]:
number = OrderedDict()
for date, graph in graphs.iteritems():
date = '{}-{}-{}'.format(date.year, date.month, date.day)
number[date] = {'github': 0, 'cran': 0, 'both': 0}
for name, package in graph.iteritems():
github, cran = False, False
for source in package.iterkeys():
if source == 'github':
github = True
elif source == 'cran':
cran = True
if github:
number[date]['github'] += 1
if cran:
number[date]['cran'] += 1
if github and cran:
number[date]['both'] += 1
df_N = pandas.DataFrame.from_dict(number, orient='index')
df_N.index = pandas.to_datetime(df_N.index)
df_N = df_N.sort_index()
df_N['githubP'] = 100. * df_N['both'] / df_N['github']
df_N['cranP'] = 100. * df_N['both'] / df_N['cran']
ax = df_N[['github', 'cran', 'both']].plot(title=u'Number of available packages\n',
figsize=(8,4), ylim=(0,8000))
ax.legend(['github (left)', 'cran (left)', 'github $\cap$ cran (left)'], ncol=1, loc='best')
ax2 = ax.twinx()
ax2 = df_N[['githubP', 'cranP']].plot(ax=ax2, ylim=(0,100), style=['--', '--'], legend=False, grid=False)
ax2.set_yticklabels([str(int(v))+'%' for v in ax2.get_yticks()])
df_N
Out[158]:
In [159]:
df_installability = {}
for source in sources:
df_installability[source] = pandas.DataFrame.from_dict(
{k: {k2: len(v2) for k2, v2 in v[source].iteritems()} for k, v in installability.iteritems()}, orient='index')
for source in sources:
df = df_installability[source]
df.index = pandas.to_datetime(df.index)
df = df.sort_index()[[' '.join(comb) for comb in [['None']] + combinations]]
ax = df.plot(title=u'Number of installable packages from {} using given set of sources'.format(source),
style=['k:', 'b', 'g', 'r', 'b--', 'g--', 'r--', 'r:'],
figsize=(15, 6))
ax.legend(ncol=2, loc='best')
In [160]:
# Focus on Github
df = df_installability['github']
df.index = pandas.to_datetime(df.index)
df['number'] = df_N['github']
df = df.sort_index()[['number'] + [' '.join(comb) for comb in [['None']] + combinations]]
for key in ['None', 'github', 'cran', 'github cran']:
#df[key] = 100.0 * df[key] / df['number']
pass
ax = df[['None', 'github', 'cran', 'github cran']].plot(title=u'Installable packages from GitHub\n',
style=['k--', 'b', 'g', 'r--'],
ylim=(0,100),
figsize=(8,4))
ax.legend(['None', 'github', 'cran', 'github $\cup$ cran'], ncol=1, loc='best')
ax.set_yticklabels([str(int(v))+'%' for v in ax.get_yticks()])
df
Out[160]: