In [1]:
import pandas
import deps
import itertools
from matplotlib import pyplot as plt
from collections import OrderedDict
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf')
# Workaround if changes are made to deps.py and "Run All" is hit
deps = reload(deps)
data = pandas.DataFrame.from_csv('../data/github-cran-bioc-alldata.csv', index_col=None)
sources = ['github', 'cran']#, 'bioc']
In [2]:
graphs = OrderedDict()
for date in pandas.date_range(start='2013-09', end='2015-01', freq='1M'):
graphs[date] = deps.create_graph_for(data, date)
In [3]:
number = OrderedDict()
for date, graph in graphs.iteritems():
date = '{}-{}-{}'.format(date.year, date.month, date.day)
number[date] = {'github': 0, 'cran': 0, 'both': 0}
for name, package in graph.iteritems():
github, cran = False, False
for source in package.iterkeys():
if source == 'github':
github = True
elif source == 'cran':
cran = True
if github:
number[date]['github'] += 1
if cran:
number[date]['cran'] += 1
if github and cran:
number[date]['both'] += 1
df_N = pandas.DataFrame.from_dict(number, orient='index')
df_N.index = pandas.to_datetime(df_N.index)
df_N = df_N.sort_index()
df_N['githubP'] = 100. * df_N['both'] / df_N['github']
df_N['cranP'] = 100. * df_N['both'] / df_N['cran']
ax = df_N[['github', 'cran', 'both']].plot(title=u'Number of available packages\n',
figsize=(8,4), ylim=(0,9000))
ax.legend(['github', 'cran', 'github $\cap$ cran'], title='on left y-axis', ncol=1, loc='best')
ax2 = ax.twinx()
ax2 = df_N[['githubP', 'cranP']].plot(ax=ax2, ylim=(0,100), style=['--', '--'], legend=False, grid=False)
ax2.legend(['github', 'cran'], title='on right y-axis', ncol=2, loc='best')
ax2.set_yticklabels([str(int(v))+'%' for v in ax2.get_yticks()])
df_N
Out[3]:
In [4]:
data = {}
intersections = {}
for date, graph in graphs.iteritems():
date = '{}-{}-{}'.format(date.year, date.month, date.day)
value = {k: set() for k in sources}
n = 0
for name, package in graph.iteritems():
if 'cran' in package:
n += 1
for source in filter(lambda s: s in sources, package.iterkeys()):
dependencies = filter(lambda p: p not in deps.R_packages, package[source]['Dependencies'])
dependencies = filter(lambda p: p in graph and 'cran' in graph[p], dependencies)
for dependency in dependencies:
value[source].add(dependency)
data[date] = OrderedDict()
intersections[date] = OrderedDict()
github = set(value['github'])
cran = set(value['cran'])
bioc = set(value.get('bioc', []))
venn_sets = [
('overall github', len(github)),
('overall cran', len(cran)),
('github', len(github.difference(cran).difference(bioc))),
('cran', len(cran.difference(github).difference(bioc))),
('bioc', len(bioc.difference(github).difference(cran))),
('github $\cap$ cran', len(github.intersection(cran).difference(bioc))),
('github $\cap$ bioc', len(github.intersection(bioc).difference(cran))),
('cran $\cap$ bioc', len(cran.intersection(bioc).difference(github))),
('github $\cap$ cran $\cap$ bioc', len(github.intersection(cran).intersection(bioc)))
]
# Proportional
venn_sets = map(lambda x: (x[0], x[1]), venn_sets)
intersections[date] = OrderedDict(venn_sets)
for source in sources:
data[date][source] = len(value[source]) * 100.0 / n
df = pandas.DataFrame.from_dict(data, orient='index')
df.index = pandas.to_datetime(df.index)
df = df.sort_index()[['github', 'cran']]
ax = df.plot(title=u'Proportion of CRAN packages needed for packages from given source\n',
ylim=(10,30),
figsize=(8, 4))
ax.legend(ncol=2, loc='best')
ax.set_yticklabels([str(int(v))+'%' for v in ax.get_yticks()])
df
Out[4]:
In [5]:
df = pandas.DataFrame.from_dict(intersections, orient='index')
df.index = pandas.to_datetime(df.index)
df = df.sort_index()[['github', 'overall github', 'cran', 'overall cran', 'github $\cap$ cran']]
ax = df.plot(title=u'Size of the Venn sets containing CRAN packages needed by given set of sources\n',
style=['b', 'b--', 'g', 'g--', 'r'],
ylim=(0,2100),
figsize=(8, 4))
ax.legend(['only github', 'at least github', 'only cran', 'at least cran', 'both'], title='Required by', ncol=3, loc='best')
for key in df.columns:
df[key+'%'] = 100.0 * df[key] / df_N['cran']
df
Out[5]:
In [6]:
"""
data = OrderedDict()
combinations = [set()]
for n in range(len(sources)):
combinations += [set(x) for x in itertools.combinations(sources, n+1)]
# Gather all packages from CRAN
cran = set()
for date, graph in graphs.iteritems():
for name, package in graph.iteritems():
if 'cran' in package:
cran.add(name)
for date, graph in graphs.iteritems():
date = '{}-{}-{}'.format(date.year, date.month, date.day)
data[date] = {k: set()for k in cran}
for name, package in graph.iteritems():
for source in package.iterkeys():
dependencies = filter(lambda p: p not in deps.R_packages, package[source]['Dependencies'])
# Filter dependencies to CRAN
dependencies = filter(lambda p: p in graph and 'cran' in graph[p], dependencies)
# Every dependency is required by current 'name' from 'source'
for dependency in dependencies:
data[date][dependency].add(source)
# Compute the index in "combinations" for every CRAN packages
for name, source in data[date].iteritems():
data[date][name] = combinations.index(source)
df = pandas.DataFrame.from_dict(data, orient='index')
df.index = pandas.to_datetime(df.index)
df = df.sort_index()
df = df.T
df = df.sort(columns=df.columns[-1])
import numpy as np
column_labels = df.columns
row_labels = df.index
fig, ax = plt.subplots(figsize=(20,15))
heatmap = ax.pcolor(df, cmap=plt.cm.Spectral)
cbar = plt.colorbar(heatmap)
cbar.ax.get_yaxis().set_ticks([])
for j, lab in enumerate(combinations):
cbar.ax.text(.5, (2 * j + 1) / 16.0, lab, ha='center', va='center')
cbar.ax.get_yaxis().labelpad = 30
plt.show()
"""
print
In [7]:
data = {}
for date, graph in graphs.iteritems():
date = '{}-{}-{}'.format(date.year, date.month, date.day)
value = {k: {'n':0, 'e': 0} for k in sources + ['bioc']}
for name, package in graph.iteritems():
for source in package.iterkeys():
dependencies = filter(lambda p: p not in deps.R_packages, package[source]['Dependencies'])
if len(dependencies) > 0:
value[source]['n'] += 1
for dep in dependencies:
if dep not in graph or source not in graph[dep]:
value[source]['e'] += 1
break
data[date] = OrderedDict()
for source in sources:
data[date][source] = value[source]['e'] * 100.0 / value[source]['n']
df = pandas.DataFrame.from_dict(data, orient='index')
df.index = pandas.to_datetime(df.index)
df = df.sort_index()
ax = df[['github', 'cran']].plot(title=u'Proportion of packages in given source with at least one external dependency, \nconsidering packages with at least one dependency\n',
ylim=(0,100),
figsize=(8, 4))
ax.legend(ncol=3, loc='best')
ax.set_yticklabels([str(int(v))+'%' for v in ax.get_yticks()])
print
df
Out[7]:
In [8]:
for date, graph in graphs.iteritems():
github = [(name, package['github']) for name, package in graph.iteritems() if 'github' in package]
unsatisfied_deps = 0
cran_deps = 0
github_deps = 0
for name, package in github:
dependencies = filter(lambda p: p not in deps.R_packages, package['Dependencies'])
if len(dependencies) > 0:
for dep in dependencies:
github_deps += 1
if dep not in graph or 'github' not in graph[dep]:
unsatisfied_deps += 1
if dep in graph and 'cran' in graph[dep]:
cran_deps += 1
print date, len(github), github_deps, unsatisfied_deps, cran_deps, 100.0 * cran_deps / unsatisfied_deps
In [9]:
data = {}
for date, graph in graphs.iteritems():
date = '{}-{}-{}'.format(date.year, date.month, date.day)
value = {k: {'n':0, '0':0, '+': 0} for k in sources + ['bioc']}
for name, package in graph.iteritems():
for source in package.iterkeys():
dependencies = filter(lambda p: p not in deps.R_packages, package[source]['Dependencies'])
value[source]['n'] += 1
if len(dependencies) == 0:
value[source]['0'] += 1
else:
value[source]['+'] += 1
data[date] = OrderedDict()
for source in sources:
data[date]['0 on {}'.format(source)] = value[source]['0'] * 100.0 / value[source]['n']
data[date]['>0 on {}'.format(source)] = value[source]['+'] * 100.0 / value[source]['n']
df = pandas.DataFrame.from_dict(data, orient='index')
df.index = pandas.to_datetime(df.index)
df = df.sort_index()
ax = df[['0 on github', '0 on cran']].plot(title=u'Proportion of packages with no dependency\n',
ylim=(20,50),
figsize=(8, 4))
ax.legend(['github', 'cran'], ncol=2, loc='best')
ax.set_yticklabels([str(int(v))+'%' for v in ax.get_yticks()])
print
df
Out[9]:
We compute, for every GitHub package, the sources that are required to install the package.
In [10]:
installable = OrderedDict()
for date, graph in graphs.iteritems():
date = '{}-{}-{}'.format(date.year, date.month, date.day)
installable[date] = {'number': 0, 'empty set': 0, 'github': 0, 'cran': 0, 'github and cran': 0, 'other': 0}
for name, package in graph.iteritems():
if 'github' in package:
installable[date]['number'] += 1
other = True
dependencies = filter(lambda p: p not in deps.R_packages, package['github']['Dependencies'])
n = len(dependencies)
outside = filter(lambda p: p not in graph, dependencies)
# Need other sources?
if len(outside) > 0:
installable[date]['other'] += 1
else:
# Can be installed with packages in our graph..
# ... with nothing?
if len(dependencies) == 0:
installable[date]['empty set'] += 1
other = False
# ... with github?
if len(filter(lambda p: 'github' in graph[p], dependencies)) == n:
installable[date]['github'] += 1
other = False
# .... with cran?
if len(filter(lambda p: 'cran' in graph[p], dependencies)) == n:
installable[date]['cran'] += 1
other = False
# ....with both?
if len(filter(lambda p: 'github' in graph[p] or 'cran' in graph[p], dependencies)) == n:
installable[date]['github and cran'] += 1
other = False
# ... with something else
if other:
installable[date]['other'] += 1
In [11]:
df = pandas.DataFrame.from_dict(installable, orient='index')
df.index = pandas.to_datetime(df.index)
df = df.sort_index()
for key in ['empty set', 'github', 'cran', 'github and cran']:
df[key] = 100.0 * df[key] / df['number']
df = df[['empty set', 'github', 'cran', 'github and cran']]
ax = df.plot(title=u'% of github packages with dependencies satisfied using given repositories\n',
ylim=(0,100),
style=['k--', 'b', 'g', 'r'],
figsize=(8, 4))
ax.legend(['R core packages', 'github', 'cran', 'github$\cup$cran'], ncol=2, loc='best')
ax.set_yticklabels([str(int(v))+'%' for v in ax.get_yticks()])
print df