R Package Dependencies - Installability



In [152]:

    
import pandas
import deps
import itertools
from collections import OrderedDict

%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')

# Workaround if changes are made to deps.py and "Run All" is hit
deps = reload(deps)



In [153]:

    
data = pandas.DataFrame.from_csv('../data/github-cran-bioc-alldata.csv', index_col=None)
sources = ['github', 'cran', 'bioc']
combinations = []
for n in range(len(sources)):
    combinations += [list(x) for x in itertools.combinations(sources, n+1)]

Let us create the graphs for several dates.



In [154]:

    
graphs = OrderedDict()
for date in pandas.date_range(start='2013-09', end='2015-01', freq='6M'):
        graphs[date] = deps.create_graph_for(data, date)

Let us compute which are the packages that are installable.



In [155]:

    
installability = OrderedDict()
packages = OrderedDict()

for date, graph in graphs.iteritems():
    date = '{}-{}-{}'.format(date.year, date.month, date.day)
    installability[date] = {}
    packages[date] = {}
    for from_source in sources:
        installability[date][from_source] = {}
        packages[date][from_source] = filter(lambda p: deps.available(graph, p, [from_source]), graph.iterkeys())
        for combination in [['None']] + combinations: 
            if combination == ['None']:
                installable = deps.installable(graph, from_source, [])
            else:
                installable = deps.installable(graph, from_source, combination)
            installability[date][from_source][' '.join(combination)] = installable

Evolution of the number of packages



In [156]:

    
df_packages = pandas.DataFrame.from_dict({k: {k2: len(v2) for k2, v2 in v.iteritems()} for k, v in packages.iteritems()}, orient='index')
df_packages.index = pandas.to_datetime(df_packages.index)
df_packages = df_packages.sort_index()[sources]
ax = df_packages.plot(title=u'Number of available packages', 
                      style=[None, None, None, '--', '--', '--', ':'],
                      figsize=(15,6))
ax.legend(ncol=2, loc='upper left')









    Out[156]:





<matplotlib.legend.Legend at 0x7f348a73f290>



In [157]:



In [158]:

    
number = OrderedDict()
for date, graph in graphs.iteritems():
    date = '{}-{}-{}'.format(date.year, date.month, date.day)
    number[date] = {'github': 0, 'cran': 0, 'both': 0}
    for name, package in graph.iteritems():
        github, cran = False, False
        
        for source in package.iterkeys():
            if source  == 'github':
                github = True
            elif source == 'cran':
                cran = True
        if github:
            number[date]['github'] += 1
        if cran:
            number[date]['cran'] += 1
        if github and cran:
            number[date]['both'] += 1
            
df_N = pandas.DataFrame.from_dict(number, orient='index')
df_N.index = pandas.to_datetime(df_N.index)
df_N = df_N.sort_index()
df_N['githubP'] = 100. * df_N['both'] / df_N['github']
df_N['cranP'] = 100. * df_N['both'] / df_N['cran']

ax = df_N[['github', 'cran', 'both']].plot(title=u'Number of available packages\n', 
                                         figsize=(8,4), ylim=(0,8000))
ax.legend(['github (left)', 'cran (left)', 'github $\cap$ cran (left)'], ncol=1, loc='best')

ax2 = ax.twinx()
ax2 = df_N[['githubP', 'cranP']].plot(ax=ax2, ylim=(0,100), style=['--', '--'], legend=False, grid=False)        

ax2.set_yticklabels([str(int(v))+'%' for v in ax2.get_yticks()])
df_N

Evolution of the installability



In [159]:

    
df_installability = {}

for source in sources: 
    df_installability[source] = pandas.DataFrame.from_dict(
        {k: {k2: len(v2) for k2, v2 in v[source].iteritems()} for k, v in installability.iteritems()}, orient='index')
    
for source in sources: 
    df = df_installability[source]
    df.index = pandas.to_datetime(df.index)
    df = df.sort_index()[[' '.join(comb) for comb in [['None']] + combinations]]
    ax = df.plot(title=u'Number of installable packages from {} using given set of sources'.format(source), 
                 style=['k:', 'b', 'g', 'r', 'b--', 'g--', 'r--', 'r:'],
                 figsize=(15, 6))
    ax.legend(ncol=2, loc='best')



In [160]:

    
# Focus on Github

df = df_installability['github']
df.index = pandas.to_datetime(df.index)
df['number'] = df_N['github']

df = df.sort_index()[['number'] + [' '.join(comb) for comb in [['None']] + combinations]]

for key in ['None', 'github', 'cran', 'github cran']:
    #df[key] = 100.0 * df[key] / df['number']
    pass

ax = df[['None', 'github', 'cran', 'github cran']].plot(title=u'Installable packages from GitHub\n', 
             style=['k--', 'b', 'g', 'r--'],
             ylim=(0,100),
             figsize=(8,4))
ax.legend(['None', 'github', 'cran', 'github $\cup$ cran'], ncol=1, loc='best')   
ax.set_yticklabels([str(int(v))+'%' for v in ax.get_yticks()])

df









    Out[160]:






  
    
      
      number
      None
      github
      cran
      bioc
      github cran
      github bioc
      cran bioc
      github cran bioc
    
  
  
    
      2013-09-30
      1817
      497
      503
      511
      498
      513
      504
      512
      514
    
    
      2014-03-31
      2898
      789
      800
      824
      790
      826
      801
      825
      827
    
    
      2014-09-30
      4609
      1310
      1339
      1366
      1312
      1371
      1341
      1368
      1373

	both	cran	github	githubP	cranP
2013-09-30	509	4852	1817	28.013209	10.490519
2014-03-31	713	5489	2898	24.603175	12.989616
2014-09-30	1004	6215	4609	21.783467	16.154465