R Packages Maintainer


In [1]:
import pandas
import re

from matplotlib import pyplot as plt

%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')

data = pandas.DataFrame.from_csv('../data/github-cran-150601.csv')
data['Date'] = pandas.to_datetime(data['Date'])

In [2]:
print len(data.drop_duplicates('Maintainer')), 'maintainer values'


8533 maintainer values

In [3]:
def clean_maintainer(v):
    v = v if isinstance(v, str) else ''
    words = map(str.strip, v.split(' '))
    words = map(lambda w: re.sub(r'[^@a-zA-Z]', '', w), words)
    words = filter(lambda w: len(w) > 2 and '@' not in w, words)
    words.sort()
    return ' '.join(words)

data['Maintainer_can'] = data['Maintainer'].apply(clean_maintainer)
print len(data.drop_duplicates('Maintainer_can')), 'canonical maintainer values'


6044 canonical maintainer values

In [4]:
_maint_first = data.sort('Date').drop_duplicates('Maintainer_can').set_index('Maintainer_can')[['Date', 'InGitHub', 'InCRAN']]
_maint_last = data.sort('Date', ascending=False).drop_duplicates('Maintainer_can').set_index('Maintainer_can')[['Date', 'InGitHub', 'InCRAN']]

github = data.query('InGitHub == 1').sort('Date')
cran = data.query('InCRAN == 1').sort('Date')

__f = (lambda d,rev,name: d.drop_duplicates('Maintainer_can', take_last=rev)
               .set_index('Maintainer_can')[['Date']]
               .rename(columns={'Date':name}))

maintainers = __f(github, False, 'GitHubFirstDate').join(
    [__f(github, True, 'GitHubLastDate'), __f(cran, False, 'CRANFirstDate'), __f(cran, True, 'CRANLastDate'),],
    how='outer')

In [5]:
maintainers['DaysOnGitHub'] = maintainers['GitHubLastDate'] - maintainers['GitHubFirstDate']
maintainers['DaysOnCRAN'] = maintainers['CRANLastDate'] - maintainers['CRANFirstDate']
# We consider CRAN first
maintainers['CRANFirst-->GHFirst'] = maintainers['GitHubFirstDate'] - maintainers['CRANFirstDate'] 
maintainers['CRANLast-->GitHubLast'] = maintainers['GitHubLastDate'] - maintainers['CRANLastDate']
maintainers['CRANLast-->GitHubFirst'] = maintainers['GitHubFirstDate'] - maintainers['CRANLastDate']
maintainers['CRANFirst-->GitHubLast'] = maintainers['GitHubLastDate'] - maintainers['CRANFirstDate']

fields = ['DaysOnGitHub', 'DaysOnCRAN', 'CRANFirst-->GHFirst', 'CRANLast-->GitHubLast', 'CRANLast-->GitHubFirst', 'CRANFirst-->GitHubLast']
for field in fields:
    maintainers[field] = maintainers[field].astype('timedelta64[D]')

In [6]:
from collections import OrderedDict

testdata = OrderedDict([
    ('all', maintainers), 
    ('on CRAN', maintainers.dropna(subset=['CRANFirstDate', 'CRANLastDate'])),
    ('on GH', maintainers.dropna(subset=['GitHubFirstDate', 'GitHubLastDate'])),
    ('on both', maintainers.dropna()),
    ('on both at least 1 minute', maintainers.dropna().query('GitHubFirstDate <= CRANLastDate and GitHubLastDate >= CRANFirstDate')),
    ('first on CRAN', maintainers.dropna().query('CRANFirstDate <= GitHubFirstDate')),
    ('first on GH', maintainers.dropna().query('GitHubFirstDate <= CRANFirstDate')),
    ('migrate from GH to CRAN', maintainers.dropna().query('GitHubLastDate <= CRANFirstDate')),
    ('migrate from CRAN to GH', maintainers.dropna().query('CRANLastDate <= GitHubFirstDate')),
])

for key, value in testdata.iteritems():
    print key
    print len(value), 'maintainers of', len(data[data['Maintainer_can'].isin(value.index)].drop_duplicates(subset=['Package'])), 'packages'
    for field in fields:
        print field, '| mean:', value[field].mean(), '| median:', value[field].median(), '| stddev:', value[field].std()
    print


all
6044 maintainers of 11442 packages
DaysOnGitHub | mean: 275.196850394 | median: 82.0 | stddev: 452.322268348
DaysOnCRAN | mean: 817.769325658 | median: 380.5 | stddev: 1084.94367843
CRANFirst-->GHFirst | mean: 597.544827586 | median: 137.0 | stddev: 1073.92398917
CRANLast-->GitHubLast | mean: 103.053793103 | median: 0.0 | stddev: 396.265029521
CRANLast-->GitHubFirst | mean: -355.652413793 | median: -259.0 | stddev: 707.699850157
CRANFirst-->GitHubLast | mean: 1056.27310345 | median: 706.0 | stddev: 1169.92601169

on CRAN
4864 maintainers of 10125 packages
DaysOnGitHub | mean: 458.275862069 | median: 276.0 | stddev: 581.670782258
DaysOnCRAN | mean: 817.769325658 | median: 380.5 | stddev: 1084.94367843
CRANFirst-->GHFirst | mean: 597.544827586 | median: 137.0 | stddev: 1073.92398917
CRANLast-->GitHubLast | mean: 103.053793103 | median: 0.0 | stddev: 396.265029521
CRANLast-->GitHubFirst | mean: -355.652413793 | median: -259.0 | stddev: 707.699850157
CRANFirst-->GitHubLast | mean: 1056.27310345 | median: 706.0 | stddev: 1169.92601169

on GH
1905 maintainers of 5841 packages
DaysOnGitHub | mean: 275.196850394 | median: 82.0 | stddev: 452.322268348
DaysOnCRAN | mean: 952.773793103 | median: 544.0 | stddev: 1140.48640691
CRANFirst-->GHFirst | mean: 597.544827586 | median: 137.0 | stddev: 1073.92398917
CRANLast-->GitHubLast | mean: 103.053793103 | median: 0.0 | stddev: 396.265029521
CRANLast-->GitHubFirst | mean: -355.652413793 | median: -259.0 | stddev: 707.699850157
CRANFirst-->GitHubLast | mean: 1056.27310345 | median: 706.0 | stddev: 1169.92601169

on both
725 maintainers of 4516 packages
DaysOnGitHub | mean: 458.275862069 | median: 276.0 | stddev: 581.670782258
DaysOnCRAN | mean: 952.773793103 | median: 544.0 | stddev: 1140.48640691
CRANFirst-->GHFirst | mean: 597.544827586 | median: 137.0 | stddev: 1073.92398917
CRANLast-->GitHubLast | mean: 103.053793103 | median: 0.0 | stddev: 396.265029521
CRANLast-->GitHubFirst | mean: -355.652413793 | median: -259.0 | stddev: 707.699850157
CRANFirst-->GitHubLast | mean: 1056.27310345 | median: 706.0 | stddev: 1169.92601169

on both at least 1 minute
509 maintainers of 4096 packages
DaysOnGitHub | mean: 613.404715128 | median: 424.0 | stddev: 623.723131321
DaysOnCRAN | mean: 1113.14734774 | median: 767.0 | stddev: 1165.63265403
CRANFirst-->GHFirst | mean: 506.611001965 | median: 41.0 | stddev: 1000.71954524
CRANLast-->GitHubLast | mean: 6.84872298625 | median: -1.0 | stddev: 257.214253011
CRANLast-->GitHubFirst | mean: -607.029469548 | median: -411.0 | stddev: 612.973616595
CRANFirst-->GitHubLast | mean: 1120.5108055 | median: 755.0 | stddev: 1159.68419302

first on CRAN
434 maintainers of 3476 packages
DaysOnGitHub | mean: 421.3640553 | median: 163.0 | stddev: 607.382112949
DaysOnCRAN | mean: 1352.62442396 | median: 1038.0 | stddev: 1275.67457478
CRANFirst-->GHFirst | mean: 1107.65207373 | median: 724.0 | stddev: 1117.14711553
CRANLast-->GitHubLast | mean: 176.313364055 | median: 37.5 | stddev: 467.675306777
CRANLast-->GitHubFirst | mean: -245.444700461 | median: -123.0 | stddev: 790.964271291
CRANFirst-->GitHubLast | mean: 1529.41705069 | median: 1165.5 | stddev: 1251.47586546

first on GH
291 maintainers of 1342 packages
DaysOnGitHub | mean: 513.326460481 | median: 362.0 | stddev: 537.382724933
DaysOnCRAN | mean: 356.432989691 | median: 186.0 | stddev: 470.504444786
CRANFirst-->GHFirst | mean: -163.233676976 | median: -71.0 | stddev: 213.74945138
CRANLast-->GitHubLast | mean: -6.20618556701 | median: -1.0 | stddev: 213.168345558
CRANLast-->GitHubFirst | mean: -520.017182131 | median: -370.0 | stddev: 520.541001446
CRANFirst-->GitHubLast | mean: 350.621993127 | median: 208.0 | stddev: 493.88724952

migrate from GH to CRAN
66 maintainers of 99 packages
DaysOnGitHub | mean: 115.424242424 | median: 18.0 | stddev: 179.974960703
DaysOnCRAN | mean: 48.4393939394 | median: 0.0 | stddev: 166.130544479
CRANFirst-->GHFirst | mean: -166.439393939 | median: -67.5 | stddev: 215.792074339
CRANLast-->GitHubLast | mean: -99.1818181818 | median: -21.5 | stddev: 190.49109885
CRANLast-->GitHubFirst | mean: -214.96969697 | median: -92.5 | stddev: 260.016151551
CRANFirst-->GitHubLast | mean: -50.6060606061 | median: -5.5 | stddev: 113.405450797

migrate from CRAN to GH
150 maintainers of 411 packages
DaysOnGitHub | mean: 82.7266666667 | median: 7.5 | stddev: 165.134027187
DaysOnCRAN | mean: 806.48 | median: 302.5 | stddev: 1097.35017457
CRANFirst-->GHFirst | mean: 1242.26666667 | median: 725.5 | stddev: 1210.02277236
CRANLast-->GitHubLast | mean: 518.493333333 | median: 385.0 | stddev: 546.007113884
CRANLast-->GitHubFirst | mean: 435.453333333 | median: 248.5 | stddev: 518.071457157
CRANFirst-->GitHubLast | mean: 1325.32 | median: 902.5 | stddev: 1195.93651669


In [7]:
import lifelines
import datetime 

survival = maintainers.copy()
kmf = lifelines.KaplanMeierFitter()

fill = pandas.to_datetime(datetime.date(2015, 6, 1))


Using pure Python version of concordance index. 
You can speed this up 100x by compiling the Fortran code with:
>>> python setup.py build_ext --inplace

In [8]:
ax = plt.subplot()

loc = survival.dropna(subset=('GitHubFirstDate',),)
T, E = lifelines.utils.datetimes_to_durations(loc['GitHubFirstDate'], loc['CRANFirstDate'], fill_date=fill)
kmf.fit(T, event_observed=E, label='GitHub --> CRAN').plot(ax=ax)

loc = survival.dropna(subset=('CRANFirstDate',),)
T, E = lifelines.utils.datetimes_to_durations(loc['CRANFirstDate'], loc['GitHubFirstDate'], fill_date=fill)
kmf.fit(T, event_observed=E, label='CRAN --> GitHub').plot(ax=ax)


Warning: some values of start_times are before end_times
Warning: some values of start_times are before end_times
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5879e36050>

In [9]:
ax = plt.subplot()

limit = pandas.to_datetime(datetime.date(2015,1,1)) #datetime.date.today() - datetime.timedelta(weeks=13)


loc = survival.dropna(subset=('GitHubFirstDate',),)

_F = lambda r: r if r < limit else ''

T, E = lifelines.utils.datetimes_to_durations(loc['GitHubFirstDate'], loc['GitHubLastDate'].apply(_F), fill_date=fill)
kmf.fit(T, event_observed=E, label='GitHub').plot(ax=ax)

loc = survival.dropna(subset=('CRANFirstDate',),)
T, E = lifelines.utils.datetimes_to_durations(loc['CRANFirstDate'], loc['CRANLastDate'].apply(_F), fill_date=fill)
kmf.fit(T, event_observed=E, label='CRAN').plot(ax=ax)


Warning: some values of start_times are before end_times
Warning: some values of start_times are before end_times
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5879e46410>

In [31]:
gh = github.dropna(subset=['Version']).sort('Date', ascending=True).drop_duplicates('Package', take_last=True).set_index('Package')
cr = cran.dropna(subset=['Version']).sort('Date', ascending=True).drop_duplicates('Package', take_last=True).set_index('Package')



_ = gh.join(cr, how='inner', lsuffix='gh', rsuffix='cr')[['Versiongh', 'Versioncr', 'Dependenciesgh', 'Dependenciescr', 'Maintainer_cangh', 'Maintainer_cancr']]

#pandas.set_option('display.max_rows', None)
_.query('Maintainer_cangh != Maintainer_cancr and Dependenciescr != Dependenciesgh and Versioncr != Versiongh')


Out[31]:
Versiongh Versioncr Dependenciesgh Dependenciescr Maintainer_cangh Maintainer_cancr
Package
genefu 1.15.1 1.0.9 amap survcomp mclust biomaRt R survcomp amap mclust gplots R Benjamin HaibeKains Markus Schroeder Benjamin HaibeKains
graph 0.0.0.9000 1.30.0 sp R methods stats tools utils R methods Falcon Seth
Rfun 0.1 1.0 data.table R NaN Andrew Brooks Huidong Tian
roxygen 1.0 0.1-3 stringr tools brew digest digest gsubfn Hadley Wickham Danenberg Peter
nnls 1.0 1.4 NaN NaN Donachy Ljiljana Shaun Zigic and Katharine Mullen
corpora 1.0.0 0.4-3 jsonlite R Csardi Gabor Evert Stefan
RTAQ 0.3 0.2 xts timeDate R xts timeDate Cornelissen Jonathan Jonathan
primer 0.1 1.0 R deSolve lattice Mahr Tristan Hank Stevens
concord 0.1 1.4-9 dplyr lme4 lubridate Matrix R NaN Jim Lemon
cat 0.1 0.0-6.5 R NaN Fernando Tusell
gtx 0.0.10 0.0.8 beeswarm survival meta rmeta data.table R survival Johnson Toby
LaplacesDemon 15.03.19 13.03.04 parallel R R parallel LLC Statisticat Hall Martina
BiGGR 1.5.0 1.8 hypergraph R rsbml hyperdraw LIM stringr R LIM igraph Rgraphviz minet graph rsbml limSo... Anand Gavai Hannes Hettling Anand Gavai
eventstudies 1.2 1.1 R zoo xts boot testthat sandwich R zoo xts boot Anand Chirag Bahure Vikram
pcurve 0.1 0.6-4 ggplot2 R mgcv vegan MASS stats Dablander Fabian Gavin Simpson
rv 0.1 2.3.1 NaN R stats utils grDevices graphics parallel Hadley Wickham Jouni Kerman
lazy 0.1 1.2-15 R NaN Lerman Liran
pedigree 0.1 1.4 R Matrix HaploSim reshape Albart Coster
ClustOfVar 1.1 0.8 R PCAmixdata NaN Chavent Marie
assertthat 0.1.0.99 0.1 tools NaN Hadley Wickham
rfishbase 2.0.0 0.2-2 dplyr lazyeval httr tidyr RCurl R XML RCurl R Boettiger Carl
rbhl 0.1.7.99 0.1.0 httr XML jsonlite plyr httr XML RJSONIO plyr Chamberlain Scott
rebird 0.1.5.99 0.1.1 jsonlite httr dplyr RJSONIO httr RCurl plyr Maia Rafael
Jmisc 0.1 0.3.1 R ROCR pROC NaN Chan Julian TszKin
itertools 0.0.0.9000 0.1-3 R parallel R iterators Steve Weston
rgauges 0.2.2.99 0.2.0 ggplot2 lubridate plyr httr jsonlite reshape2 ... ggplot2 lubridate plyr httr reshape2 gridExtra... Chamberlain Scott
solr 0.1.9.999 0.1.4 dplyr plyr httr XML jsonlite plyr httr XML assertthat rjson Chamberlain Scott
quipu 1.9.5 1.9.0 agricolae stringr pixmap shiny xtable R R stats agricolae stringr pixmap shiny xtable Reinhard Simon
ngram 1.0 1.1 NaN R methods Who complain Drew Schmidt
SOD 0.1 1.0 NaN methods Rcpp Noam Ross Jakt Martin
translateR 0.1 1.0 car data.table e1071 foreign plyr stringr tidy... RJSONIO RCurl textcat parallel httr Andreas Wygrabek Christopher Lucas
refund.wave 0.1-1 0.1 MASS glmnet wavethresh ncvreg R R glmnet wavethresh Huo Lan Adam Ciarleglio
classify 0.1 1.3 MASS bdsmatrix mvtnorm R Rcpp plyr ggplot2 lattice methods R2jags reshape2 John Ramey Chris Wheadon
rsunlight 0.3.2.99 0.3.0 httr plyr jsonlite stringr httr plyr jsonlite stringr assertthat Chamberlain Scott
agop 0.2-0 0.1-4 R stats grDevices graphics R stats grDevices graphics Matrix igraph Gagolewski Marek
HIBAG 1.3.1 1.2.4 methods R R Xiuwen Zheng
NB 0.0.0.9000 0.9 R NaN Hui TinYu
shape 1.0.0 1.4.2 R R stats Kwame Okrah Karline Soetaert
PANDA 0.3 0.9.9 R GO.db cluster Hua
RCMIP5 1.1.9000 1.1 abind dplyr assertthat digest R abind dplyr reshape2 digest R Kathe ToddBrown
jaatha 2.99.0.9030 2.7.0 assertthat coala PopGenome R6 reshape2 paralle... phyclust Rcpp reshape2 parallel R methods Paul Staab
lintr 0.3.0.9000 0.2.0 rex crayon codetools stringdist testthat diges... rex crayon codetools stringdist testthat diges... Hester Jim
taxize 0.5.4.9320 0.5.2 XML RCurl reshape2 stringr plyr httr jsonlite ... XML RCurl stringr plyr httr jsonlite foreach a... Chamberlain Scott
rnoaa 0.3.5.99 0.3.3 httr lubridate plyr dplyr tidyr ggplot2 scales... httr lubridate plyr ggplot2 scales sp RCurl rg... Chamberlain Scott
selectr 0.1 0.2-3 NaN methods XML stringr R Janko Thyson Potter Simon
mlr 2.4 2.3 checkmate parallelMap plyr reshape2 survival R... checkmate parallelMap plyr reshape2 survival R... Bernd Bischl
DNMF 1.1 1.0 Matrix gplots parallel doParallel foreach NaN Jia Zhilong
rentrez 0.5.1 0.4.1 httr jsonlite XML R httr jsonlite R XML David Winter
Gmisc 1.0 1.1 NaN grid lattice Hmisc forestplot abind methods kn... Giuseppe Paleologo Gordon Max
webchem 0.0.2 0.0.1 XML RCurl jsonlite XML RCurl RJSONIO Eduard Szoecs
gsheet 0.2.0 0.1.0 rvest stringr magrittr dplyr rvest stringr Conway Max
tab 1.0.0 3.1.1 magrittr crayon methods survey survival gee R Dane Domelen Van
repmis 0.4.1 0.4.2 data.table digest httr plyr R.cache xlsx R data.table digest httr plyr R.cache R Christopher Gandrud
flowr 0.9.6.7 0.9.6.5 knitr ggplot2 RSQLite reshape2 openxlsx diagra... knitr ggplot2 RSQLite reshape2 methods stats d... Sahil Seth
rivr 0.0.0.9000 0.9.2 R6 R Rcpp reshape2 R Koohafkan Michael
falsy 1.0 1.0.1 NaN NaN Csardi Gabor
ramify 0.2.1.9999 0.2.0 NaN NaN Brandon Greenwell
dga 0.8 1.2 NaN chron Ball Patrick Kristian Lum
bold 0.2.2.99 0.2.6 XML httr stringr assertthat jsonlite reshape p... XML httr stringr assertthat jsonlite reshape plyr Chamberlain Scott
treatSens 1.2 1.1 methods stats graphics grDevices lme4 dbarts R R Bohme Carnegie Nicole
RDML 0.8 0.8-4 R6 assertthat dpcR plyr dplyr R6 assertthat XML plyr dplyr R Blagodatskikh Konstantin
effsize 0.5.5 0.5.4 NaN NaN Marco Torchiano
genderizeR 1.0.0.1 1.1.0 jsonlite stringr tm data.table magrittr RCurl ... stringr httr tm data.table magrittr parallel R Kamil Wais
quickpsy 0.1 0.1.0 boot DEoptim dplyr ggplot2 tidyr R MPDiR R boot DEoptim dplyr ggplot2 tidyr Daniel Linares
gistr 0.1.2.9999 0.2.0 jsonlite httr magrittr assertthat knitr rmarkdown jsonlite httr magrittr assertthat knitr rmarkd... Chamberlain Scott
abc 0.0.1.9000 2.1 R R abc.data nnet quantreg MASS locfit Blum Michael
Deriv 3.1 3.4 NaN NaN Serguei Sokol
sdcMicroGUI 1.1.6 1.2.0 vcd foreign Hmisc tools sdcMicro gWidgetsRGtk2... vcd foreign Hmisc tools sdcMicro gWidgetsRGtk2... Matthias Templ Alexander Kowarik
mixedMem 1.0.0 1.0.2 Rcpp gtools Rcpp gtools R Samuel Wang
dmm 0.1 1.6-1 httr xml2 RCurl MASS Matrix nadiv robustbase pls Jackson Neville
rotationForest 1.0 0.1 rpart R rpart Manan Shah Ballings Michel
covr 0.2.0.9000 1.0.0 jsonlite rex devtools httr testthat crayon R m... jsonlite rex devtools httr crayon htmltools R ... Hester Jim
trip 1.1-20 1.1-21 maptools MASS spatstat methods sp maptools MASS raster spatstat methods sp Michael Sumner
meteoForecast 0.44 0.45 ncdf rjson raster sp zoo ncdf raster sp zoo Lamigueiro Oscar Perpinan
dpmr 0.1.6 0.1.7-1 data.table digest httr jsonlite magrittr digest httr jsonlite magrittr rio Christopher Gandrud
dendextendRcpp 0.5.0 0.6.1 Rcpp dendextend R Rcpp dendextend Galili Tal
plspm 0.4.4 0.4.7 tester turner shape diagram amap R tester turner diagram shape amap R Gaston Sanchez
knitcitations 1.0.4 1.0.6 RefManageR digest httr R methods RefManageR digest httr methods R Boettiger Carl
aroma.core 2.13.0 2.13.1 utils R.methodsS3 R.oo R.cache R.rsp matrixSta... stats utils R.methodsS3 R.oo R.cache R.rsp mat... Bengtsson Henrik
progress 1.0.0 1.0.1 prettyunits magrittr R6 prettyunits R6 Csardi Gabor
signal 0.0.0.9000 0.7-5 R MASS graphics grDevices R stats Ligges Uwe