Merging GitHub and CRAN data

Files: github-raw-date.csv contains the metadata of extracted package repositories. cran-descfiles-date.csv contains the metadata of all CRAN packages. cran-packages-date.csv contains the date at which a package was not archived.



In [133]:

    
import pandas



In [134]:

    
github = pandas.DataFrame.from_csv('../data/github-raw-150601.csv', index_col=None).drop(labels='Unnamed: 0', axis=1).dropna(subset=('Package', 'Version'))
desc = pandas.DataFrame.from_csv('../data/cran-descfiles-150601.csv', index_col=None)
pkg = pandas.DataFrame.from_csv('../data/cran-packages-150601.csv', index_col=None)

OUTPUT = '../data/github-cran-150601.csv'
FIELDS = ['Author', 'Maintainer', 'Depends', 'Imports']
IGNORED_OWNERS = ['Bioconductor-mirror', 'rpkg', 'cran']

What we do:

Parse and clean dependencies in Depends and Imports and merge them in a single field Dependencies
Add (numerical) Boolean attributes InGitHub and InCRAN
Add a CommitDate for GitHub packages, and a ServerDate for CRAN packages and merge them in a single field Date



In [135]:

    
desc = desc.query(' or '.join(['key == "{}"'.format(f) for f in FIELDS])).set_index(['package', 'version', 'key'])
desc = desc.unstack('key')['value']
desc['ServerDate'] = desc['Date'] = pandas.to_datetime(pkg.set_index(['package', 'version'])['mtime'])
desc['InCRAN'] = 1
desc.index.names = ['Package', 'Version']



In [136]:

    
github = github.query(' and '.join(['Owner != "{}"'.format(ow) for ow in IGNORED_OWNERS]))
github = github.set_index(['Package', 'Version'])[FIELDS + ['CommitDate']]
github['Date'] = github['CommitDate'] = pandas.to_datetime(github['CommitDate'])
github['InGitHub'] = 1



In [137]:

    
def parse_dependencies(str_list, ignored=[]):
    """
    Return a list of strings where each string is a package name not in `ignored`.
    The input is a string that lists dependencies, as in a DESCRIPTION file. 
    """
    # Check NaN
    str_list = str_list if str_list != pandas.np.nan else ''
    
    # Filter version numbers
    f = lambda lst: [dep.split('(')[0].strip() for dep in lst.split(',')]
    items = filter(lambda x: len(x) > 0, f(str_list))
    items = filter(lambda x: x not in ignored, items)
    return items



In [138]:

    
# Merge
packages = pandas.concat([desc, github])

# Deal with dependencies lists
dependencies_formatter = lambda x: ' '.join(parse_dependencies(x))
for field in ['Imports', 'Depends']:
    packages[field] = packages[field].fillna(value='').apply(dependencies_formatter)
    
packages['Dependencies'] = packages.apply(lambda r: (r['Imports'] + ' ' + r['Depends']).strip(), axis=1)
packages = packages.fillna({'Author': '', 'InGitHub': 0, 'InCRAN': 0, 'Maintainer': ''})



In [140]:

    
# Remove useless packages (see http://cran.r-project.org/doc/manuals/r-release/R-exts.html#Creating-R-packages)
# The mandatory ‘Package’ field gives the name of the package. 
# This should contain only (ASCII) letters, numbers and dot, have at least two characters and 
# start with a letter and not end in a dot. 

packages = packages.reset_index()
packages = packages[packages.Package.str.match(r'^[a-zA-Z][a-zA-Z0-9\.]*[a-zA-Z0-9]$')]  
    
output = packages.sort('Package')



In [141]:

    
output.to_csv(OUTPUT, encoding='utf-8')