Files: github-raw-date.csv contains the metadata of extracted package repositories. cran-descfiles-date.csv contains the metadata of all CRAN packages. cran-packages-date.csv contains the date at which a package was not archived.
In [133]:
import pandas
In [134]:
github = pandas.DataFrame.from_csv('../data/github-raw-150601.csv', index_col=None).drop(labels='Unnamed: 0', axis=1).dropna(subset=('Package', 'Version'))
desc = pandas.DataFrame.from_csv('../data/cran-descfiles-150601.csv', index_col=None)
pkg = pandas.DataFrame.from_csv('../data/cran-packages-150601.csv', index_col=None)
OUTPUT = '../data/github-cran-150601.csv'
FIELDS = ['Author', 'Maintainer', 'Depends', 'Imports']
IGNORED_OWNERS = ['Bioconductor-mirror', 'rpkg', 'cran']
What we do:
Depends and Imports and merge them in a single field DependenciesInGitHub and InCRANCommitDate for GitHub packages, and a ServerDate for CRAN packages and merge them in a single field Date
In [135]:
desc = desc.query(' or '.join(['key == "{}"'.format(f) for f in FIELDS])).set_index(['package', 'version', 'key'])
desc = desc.unstack('key')['value']
desc['ServerDate'] = desc['Date'] = pandas.to_datetime(pkg.set_index(['package', 'version'])['mtime'])
desc['InCRAN'] = 1
desc.index.names = ['Package', 'Version']
In [136]:
github = github.query(' and '.join(['Owner != "{}"'.format(ow) for ow in IGNORED_OWNERS]))
github = github.set_index(['Package', 'Version'])[FIELDS + ['CommitDate']]
github['Date'] = github['CommitDate'] = pandas.to_datetime(github['CommitDate'])
github['InGitHub'] = 1
In [137]:
def parse_dependencies(str_list, ignored=[]):
"""
Return a list of strings where each string is a package name not in `ignored`.
The input is a string that lists dependencies, as in a DESCRIPTION file.
"""
# Check NaN
str_list = str_list if str_list != pandas.np.nan else ''
# Filter version numbers
f = lambda lst: [dep.split('(')[0].strip() for dep in lst.split(',')]
items = filter(lambda x: len(x) > 0, f(str_list))
items = filter(lambda x: x not in ignored, items)
return items
In [138]:
# Merge
packages = pandas.concat([desc, github])
# Deal with dependencies lists
dependencies_formatter = lambda x: ' '.join(parse_dependencies(x))
for field in ['Imports', 'Depends']:
packages[field] = packages[field].fillna(value='').apply(dependencies_formatter)
packages['Dependencies'] = packages.apply(lambda r: (r['Imports'] + ' ' + r['Depends']).strip(), axis=1)
packages = packages.fillna({'Author': '', 'InGitHub': 0, 'InCRAN': 0, 'Maintainer': ''})
In [140]:
# Remove useless packages (see http://cran.r-project.org/doc/manuals/r-release/R-exts.html#Creating-R-packages)
# The mandatory ‘Package’ field gives the name of the package.
# This should contain only (ASCII) letters, numbers and dot, have at least two characters and
# start with a letter and not end in a dot.
packages = packages.reset_index()
packages = packages[packages.Package.str.match(r'^[a-zA-Z][a-zA-Z0-9\.]*[a-zA-Z0-9]$')]
output = packages.sort('Package')
In [141]:
output.to_csv(OUTPUT, encoding='utf-8')