Analyse license choices for bioRxiv preprints


In [1]:
import json
import os

import pandas
import altair

import utilities

Read data


In [2]:
# Ordered according to https://creativecommons.org/licenses/
licenses = ['CC BY', 'CC BY-ND', 'CC BY-NC', 'CC BY-NC-ND', 'None']
license_colors = ['#F68212', '#AC5B0C', '#934E0A', '#492705', '#000000']

license_scores = {
    'CC BY': 5,
    'CC BY-ND': 3,
    'CC BY-NC': 3,
    'CC BY-NC-ND': 2,
    'None': 1,
}

In [3]:
path = os.path.join('data', 'preprints.tsv')
preprint_df = pandas.read_table(path, parse_dates=['Date'])
preprint_df.License = pandas.Categorical(preprint_df.License, licenses)
preprint_df.head(2)


Out[3]:
DOI Date License
0 10.1101/000026 2014-09-08 CC BY
1 10.1101/000042 2013-12-01 CC BY

In [4]:
# Table of licensing choices
counts = preprint_df.groupby('License').apply(len)
counts.name = 'Count'
count_df = counts.reset_index()
count_df['Percent'] = count_df['Count'] / sum(count_df['Count'])
count_df.License = pandas.Categorical(count_df.License, licenses)
count_df.assign(
    Count=count_df['Count'].map('{:,}'.format),
    Percent=count_df['Percent'].map('{:.1%}'.format),
    Score=count_df['License'].map(license_scores),
)


Out[4]:
License Count Percent Score
0 CC BY 2,228 19.6% 5
1 CC BY-ND 768 6.8% 3
2 CC BY-NC 1,067 9.4% 3
3 CC BY-NC-ND 3,966 34.9% 2
4 None 3,325 29.3% 1

In [5]:
# Preprints that forbid derivatives
ND_licenses = {'CC BY-ND', 'CC BY-NC-ND', 'None'}
count_df.query("License in @ND_licenses").sum(numeric_only=True)


Out[5]:
Count      8059.000000
Percent       0.709794
dtype: float64

In [6]:
# Preprints that forbid commercial use
ND_licenses = {'CC BY-NC', 'CC BY-NC-ND', 'None'}
count_df.query("License in @ND_licenses").sum(numeric_only=True)


Out[6]:
Count      8358.000000
Percent       0.736128
dtype: float64

License distribution over time


In [7]:
path = os.path.join('figure', 'license-vs-time', 'vega-lite-data.json')
utilities.df_to_vega_lite(preprint_df, path)

License distribution by subject


In [8]:
path = os.path.join('data', 'subjects.tsv')
subject_df = preprint_df.merge(
    pandas.read_table(path)
    .replace({'Subject': {'Animal Behavior and Cognition': 'Animal Behavior & Cogn.'}})
)
subject_df.tail(2)


Out[8]:
DOI Date License Subject
10040 10.1101/146712 2017-06-06 CC BY-NC-ND Developmental Biology
10041 10.1101/146779 2017-06-06 None Neuroscience

In [9]:
# Subset subject_df for subjects with 100+ preprints
subject_counts = subject_df.Subject.value_counts()
popular_subjects = subject_counts[subject_counts >= 100].index.tolist()
popular_subject_df = subject_df.query("Subject in @popular_subjects")
len(popular_subjects)


Out[9]:
19

In [10]:
# Export for vega-lite
path = os.path.join('figure', 'license-vs-subject', 'vega-lite-data.json')
utilities.df_to_vega_lite(popular_subject_df, path)

Licensing by author


In [11]:
path = os.path.join('data', 'authors.tsv')
author_df = preprint_df.merge(
    pandas.read_table(path)
)

In [12]:
author_df['score'] = author_df.License.map(license_scores)

In [13]:
author_df.tail(2)


Out[13]:
DOI Date License Author Standard_Author score
70102 10.1101/146837 2017-06-06 CC BY Roxana E. Georgescu Roxana Georgescu 5
70103 10.1101/146837 2017-06-06 CC BY Ryan Mayle Ryan Mayle 5

In [14]:
def summarize(df):
    row = pandas.Series()
    row['Preprints'] = len(df)
    row['Score'] = sum(df.score)
    return row

author_score_df = author_df.groupby('Standard_Author').apply(summarize).reset_index()
author_score_df['Rank'] = author_score_df.Score.rank(method='min', ascending=False).astype(int)

In [15]:
author_score_df.sort_values('Score', ascending=False).head()


Out[15]:
Standard_Author Preprints Score Rank
28606 Mark Daly 43 105 1
15229 George Smith 28 86 2
5208 Benjamin Neale 34 84 3
30639 Michael Inouye 18 84 3
15766 Graham Coop 17 77 5

In [16]:
path = os.path.join('data', 'author-scores.tsv')
author_score_df.to_csv(path, sep='\t', index=False)

In [17]:
path = os.path.join('data', 'author-scores.json')
utilities.df_to_datatables(author_score_df, path)