Analyse license choices for bioRxiv preprints



In [1]:

    
import json
import os

import pandas
import altair

import utilities

Read data



In [2]:

    
# Ordered according to https://creativecommons.org/licenses/
licenses = ['CC BY', 'CC BY-ND', 'CC BY-NC', 'CC BY-NC-ND', 'None']
license_colors = ['#F68212', '#AC5B0C', '#934E0A', '#492705', '#000000']

license_scores = {
    'CC BY': 5,
    'CC BY-ND': 3,
    'CC BY-NC': 3,
    'CC BY-NC-ND': 2,
    'None': 1,
}



In [3]:

    
path = os.path.join('data', 'preprints.tsv')
preprint_df = pandas.read_table(path, parse_dates=['Date'])
preprint_df.License = pandas.Categorical(preprint_df.License, licenses)
preprint_df.head(2)









    Out[3]:






  
    
      
      DOI
      Date
      License
    
  
  
    
      0
      10.1101/000026
      2014-09-08
      CC BY
    
    
      1
      10.1101/000042
      2013-12-01
      CC BY



In [4]:

    
# Table of licensing choices
counts = preprint_df.groupby('License').apply(len)
counts.name = 'Count'
count_df = counts.reset_index()
count_df['Percent'] = count_df['Count'] / sum(count_df['Count'])
count_df.License = pandas.Categorical(count_df.License, licenses)
count_df.assign(
    Count=count_df['Count'].map('{:,}'.format),
    Percent=count_df['Percent'].map('{:.1%}'.format),
    Score=count_df['License'].map(license_scores),
)









    Out[4]:






  
    
      
      License
      Count
      Percent
      Score
    
  
  
    
      0
      CC BY
      2,228
      19.6%
      5
    
    
      1
      CC BY-ND
      768
      6.8%
      3
    
    
      2
      CC BY-NC
      1,067
      9.4%
      3
    
    
      3
      CC BY-NC-ND
      3,966
      34.9%
      2
    
    
      4
      None
      3,325
      29.3%
      1



In [5]:

    
# Preprints that forbid derivatives
ND_licenses = {'CC BY-ND', 'CC BY-NC-ND', 'None'}
count_df.query("License in @ND_licenses").sum(numeric_only=True)









    Out[5]:





Count      8059.000000
Percent       0.709794
dtype: float64



In [6]:

    
# Preprints that forbid commercial use
ND_licenses = {'CC BY-NC', 'CC BY-NC-ND', 'None'}
count_df.query("License in @ND_licenses").sum(numeric_only=True)









    Out[6]:





Count      8358.000000
Percent       0.736128
dtype: float64

License distribution over time



In [7]:

    
path = os.path.join('figure', 'license-vs-time', 'vega-lite-data.json')
utilities.df_to_vega_lite(preprint_df, path)

License distribution by subject



In [8]:

    
path = os.path.join('data', 'subjects.tsv')
subject_df = preprint_df.merge(
    pandas.read_table(path)
    .replace({'Subject': {'Animal Behavior and Cognition': 'Animal Behavior & Cogn.'}})
)
subject_df.tail(2)









    Out[8]:






  
    
      
      DOI
      Date
      License
      Subject
    
  
  
    
      10040
      10.1101/146712
      2017-06-06
      CC BY-NC-ND
      Developmental Biology
    
    
      10041
      10.1101/146779
      2017-06-06
      None
      Neuroscience



In [9]:

    
# Subset subject_df for subjects with 100+ preprints
subject_counts = subject_df.Subject.value_counts()
popular_subjects = subject_counts[subject_counts >= 100].index.tolist()
popular_subject_df = subject_df.query("Subject in @popular_subjects")
len(popular_subjects)









    Out[9]:





19



In [10]:

    
# Export for vega-lite
path = os.path.join('figure', 'license-vs-subject', 'vega-lite-data.json')
utilities.df_to_vega_lite(popular_subject_df, path)

Licensing by author



In [11]:

    
path = os.path.join('data', 'authors.tsv')
author_df = preprint_df.merge(
    pandas.read_table(path)
)



In [12]:

    
author_df['score'] = author_df.License.map(license_scores)



In [13]:

    
author_df.tail(2)









    Out[13]:






  
    
      
      DOI
      Date
      License
      Author
      Standard_Author
      score
    
  
  
    
      70102
      10.1101/146837
      2017-06-06
      CC BY
      Roxana E. Georgescu
      Roxana Georgescu
      5
    
    
      70103
      10.1101/146837
      2017-06-06
      CC BY
      Ryan Mayle
      Ryan Mayle
      5



In [14]:

    
def summarize(df):
    row = pandas.Series()
    row['Preprints'] = len(df)
    row['Score'] = sum(df.score)
    return row

author_score_df = author_df.groupby('Standard_Author').apply(summarize).reset_index()
author_score_df['Rank'] = author_score_df.Score.rank(method='min', ascending=False).astype(int)



In [15]:

    
author_score_df.sort_values('Score', ascending=False).head()









    Out[15]:






  
    
      
      Standard_Author
      Preprints
      Score
      Rank
    
  
  
    
      28606
      Mark Daly
      43
      105
      1
    
    
      15229
      George Smith
      28
      86
      2
    
    
      5208
      Benjamin Neale
      34
      84
      3
    
    
      30639
      Michael Inouye
      18
      84
      3
    
    
      15766
      Graham Coop
      17
      77
      5



In [16]:

    
path = os.path.join('data', 'author-scores.tsv')
author_score_df.to_csv(path, sep='\t', index=False)



In [17]:

    
path = os.path.join('data', 'author-scores.json')
utilities.df_to_datatables(author_score_df, path)

	DOI	Date	License
0	10.1101/000026	2014-09-08	CC BY
1	10.1101/000042	2013-12-01	CC BY

	License	Count	Percent	Score
0	CC BY	2,228	19.6%	5
1	CC BY-ND	768	6.8%	3
2	CC BY-NC	1,067	9.4%	3
3	CC BY-NC-ND	3,966	34.9%	2
4	None	3,325	29.3%	1

	DOI	Date	License	Subject
10040	10.1101/146712	2017-06-06	CC BY-NC-ND	Developmental Biology
10041	10.1101/146779	2017-06-06	None	Neuroscience

	DOI	Date	License	Author	Standard_Author	score
70102	10.1101/146837	2017-06-06	CC BY	Roxana E. Georgescu	Roxana Georgescu	5
70103	10.1101/146837	2017-06-06	CC BY	Ryan Mayle	Ryan Mayle	5

	Standard_Author	Preprints	Score	Rank
28606	Mark Daly	43	105	1
15229	George Smith	28	86	2
5208	Benjamin Neale	34	84	3
30639	Michael Inouye	18	84	3
15766	Graham Coop	17	77	5