Download biorxiv preprint table from the PrePubMed repository



In [1]:

    
import os
import json
import logging

import pandas
import requests

import utilities



In [2]:

    
# Configure logging to write to file
logging.basicConfig(level=logging.INFO, filename=os.path.join('logs/donwload.log'), filemode='w')

Get `OmnesRes/prepub` version



In [3]:

    
url = 'https://api.github.com/repos/OmnesRes/prepub/git/refs/heads/master'
response = requests.get(url)
response = response.json()
response['object']









    Out[3]:





{'sha': '6e5c180b3cf81cff9c58653aebd6b35556470260',
 'type': 'commit',
 'url': 'https://api.github.com/repos/OmnesRes/prepub/git/commits/6e5c180b3cf81cff9c58653aebd6b35556470260'}

Load bioRxiv data



In [4]:

    
url = 'https://github.com/OmnesRes/prepub/raw/master/biorxiv/biorxiv_licenses.tsv'
biorxiv_df = pandas.read_table(url)



In [5]:

    
# Limit to preprints through November 2016
biorxiv_df = biorxiv_df.query("Date <= '2017-06-07'")



In [6]:

    
biorxiv_df.head(2)









    Out[6]:






  
    
      
      DOI
      Date
      Subjects
      License
      Title
      Authors
      Affiliations
    
  
  
    
      0
      http://dx.doi.org/10.1101/049031
      2016-04-16
      Microbiology
      CC BY-NC
      Alternative Growth Behavior of Mycobacterium A...
      Peilin Zhang|Lawrence M Minardi|J. Todd Kuenst...
      PZM Diagnostics, LLC
    
    
      1
      http://dx.doi.org/10.1101/049049
      2016-04-16
      Genomics
      CC BY-NC-ND
      Lateral genetic transfers between eukaryotes a...
      Sarah R Bordenstein|Seth R Bordenstein
      Vanderbilt University

Processing



In [7]:

    
# Remove URL from DOIs
biorxiv_df.DOI = biorxiv_df.DOI.str.extract(r'(10\.[0-9]+/[0-9]+)', expand=False)
biorxiv_df.License = biorxiv_df.License.str.replace('CC-BY', 'CC BY')
biorxiv_df.License = biorxiv_df.License.fillna('None')

Authors



In [8]:

    
author_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Authors')
    .rename(columns={'Authors': 'Author'})
    [['DOI', 'Author']]
    .sort_values(['DOI', 'Author'])
    .drop_duplicates()
    .reset_index(drop=True)
)



In [9]:

    
# Standardize author names
author_df['Standard_Author'] = author_df.Author.map(utilities.get_standard_author)



In [10]:

    
author_df.tail(2)









    Out[10]:






  
    
      
      DOI
      Author
      Standard_Author
    
  
  
    
      70102
      10.1101/146837
      Roxana E. Georgescu
      Roxana Georgescu
    
    
      70103
      10.1101/146837
      Ryan Mayle
      Ryan Mayle



In [11]:

    
# Authors with the most preprints
author_df.Standard_Author.value_counts().head(2)









    Out[11]:





Mark Daly         43
Benjamin Neale    34
Name: Standard_Author, dtype: int64

Subjects



In [12]:

    
# Create a TSV with a row per preprint-subject pair
subject_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Subjects')
    .rename(columns={'Subjects': 'Subject'})
    [['DOI', 'Subject']]
    .sort_values(['DOI', 'Subject'])
    .reset_index(drop=True)
)
subject_df.tail(2)









    Out[12]:






  
    
      
      DOI
      Subject
    
  
  
    
      10040
      10.1101/146712
      Developmental Biology
    
    
      10041
      10.1101/146779
      Neuroscience



In [13]:

    
# Number of subjects per preprint
subject_df.DOI.value_counts().value_counts()









    Out[13]:





1    9985
2      22
5       2
3       1
Name: DOI, dtype: int64



In [14]:

    
# Number of preprints by subject
subject_df.Subject.value_counts()









    Out[14]:





Bioinformatics                            1486
Neuroscience                              1346
Evolutionary Biology                      1251
Genomics                                  1099
Genetics                                   799
Microbiology                               522
Ecology                                    483
Systems Biology                            387
Biophysics                                 346
Cell Biology                               317
Developmental Biology                      243
Plant Biology                              235
Cancer Biology                             232
Molecular Biology                          195
Biochemistry                               188
Epidemiology                               168
Animal Behavior and Cognition              146
Synthetic Biology                          128
Immunology                                 110
Bioengineering                              99
Physiology                                  57
Zoology                                     50
Scientific Communication and Education      44
Pharmacology and Toxicology                 28
Scientific Communication                    25
Pathology                                   25
Pharmacology                                18
Paleontology                                 9
Clinical Trials                              6
Name: Subject, dtype: int64

Preprints



In [15]:

    
preprint_df = (biorxiv_df
    [['DOI', 'Date', 'License']]
    .sort_values('DOI')
    .reset_index(drop=True)
)
preprint_df.tail(4)









    Out[15]:






  
    
      
      DOI
      Date
      License
    
  
  
    
      11350
      10.1101/146779
      2017-06-06
      None
    
    
      11351
      10.1101/146787
      2017-06-06
      CC BY
    
    
      11352
      10.1101/146811
      2017-06-06
      CC BY-ND
    
    
      11353
      10.1101/146837
      2017-06-06
      CC BY



In [16]:

    
len(preprint_df)









    Out[16]:





11354



In [17]:

    
# Preprints by license
preprint_df.License.value_counts(normalize=True).reset_index()









    Out[17]:






  
    
      
      index
      License
    
  
  
    
      0
      CC BY-NC-ND
      0.349304
    
    
      1
      None
      0.292848
    
    
      2
      CC BY
      0.196230
    
    
      3
      CC BY-NC
      0.093976
    
    
      4
      CC BY-ND
      0.067641



In [18]:

    
# Preprints by year
preprint_df.Date.map(lambda date: date.split('-')[0]).value_counts()









    Out[18]:





2016    4901
2017    3864
2015    1706
2014     806
2013      77
Name: Date, dtype: int64

Save as a TSVs



In [19]:

    
path = os.path.join('data', 'preprints.tsv')
preprint_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'subjects.tsv')
subject_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'authors.tsv')
author_df.to_csv(path, sep='\t', index=False)

	DOI	Date	Subjects	License	Title	Authors	Affiliations
0	http://dx.doi.org/10.1101/049031	2016-04-16	Microbiology	CC BY-NC	Alternative Growth Behavior of Mycobacterium A...	Peilin Zhang\|Lawrence M Minardi\|J. Todd Kuenst...	PZM Diagnostics, LLC
1	http://dx.doi.org/10.1101/049049	2016-04-16	Genomics	CC BY-NC-ND	Lateral genetic transfers between eukaryotes a...	Sarah R Bordenstein\|Seth R Bordenstein	Vanderbilt University

	DOI	Author	Standard_Author
70102	10.1101/146837	Roxana E. Georgescu	Roxana Georgescu
70103	10.1101/146837	Ryan Mayle	Ryan Mayle

	DOI	Subject
10040	10.1101/146712	Developmental Biology
10041	10.1101/146779	Neuroscience

	DOI	Date	License
11350	10.1101/146779	2017-06-06	None
11351	10.1101/146787	2017-06-06	CC BY
11352	10.1101/146811	2017-06-06	CC BY-ND
11353	10.1101/146837	2017-06-06	CC BY

	index	License
0	CC BY-NC-ND	0.349304
1	None	0.292848
2	CC BY	0.196230
3	CC BY-NC	0.093976
4	CC BY-ND	0.067641

Download biorxiv preprint table from the PrePubMed repository

Get OmnesRes/prepub version

Load bioRxiv data

Processing

Authors

Subjects

Preprints

Save as a TSVs

Get `OmnesRes/prepub` version