Download biorxiv preprint table from the PrePubMed repository


In [1]:
import os
import json
import logging

import pandas
import requests

import utilities

In [2]:
# Configure logging to write to file
logging.basicConfig(level=logging.INFO, filename=os.path.join('logs/donwload.log'), filemode='w')

Get OmnesRes/prepub version


In [3]:
url = 'https://api.github.com/repos/OmnesRes/prepub/git/refs/heads/master'
response = requests.get(url)
response = response.json()
response['object']


Out[3]:
{'sha': '6e5c180b3cf81cff9c58653aebd6b35556470260',
 'type': 'commit',
 'url': 'https://api.github.com/repos/OmnesRes/prepub/git/commits/6e5c180b3cf81cff9c58653aebd6b35556470260'}

Load bioRxiv data


In [4]:
url = 'https://github.com/OmnesRes/prepub/raw/master/biorxiv/biorxiv_licenses.tsv'
biorxiv_df = pandas.read_table(url)

In [5]:
# Limit to preprints through November 2016
biorxiv_df = biorxiv_df.query("Date <= '2017-06-07'")

In [6]:
biorxiv_df.head(2)


Out[6]:
DOI Date Subjects License Title Authors Affiliations
0 http://dx.doi.org/10.1101/049031 2016-04-16 Microbiology CC BY-NC Alternative Growth Behavior of Mycobacterium A... Peilin Zhang|Lawrence M Minardi|J. Todd Kuenst... PZM Diagnostics, LLC
1 http://dx.doi.org/10.1101/049049 2016-04-16 Genomics CC BY-NC-ND Lateral genetic transfers between eukaryotes a... Sarah R Bordenstein|Seth R Bordenstein Vanderbilt University

Processing


In [7]:
# Remove URL from DOIs
biorxiv_df.DOI = biorxiv_df.DOI.str.extract(r'(10\.[0-9]+/[0-9]+)', expand=False)
biorxiv_df.License = biorxiv_df.License.str.replace('CC-BY', 'CC BY')
biorxiv_df.License = biorxiv_df.License.fillna('None')

Authors


In [8]:
author_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Authors')
    .rename(columns={'Authors': 'Author'})
    [['DOI', 'Author']]
    .sort_values(['DOI', 'Author'])
    .drop_duplicates()
    .reset_index(drop=True)
)

In [9]:
# Standardize author names
author_df['Standard_Author'] = author_df.Author.map(utilities.get_standard_author)

In [10]:
author_df.tail(2)


Out[10]:
DOI Author Standard_Author
70102 10.1101/146837 Roxana E. Georgescu Roxana Georgescu
70103 10.1101/146837 Ryan Mayle Ryan Mayle

In [11]:
# Authors with the most preprints
author_df.Standard_Author.value_counts().head(2)


Out[11]:
Mark Daly         43
Benjamin Neale    34
Name: Standard_Author, dtype: int64

Subjects


In [12]:
# Create a TSV with a row per preprint-subject pair
subject_df = (biorxiv_df
    .pipe(utilities.tidy_split, column='Subjects')
    .rename(columns={'Subjects': 'Subject'})
    [['DOI', 'Subject']]
    .sort_values(['DOI', 'Subject'])
    .reset_index(drop=True)
)
subject_df.tail(2)


Out[12]:
DOI Subject
10040 10.1101/146712 Developmental Biology
10041 10.1101/146779 Neuroscience

In [13]:
# Number of subjects per preprint
subject_df.DOI.value_counts().value_counts()


Out[13]:
1    9985
2      22
5       2
3       1
Name: DOI, dtype: int64

In [14]:
# Number of preprints by subject
subject_df.Subject.value_counts()


Out[14]:
Bioinformatics                            1486
Neuroscience                              1346
Evolutionary Biology                      1251
Genomics                                  1099
Genetics                                   799
Microbiology                               522
Ecology                                    483
Systems Biology                            387
Biophysics                                 346
Cell Biology                               317
Developmental Biology                      243
Plant Biology                              235
Cancer Biology                             232
Molecular Biology                          195
Biochemistry                               188
Epidemiology                               168
Animal Behavior and Cognition              146
Synthetic Biology                          128
Immunology                                 110
Bioengineering                              99
Physiology                                  57
Zoology                                     50
Scientific Communication and Education      44
Pharmacology and Toxicology                 28
Scientific Communication                    25
Pathology                                   25
Pharmacology                                18
Paleontology                                 9
Clinical Trials                              6
Name: Subject, dtype: int64

Preprints


In [15]:
preprint_df = (biorxiv_df
    [['DOI', 'Date', 'License']]
    .sort_values('DOI')
    .reset_index(drop=True)
)
preprint_df.tail(4)


Out[15]:
DOI Date License
11350 10.1101/146779 2017-06-06 None
11351 10.1101/146787 2017-06-06 CC BY
11352 10.1101/146811 2017-06-06 CC BY-ND
11353 10.1101/146837 2017-06-06 CC BY

In [16]:
len(preprint_df)


Out[16]:
11354

In [17]:
# Preprints by license
preprint_df.License.value_counts(normalize=True).reset_index()


Out[17]:
index License
0 CC BY-NC-ND 0.349304
1 None 0.292848
2 CC BY 0.196230
3 CC BY-NC 0.093976
4 CC BY-ND 0.067641

In [18]:
# Preprints by year
preprint_df.Date.map(lambda date: date.split('-')[0]).value_counts()


Out[18]:
2016    4901
2017    3864
2015    1706
2014     806
2013      77
Name: Date, dtype: int64

Save as a TSVs


In [19]:
path = os.path.join('data', 'preprints.tsv')
preprint_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'subjects.tsv')
subject_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'authors.tsv')
author_df.to_csv(path, sep='\t', index=False)