In [1]:
import os
import json
import logging
import pandas
import requests
import utilities
In [2]:
# Configure logging to write to file
logging.basicConfig(level=logging.INFO, filename=os.path.join('logs/donwload.log'), filemode='w')
In [3]:
url = 'https://api.github.com/repos/OmnesRes/prepub/git/refs/heads/master'
response = requests.get(url)
response = response.json()
response['object']
Out[3]:
In [4]:
url = 'https://github.com/OmnesRes/prepub/raw/master/biorxiv/biorxiv_licenses.tsv'
biorxiv_df = pandas.read_table(url)
In [5]:
# Limit to preprints through November 2016
biorxiv_df = biorxiv_df.query("Date <= '2017-06-07'")
In [6]:
biorxiv_df.head(2)
Out[6]:
In [7]:
# Remove URL from DOIs
biorxiv_df.DOI = biorxiv_df.DOI.str.extract(r'(10\.[0-9]+/[0-9]+)', expand=False)
biorxiv_df.License = biorxiv_df.License.str.replace('CC-BY', 'CC BY')
biorxiv_df.License = biorxiv_df.License.fillna('None')
In [8]:
author_df = (biorxiv_df
.pipe(utilities.tidy_split, column='Authors')
.rename(columns={'Authors': 'Author'})
[['DOI', 'Author']]
.sort_values(['DOI', 'Author'])
.drop_duplicates()
.reset_index(drop=True)
)
In [9]:
# Standardize author names
author_df['Standard_Author'] = author_df.Author.map(utilities.get_standard_author)
In [10]:
author_df.tail(2)
Out[10]:
In [11]:
# Authors with the most preprints
author_df.Standard_Author.value_counts().head(2)
Out[11]:
In [12]:
# Create a TSV with a row per preprint-subject pair
subject_df = (biorxiv_df
.pipe(utilities.tidy_split, column='Subjects')
.rename(columns={'Subjects': 'Subject'})
[['DOI', 'Subject']]
.sort_values(['DOI', 'Subject'])
.reset_index(drop=True)
)
subject_df.tail(2)
Out[12]:
In [13]:
# Number of subjects per preprint
subject_df.DOI.value_counts().value_counts()
Out[13]:
In [14]:
# Number of preprints by subject
subject_df.Subject.value_counts()
Out[14]:
In [15]:
preprint_df = (biorxiv_df
[['DOI', 'Date', 'License']]
.sort_values('DOI')
.reset_index(drop=True)
)
preprint_df.tail(4)
Out[15]:
In [16]:
len(preprint_df)
Out[16]:
In [17]:
# Preprints by license
preprint_df.License.value_counts(normalize=True).reset_index()
Out[17]:
In [18]:
# Preprints by year
preprint_df.Date.map(lambda date: date.split('-')[0]).value_counts()
Out[18]:
In [19]:
path = os.path.join('data', 'preprints.tsv')
preprint_df.to_csv(path, sep='\t', index=False)
path = os.path.join('data', 'subjects.tsv')
subject_df.to_csv(path, sep='\t', index=False)
path = os.path.join('data', 'authors.tsv')
author_df.to_csv(path, sep='\t', index=False)