In [1]:
import json
import os
import pandas
import altair
import utilities
In [2]:
# Ordered according to https://creativecommons.org/licenses/
licenses = ['CC BY', 'CC BY-ND', 'CC BY-NC', 'CC BY-NC-ND', 'None']
license_colors = ['#F68212', '#AC5B0C', '#934E0A', '#492705', '#000000']
license_scores = {
'CC BY': 5,
'CC BY-ND': 3,
'CC BY-NC': 3,
'CC BY-NC-ND': 2,
'None': 1,
}
In [3]:
path = os.path.join('data', 'preprints.tsv')
preprint_df = pandas.read_table(path, parse_dates=['Date'])
preprint_df.License = pandas.Categorical(preprint_df.License, licenses)
preprint_df.head(2)
Out[3]:
In [4]:
# Table of licensing choices
counts = preprint_df.groupby('License').apply(len)
counts.name = 'Count'
count_df = counts.reset_index()
count_df['Percent'] = count_df['Count'] / sum(count_df['Count'])
count_df.License = pandas.Categorical(count_df.License, licenses)
count_df.assign(
Count=count_df['Count'].map('{:,}'.format),
Percent=count_df['Percent'].map('{:.1%}'.format),
Score=count_df['License'].map(license_scores),
)
Out[4]:
In [5]:
# Preprints that forbid derivatives
ND_licenses = {'CC BY-ND', 'CC BY-NC-ND', 'None'}
count_df.query("License in @ND_licenses").sum(numeric_only=True)
Out[5]:
In [6]:
# Preprints that forbid commercial use
ND_licenses = {'CC BY-NC', 'CC BY-NC-ND', 'None'}
count_df.query("License in @ND_licenses").sum(numeric_only=True)
Out[6]:
In [7]:
path = os.path.join('figure', 'license-vs-time', 'vega-lite-data.json')
utilities.df_to_vega_lite(preprint_df, path)
In [8]:
path = os.path.join('data', 'subjects.tsv')
subject_df = preprint_df.merge(
pandas.read_table(path)
.replace({'Subject': {'Animal Behavior and Cognition': 'Animal Behavior & Cogn.'}})
)
subject_df.tail(2)
Out[8]:
In [9]:
# Subset subject_df for subjects with 100+ preprints
subject_counts = subject_df.Subject.value_counts()
popular_subjects = subject_counts[subject_counts >= 100].index.tolist()
popular_subject_df = subject_df.query("Subject in @popular_subjects")
len(popular_subjects)
Out[9]:
In [10]:
# Export for vega-lite
path = os.path.join('figure', 'license-vs-subject', 'vega-lite-data.json')
utilities.df_to_vega_lite(popular_subject_df, path)
In [11]:
path = os.path.join('data', 'authors.tsv')
author_df = preprint_df.merge(
pandas.read_table(path)
)
In [12]:
author_df['score'] = author_df.License.map(license_scores)
In [13]:
author_df.tail(2)
Out[13]:
In [14]:
def summarize(df):
row = pandas.Series()
row['Preprints'] = len(df)
row['Score'] = sum(df.score)
return row
author_score_df = author_df.groupby('Standard_Author').apply(summarize).reset_index()
author_score_df['Rank'] = author_score_df.Score.rank(method='min', ascending=False).astype(int)
In [15]:
author_score_df.sort_values('Score', ascending=False).head()
Out[15]:
In [16]:
path = os.path.join('data', 'author-scores.tsv')
author_score_df.to_csv(path, sep='\t', index=False)
In [17]:
path = os.path.join('data', 'author-scores.json')
utilities.df_to_datatables(author_score_df, path)