In [1]:
from __future__ import unicode_literals
import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
The desired data structure for article information is the following JSON object:
<doi1>: {
author: [ ... ]
title:
journal:
publication_date: <yyyy>
subject: [ <full subject /-separated strings>, ... ]
subj_top: [ set of top levels of each subject ]
subj_leaf: [ set of last terms of each subject ]
},
<doi2>: { ... },
...
In [2]:
df = pd.read_pickle('../data/all_plos_df.pkl')
df.head()
Out[2]:
In [3]:
# Drop unused data.
df.drop(['author', 'title_display', 'journal', 'abstract', 'score'], axis=1, inplace=True)
df.set_index('id', inplace=True)
# We just want the year.
df.publication_date = df.publication_date.str[:4]
df.head()
Out[3]:
In [4]:
def get_subj_top(subjects):
subj_top = set()
for s in subjects:
# the string gets split at its first character, so not [0] here:
subj_top.add(s.split('/')[1])
return subj_top
def get_subj_leaf(subjects):
subj_top = set()
for s in subjects:
subj_top.add(s.split('/')[-1])
return subj_top
In [5]:
df['subj_top'] = df.subject.apply(get_subj_top)
df['subj_leaf'] = df.subject.apply(get_subj_leaf)
In [6]:
test_df = df.sort_index().head()
test_df
Out[6]:
In [7]:
def make_dicts(values):
# Dict to translate original terms into codes
d1 = {}
index = 1
for i in values:
for s in i:
s = s.replace(u'\u2019', "'")
if s not in d1.keys():
d1[s] = index
index = index + 1
# Dict to translate codes back into original terms
d2 = {d1[k]: k for k in d1.keys()}
return d1, d2
def encode_terms(values, d):
# Returned Series must share the same Index as input
coded_values = values.copy()
for i in range(len(values)):
coded_values.ix[i] = set(d[s.replace(u'\u2019', "'")] for s in values.ix[i])
return coded_values
In [8]:
# Test!
# - There should be repeated uses of the keys (not just 1, 2, 3, 4, 5...)
# - There should be the same number of terms in original and 'encoded' data
print test_df['subj_leaf'].apply(len)
d_to_code, d_from_code = make_dicts(test_df['subj_leaf'])
test_enc1 = encode_terms(test_df['subj_leaf'], d_to_code)
print test_enc1
print test_enc1.apply(len) == test_df['subj_leaf'].apply(len)
In [9]:
# Test again!
print test_df['subject'].apply(len)
d_to_code, d_from_code = make_dicts(test_df['subject'])
test_enc2 = encode_terms(test_df['subject'], d_to_code)
print test_enc2
print test_enc2.apply(len) == test_df['subject'].apply(len)
In [10]:
# Test output dict as JSON:
json.dumps(d_from_code)
Out[10]:
In [11]:
# subject
subject_enc, subject_dec = make_dicts(df['subject'])
with open('../data/dict_subject_enc.json', 'wb') as f:
json.dump(subject_enc, f)
with open('../data/dict_subject_dec.json', 'wb') as f:
json.dump(subject_dec, f)
# subj_top
subj_top_enc, subj_top_dec = make_dicts(df['subj_top'])
with open('../data/dict_subj_top.json', 'wb') as f:
json.dump(subj_top_dec, f)
# subj_leaf
subj_leaf_enc, subj_leaf_dec = make_dicts(df['subj_leaf'])
with open('../data/dict_subj_leaf.json', 'wb') as f:
json.dump(subj_leaf_dec, f)
The next cell also takes a while.
In [12]:
df['subject'] = encode_terms(df['subject'], subject_enc)
df['subj_top'] = encode_terms(df['subj_top'], subj_top_enc)
df['subj_leaf'] = encode_terms(df['subj_leaf'], subj_leaf_enc)
df.head()
Out[12]:
In [13]:
df.head().to_json(orient='index', force_ascii=False)
Out[13]:
In [14]:
df.to_json(path_or_buf='../data/articles_coded.json', orient='index', force_ascii=False)