In [1]:
    
from __future__ import unicode_literals
import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
    
The desired data structure for article information is the following JSON object:
<doi1>: {
    author: [ ... ]
    title:
    journal:
    publication_date: <yyyy>
    subject: [ <full subject /-separated strings>, ... ]
    subj_top: [ set of top levels of each subject ]
    subj_leaf: [ set of last terms of each subject ]
},
<doi2>: { ... }, 
...
In [2]:
    
df = pd.read_pickle('../data/all_plos_df.pkl')
df.head()
    
    Out[2]:
In [3]:
    
# Drop unused data.
df.drop(['author', 'title_display', 'journal', 'abstract', 'score'], axis=1, inplace=True)
df.set_index('id', inplace=True)
# We just want the year.
df.publication_date = df.publication_date.str[:4]
df.head()
    
    Out[3]:
In [4]:
    
def get_subj_top(subjects):
    subj_top = set()
    for s in subjects:
        # the string gets split at its first character, so not [0] here:
        subj_top.add(s.split('/')[1])
    return subj_top
def get_subj_leaf(subjects):
    subj_top = set()
    for s in subjects:
        subj_top.add(s.split('/')[-1])
    return subj_top
    
In [5]:
    
df['subj_top'] = df.subject.apply(get_subj_top)
df['subj_leaf'] = df.subject.apply(get_subj_leaf)
    
In [6]:
    
test_df = df.sort_index().head()
test_df
    
    Out[6]:
In [7]:
    
def make_dicts(values):
    # Dict to translate original terms into codes
    d1 = {}
    index = 1
    
    for i in values:
        for s in i:
            s = s.replace(u'\u2019', "'")
            if s not in d1.keys():
                d1[s] = index
                index = index + 1
    
    # Dict to translate codes back into original terms
    d2 = {d1[k]: k for k in d1.keys()}
    
    return d1, d2
def encode_terms(values, d):
    # Returned Series must share the same Index as input
    coded_values = values.copy()
    
    for i in range(len(values)):
        coded_values.ix[i] = set(d[s.replace(u'\u2019', "'")] for s in values.ix[i])
    
    return coded_values
    
In [8]:
    
# Test!
# - There should be repeated uses of the keys (not just 1, 2, 3, 4, 5...)
# - There should be the same number of terms in original and 'encoded' data
print test_df['subj_leaf'].apply(len)
d_to_code, d_from_code = make_dicts(test_df['subj_leaf'])
test_enc1 = encode_terms(test_df['subj_leaf'], d_to_code)
print test_enc1
print test_enc1.apply(len) == test_df['subj_leaf'].apply(len)
    
    
In [9]:
    
# Test again!
print test_df['subject'].apply(len)
d_to_code, d_from_code = make_dicts(test_df['subject'])
test_enc2 = encode_terms(test_df['subject'], d_to_code)
print test_enc2
print test_enc2.apply(len) == test_df['subject'].apply(len)
    
    
In [10]:
    
# Test output dict as JSON:
json.dumps(d_from_code)
    
    Out[10]:
In [11]:
    
# subject
subject_enc, subject_dec = make_dicts(df['subject'])
with open('../data/dict_subject_enc.json', 'wb') as f:
     json.dump(subject_enc, f)
with open('../data/dict_subject_dec.json', 'wb') as f:
     json.dump(subject_dec, f)
# subj_top
subj_top_enc, subj_top_dec = make_dicts(df['subj_top'])
with open('../data/dict_subj_top.json', 'wb') as f:
     json.dump(subj_top_dec, f)
# subj_leaf
subj_leaf_enc, subj_leaf_dec = make_dicts(df['subj_leaf'])
with open('../data/dict_subj_leaf.json', 'wb') as f:
     json.dump(subj_leaf_dec, f)
    
The next cell also takes a while.
In [12]:
    
df['subject'] = encode_terms(df['subject'], subject_enc)
df['subj_top'] = encode_terms(df['subj_top'], subj_top_enc)
df['subj_leaf'] = encode_terms(df['subj_leaf'], subj_leaf_enc)
df.head()
    
    Out[12]:
In [13]:
    
df.head().to_json(orient='index', force_ascii=False)
    
    Out[13]:
In [14]:
    
df.to_json(path_or_buf='../data/articles_coded.json', orient='index', force_ascii=False)