In [1]:
    
from __future__ import unicode_literals
import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
    
The desired data structure for article information is the following JSON object:
articles:
  [ <doi>:
      { author: [ ... ]
        title:
        journal:
        publication_date: <yyyy>
        subject: [ <full subject /-separated strings>, ... ]
        subj_top: [ set of top levels of each subject ]
        subj_leaf: [ set of last terms of each subject ]
      },
    ...
  ]
In [2]:
    
df = pd.read_pickle('../data/abstract_df.pkl')
# Dropping abstract and score.
df.drop(['abstract', 'score'], axis=1, inplace=True)
df.set_index('id', inplace=True)
df.columns = ['author', 'journal', 'publication_date', 'subject', 'title']
df = df.reindex(columns=['author', 'title', 'journal', 'publication_date', 'subject'])
# We just want the year.
df.publication_date = df.publication_date.str[:4]
df.head()
    
    Out[2]:
In [3]:
    
def get_subj_top(subjects):
    subj_top = set()
    for s in subjects:
        # the string gets split at its first character, so not [0] here:
        subj_top.add(s.split('/')[1])
    return subj_top
def get_subj_leaf(subjects):
    subj_top = set()
    for s in subjects:
        subj_top.add(s.split('/')[-1])
    return subj_top
    
In [4]:
    
df['subj_top'] = df.subject.apply(get_subj_top)
df['subj_leaf'] = df.subject.apply(get_subj_leaf)
    
In [5]:
    
df.head()
    
    Out[5]:
In [6]:
    
#df.head().to_json(orient='index', force_ascii=False)
    
In [7]:
    
#df.to_json(path_or_buf='../data/articles.json', orient='index', force_ascii=False)
    
The PLOS thesaurus was kindly provided to us as a spreadsheet with thousands of rows, one node per row. It is a polyhierarchy represented in the form of a tree. We need to transform it into a JSON object that also includes article counts for all the nodes in the tree.
An example of the desired data structure for PLOS thesaurus:
{ 
  "name": "Computer and information sciences",
  "count": ###,
  "children": [
    {
      "name": "Information technology",
      "count": ###,
      "children": [
        {"name": "Data mining", "count": ###},
        {"name": "Data reduction", "count": ###}, 
        {
          "name":  "Databases",
          "count": ###,
          "children": [
            {"name": "Relational databases", "count": ###}
          ]
        },
        ...,
        {"name": "Text mining","count": ###} 
      ]
    }
  ]
}, 
...
In Python, each node is a dict. Children are specified as a list of dicts. The whole thing is a list of nodes, therefore, a list of dicts.
In [8]:
    
# Let's make sure we are counting articles correctly for each subject node.
def count_articles(df, subject_path):
    s = df.subject.apply(lambda s: str(s))
    matching = s[s.str.contains(subject_path)]
    return len(matching)
print 'Total articles:', len(df)
print 'Science policy:', count_articles(df, 'Science policy')
print 'Science policy/Bioethics:', count_articles(df, 'Science policy/Bioethics')
    
    
In [9]:
    
import xlrd
from collections import defaultdict
def tree_from_spreadsheet(f, df, verbose=False):
    
    subjects = df.subject.apply(lambda s: str(s))
    
    book = xlrd.open_workbook(f)
    pt = book.sheet_by_index(0)
    # spreadsheet cells : (row, col) :: cell A1 = (0, 0)
    # Initialize a list to contain the thesaurus.
    # Our test case will only have one item in this list.
    pt_test = []
    # Keep track of the path in the tree.
    cur_path = Series([np.nan]*10)
    for r in range(1, pt.nrows):
        # Start on row two.
        # Columns: the hierarchy goes up to 10 tiers.
        for c in range(10):
            if pt.cell_value(r, c):
                # If this condition is satisfied, we are at the node that's in this line.
                # Construct the path to this node.
                # Clean strings because some terms (RNA nomenclature) cause unicode error
                text = pt.cell_value(r, c).replace(u'\u2019', "'")
                cur_path[c] = text
                cur_path[c+1:] = np.nan
                path_list = list(cur_path.dropna())
                tier = len(path_list)
                path_str = '/'.join(path_list)
                if verbose:
                    print tier, path_str
                # Add the node to the JSON-like tree structure.
                node = defaultdict(list)
                node['name'] = text
                node['count']=  len(subjects[subjects.str.contains(path_str)])
                # This part is completely ridiculous. But it seems to work.
                if tier == 1:
                    pt_test.append(node)
                    pt_test.append
                elif tier == 2:
                    pt_test[-1]['children'].append(node)
                elif tier == 3:
                    pt_test[-1]['children'][-1]['children'].append(node)
                elif tier == 4:
                    pt_test[-1]['children'][-1]['children'][-1]['children'].append(node)
                elif tier == 5:
                    pt_test[-1]['children'][-1]['children'][-1]['children'].append(node)
                elif tier == 6:
                    pt_test[-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
                elif tier == 7:
                    pt_test[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
                elif tier == 8:
                    pt_test[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
                elif tier == 9:
                    pt_test[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
                elif tier == 10:
                    pt_test[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
                # Go to next row after finding a term. There is only one term listed per row.
                break
    return pt_test
    
In [12]:
    
plosthes_test_file = '../data/plosthes_test.xlsx'
json.dumps(tree_from_spreadsheet(plosthes_test_file, df, verbose=True))
    
    
    Out[12]:
In [11]:
    
plosthes_full_file = '../data/plosthes.2014-1.full.xlsx'
plos_tree = tree_from_spreadsheet(plosthes_full_file, df)
with open('../data/plos_full.json', 'wb') as f:
    json.dump(plos_tree, f)