In [1]:

    
from __future__ import unicode_literals
import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import xlrd
from collections import defaultdict

Transforming the PLOS thesaurus

The PLOS thesaurus was kindly provided to us as a spreadsheet with thousands of rows, one node per row. It is a polyhierarchy represented in the form of a tree. We need to transform it into a JSON object that also includes article counts for all the nodes in the tree.

An example of the desired data structure for PLOS thesaurus:

{"count": #total, "name": "PLOS", "children": [
     {"name": "Computer and information sciences",
      "count": ###,
      "children": [
            {"name": "Information technology",
             "count": ###,
             "children": [
                {"name": "Data mining", "count": ###},
                {"name": "Data reduction", "count": ###},
                {"name":  "Databases",
                  "count": ###,
                  "children": [
                    {"name": "Relational databases", "count": ###}
              ]
            },
            ...,
            {"name": "Text mining","count": ###} 
          ]
        }
      ]
    }, 
    ...
  ]
}

Thinking about a Python data structure that could be output to JSON, each node would be a dict. Children are specified as a list of dicts.



In [2]:

    
# Import article data. This is a 400 MB file.
df = pd.read_pickle('../data/all_plos_df.pkl')

# Drop unused data
df.drop(['author', 'title_display', 'journal', 'abstract', 'publication_date', 'score'], axis=1, inplace=True)
df.set_index('id', inplace=True)
df.head()









    Out[2]:






  
    
      
      subject
    
    
      id
      
    
  
  
    
      10.1371/journal.pone.0100281
       [/Medicine and health sciences/Pathology and l...
    
    
      10.1371/journal.pone.0070935
       [/Biology and life sciences/Molecular biology/...
    
    
      10.1371/journal.pone.0066671
       [/Biology and life sciences/Population biology...
    
    
      10.1371/journal.pntd.0002444
       [/Earth sciences/Geography/Geographic areas/Ru...
    
    
      10.1371/journal.pone.0003605
       [/Biology and life sciences/Cell biology/Cell ...
    
  

5 rows × 1 columns



In [3]:

    
# Let's make sure we are counting articles correctly for each subject node.

def count_articles(df, subject_path):
    s = df.subject.apply(lambda s: str(s))
    matching = s[s.str.contains(subject_path)]
    return len(matching)

print 'Total articles:', len(df)
print 'Science policy:', count_articles(df, 'Science policy')
print 'Science policy/Bioethics:', count_articles(df, 'Science policy/Bioethics')









    



Total articles: 121450
Science policy: 531
Science policy/Bioethics: 72



In [4]:

    
def tree_from_spreadsheet(f, df, verbose=False):
    
    subjects = df.subject.apply(lambda s: str(s))
    
    book = xlrd.open_workbook(f)
    pt = book.sheet_by_index(0)
    # spreadsheet cells : (row, col) :: cell A1 : (0, 0)

    # Initialize a list to contain the thesaurus.
    # Our test case will only have one item in this list.
    pt_list = []

    # Keep track of the path in the tree.
    cur_path = Series([np.nan]*10)

    for r in range(1, pt.nrows):
        # Start on row two.

        # Columns: the hierarchy goes up to 10 tiers.
        for c in range(10):
            if pt.cell_value(r, c):
                # If this condition is satisfied, we are at the node that's in this line.

                # Construct the path to this node.
                # Clean strings because some terms (RNA nomenclature) cause unicode error
                text = pt.cell_value(r, c).replace(u'\u2019', "'")
                cur_path[c] = text
                cur_path[c+1:] = np.nan
                path_list = list(cur_path.dropna())
                tier = len(path_list)
                path_str = '/'.join(path_list)
                if verbose:
                    print tier, path_str

                # Add the node to the JSON-like tree structure.
                node = defaultdict(list)
                node['name'] = text
                node['count']=  len(subjects[subjects.str.contains(path_str)])

                # This part is completely ridiculous. But it seems to work.
                if tier == 1:
                    pt_list.append(node)
                elif tier == 2:
                    pt_list[-1]['children'].append(node)
                elif tier == 3:
                    pt_list[-1]['children'][-1]['children'].append(node)
                elif tier == 4:
                    pt_list[-1]['children'][-1]['children'][-1]['children'].append(node)
                elif tier == 5:
                    pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
                elif tier == 6:
                    pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
                elif tier == 7:
                    pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
                elif tier == 8:
                    pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
                elif tier == 9:
                    pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
                elif tier == 10:
                    pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)

                # Go to next row after finding a term. There is only one term listed per row.
                break

    # Make a single JSON object to contain all the branches.
    pt_obj = {'count': len(df), 'name': 'PLOS', 'children': pt_list}

    return pt_obj

Prototyping

Experimenting on smaller subsets of the thesaurus.



In [ ]:

    
# Test 1: Science policy
plosthes_test_file = '../data/plosthes_test.xlsx'
json.dumps(tree_from_spreadsheet(plosthes_test_file, df, verbose=True))



In [ ]:

    
# Test 2: An edited subset of Earth Sciences
plosthes_test_file = '../data/plosthes_test_2.xlsx'
json.dumps(tree_from_spreadsheet(plosthes_test_file, df, verbose=True))

How to check for specific paths in the article data

In case something weird shows up in the tree, like a bug or something. (We fixed it.)



In [ ]:

    
df.subject[df.subject.apply(lambda x: u'/Earth sciences/Mineralogy/Minerals/Gemstones/Diamonds' in x)]

Ready for full conversion and export?

You can either run the cell below, or run the Python script crunch_tree.py. Warning: it takes a few hours. Don't forget to update the filename of the thesaurus spreadsheet if needed.

This will not overwrite the existing JSON file used by the D3 visualisation, data/plos_tree.json. You need to do that yourself, if you are updating the tree.



In [ ]:

    
# Update this filename if you use a newer version!
plosthes_full_file = '../data/plosthes.2014-2.full.xlsx'

# Generate tree structure
# Change to verbose=True if you want to see it happening. 
# (Fills up the output cell with ~10000 lines.)
plos_tree = tree_from_spreadsheet(plosthes_full_file, df, verbose=False)

# Export tree structure as JSON
# Note: the D3 tree visualization uses plos_tree.json -- this script won't overwrite it.
with open('../data/plos_hierarchy_full.json', 'wb') as f:
    json.dump(plos_tree, f)

	subject
id
10.1371/journal.pone.0100281	[/Medicine and health sciences/Pathology and l...
10.1371/journal.pone.0070935	[/Biology and life sciences/Molecular biology/...
10.1371/journal.pone.0066671	[/Biology and life sciences/Population biology...
10.1371/journal.pntd.0002444	[/Earth sciences/Geography/Geographic areas/Ru...
10.1371/journal.pone.0003605	[/Biology and life sciences/Cell biology/Cell ...