In [1]:
from __future__ import unicode_literals
import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import xlrd
from collections import defaultdict
The PLOS thesaurus was kindly provided to us as a spreadsheet with thousands of rows, one node per row. It is a polyhierarchy represented in the form of a tree. We need to transform it into a JSON object that also includes article counts for all the nodes in the tree.
An example of the desired data structure for PLOS thesaurus:
{"count": #total, "name": "PLOS", "children": [
{"name": "Computer and information sciences",
"count": ###,
"children": [
{"name": "Information technology",
"count": ###,
"children": [
{"name": "Data mining", "count": ###},
{"name": "Data reduction", "count": ###},
{"name": "Databases",
"count": ###,
"children": [
{"name": "Relational databases", "count": ###}
]
},
...,
{"name": "Text mining","count": ###}
]
}
]
},
...
]
}
Thinking about a Python data structure that could be output to JSON, each node would be a dict. Children are specified as a list of dicts.
In [2]:
# Import article data. This is a 400 MB file.
df = pd.read_pickle('../data/all_plos_df.pkl')
# Drop unused data
df.drop(['author', 'title_display', 'journal', 'abstract', 'publication_date', 'score'], axis=1, inplace=True)
df.set_index('id', inplace=True)
df.head()
Out[2]:
In [3]:
# Let's make sure we are counting articles correctly for each subject node.
def count_articles(df, subject_path):
s = df.subject.apply(lambda s: str(s))
matching = s[s.str.contains(subject_path)]
return len(matching)
print 'Total articles:', len(df)
print 'Science policy:', count_articles(df, 'Science policy')
print 'Science policy/Bioethics:', count_articles(df, 'Science policy/Bioethics')
In [4]:
def tree_from_spreadsheet(f, df, verbose=False):
subjects = df.subject.apply(lambda s: str(s))
book = xlrd.open_workbook(f)
pt = book.sheet_by_index(0)
# spreadsheet cells : (row, col) :: cell A1 : (0, 0)
# Initialize a list to contain the thesaurus.
# Our test case will only have one item in this list.
pt_list = []
# Keep track of the path in the tree.
cur_path = Series([np.nan]*10)
for r in range(1, pt.nrows):
# Start on row two.
# Columns: the hierarchy goes up to 10 tiers.
for c in range(10):
if pt.cell_value(r, c):
# If this condition is satisfied, we are at the node that's in this line.
# Construct the path to this node.
# Clean strings because some terms (RNA nomenclature) cause unicode error
text = pt.cell_value(r, c).replace(u'\u2019', "'")
cur_path[c] = text
cur_path[c+1:] = np.nan
path_list = list(cur_path.dropna())
tier = len(path_list)
path_str = '/'.join(path_list)
if verbose:
print tier, path_str
# Add the node to the JSON-like tree structure.
node = defaultdict(list)
node['name'] = text
node['count']= len(subjects[subjects.str.contains(path_str)])
# This part is completely ridiculous. But it seems to work.
if tier == 1:
pt_list.append(node)
elif tier == 2:
pt_list[-1]['children'].append(node)
elif tier == 3:
pt_list[-1]['children'][-1]['children'].append(node)
elif tier == 4:
pt_list[-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 5:
pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 6:
pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 7:
pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 8:
pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 9:
pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 10:
pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
# Go to next row after finding a term. There is only one term listed per row.
break
# Make a single JSON object to contain all the branches.
pt_obj = {'count': len(df), 'name': 'PLOS', 'children': pt_list}
return pt_obj
In [ ]:
# Test 1: Science policy
plosthes_test_file = '../data/plosthes_test.xlsx'
json.dumps(tree_from_spreadsheet(plosthes_test_file, df, verbose=True))
In [ ]:
# Test 2: An edited subset of Earth Sciences
plosthes_test_file = '../data/plosthes_test_2.xlsx'
json.dumps(tree_from_spreadsheet(plosthes_test_file, df, verbose=True))
In [ ]:
df.subject[df.subject.apply(lambda x: u'/Earth sciences/Mineralogy/Minerals/Gemstones/Diamonds' in x)]
You can either run the cell below, or run the Python script crunch_tree.py
. Warning: it takes a few hours. Don't forget to update the filename of the thesaurus spreadsheet if needed.
This will not overwrite the existing JSON file used by the D3 visualisation, data/plos_tree.json
. You need to do that yourself, if you are updating the tree.
In [ ]:
# Update this filename if you use a newer version!
plosthes_full_file = '../data/plosthes.2014-2.full.xlsx'
# Generate tree structure
# Change to verbose=True if you want to see it happening.
# (Fills up the output cell with ~10000 lines.)
plos_tree = tree_from_spreadsheet(plosthes_full_file, df, verbose=False)
# Export tree structure as JSON
# Note: the D3 tree visualization uses plos_tree.json -- this script won't overwrite it.
with open('../data/plos_hierarchy_full.json', 'wb') as f:
json.dump(plos_tree, f)