In [1]:
from __future__ import unicode_literals
import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
The desired data structure for article information is the following JSON object:
articles:
[ <doi>:
{ author: [ ... ]
title:
journal:
publication_date: <yyyy>
subject: [ <full subject /-separated strings>, ... ]
subj_top: [ set of top levels of each subject ]
subj_leaf: [ set of last terms of each subject ]
},
...
]
In [2]:
df = pd.read_pickle('../data/abstract_df.pkl')
# Dropping abstract and score.
df.drop(['abstract', 'score'], axis=1, inplace=True)
df.set_index('id', inplace=True)
df.columns = ['author', 'journal', 'publication_date', 'subject', 'title']
df = df.reindex(columns=['author', 'title', 'journal', 'publication_date', 'subject'])
# We just want the year.
df.publication_date = df.publication_date.str[:4]
df.head()
Out[2]:
In [3]:
def get_subj_top(subjects):
subj_top = set()
for s in subjects:
# the string gets split at its first character, so not [0] here:
subj_top.add(s.split('/')[1])
return subj_top
def get_subj_leaf(subjects):
subj_top = set()
for s in subjects:
subj_top.add(s.split('/')[-1])
return subj_top
In [4]:
df['subj_top'] = df.subject.apply(get_subj_top)
df['subj_leaf'] = df.subject.apply(get_subj_leaf)
In [5]:
df.head()
Out[5]:
In [6]:
#df.head().to_json(orient='index', force_ascii=False)
In [7]:
#df.to_json(path_or_buf='../data/articles.json', orient='index', force_ascii=False)
The PLOS thesaurus was kindly provided to us as a spreadsheet with thousands of rows, one node per row. It is a polyhierarchy represented in the form of a tree. We need to transform it into a JSON object that also includes article counts for all the nodes in the tree.
An example of the desired data structure for PLOS thesaurus:
{
"name": "Computer and information sciences",
"count": ###,
"children": [
{
"name": "Information technology",
"count": ###,
"children": [
{"name": "Data mining", "count": ###},
{"name": "Data reduction", "count": ###},
{
"name": "Databases",
"count": ###,
"children": [
{"name": "Relational databases", "count": ###}
]
},
...,
{"name": "Text mining","count": ###}
]
}
]
},
...
In Python, each node is a dict. Children are specified as a list of dicts. The whole thing is a list of nodes, therefore, a list of dicts.
In [8]:
# Let's make sure we are counting articles correctly for each subject node.
def count_articles(df, subject_path):
s = df.subject.apply(lambda s: str(s))
matching = s[s.str.contains(subject_path)]
return len(matching)
print 'Total articles:', len(df)
print 'Science policy:', count_articles(df, 'Science policy')
print 'Science policy/Bioethics:', count_articles(df, 'Science policy/Bioethics')
In [9]:
import xlrd
from collections import defaultdict
def tree_from_spreadsheet(f, df, verbose=False):
subjects = df.subject.apply(lambda s: str(s))
book = xlrd.open_workbook(f)
pt = book.sheet_by_index(0)
# spreadsheet cells : (row, col) :: cell A1 = (0, 0)
# Initialize a list to contain the thesaurus.
# Our test case will only have one item in this list.
pt_test = []
# Keep track of the path in the tree.
cur_path = Series([np.nan]*10)
for r in range(1, pt.nrows):
# Start on row two.
# Columns: the hierarchy goes up to 10 tiers.
for c in range(10):
if pt.cell_value(r, c):
# If this condition is satisfied, we are at the node that's in this line.
# Construct the path to this node.
# Clean strings because some terms (RNA nomenclature) cause unicode error
text = pt.cell_value(r, c).replace(u'\u2019', "'")
cur_path[c] = text
cur_path[c+1:] = np.nan
path_list = list(cur_path.dropna())
tier = len(path_list)
path_str = '/'.join(path_list)
if verbose:
print tier, path_str
# Add the node to the JSON-like tree structure.
node = defaultdict(list)
node['name'] = text
node['count']= len(subjects[subjects.str.contains(path_str)])
# This part is completely ridiculous. But it seems to work.
if tier == 1:
pt_test.append(node)
pt_test.append
elif tier == 2:
pt_test[-1]['children'].append(node)
elif tier == 3:
pt_test[-1]['children'][-1]['children'].append(node)
elif tier == 4:
pt_test[-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 5:
pt_test[-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 6:
pt_test[-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 7:
pt_test[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 8:
pt_test[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 9:
pt_test[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 10:
pt_test[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
# Go to next row after finding a term. There is only one term listed per row.
break
return pt_test
In [12]:
plosthes_test_file = '../data/plosthes_test.xlsx'
json.dumps(tree_from_spreadsheet(plosthes_test_file, df, verbose=True))
Out[12]:
In [11]:
plosthes_full_file = '../data/plosthes.2014-1.full.xlsx'
plos_tree = tree_from_spreadsheet(plosthes_full_file, df)
with open('../data/plos_full.json', 'wb') as f:
json.dump(plos_tree, f)