Go through all the authors in citations.h5 and bin them according to the first character of the last name. Save the bins into the same database, but under a new group, called buckets.
In [1]:
import pandas as pd
import json
import os
import sys
from unidecode import unidecode
from IPython.display import clear_output
In [2]:
# open the citations database for reading and access the group
# holding all the citation information
store = pd.HDFStore("citations.h5", mode='r')
doi_group = store.get_node('dois')
# allocate our bins
buckets = {}
# walk through all the nodes in the database
for df in doi_group._f_walknodes('Group'):
# skip groups that don't have a leaf (data) attached to them
if len(df._v_leaves) == 0:
continue
# get the full path in the database to the data
citation = df._v_pathname
# print progress
clear_output()
print("Binning authors in '{}'".format(citation))
sys.stdout.flush()
# get the author information
data = store[citation]
# go through each author for the citation
for i, row in data.iterrows():
# skip authors that don't have a first or last name
if 'family' not in row or 'given' not in row:
continue
if not isinstance(row['family'], str):
continue
# get the first letter of the last name -- we need to use
# unidecode because we want to bin names together even if
# some of them have accents
letter = unidecode(row['family'][0]).lower()
if letter not in buckets:
buckets[letter] = []
buckets[letter].append(row)
# close the database
store.close()
In [3]:
# print out information about the size of the buckets
for bucket in sorted(buckets.keys()):
print("{}: {}".format(bucket, len(buckets[bucket])))
In [5]:
# open the database again to store the buckets
store = pd.HDFStore('citations.h5', mode='a')
for bucket in buckets:
# turn the bucket into a dataframe
df = pd.DataFrame(buckets[bucket])\
.sort(['family', 'given'])\
.reset_index(drop=True)
# save it into the database
store.put("buckets/_{}".format(bucket), df)
# close the database
store.close()