Go through all the authors in citations.h5 and bin them according to the first character of the last name. Save the bins into the same database, but under a new group, called buckets.


In [1]:
import pandas as pd
import json
import os
import sys
from unidecode import unidecode
from IPython.display import clear_output

In [2]:
# open the citations database for reading and access the group
# holding all the citation information
store = pd.HDFStore("citations.h5", mode='r')
doi_group = store.get_node('dois')

# allocate our bins
buckets = {}

# walk through all the nodes in the database
for df in doi_group._f_walknodes('Group'):
    # skip groups that don't have a leaf (data) attached to them
    if len(df._v_leaves) == 0:
        continue

    # get the full path in the database to the data
    citation = df._v_pathname

    # print progress
    clear_output()
    print("Binning authors in '{}'".format(citation))
    sys.stdout.flush()

    # get the author information
    data = store[citation]

    # go through each author for the citation
    for i, row in data.iterrows():
        # skip authors that don't have a first or last name
        if 'family' not in row or 'given' not in row:
            continue
        if not isinstance(row['family'], str):
            continue

        # get the first letter of the last name -- we need to use
        # unidecode because we want to bin names together even if
        # some of them have accents
        letter = unidecode(row['family'][0]).lower()
        if letter not in buckets:
            buckets[letter] = []
        buckets[letter].append(row)

# close the database
store.close()


Binning authors in '/dois/_10_1371/journal_pmed/_0/_0/_1/_0/_0/_0010066'

In [3]:
# print out information about the size of the buckets
for bucket in sorted(buckets.keys()):
    print("{}: {}".format(bucket, len(buckets[bucket])))


3: 1
a: 2364
b: 4433
c: 4827
d: 2972
e: 943
f: 2144
g: 3284
h: 4230
i: 628
j: 1576
k: 3773
l: 5434
m: 5051
n: 1652
o: 1128
p: 3163
q: 296
r: 2644
s: 6438
t: 2647
u: 282
v: 1726
w: 3617
x: 715
y: 1754
z: 2465

In [5]:
# open the database again to store the buckets
store = pd.HDFStore('citations.h5', mode='a')

for bucket in buckets:
    # turn the bucket into a dataframe
    df = pd.DataFrame(buckets[bucket])\
        .sort(['family', 'given'])\
        .reset_index(drop=True)

    # save it into the database
    store.put("buckets/_{}".format(bucket), df)

# close the database
store.close()


/usr/lib/python3/dist-packages/pandas/io/pytables.py:2446: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['affiliation', 'doi', 'email', 'family', 'given']]

  warnings.warn(ws, PerformanceWarning)