notebook.community

Edit and run

Go through all the authors in citations.h5 and bin them according to the first character of the last name. Save the bins into the same database, but under a new group, called buckets.



In [1]:

    
import pandas as pd
import json
import os
import sys
from unidecode import unidecode
from IPython.display import clear_output



In [2]:

    
# open the citations database for reading and access the group
# holding all the citation information
store = pd.HDFStore("citations.h5", mode='r')
doi_group = store.get_node('dois')

# allocate our bins
buckets = {}

# walk through all the nodes in the database
for df in doi_group._f_walknodes('Group'):
    # skip groups that don't have a leaf (data) attached to them
    if len(df._v_leaves) == 0:
        continue

    # get the full path in the database to the data
    citation = df._v_pathname

    # print progress
    clear_output()
    print("Binning authors in '{}'".format(citation))
    sys.stdout.flush()

    # get the author information
    data = store[citation]

    # go through each author for the citation
    for i, row in data.iterrows():
        # skip authors that don't have a first or last name
        if 'family' not in row or 'given' not in row:
            continue
        if not isinstance(row['family'], str):
            continue

        # get the first letter of the last name -- we need to use
        # unidecode because we want to bin names together even if
        # some of them have accents
        letter = unidecode(row['family'][0]).lower()
        if letter not in buckets:
            buckets[letter] = []
        buckets[letter].append(row)

# close the database
store.close()









    



Binning authors in '/dois/_10_1371/journal_pmed/_0/_0/_1/_0/_0/_0010066'



In [3]:

    
# print out information about the size of the buckets
for bucket in sorted(buckets.keys()):
    print("{}: {}".format(bucket, len(buckets[bucket])))









    



3: 1
a: 2364
b: 4433
c: 4827
d: 2972
e: 943
f: 2144
g: 3284
h: 4230
i: 628
j: 1576
k: 3773
l: 5434
m: 5051
n: 1652
o: 1128
p: 3163
q: 296
r: 2644
s: 6438
t: 2647
u: 282
v: 1726
w: 3617
x: 715
y: 1754
z: 2465



In [5]:

    
# open the database again to store the buckets
store = pd.HDFStore('citations.h5', mode='a')

for bucket in buckets:
    # turn the bucket into a dataframe
    df = pd.DataFrame(buckets[bucket])\
        .sort(['family', 'given'])\
        .reset_index(drop=True)

    # save it into the database
    store.put("buckets/_{}".format(bucket), df)

# close the database
store.close()









    



/usr/lib/python3/dist-packages/pandas/io/pytables.py:2446: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['affiliation', 'doi', 'email', 'family', 'given']]

  warnings.warn(ws, PerformanceWarning)