In [1]:
from __future__ import unicode_literals
import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

Transforming batch-collected article data

The desired data structure for article information is the following JSON object:

<doi1>: {
    author: [ ... ]
    title:
    journal:
    publication_date: <yyyy>
    subject: [ <full subject /-separated strings>, ... ]
    subj_top: [ set of top levels of each subject ]
    subj_leaf: [ set of last terms of each subject ]
},
<doi2>: { ... }, 
...

In [2]:
df = pd.read_pickle('../data/all_plos_df.pkl')
df.head()


Out[2]:
abstract author id journal publication_date score subject title_display
0 [\nClenbuterol, a β2-agonist, induces skeletal... [Pascal Sirvent, Aymerick Douillard, Olivier G... 10.1371/journal.pone.0100281 PLoS ONE 2014-06-27T00:00:00Z 1 [/Medicine and health sciences/Pathology and l... Effects of Chronic Administration of Clenbuter...
1 [\nRecent studies point to an association betw... [Lan Zhang, Hui Ding, Dan-Hui Wang, Yan-Li Zha... 10.1371/journal.pone.0070935 PLoS ONE 2013-08-09T00:00:00Z 1 [/Biology and life sciences/Molecular biology/... Calpastatin Gene (CAST) Is Not Associated with...
2 [\nThe reduction of game and fish populations ... [Silvia Díaz-Fernández, Beatriz Arroyo, Fabián... 10.1371/journal.pone.0066671 PLoS ONE 2013-06-19T00:00:00Z 1 [/Biology and life sciences/Population biology... Effect of Game Management on Wild Red-Legged P...
3 [Background: Cebu has been one of the most lep... [Pauline F D Scheelbeek, Marivic V F Balagon, ... 10.1371/journal.pntd.0002444 PLoS Neglected Tropical Diseases 2013-09-19T00:00:00Z 1 [/Earth sciences/Geography/Geographic areas/Ru... A Retrospective Study of the Epidemiology of L...
4 [Background: Understanding how androgen recept... [Adam T Szafran, Maria Szwarc, Marco Marcelli,... 10.1371/journal.pone.0003605 PLoS ONE 2008-11-03T00:00:00Z 1 [/Biology and life sciences/Cell biology/Cell ... Androgen Receptor Functional Analyses by High ...

5 rows × 8 columns


In [3]:
# Drop unused data.
df.drop(['author', 'title_display', 'journal', 'abstract', 'score'], axis=1, inplace=True)
df.set_index('id', inplace=True)

# We just want the year.
df.publication_date = df.publication_date.str[:4]

df.head()


Out[3]:
publication_date subject
id
10.1371/journal.pone.0100281 2014 [/Medicine and health sciences/Pathology and l...
10.1371/journal.pone.0070935 2013 [/Biology and life sciences/Molecular biology/...
10.1371/journal.pone.0066671 2013 [/Biology and life sciences/Population biology...
10.1371/journal.pntd.0002444 2013 [/Earth sciences/Geography/Geographic areas/Ru...
10.1371/journal.pone.0003605 2008 [/Biology and life sciences/Cell biology/Cell ...

5 rows × 2 columns


In [4]:
def get_subj_top(subjects):
    subj_top = set()
    for s in subjects:
        # the string gets split at its first character, so not [0] here:
        subj_top.add(s.split('/')[1])
    return subj_top

def get_subj_leaf(subjects):
    subj_top = set()
    for s in subjects:
        subj_top.add(s.split('/')[-1])
    return subj_top

In [5]:
df['subj_top'] = df.subject.apply(get_subj_top)
df['subj_leaf'] = df.subject.apply(get_subj_leaf)

In [6]:
test_df = df.sort_index().head()
test_df


Out[6]:
publication_date subject subj_top subj_leaf
id
10.1371/journal.pbio.0000001 2003 [/Biology and life sciences/Molecular biology/... set([Biology and life sciences, Physical scien... set([Recombinant proteins, Nucleotide sequenci...
10.1371/journal.pbio.0000002 2003 [/Biology and life sciences/Molecular biology/... set([Biology and life sciences, Physical scien... set([DNA sequences, Microarrays, Oligonucleoti...
10.1371/journal.pbio.0000003 2003 [/Biology and life sciences/Organisms/Viruses/... set([Biology and life sciences, Medicine and h... set([DNA sequences, West Nile virus, Sequence ...
10.1371/journal.pbio.0000004 2003 [/Medicine and health sciences/Immunology/Immu... set([Biology and life sciences, Medicine and h... set([Recombinant proteins, Antibodies, Immune ...
10.1371/journal.pbio.0000005 2003 [/Biology and life sciences/Biochemistry/Prote... set([Biology and life sciences, Physical scien... set([Plasmodium, Merozoites, Oligonucleotides,...

5 rows × 4 columns

Start making dictionaries

We need to reduce the amount of data that the web app needs to load. We will encode all the subject terms and paths into dicts with simple numeric keys.


In [7]:
def make_dicts(values):
    # Dict to translate original terms into codes
    d1 = {}
    index = 1
    
    for i in values:
        for s in i:
            s = s.replace(u'\u2019', "'")
            if s not in d1.keys():
                d1[s] = index
                index = index + 1
    
    # Dict to translate codes back into original terms
    d2 = {d1[k]: k for k in d1.keys()}
    
    return d1, d2


def encode_terms(values, d):
    # Returned Series must share the same Index as input
    coded_values = values.copy()
    
    for i in range(len(values)):
        coded_values.ix[i] = set(d[s.replace(u'\u2019', "'")] for s in values.ix[i])
    
    return coded_values

In [8]:
# Test!
# - There should be repeated uses of the keys (not just 1, 2, 3, 4, 5...)
# - There should be the same number of terms in original and 'encoded' data

print test_df['subj_leaf'].apply(len)

d_to_code, d_from_code = make_dicts(test_df['subj_leaf'])
test_enc1 = encode_terms(test_df['subj_leaf'], d_to_code)
print test_enc1
print test_enc1.apply(len) == test_df['subj_leaf'].apply(len)


id
10.1371/journal.pbio.0000001    8
10.1371/journal.pbio.0000002    8
10.1371/journal.pbio.0000003    8
10.1371/journal.pbio.0000004    8
10.1371/journal.pbio.0000005    8
Name: subj_leaf, dtype: int64
id
10.1371/journal.pbio.0000001            set([1, 2, 3, 4, 5, 6, 7, 8])
10.1371/journal.pbio.0000002       set([3, 6, 9, 10, 11, 12, 13, 14])
10.1371/journal.pbio.0000003      set([3, 9, 12, 14, 15, 16, 17, 18])
10.1371/journal.pbio.0000004      set([1, 3, 19, 20, 21, 22, 23, 24])
10.1371/journal.pbio.0000005    set([10, 25, 26, 27, 28, 29, 30, 31])
Name: subj_leaf, dtype: object
id
10.1371/journal.pbio.0000001    True
10.1371/journal.pbio.0000002    True
10.1371/journal.pbio.0000003    True
10.1371/journal.pbio.0000004    True
10.1371/journal.pbio.0000005    True
Name: subj_leaf, dtype: bool

In [9]:
# Test again!

print test_df['subject'].apply(len)

d_to_code, d_from_code = make_dicts(test_df['subject'])
test_enc2 = encode_terms(test_df['subject'], d_to_code)
print test_enc2
print test_enc2.apply(len) == test_df['subject'].apply(len)


id
10.1371/journal.pbio.0000001    13
10.1371/journal.pbio.0000002    14
10.1371/journal.pbio.0000003    18
10.1371/journal.pbio.0000004    16
10.1371/journal.pbio.0000005    12
Name: subject, dtype: int64
id
10.1371/journal.pbio.0000001     set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
10.1371/journal.pbio.0000002    set([5, 9, 10, 12, 14, 15, 16, 17, 18, 19, 20,...
10.1371/journal.pbio.0000003    set([32, 33, 5, 12, 15, 16, 17, 18, 20, 21, 24...
10.1371/journal.pbio.0000004    set([34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 4...
10.1371/journal.pbio.0000005    set([54, 47, 48, 49, 50, 51, 52, 53, 22, 55, 5...
Name: subject, dtype: object
id
10.1371/journal.pbio.0000001    True
10.1371/journal.pbio.0000002    True
10.1371/journal.pbio.0000003    True
10.1371/journal.pbio.0000004    True
10.1371/journal.pbio.0000005    True
Name: subject, dtype: bool

In [10]:
# Test output dict as JSON:
json.dumps(d_from_code)


Out[10]:
'{"1": "/Biology and life sciences/Molecular biology/Molecular biology techniques/Molecular biology assays and analysis techniques/Library screening/Recombination-based assay", "2": "/Research and analysis methods/Molecular biology techniques/Sequencing techniques/Sequence analysis/DNA sequence analysis", "3": "/Physical sciences/Chemistry/Chemical reactions/Recombination reactions", "4": "/Biology and life sciences/Molecular biology/Molecular biology techniques/Sequencing techniques/Nucleotide sequencing", "5": "/Biology and life sciences/Biochemistry/DNA/DNA sequences", "6": "/Biology and life sciences/Biochemistry/Proteins/Recombinant proteins", "7": "/Biology and life sciences/Molecular biology/Molecular biology techniques/Sequencing techniques/Sequence analysis/DNA sequence analysis", "8": "/Biology and life sciences/Biochemistry/Nucleotides", "9": "/Research and analysis methods/Molecular biology techniques/Sequencing techniques/Sequence analysis/Sequence motif analysis", "10": "/Biology and life sciences/Molecular biology/Molecular biology techniques/Sequencing techniques/Sequence analysis/Sequence motif analysis", "11": "/Research and analysis methods/Molecular biology techniques/Molecular biology assays and analysis techniques/Library screening/Recombination-based assay", "12": "/Biology and life sciences/Genetics/DNA/DNA sequences", "13": "/Research and analysis methods/Molecular biology techniques/Sequencing techniques/Nucleotide sequencing", "14": "/Biology and life sciences/Molecular biology/Molecular biology techniques/Artificial gene amplification and extension/Polymerase chain reaction", "15": "/Biology and life sciences/Organisms/Viruses/Viral pathogens/Coronaviruses", "16": "/Medicine and health sciences/Pathology and laboratory medicine/Pathogens/Microbial pathogens/Viral pathogens/Coronaviruses", "17": "/Biology and life sciences/Microbiology/Medical microbiology/Microbial pathogens/Viral pathogens/Coronaviruses", "18": "/Medicine and health sciences/Infectious diseases/Viral diseases/SARS", "19": "/Research and analysis methods/Molecular biology techniques/Artificial gene amplification and extension/Polymerase chain reaction", "20": "/Biology and life sciences/Organisms/Viruses/RNA viruses/Coronaviruses", "21": "/Research and analysis methods/Bioassays and physiological analysis/Microarrays", "22": "/Physical sciences/Materials science/Materials by structure/Polymers/Oligonucleotides", "23": "/Biology and life sciences/Biochemistry/Nucleic acids", "24": "/Biology and life sciences/Organisms/Viruses/Viral pathogens/Flaviviruses/West Nile virus", "25": "/Research and analysis methods/Molecular biology techniques/Sequencing techniques/Sequence analysis", "26": "/Biology and life sciences/Microbiology/Medical microbiology/Microbial pathogens/Viral pathogens/Flaviviruses/West Nile virus", "27": "/Medicine and health sciences/Pathology and laboratory medicine/Pathogens/Microbial pathogens/Viral pathogens/Flaviviruses/West Nile virus", "28": "/Biology and life sciences/Microbiology/Microbial genomics/Viral genomics", "29": "/Biology and life sciences/Molecular biology/Molecular biology techniques/Sequencing techniques/Sequence analysis", "30": "/Biology and life sciences/Genetics/Genomics/Microbial genomics/Viral genomics", "31": "/Biology and life sciences/Microbiology/Virology/Viral genomics", "32": "/Biology and life sciences/Organisms/Viruses/RNA viruses/Flaviviruses/West Nile virus", "33": "/Biology and life sciences/Organisms/Viruses/DNA viruses", "34": "/Medicine and health sciences/Immunology/Immune system proteins", "35": "/Biology and life sciences/Biochemistry/Proteins/Protein interactions", "36": "/Medicine and health sciences/Physiology/Immune physiology/Antibodies", "37": "/Biology and life sciences/Immunology/Immune system proteins/Antibodies", "38": "/Biology and life sciences/Physiology/Immune physiology/Antibodies", "39": "/Biology and life sciences/Biochemistry/Proteins/Immune system proteins/Antibodies", "40": "/Biology and life sciences/Immunology/Immune system proteins", "41": "/Medicine and health sciences/Immunology/Immune system proteins/Antibodies", "42": "/Biology and life sciences/Biochemistry/Proteins/Immune system proteins", "43": "/Medicine and health sciences", "44": "/Biology and life sciences/Biochemistry/DNA/DNA recombination", "45": "/Biology and life sciences/Genetics/Gene expression/Gene regulation", "46": "/Biology and life sciences/Genetics/DNA/DNA recombination", "47": "/Biology and life sciences/Biochemistry/Proteins/Enzymes/Proteases", "48": "/Biology and life sciences/Parasitology/Parasite groups/Apicomplexa/Merozoites", "49": "/Biology and life sciences/Biochemistry/Enzymology/Enzymes/Proteases", "50": "/Biology and life sciences/Computational biology/Genome analysis/Transcriptome analysis", "51": "/Biology and life sciences/Genetics/Genomics/Genome analysis/Gene prediction", "52": "/Biology and life sciences/Computational biology/Genome analysis/Gene prediction", "53": "/Biology and life sciences/Genetics/Gene expression", "54": "/Biology and life sciences/Genetics/Genomics/Genome analysis/Transcriptome analysis", "55": "/Biology and life sciences/Parasitology/Parasite groups/Apicomplexa/Plasmodium", "56": "/Biology and life sciences/Cell biology/Plant cell biology/Plastids", "57": "/Biology and life sciences/Plant science/Plant cell biology/Plastids"}'

Ready to generate dicts & translate the entire dataset

This cell takes a long time to run.


In [11]:
# subject
subject_enc, subject_dec = make_dicts(df['subject'])

with open('../data/dict_subject_enc.json', 'wb') as f:
     json.dump(subject_enc, f)

with open('../data/dict_subject_dec.json', 'wb') as f:
     json.dump(subject_dec, f)

# subj_top
subj_top_enc, subj_top_dec = make_dicts(df['subj_top'])

with open('../data/dict_subj_top.json', 'wb') as f:
     json.dump(subj_top_dec, f)

# subj_leaf
subj_leaf_enc, subj_leaf_dec = make_dicts(df['subj_leaf'])

with open('../data/dict_subj_leaf.json', 'wb') as f:
     json.dump(subj_leaf_dec, f)

The next cell also takes a while.


In [12]:
df['subject'] = encode_terms(df['subject'], subject_enc)
df['subj_top'] = encode_terms(df['subj_top'], subj_top_enc)
df['subj_leaf'] = encode_terms(df['subj_leaf'], subj_leaf_enc)

df.head()


Out[12]:
publication_date subject subj_top subj_leaf
id
10.1371/journal.pone.0100281 2014 set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13... set([1, 2, 3]) set([1, 2, 3, 4, 5, 6, 7, 8])
10.1371/journal.pone.0070935 2013 set([32, 17, 18, 19, 20, 21, 22, 23, 24, 25, 2... set([1, 2, 3, 4, 5]) set([9, 10, 11, 12, 13, 14, 15, 16])
10.1371/journal.pone.0066671 2013 set([33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]) set([8, 1, 6, 7]) set([17, 18, 19, 20, 21, 22, 23, 24])
10.1371/journal.pntd.0002444 2013 set([44, 45, 46, 47, 48, 49, 50, 51, 52, 53]) set([2, 5, 7]) set([32, 25, 26, 27, 28, 29, 30, 31])
10.1371/journal.pone.0003605 2008 set([54, 55, 56, 57, 58, 59, 60, 61, 62]) set([1, 2, 3]) set([33, 34, 35, 36, 37, 38, 39, 40])

5 rows × 4 columns

Here is where we check to see if I did it right...


In [13]:
df.head().to_json(orient='index', force_ascii=False)


Out[13]:
'{"10.1371\\/journal.pone.0100281":{"publication_date":"2014","subject":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16],"subj_top":[1,2,3],"subj_leaf":[1,2,3,4,5,6,7,8]},"10.1371\\/journal.pone.0070935":{"publication_date":"2013","subject":[32,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],"subj_top":[1,2,3,4,5],"subj_leaf":[9,10,11,12,13,14,15,16]},"10.1371\\/journal.pone.0066671":{"publication_date":"2013","subject":[33,34,35,36,37,38,39,40,41,42,43],"subj_top":[8,1,6,7],"subj_leaf":[17,18,19,20,21,22,23,24]},"10.1371\\/journal.pntd.0002444":{"publication_date":"2013","subject":[44,45,46,47,48,49,50,51,52,53],"subj_top":[2,5,7],"subj_leaf":[32,25,26,27,28,29,30,31]},"10.1371\\/journal.pone.0003605":{"publication_date":"2008","subject":[54,55,56,57,58,59,60,61,62],"subj_top":[1,2,3],"subj_leaf":[33,34,35,36,37,38,39,40]}}'

If all is OK, export..


In [14]:
df.to_json(path_or_buf='../data/articles_coded.json', orient='index', force_ascii=False)