In [1]:

    
from __future__ import unicode_literals
import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

Transforming batch-collected article data

The desired data structure for article information is the following JSON object:

<doi1>: {
    author: [ ... ]
    title:
    journal:
    publication_date: <yyyy>
    subject: [ <full subject /-separated strings>, ... ]
    subj_top: [ set of top levels of each subject ]
    subj_leaf: [ set of last terms of each subject ]
},
<doi2>: { ... }, 
...



In [2]:

    
df = pd.read_pickle('../data/all_plos_df.pkl')
df.head()









    Out[2]:






  
    
      
      abstract
      author
      id
      journal
      publication_date
      score
      subject
      title_display
    
  
  
    
      0
       [\nClenbuterol, a β2-agonist, induces skeletal...
       [Pascal Sirvent, Aymerick Douillard, Olivier G...
       10.1371/journal.pone.0100281
                               PLoS ONE
       2014-06-27T00:00:00Z
       1
       [/Medicine and health sciences/Pathology and l...
       Effects of Chronic Administration of Clenbuter...
    
    
      1
       [\nRecent studies point to an association betw...
       [Lan Zhang, Hui Ding, Dan-Hui Wang, Yan-Li Zha...
       10.1371/journal.pone.0070935
                               PLoS ONE
       2013-08-09T00:00:00Z
       1
       [/Biology and life sciences/Molecular biology/...
       Calpastatin Gene (CAST) Is Not Associated with...
    
    
      2
       [\nThe reduction of game and fish populations ...
       [Silvia Díaz-Fernández, Beatriz Arroyo, Fabián...
       10.1371/journal.pone.0066671
                               PLoS ONE
       2013-06-19T00:00:00Z
       1
       [/Biology and life sciences/Population biology...
       Effect of Game Management on Wild Red-Legged P...
    
    
      3
       [Background: Cebu has been one of the most lep...
       [Pauline F D Scheelbeek, Marivic V F Balagon, ...
       10.1371/journal.pntd.0002444
       PLoS Neglected Tropical Diseases
       2013-09-19T00:00:00Z
       1
       [/Earth sciences/Geography/Geographic areas/Ru...
       A Retrospective Study of the Epidemiology of L...
    
    
      4
       [Background: Understanding how androgen recept...
       [Adam T Szafran, Maria Szwarc, Marco Marcelli,...
       10.1371/journal.pone.0003605
                               PLoS ONE
       2008-11-03T00:00:00Z
       1
       [/Biology and life sciences/Cell biology/Cell ...
       Androgen Receptor Functional Analyses by High ...
    
  

5 rows × 8 columns



In [3]:

    
# Drop unused data.
df.drop(['author', 'title_display', 'journal', 'abstract', 'score'], axis=1, inplace=True)
df.set_index('id', inplace=True)

# We just want the year.
df.publication_date = df.publication_date.str[:4]

df.head()









    Out[3]:






  
    
      
      publication_date
      subject
    
    
      id
      
      
    
  
  
    
      10.1371/journal.pone.0100281
       2014
       [/Medicine and health sciences/Pathology and l...
    
    
      10.1371/journal.pone.0070935
       2013
       [/Biology and life sciences/Molecular biology/...
    
    
      10.1371/journal.pone.0066671
       2013
       [/Biology and life sciences/Population biology...
    
    
      10.1371/journal.pntd.0002444
       2013
       [/Earth sciences/Geography/Geographic areas/Ru...
    
    
      10.1371/journal.pone.0003605
       2008
       [/Biology and life sciences/Cell biology/Cell ...
    
  

5 rows × 2 columns



In [4]:

    
def get_subj_top(subjects):
    subj_top = set()
    for s in subjects:
        # the string gets split at its first character, so not [0] here:
        subj_top.add(s.split('/')[1])
    return subj_top

def get_subj_leaf(subjects):
    subj_top = set()
    for s in subjects:
        subj_top.add(s.split('/')[-1])
    return subj_top



In [5]:

    
df['subj_top'] = df.subject.apply(get_subj_top)
df['subj_leaf'] = df.subject.apply(get_subj_leaf)



In [6]:

    
test_df = df.sort_index().head()
test_df









    Out[6]:






  
    
      
      publication_date
      subject
      subj_top
      subj_leaf
    
    
      id
      
      
      
      
    
  
  
    
      10.1371/journal.pbio.0000001
       2003
       [/Biology and life sciences/Molecular biology/...
       set([Biology and life sciences, Physical scien...
       set([Recombinant proteins, Nucleotide sequenci...
    
    
      10.1371/journal.pbio.0000002
       2003
       [/Biology and life sciences/Molecular biology/...
       set([Biology and life sciences, Physical scien...
       set([DNA sequences, Microarrays, Oligonucleoti...
    
    
      10.1371/journal.pbio.0000003
       2003
       [/Biology and life sciences/Organisms/Viruses/...
       set([Biology and life sciences, Medicine and h...
       set([DNA sequences, West Nile virus, Sequence ...
    
    
      10.1371/journal.pbio.0000004
       2003
       [/Medicine and health sciences/Immunology/Immu...
       set([Biology and life sciences, Medicine and h...
       set([Recombinant proteins, Antibodies, Immune ...
    
    
      10.1371/journal.pbio.0000005
       2003
       [/Biology and life sciences/Biochemistry/Prote...
       set([Biology and life sciences, Physical scien...
       set([Plasmodium, Merozoites, Oligonucleotides,...
    
  

5 rows × 4 columns

Start making dictionaries

We need to reduce the amount of data that the web app needs to load. We will encode all the subject terms and paths into dicts with simple numeric keys.



In [7]:

    
def make_dicts(values):
    # Dict to translate original terms into codes
    d1 = {}
    index = 1
    
    for i in values:
        for s in i:
            s = s.replace(u'\u2019', "'")
            if s not in d1.keys():
                d1[s] = index
                index = index + 1
    
    # Dict to translate codes back into original terms
    d2 = {d1[k]: k for k in d1.keys()}
    
    return d1, d2


def encode_terms(values, d):
    # Returned Series must share the same Index as input
    coded_values = values.copy()
    
    for i in range(len(values)):
        coded_values.ix[i] = set(d[s.replace(u'\u2019', "'")] for s in values.ix[i])
    
    return coded_values



In [8]:

    
# Test!
# - There should be repeated uses of the keys (not just 1, 2, 3, 4, 5...)
# - There should be the same number of terms in original and 'encoded' data

print test_df['subj_leaf'].apply(len)

d_to_code, d_from_code = make_dicts(test_df['subj_leaf'])
test_enc1 = encode_terms(test_df['subj_leaf'], d_to_code)
print test_enc1
print test_enc1.apply(len) == test_df['subj_leaf'].apply(len)









    



id
10.1371/journal.pbio.0000001    8
10.1371/journal.pbio.0000002    8
10.1371/journal.pbio.0000003    8
10.1371/journal.pbio.0000004    8
10.1371/journal.pbio.0000005    8
Name: subj_leaf, dtype: int64
id
10.1371/journal.pbio.0000001            set([1, 2, 3, 4, 5, 6, 7, 8])
10.1371/journal.pbio.0000002       set([3, 6, 9, 10, 11, 12, 13, 14])
10.1371/journal.pbio.0000003      set([3, 9, 12, 14, 15, 16, 17, 18])
10.1371/journal.pbio.0000004      set([1, 3, 19, 20, 21, 22, 23, 24])
10.1371/journal.pbio.0000005    set([10, 25, 26, 27, 28, 29, 30, 31])
Name: subj_leaf, dtype: object
id
10.1371/journal.pbio.0000001    True
10.1371/journal.pbio.0000002    True
10.1371/journal.pbio.0000003    True
10.1371/journal.pbio.0000004    True
10.1371/journal.pbio.0000005    True
Name: subj_leaf, dtype: bool



In [9]:

    
# Test again!

print test_df['subject'].apply(len)

d_to_code, d_from_code = make_dicts(test_df['subject'])
test_enc2 = encode_terms(test_df['subject'], d_to_code)
print test_enc2
print test_enc2.apply(len) == test_df['subject'].apply(len)









    



id
10.1371/journal.pbio.0000001    13
10.1371/journal.pbio.0000002    14
10.1371/journal.pbio.0000003    18
10.1371/journal.pbio.0000004    16
10.1371/journal.pbio.0000005    12
Name: subject, dtype: int64
id
10.1371/journal.pbio.0000001     set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
10.1371/journal.pbio.0000002    set([5, 9, 10, 12, 14, 15, 16, 17, 18, 19, 20,...
10.1371/journal.pbio.0000003    set([32, 33, 5, 12, 15, 16, 17, 18, 20, 21, 24...
10.1371/journal.pbio.0000004    set([34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 4...
10.1371/journal.pbio.0000005    set([54, 47, 48, 49, 50, 51, 52, 53, 22, 55, 5...
Name: subject, dtype: object
id
10.1371/journal.pbio.0000001    True
10.1371/journal.pbio.0000002    True
10.1371/journal.pbio.0000003    True
10.1371/journal.pbio.0000004    True
10.1371/journal.pbio.0000005    True
Name: subject, dtype: bool



In [10]:

    
# Test output dict as JSON:
json.dumps(d_from_code)









    Out[10]:





'{"1": "/Biology and life sciences/Molecular biology/Molecular biology techniques/Molecular biology assays and analysis techniques/Library screening/Recombination-based assay", "2": "/Research and analysis methods/Molecular biology techniques/Sequencing techniques/Sequence analysis/DNA sequence analysis", "3": "/Physical sciences/Chemistry/Chemical reactions/Recombination reactions", "4": "/Biology and life sciences/Molecular biology/Molecular biology techniques/Sequencing techniques/Nucleotide sequencing", "5": "/Biology and life sciences/Biochemistry/DNA/DNA sequences", "6": "/Biology and life sciences/Biochemistry/Proteins/Recombinant proteins", "7": "/Biology and life sciences/Molecular biology/Molecular biology techniques/Sequencing techniques/Sequence analysis/DNA sequence analysis", "8": "/Biology and life sciences/Biochemistry/Nucleotides", "9": "/Research and analysis methods/Molecular biology techniques/Sequencing techniques/Sequence analysis/Sequence motif analysis", "10": "/Biology and life sciences/Molecular biology/Molecular biology techniques/Sequencing techniques/Sequence analysis/Sequence motif analysis", "11": "/Research and analysis methods/Molecular biology techniques/Molecular biology assays and analysis techniques/Library screening/Recombination-based assay", "12": "/Biology and life sciences/Genetics/DNA/DNA sequences", "13": "/Research and analysis methods/Molecular biology techniques/Sequencing techniques/Nucleotide sequencing", "14": "/Biology and life sciences/Molecular biology/Molecular biology techniques/Artificial gene amplification and extension/Polymerase chain reaction", "15": "/Biology and life sciences/Organisms/Viruses/Viral pathogens/Coronaviruses", "16": "/Medicine and health sciences/Pathology and laboratory medicine/Pathogens/Microbial pathogens/Viral pathogens/Coronaviruses", "17": "/Biology and life sciences/Microbiology/Medical microbiology/Microbial pathogens/Viral pathogens/Coronaviruses", "18": "/Medicine and health sciences/Infectious diseases/Viral diseases/SARS", "19": "/Research and analysis methods/Molecular biology techniques/Artificial gene amplification and extension/Polymerase chain reaction", "20": "/Biology and life sciences/Organisms/Viruses/RNA viruses/Coronaviruses", "21": "/Research and analysis methods/Bioassays and physiological analysis/Microarrays", "22": "/Physical sciences/Materials science/Materials by structure/Polymers/Oligonucleotides", "23": "/Biology and life sciences/Biochemistry/Nucleic acids", "24": "/Biology and life sciences/Organisms/Viruses/Viral pathogens/Flaviviruses/West Nile virus", "25": "/Research and analysis methods/Molecular biology techniques/Sequencing techniques/Sequence analysis", "26": "/Biology and life sciences/Microbiology/Medical microbiology/Microbial pathogens/Viral pathogens/Flaviviruses/West Nile virus", "27": "/Medicine and health sciences/Pathology and laboratory medicine/Pathogens/Microbial pathogens/Viral pathogens/Flaviviruses/West Nile virus", "28": "/Biology and life sciences/Microbiology/Microbial genomics/Viral genomics", "29": "/Biology and life sciences/Molecular biology/Molecular biology techniques/Sequencing techniques/Sequence analysis", "30": "/Biology and life sciences/Genetics/Genomics/Microbial genomics/Viral genomics", "31": "/Biology and life sciences/Microbiology/Virology/Viral genomics", "32": "/Biology and life sciences/Organisms/Viruses/RNA viruses/Flaviviruses/West Nile virus", "33": "/Biology and life sciences/Organisms/Viruses/DNA viruses", "34": "/Medicine and health sciences/Immunology/Immune system proteins", "35": "/Biology and life sciences/Biochemistry/Proteins/Protein interactions", "36": "/Medicine and health sciences/Physiology/Immune physiology/Antibodies", "37": "/Biology and life sciences/Immunology/Immune system proteins/Antibodies", "38": "/Biology and life sciences/Physiology/Immune physiology/Antibodies", "39": "/Biology and life sciences/Biochemistry/Proteins/Immune system proteins/Antibodies", "40": "/Biology and life sciences/Immunology/Immune system proteins", "41": "/Medicine and health sciences/Immunology/Immune system proteins/Antibodies", "42": "/Biology and life sciences/Biochemistry/Proteins/Immune system proteins", "43": "/Medicine and health sciences", "44": "/Biology and life sciences/Biochemistry/DNA/DNA recombination", "45": "/Biology and life sciences/Genetics/Gene expression/Gene regulation", "46": "/Biology and life sciences/Genetics/DNA/DNA recombination", "47": "/Biology and life sciences/Biochemistry/Proteins/Enzymes/Proteases", "48": "/Biology and life sciences/Parasitology/Parasite groups/Apicomplexa/Merozoites", "49": "/Biology and life sciences/Biochemistry/Enzymology/Enzymes/Proteases", "50": "/Biology and life sciences/Computational biology/Genome analysis/Transcriptome analysis", "51": "/Biology and life sciences/Genetics/Genomics/Genome analysis/Gene prediction", "52": "/Biology and life sciences/Computational biology/Genome analysis/Gene prediction", "53": "/Biology and life sciences/Genetics/Gene expression", "54": "/Biology and life sciences/Genetics/Genomics/Genome analysis/Transcriptome analysis", "55": "/Biology and life sciences/Parasitology/Parasite groups/Apicomplexa/Plasmodium", "56": "/Biology and life sciences/Cell biology/Plant cell biology/Plastids", "57": "/Biology and life sciences/Plant science/Plant cell biology/Plastids"}'

Ready to generate dicts & translate the entire dataset

This cell takes a long time to run.



In [11]:

    
# subject
subject_enc, subject_dec = make_dicts(df['subject'])

with open('../data/dict_subject_enc.json', 'wb') as f:
     json.dump(subject_enc, f)

with open('../data/dict_subject_dec.json', 'wb') as f:
     json.dump(subject_dec, f)

# subj_top
subj_top_enc, subj_top_dec = make_dicts(df['subj_top'])

with open('../data/dict_subj_top.json', 'wb') as f:
     json.dump(subj_top_dec, f)

# subj_leaf
subj_leaf_enc, subj_leaf_dec = make_dicts(df['subj_leaf'])

with open('../data/dict_subj_leaf.json', 'wb') as f:
     json.dump(subj_leaf_dec, f)

The next cell also takes a while.



In [12]:

    
df['subject'] = encode_terms(df['subject'], subject_enc)
df['subj_top'] = encode_terms(df['subj_top'], subj_top_enc)
df['subj_leaf'] = encode_terms(df['subj_leaf'], subj_leaf_enc)

df.head()









    Out[12]:






  
    
      
      publication_date
      subject
      subj_top
      subj_leaf
    
    
      id
      
      
      
      
    
  
  
    
      10.1371/journal.pone.0100281
       2014
       set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...
             set([1, 2, 3])
               set([1, 2, 3, 4, 5, 6, 7, 8])
    
    
      10.1371/journal.pone.0070935
       2013
       set([32, 17, 18, 19, 20, 21, 22, 23, 24, 25, 2...
       set([1, 2, 3, 4, 5])
        set([9, 10, 11, 12, 13, 14, 15, 16])
    
    
      10.1371/journal.pone.0066671
       2013
       set([33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43])
          set([8, 1, 6, 7])
       set([17, 18, 19, 20, 21, 22, 23, 24])
    
    
      10.1371/journal.pntd.0002444
       2013
           set([44, 45, 46, 47, 48, 49, 50, 51, 52, 53])
             set([2, 5, 7])
       set([32, 25, 26, 27, 28, 29, 30, 31])
    
    
      10.1371/journal.pone.0003605
       2008
               set([54, 55, 56, 57, 58, 59, 60, 61, 62])
             set([1, 2, 3])
       set([33, 34, 35, 36, 37, 38, 39, 40])
    
  

5 rows × 4 columns

Here is where we check to see if I did it right...



In [13]:

    
df.head().to_json(orient='index', force_ascii=False)









    Out[13]:





'{"10.1371\\/journal.pone.0100281":{"publication_date":"2014","subject":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16],"subj_top":[1,2,3],"subj_leaf":[1,2,3,4,5,6,7,8]},"10.1371\\/journal.pone.0070935":{"publication_date":"2013","subject":[32,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],"subj_top":[1,2,3,4,5],"subj_leaf":[9,10,11,12,13,14,15,16]},"10.1371\\/journal.pone.0066671":{"publication_date":"2013","subject":[33,34,35,36,37,38,39,40,41,42,43],"subj_top":[8,1,6,7],"subj_leaf":[17,18,19,20,21,22,23,24]},"10.1371\\/journal.pntd.0002444":{"publication_date":"2013","subject":[44,45,46,47,48,49,50,51,52,53],"subj_top":[2,5,7],"subj_leaf":[32,25,26,27,28,29,30,31]},"10.1371\\/journal.pone.0003605":{"publication_date":"2008","subject":[54,55,56,57,58,59,60,61,62],"subj_top":[1,2,3],"subj_leaf":[33,34,35,36,37,38,39,40]}}'

If all is OK, export..



In [14]:

    
df.to_json(path_or_buf='../data/articles_coded.json', orient='index', force_ascii=False)

	abstract	author	id	journal	publication_date	score	subject	title_display
0	[\nClenbuterol, a β2-agonist, induces skeletal...	[Pascal Sirvent, Aymerick Douillard, Olivier G...	10.1371/journal.pone.0100281	PLoS ONE	2014-06-27T00:00:00Z	1	[/Medicine and health sciences/Pathology and l...	Effects of Chronic Administration of Clenbuter...
1	[\nRecent studies point to an association betw...	[Lan Zhang, Hui Ding, Dan-Hui Wang, Yan-Li Zha...	10.1371/journal.pone.0070935	PLoS ONE	2013-08-09T00:00:00Z	1	[/Biology and life sciences/Molecular biology/...	Calpastatin Gene (CAST) Is Not Associated with...
2	[\nThe reduction of game and fish populations ...	[Silvia Díaz-Fernández, Beatriz Arroyo, Fabián...	10.1371/journal.pone.0066671	PLoS ONE	2013-06-19T00:00:00Z	1	[/Biology and life sciences/Population biology...	Effect of Game Management on Wild Red-Legged P...
3	[Background: Cebu has been one of the most lep...	[Pauline F D Scheelbeek, Marivic V F Balagon, ...	10.1371/journal.pntd.0002444	PLoS Neglected Tropical Diseases	2013-09-19T00:00:00Z	1	[/Earth sciences/Geography/Geographic areas/Ru...	A Retrospective Study of the Epidemiology of L...
4	[Background: Understanding how androgen recept...	[Adam T Szafran, Maria Szwarc, Marco Marcelli,...	10.1371/journal.pone.0003605	PLoS ONE	2008-11-03T00:00:00Z	1	[/Biology and life sciences/Cell biology/Cell ...	Androgen Receptor Functional Analyses by High ...

	publication_date	subject
id
10.1371/journal.pone.0100281	2014	[/Medicine and health sciences/Pathology and l...
10.1371/journal.pone.0070935	2013	[/Biology and life sciences/Molecular biology/...
10.1371/journal.pone.0066671	2013	[/Biology and life sciences/Population biology...
10.1371/journal.pntd.0002444	2013	[/Earth sciences/Geography/Geographic areas/Ru...
10.1371/journal.pone.0003605	2008	[/Biology and life sciences/Cell biology/Cell ...

	publication_date	subject	subj_top	subj_leaf
id
10.1371/journal.pbio.0000001	2003	[/Biology and life sciences/Molecular biology/...	set([Biology and life sciences, Physical scien...	set([Recombinant proteins, Nucleotide sequenci...
10.1371/journal.pbio.0000002	2003	[/Biology and life sciences/Molecular biology/...	set([Biology and life sciences, Physical scien...	set([DNA sequences, Microarrays, Oligonucleoti...
10.1371/journal.pbio.0000003	2003	[/Biology and life sciences/Organisms/Viruses/...	set([Biology and life sciences, Medicine and h...	set([DNA sequences, West Nile virus, Sequence ...
10.1371/journal.pbio.0000004	2003	[/Medicine and health sciences/Immunology/Immu...	set([Biology and life sciences, Medicine and h...	set([Recombinant proteins, Antibodies, Immune ...
10.1371/journal.pbio.0000005	2003	[/Biology and life sciences/Biochemistry/Prote...	set([Biology and life sciences, Physical scien...	set([Plasmodium, Merozoites, Oligonucleotides,...

	publication_date	subject	subj_top	subj_leaf
id
10.1371/journal.pone.0100281	2014	set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...	set([1, 2, 3])	set([1, 2, 3, 4, 5, 6, 7, 8])
10.1371/journal.pone.0070935	2013	set([32, 17, 18, 19, 20, 21, 22, 23, 24, 25, 2...	set([1, 2, 3, 4, 5])	set([9, 10, 11, 12, 13, 14, 15, 16])
10.1371/journal.pone.0066671	2013	set([33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43])	set([8, 1, 6, 7])	set([17, 18, 19, 20, 21, 22, 23, 24])
10.1371/journal.pntd.0002444	2013	set([44, 45, 46, 47, 48, 49, 50, 51, 52, 53])	set([2, 5, 7])	set([32, 25, 26, 27, 28, 29, 30, 31])
10.1371/journal.pone.0003605	2008	set([54, 55, 56, 57, 58, 59, 60, 61, 62])	set([1, 2, 3])	set([33, 34, 35, 36, 37, 38, 39, 40])