notebook.community

Edit and run



In [2]:

    
import pandas as pd
import json
import gzip
import os
import requests
import pickle
from tqdm import tqdm
from functools import lru_cache
from collections import Counter
from itertools import chain
pd.set_option("display.max_columns", 30)

import sys, os
sys.path.insert(0, "/home/gstupp/projects/WikidataIntegrator")
sys.path.insert(0, "/home/gstupp/projects/wikidata-biothings/scheduled_bots")
from wikidataintegrator import wdi_helpers



In [3]:

    
f = gzip.open("openfda/openfda.json.gz", 'rt', encoding='utf8')
f = map(lambda x:json.loads(x), f)
ds = [d for d in f if ("product_type" in d) and ('HUMAN PRESCRIPTION DRUG' in d['product_type'])]
ds = [d for d in ds if ("application_number" in d) and any('NDA' in x for x in d['application_number'])]
ds = [d for d in ds if "unii" in d and d['unii']]



In [4]:

    
# make sure all docs have all keys
keys = set(chain(*[d.keys() for d in ds]))
for d in ds:
    for key in keys:
        if key not in d:
            d[key] = []



In [5]:

    
# which keys are always one valued lists
single = set()
for key in keys:
    if set([len(d[key]) for d in ds]) == {1}:
        single.add(key)
print(single)
for key in single:
    for d in ds:
        d[key] = d[key][0].upper()
for d in ds:
    d['generic_name'] = d['generic_name'].upper()
    d['substance_name'] = [x.strip().upper() for x in d['substance_name']]









    



{'manufacturer_name', 'spl_id', 'spl_set_id', 'brand_name', 'application_number', 'generic_name', 'product_type'}



In [6]:

    
print(len(ds))
# toss those where the brand_name == generic_name
ds = [d for d in ds if d['brand_name'] != d['generic_name']]
print(len(ds))
# toss those in which the number of unii ids doesn't match the number of substances
ds = [d for d in ds if len(d['substance_name']) == len(d['unii'])]
print(len(ds))
# toss those in which there are brand names that have different components (by their associated unii ids)
df = pd.DataFrame(ds)
bn_unii = df.groupby("brand_name").agg({'unii': lambda x: set(frozenset(y) for y in x)})
bn_unii = bn_unii[bn_unii.unii.apply(len)>1]
df = df[~df.brand_name.isin(bn_unii.index)]
print(len(df))



In [7]:

    
@lru_cache(maxsize=100000)
def get_rxcui_brandname(rxcui):
    url = "https://rxnav.nlm.nih.gov/REST/rxcui/{}/related.json?tty=BN".format(rxcui)
    d = requests.get(url).json()
    ingredients = {x['tty']: x.get('conceptProperties', []) for x in d['relatedGroup']['conceptGroup'] if
                   x['tty'] in {'BN'}}
    if len(ingredients['BN']):
        return ingredients['BN'][0]['rxcui']

@lru_cache(maxsize=100000)
def get_rxcui_ingredient(rxcui):
    """
    Get from ingredient/dose/form to compound
    example: rxcui: 1442407 (Camphor 48 MG/ML / Eucalyptus oil 12 MG/ML / Menthol 26 MG/ML Topical Cream)
    to: 691178 (Camphor / Eucalyptus oil / Menthol)
    https://rxnav.nlm.nih.gov/REST/rxcui/1442407/allrelated.json
    http://bioportal.bioontology.org/ontologies/RXNORM?p=classes&conceptid=1442407

    Look for MIN, PIN, or IN
    types: https://www.nlm.nih.gov/research/umls/rxnorm/docs/2015/appendix5.html
    api doc: https://rxnav.nlm.nih.gov/RxNormAPIs.html#uLink=RxNorm_REST_getAllRelatedInfo
    :param rxcui:
    :return:
    """
    url = "https://rxnav.nlm.nih.gov/REST/rxcui/{}/related.json?tty=MIN+PIN+IN".format(rxcui)
    d = requests.get(url).json()
    ingredients = {x['tty']: x.get('conceptProperties', []) for x in d['relatedGroup']['conceptGroup'] if
                   x['tty'] in {'MIN', 'PIN', 'IN'}}
    if len(ingredients['MIN']):
        return ingredients['MIN'][0]['rxcui']
    elif len(ingredients['PIN']):
        return ingredients['PIN'][0]['rxcui']
    elif len(ingredients['IN']):
        return ingredients['IN'][0]['rxcui']
    else:
        return None

get_rxcui_ingredient(403878)
get_rxcui_brandname(403878)









    Out[7]:





'282386'



In [ ]:

    
rxcuis = set(chain(*df.rxcui))
rxcui_ingredient = dict()
rxcui_brandname = dict()
for rxcui in tqdm(rxcuis):
    if rxcui not in rxcui_ingredient and rxcui not in rxcui_brandname:
        rxcui_ingredient[rxcui] = get_rxcui_ingredient(rxcui)
        rxcui_brandname[rxcui] = get_rxcui_brandname(rxcui)



In [ ]:

    
df['rxcui_brandname'] = df.rxcui.apply(lambda x:[rxcui_brandname.get(y) for y in x])
df['rxcui_ingredient'] = df.rxcui.apply(lambda x:[rxcui_ingredient.get(y) for y in x])



In [ ]:

    
# df.to_csv("openfda.csv")



In [ ]:

    
# collapse duplicate brand_names
gb = iter(df.groupby("brand_name"))
newdf = pd.DataFrame()

for _, this_df in gb:
    # there should be one brandname cui, one ingredient cui, can be more than one unii
    rxcui_brandname = set(chain(*this_df.rxcui_brandname))
    rxcui_ingredient = set(chain(*this_df.rxcui_ingredient))
    unii = set(this_df.unii.apply(frozenset))
    # in this_df, we want to make sure all rows have the same unii, rxcui_brandname, rxcui_ingredient
    # if so, keep one row, otherwise, toss the whole thing
    if (len(unii) == 1) and (len(rxcui_brandname) == 1) and (len(rxcui_ingredient) == 1):
        this_df = this_df.iloc[0,]
        newdf = newdf.append(this_df)



In [ ]:

    
newdf.rxcui_brandname = newdf.rxcui_brandname.apply(lambda x:list(set(x))[0] if x else x)
newdf.rxcui_ingredient = newdf.rxcui_ingredient.apply(lambda x:list(set(x))[0] if x else x)
newdf = newdf.dropna(subset=["rxcui_brandname", "rxcui_ingredient"])
newdf



In [ ]:

    
# newdf.to_csv("openfda_single.csv")