In [2]:
import pandas as pd
import json
import gzip
import os
import requests
import pickle
from tqdm import tqdm
from functools import lru_cache
from collections import Counter
from itertools import chain
pd.set_option("display.max_columns", 30)
import sys, os
sys.path.insert(0, "/home/gstupp/projects/WikidataIntegrator")
sys.path.insert(0, "/home/gstupp/projects/wikidata-biothings/scheduled_bots")
from wikidataintegrator import wdi_helpers
In [3]:
f = gzip.open("openfda/openfda.json.gz", 'rt', encoding='utf8')
f = map(lambda x:json.loads(x), f)
ds = [d for d in f if ("product_type" in d) and ('HUMAN PRESCRIPTION DRUG' in d['product_type'])]
ds = [d for d in ds if ("application_number" in d) and any('NDA' in x for x in d['application_number'])]
ds = [d for d in ds if "unii" in d and d['unii']]
In [4]:
# make sure all docs have all keys
keys = set(chain(*[d.keys() for d in ds]))
for d in ds:
for key in keys:
if key not in d:
d[key] = []
In [5]:
# which keys are always one valued lists
single = set()
for key in keys:
if set([len(d[key]) for d in ds]) == {1}:
single.add(key)
print(single)
for key in single:
for d in ds:
d[key] = d[key][0].upper()
for d in ds:
d['generic_name'] = d['generic_name'].upper()
d['substance_name'] = [x.strip().upper() for x in d['substance_name']]
In [6]:
print(len(ds))
# toss those where the brand_name == generic_name
ds = [d for d in ds if d['brand_name'] != d['generic_name']]
print(len(ds))
# toss those in which the number of unii ids doesn't match the number of substances
ds = [d for d in ds if len(d['substance_name']) == len(d['unii'])]
print(len(ds))
# toss those in which there are brand names that have different components (by their associated unii ids)
df = pd.DataFrame(ds)
bn_unii = df.groupby("brand_name").agg({'unii': lambda x: set(frozenset(y) for y in x)})
bn_unii = bn_unii[bn_unii.unii.apply(len)>1]
df = df[~df.brand_name.isin(bn_unii.index)]
print(len(df))
In [7]:
@lru_cache(maxsize=100000)
def get_rxcui_brandname(rxcui):
url = "https://rxnav.nlm.nih.gov/REST/rxcui/{}/related.json?tty=BN".format(rxcui)
d = requests.get(url).json()
ingredients = {x['tty']: x.get('conceptProperties', []) for x in d['relatedGroup']['conceptGroup'] if
x['tty'] in {'BN'}}
if len(ingredients['BN']):
return ingredients['BN'][0]['rxcui']
@lru_cache(maxsize=100000)
def get_rxcui_ingredient(rxcui):
"""
Get from ingredient/dose/form to compound
example: rxcui: 1442407 (Camphor 48 MG/ML / Eucalyptus oil 12 MG/ML / Menthol 26 MG/ML Topical Cream)
to: 691178 (Camphor / Eucalyptus oil / Menthol)
https://rxnav.nlm.nih.gov/REST/rxcui/1442407/allrelated.json
http://bioportal.bioontology.org/ontologies/RXNORM?p=classes&conceptid=1442407
Look for MIN, PIN, or IN
types: https://www.nlm.nih.gov/research/umls/rxnorm/docs/2015/appendix5.html
api doc: https://rxnav.nlm.nih.gov/RxNormAPIs.html#uLink=RxNorm_REST_getAllRelatedInfo
:param rxcui:
:return:
"""
url = "https://rxnav.nlm.nih.gov/REST/rxcui/{}/related.json?tty=MIN+PIN+IN".format(rxcui)
d = requests.get(url).json()
ingredients = {x['tty']: x.get('conceptProperties', []) for x in d['relatedGroup']['conceptGroup'] if
x['tty'] in {'MIN', 'PIN', 'IN'}}
if len(ingredients['MIN']):
return ingredients['MIN'][0]['rxcui']
elif len(ingredients['PIN']):
return ingredients['PIN'][0]['rxcui']
elif len(ingredients['IN']):
return ingredients['IN'][0]['rxcui']
else:
return None
get_rxcui_ingredient(403878)
get_rxcui_brandname(403878)
Out[7]:
In [ ]:
rxcuis = set(chain(*df.rxcui))
rxcui_ingredient = dict()
rxcui_brandname = dict()
for rxcui in tqdm(rxcuis):
if rxcui not in rxcui_ingredient and rxcui not in rxcui_brandname:
rxcui_ingredient[rxcui] = get_rxcui_ingredient(rxcui)
rxcui_brandname[rxcui] = get_rxcui_brandname(rxcui)
In [ ]:
df['rxcui_brandname'] = df.rxcui.apply(lambda x:[rxcui_brandname.get(y) for y in x])
df['rxcui_ingredient'] = df.rxcui.apply(lambda x:[rxcui_ingredient.get(y) for y in x])
In [ ]:
# df.to_csv("openfda.csv")
In [ ]:
# collapse duplicate brand_names
gb = iter(df.groupby("brand_name"))
newdf = pd.DataFrame()
for _, this_df in gb:
# there should be one brandname cui, one ingredient cui, can be more than one unii
rxcui_brandname = set(chain(*this_df.rxcui_brandname))
rxcui_ingredient = set(chain(*this_df.rxcui_ingredient))
unii = set(this_df.unii.apply(frozenset))
# in this_df, we want to make sure all rows have the same unii, rxcui_brandname, rxcui_ingredient
# if so, keep one row, otherwise, toss the whole thing
if (len(unii) == 1) and (len(rxcui_brandname) == 1) and (len(rxcui_ingredient) == 1):
this_df = this_df.iloc[0,]
newdf = newdf.append(this_df)
In [ ]:
newdf.rxcui_brandname = newdf.rxcui_brandname.apply(lambda x:list(set(x))[0] if x else x)
newdf.rxcui_ingredient = newdf.rxcui_ingredient.apply(lambda x:list(set(x))[0] if x else x)
newdf = newdf.dropna(subset=["rxcui_brandname", "rxcui_ingredient"])
newdf
In [ ]:
# newdf.to_csv("openfda_single.csv")