In [63]:
import pandas as pd
import json
import gzip
import os
import requests
import pickle
import time
from tqdm import tqdm
from functools import lru_cache
from collections import Counter
from itertools import chain
pd.set_option("display.max_columns", 30)
import sys, os
sys.path.insert(0, "/home/gstupp/projects/WikidataIntegrator")
sys.path.insert(0, "/home/gstupp/projects/wikidata-biothings/scheduled_bots")
from wikidataintegrator import wdi_helpers
from scheduled_bots.drugs.chemlib import create_item
from scheduled_bots.drugs.chemspider import ChemSpiderMolecule
from scheduled_bots.drugs.unii import UNIIMolecule
from wikidataintegrator import wdi_core, wdi_login
In [64]:
try:
from scheduled_bots.local import WDUSER, WDPASS
except ImportError:
if "WDUSER" in os.environ and "WDPASS" in os.environ:
WDUSER = os.environ['WDUSER']
WDPASS = os.environ['WDPASS']
else:
raise ValueError("WDUSER and WDPASS must be specified in local.py or as environment variables")
login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)
In [73]:
df = pd.read_csv("openfda_single.csv", index_col=0)
df.unii = df.unii.map(eval)
df.product_ndc = df.product_ndc.map(eval)
df.head()
Out[73]:
In [155]:
def get_label_from_rxcui(rxcui):
url = "https://rxnav.nlm.nih.gov/REST/rxcui/{}/properties.json".format(rxcui)
d = requests.get(url).json()
if d:
return d['properties']['name']
# rxcui_label = dict()
for rxcui in tqdm(set(df.rxcui_brandname)):
if rxcui not in rxcui_label:
rxcui_label[rxcui] = get_label_from_rxcui(rxcui)
In [156]:
# get the rxnorm label for the brand_name, becuase often, the openfda version sucks
# https://rxnav.nlm.nih.gov/REST/rxcui/497184/properties.json
df['brand_name_rxnorm'] = df.rxcui_brandname.map(rxcui_label.get)
In [157]:
rxnorm_qid = wdi_helpers.id_mapper("P3345", return_as_set=True)
rxnorm_qid = {k:list(v)[0] for k,v in rxnorm_qid.items() if len(v)==1}
In [158]:
df['rxcui_brandname_qid'] = df.rxcui_brandname.map(rxnorm_qid.get)
len(df['rxcui_brandname_qid'])
Out[158]:
In [159]:
len(df['rxcui_brandname_qid'].dropna())
Out[159]:
In [166]:
def search_wikidata(s):
# search for a string
# https://www.wikidata.org/w/api.php?action=wbsearchentities&search=ABSTRAL&language=en&type=item&format=json
params = {'action': 'wbsearchentities',
'language': 'en',
'search': s,
'type': "item",
'format': 'json'}
r = requests.get("https://www.wikidata.org/w/api.php", params=params)
r.raise_for_status()
d = r.json()
dataPage = d['search']
for item in dataPage:
item['id'] = "wd:" + item['id']
del item['repository']
del item['concepturi']
return dataPage
def get_sitelink(qid):
# get sitelink
# https://www.wikidata.org/w/api.php?action=wbgetentities&ids=Q42&props=sitelinks&format=json&sitefilter=enwiki
params = {'action': 'wbgetentities',
'ids': qid,
'props': 'sitelinks',
'sitefilter': "enwiki",
'format': 'json'}
r = requests.get("https://www.wikidata.org/w/api.php", params=params)
r.raise_for_status()
d = r.json()['entities'][qid]['sitelinks']
if 'enwiki' in d:
return d['enwiki']['title']
In [234]:
df = df[df.brand_name.notnull()]
In [189]:
row = df.iloc[12]
row
Out[189]:
In [198]:
brand_names = set(df[df.rxcui_brandname_qid.isnull()].brand_name.dropna())
# 1754 brand names. search in wikidata for existing pages
Out[198]:
In [201]:
brandname_search = dict()
brandname_match = dict()
for brand_name in tqdm(brand_names):
r = search_wikidata(brand_name)
if r:
brandname_search[brand_name] = r[0]
In [ ]:
for brand_name, r in tqdm(brandname_search.items()):
if brand_name in brandname_match:
continue
descr = r.get("description")
if brand_name.lower() == r.get("label").lower():
if descr in {"pharmaceutical product", None}:
brandname_match[brand_name] = r
else:
print(brand_name)
print(r.get("label"))
print(descr)
cont=input()
if cont == "y":
brandname_match[brand_name] = r
In [256]:
brandname_qid = {k.lower():v['title'] for k,v in brandname_match.items()}
del brandname_qid['oseni']
del brandname_qid['canasa']
for w in {"Renova", "Sprix", "Kadian", "Makena", "Nesina"}:
del brandname_qid[w.lower()]
In [261]:
# add the rxnorm statements
done = set()
for _, row in tqdm(df[df.rxcui_brandname_qid.isnull()].iterrows()):
brand_name = row.brand_name_rxnorm if row.brand_name_rxnorm else row.brand_name
if brand_name in done:
continue
done.add(brand_name)
rxcui = row.rxcui_brandname
product_ndc = row.product_ndc[0]
refs = [[
wdi_core.WDItemID(value='Q22907487', prop_nr='P248', is_reference=True), # stated in OpenFDA
wdi_core.WDExternalID(value=product_ndc, prop_nr='P3640', is_reference=True), # product_ndc
wdi_core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) # retrieved
]]
# rxnorm
data = [wdi_core.WDString(prop_nr="P3345", value=rxcui, references=refs)]
qid = brandname_qid.get(brand_name.lower())
if qid:
print(qid)
item = wdi_core.WDItemEngine(wd_item_id=qid, domain='drugs', data=data, append_value=['P31'])
if brand_name.lower() != item.get_label().lower() and brand_name.lower() not in map(str.lower, item.get_aliases()):
item.set_aliases([brand_name], append=True)
else:
data.append(wdi_core.WDItemID(value='Q28885102', prop_nr='P31')) # pharmaceutical product
item = wdi_core.WDItemEngine(item_name='drug', domain='drugs', data=data, append_value=['P31'])
item.set_label(brand_name)
if not item.get_description():
item.set_description("pharmaceutical product")
try:
item.write(login)
except Exception:
pass
In [184]:
In [185]:
In [186]:
Out[186]:
In [ ]: