In [63]:
import pandas as pd
import json
import gzip
import os
import requests
import pickle
import time
from tqdm import tqdm
from functools import lru_cache
from collections import Counter
from itertools import chain
pd.set_option("display.max_columns", 30)

import sys, os
sys.path.insert(0, "/home/gstupp/projects/WikidataIntegrator")
sys.path.insert(0, "/home/gstupp/projects/wikidata-biothings/scheduled_bots")
from wikidataintegrator import wdi_helpers
from scheduled_bots.drugs.chemlib import create_item
from scheduled_bots.drugs.chemspider import ChemSpiderMolecule
from scheduled_bots.drugs.unii import UNIIMolecule

from wikidataintegrator import wdi_core, wdi_login

In [64]:
try:
    from scheduled_bots.local import WDUSER, WDPASS
except ImportError:
    if "WDUSER" in os.environ and "WDPASS" in os.environ:
        WDUSER = os.environ['WDUSER']
        WDPASS = os.environ['WDPASS']
    else:
        raise ValueError("WDUSER and WDPASS must be specified in local.py or as environment variables")

login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS)


Successfully logged in as ProteinBoxBot

In [73]:
df = pd.read_csv("openfda_single.csv", index_col=0)
df.unii = df.unii.map(eval)
df.product_ndc = df.product_ndc.map(eval)
df.head()


Out[73]:
application_number brand_name generic_name is_original_packager manufacturer_name nui original_packager_product_ndc package_ndc pharm_class_cs pharm_class_epc pharm_class_moa pharm_class_pe product_ndc product_type route rxcui rxcui_brandname rxcui_ingredient spl_id spl_set_id substance_name unii upc
3318 ANDA077844 ABACAVIR ABACAVIR SULFATE [] AMERICAN HEALTH PACKAGING [] ['65862-073'] ['68084-021-21', '68084-021-11'] [] [] [] [] [68084-021] HUMAN PRESCRIPTION DRUG ['ORAL'] ['242679'] 221017 190521 5CDBCF4A-4B9B-5549-E053-2991AA0A6550 01E46F58-8BDA-4FF3-AB21-57D5B540D440 ['ABACAVIR SULFATE'] [J220T4J9Q2] []
5846 NDA021652 ABACAVIR AND LAMIVUDINE ABACAVIR SULFATE AND LAMIVUDINE [True] PRASCO LABORATORIES ['N0000175459', 'N0000175656', 'N0000009947', ... [] ['66993-482-30'] ['Nucleoside Analog [Chemical/Ingredient]'] ['Hepatitis B Virus Nucleoside Analog Reverse ... ['Nucleoside Reverse Transcriptase Inhibitors ... [] [66993-482] HUMAN PRESCRIPTION DRUG ['ORAL'] ['602393'] 497184 614534 704ACC32-038D-4B75-907D-96E4F5839EA4 133490FC-26EB-4C92-A21D-52BE4C226B74 ['LAMIVUDINE', 'ABACAVIR SULFATE'] [J220T4J9Q2, 2T8Q726O95] []
6415 ANDA202912 ABACAVIR, LAMIVUDINE AND ZIDOVUDINE ABACAVIR , LAMIVUDINE AND ZIDOVUDINE [True] LUPIN PHARMACEUTICALS, INC. ['N0000175459', 'N0000175656', 'N0000009947', ... [] ['68180-286-07', '68180-286-01', '68180-286-02'] ['Nucleoside Analog [Chemical/Ingredient]'] ['Hepatitis B Virus Nucleoside Analog Reverse ... ['Nucleoside Reverse Transcriptase Inhibitors ... [] [68180-286] HUMAN PRESCRIPTION DRUG ['ORAL'] ['307650'] 284904 284620 E1E30726-5522-4214-B103-A016A08FDFCB F5F7C0D9-A247-4308-8269-32ACCC490EA6 ['LAMIVUDINE', 'ZIDOVUDINE', 'ABACAVIR SULFATE'] [J220T4J9Q2, 2T8Q726O95, 4B9XT59T7S] []
104 NDA021436 ABILIFY ARIPIPRAZOLE [] TYA PHARMACEUTICALS ['N0000175430'] ['59148-009'] ['64725-0009-1'] [] ['Atypical Antipsychotic [EPC]'] [] [] [64725-0009] HUMAN PRESCRIPTION DRUG ['ORAL'] ['352308', '349490'] 352393 89013 1C1E7873-4FBF-4033-94D7-9FD8416B828E D0640208-44E9-4052-A56F-9A4CD9A5AAAB ['ARIPIPRAZOLE'] [82VFR53I78] []
3062 NDA022510 ABSTRAL FENTANYL CITRATE [True] GALENA BIOPHARMA, INC. [] [] ['57881-334-04', '57881-333-32', '57881-332-32... [] [] [] [] [57881-338, 57881-331, 57881-333, 57881-332, 5... HUMAN PRESCRIPTION DRUG ['SUBLINGUAL'] ['1053652', '1053651', '1053658', '1053657', '... 1053648 4337 621FD88B-B921-47A2-94BD-A778D6005353 F969E2BC-6297-4E29-89D3-A3685A2C7C6B ['FENTANYL CITRATE'] [MUN5LYG46H] []

In [155]:
def get_label_from_rxcui(rxcui):
    url = "https://rxnav.nlm.nih.gov/REST/rxcui/{}/properties.json".format(rxcui)
    d = requests.get(url).json()
    if d:
        return d['properties']['name']
# rxcui_label = dict()
for rxcui in tqdm(set(df.rxcui_brandname)):
    if rxcui not in rxcui_label:
        rxcui_label[rxcui] = get_label_from_rxcui(rxcui)


100%|██████████| 1513/1513 [03:40<00:00,  6.87it/s] 

In [156]:
# get the rxnorm label for the brand_name, becuase often, the openfda version sucks
# https://rxnav.nlm.nih.gov/REST/rxcui/497184/properties.json
df['brand_name_rxnorm'] = df.rxcui_brandname.map(rxcui_label.get)

In [157]:
rxnorm_qid = wdi_helpers.id_mapper("P3345", return_as_set=True)
rxnorm_qid = {k:list(v)[0] for k,v in rxnorm_qid.items() if len(v)==1}

In [158]:
df['rxcui_brandname_qid'] = df.rxcui_brandname.map(rxnorm_qid.get)
len(df['rxcui_brandname_qid'])


Out[158]:
2042

In [159]:
len(df['rxcui_brandname_qid'].dropna())


Out[159]:
285

In [166]:
def search_wikidata(s):
    # search for a string
    # https://www.wikidata.org/w/api.php?action=wbsearchentities&search=ABSTRAL&language=en&type=item&format=json
    params = {'action': 'wbsearchentities',
              'language': 'en',
              'search': s,
              'type': "item",
              'format': 'json'}
    r = requests.get("https://www.wikidata.org/w/api.php", params=params)
    r.raise_for_status()
    d = r.json()
    dataPage = d['search']
    for item in dataPage:
        item['id'] = "wd:" + item['id']
        del item['repository']
        del item['concepturi']
    return dataPage

def get_sitelink(qid):
    # get sitelink
    # https://www.wikidata.org/w/api.php?action=wbgetentities&ids=Q42&props=sitelinks&format=json&sitefilter=enwiki
    params = {'action': 'wbgetentities',
              'ids': qid,
              'props': 'sitelinks',
              'sitefilter': "enwiki",
              'format': 'json'}
    r = requests.get("https://www.wikidata.org/w/api.php", params=params)
    r.raise_for_status()
    d = r.json()['entities'][qid]['sitelinks']
    if 'enwiki' in d:
        return d['enwiki']['title']

In [234]:
df = df[df.brand_name.notnull()]

In [189]:
row = df.iloc[12]
row


Out[189]:
application_number                                                       NDA020184
brand_name                                                                   ACEON
generic_name                                                  PERINDOPRIL ERBUMINE
is_original_packager                                                        [True]
manufacturer_name                                                    XOMA (US) LLC
nui                                                                             []
original_packager_product_ndc                                                   []
package_ndc                       ['76234-001-01', '76234-002-01', '76234-000-01']
pharm_class_cs                                                                  []
pharm_class_epc                                                                 []
pharm_class_moa                                                                 []
pharm_class_pe                                                                  []
product_ndc                                      [76234-000, 76234-001, 76234-002]
product_type                                               HUMAN PRESCRIPTION DRUG
route                                                                     ['ORAL']
rxcui                            ['854988', '854986', '854984', '854990', '8549...
rxcui_brandname                                                             261438
rxcui_ingredient                                                             72260
spl_id                                        C5B3AF32-3502-4828-8DE2-CCCB3F2A3994
spl_set_id                                    97A9E8BF-91ED-4256-97D4-F9504BB01373
substance_name                                            ['PERINDOPRIL ERBUMINE']
unii                                                                  [1964X464OJ]
upc                                                              ['0376234002012']
rxcui_brandname_qid                                                           None
brand_name_rxnorm                                                            Aceon
Name: 4330, dtype: object

In [198]:
brand_names = set(df[df.rxcui_brandname_qid.isnull()].brand_name.dropna())
# 1754 brand names. search in wikidata for existing pages


Out[198]:
1754

In [201]:
brandname_search = dict()
brandname_match = dict()
for brand_name in tqdm(brand_names):
    r = search_wikidata(brand_name)
    if r:
        brandname_search[brand_name] = r[0]


100%|██████████| 1512/1512 [13:09<00:00,  1.91it/s]

In [ ]:
for brand_name, r in tqdm(brandname_search.items()):
    if brand_name in brandname_match:
        continue
    descr = r.get("description")
    if brand_name.lower() == r.get("label").lower():
        if descr in {"pharmaceutical product", None}:
            brandname_match[brand_name] = r
        else:
            print(brand_name)
            print(r.get("label"))
            print(descr)
            cont=input()
            if cont == "y":
                brandname_match[brand_name] = r

In [256]:
brandname_qid = {k.lower():v['title'] for k,v in brandname_match.items()}
del brandname_qid['oseni']
del brandname_qid['canasa']
for w in {"Renova", "Sprix", "Kadian", "Makena", "Nesina"}:
    del brandname_qid[w.lower()]

In [261]:
# add the rxnorm statements
done = set()
for _, row in tqdm(df[df.rxcui_brandname_qid.isnull()].iterrows()):
    brand_name = row.brand_name_rxnorm if row.brand_name_rxnorm else row.brand_name
    if brand_name in done:
        continue
    done.add(brand_name)
    rxcui = row.rxcui_brandname
    product_ndc = row.product_ndc[0]
    refs = [[
        wdi_core.WDItemID(value='Q22907487', prop_nr='P248', is_reference=True),  # stated in OpenFDA
        wdi_core.WDExternalID(value=product_ndc, prop_nr='P3640', is_reference=True),  # product_ndc
        wdi_core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True)  # retrieved
    ]]
    # rxnorm
    data = [wdi_core.WDString(prop_nr="P3345", value=rxcui, references=refs)]
    qid = brandname_qid.get(brand_name.lower())
    if qid:
        print(qid)
        item = wdi_core.WDItemEngine(wd_item_id=qid, domain='drugs', data=data, append_value=['P31'])
        if brand_name.lower() != item.get_label().lower() and brand_name.lower() not in map(str.lower, item.get_aliases()):
            item.set_aliases([brand_name], append=True)
    else:
        data.append(wdi_core.WDItemID(value='Q28885102', prop_nr='P31'))  # pharmaceutical product
        item = wdi_core.WDItemEngine(item_name='drug', domain='drugs', data=data, append_value=['P31'])
        item.set_label(brand_name)
    if not item.get_description():
        item.set_description("pharmaceutical product")
    try:
        item.write(login)
    except Exception:
        pass


0it [00:00, ?it/s]
Q4669977
3it [00:01,  2.20it/s]
Error while writing to Wikidata
4it [00:01,  2.40it/s]
Error while writing to Wikidata
5it [00:01,  2.65it/s]
Error while writing to Wikidata
6it [00:02,  2.63it/s]
Error while writing to Wikidata
8it [00:02,  2.87it/s]
Error while writing to Wikidata
Error while writing to Wikidata
10it [00:03,  2.94it/s]
Error while writing to Wikidata
Error while writing to Wikidata
13it [00:03,  3.43it/s]
Error while writing to Wikidata
Error while writing to Wikidata
14it [00:03,  3.51it/s]
Error while writing to Wikidata
15it [00:04,  3.47it/s]
Error while writing to Wikidata
Q2898632
17it [00:05,  3.24it/s]
Error while writing to Wikidata
19it [00:05,  3.46it/s]
Error while writing to Wikidata
22it [00:05,  3.74it/s]
Error while writing to Wikidata
Error while writing to Wikidata
23it [00:06,  3.77it/s]
Error while writing to Wikidata
25it [00:06,  3.84it/s]
Error while writing to Wikidata
Error while writing to Wikidata
27it [00:06,  3.92it/s]
Error while writing to Wikidata
Error while writing to Wikidata
29it [00:07,  3.99it/s]
Error while writing to Wikidata
Error while writing to Wikidata
31it [00:07,  4.05it/s]
Error while writing to Wikidata
Error while writing to Wikidata
32it [00:08,  3.94it/s]
Error while writing to Wikidata
34it [00:08,  4.05it/s]
Error while writing to Wikidata
35it [00:08,  4.01it/s]
Error while writing to Wikidata
37it [00:09,  4.10it/s]
Error while writing to Wikidata
38it [00:09,  4.06it/s]
Error while writing to Wikidata
39it [00:09,  4.04it/s]
Error while writing to Wikidata
40it [00:09,  4.01it/s]
Error while writing to Wikidata
42it [00:10,  4.01it/s]
Error while writing to Wikidata
Error while writing to Wikidata
43it [00:10,  4.00it/s]
Error while writing to Wikidata
44it [00:11,  3.93it/s]
Error while writing to Wikidata
45it [00:11,  3.91it/s]
Error while writing to Wikidata
46it [00:11,  3.89it/s]
Error while writing to Wikidata
47it [00:12,  3.91it/s]
Error while writing to Wikidata
48it [00:12,  3.91it/s]
Error while writing to Wikidata
49it [00:12,  3.89it/s]
Error while writing to Wikidata
50it [00:12,  3.91it/s]
Error while writing to Wikidata
52it [00:13,  3.86it/s]
Error while writing to Wikidata
Error while writing to Wikidata
53it [00:13,  3.85it/s]
Error while writing to Wikidata
56it [00:13,  4.01it/s]
Error while writing to Wikidata
57it [00:14,  3.94it/s]
Error while writing to Wikidata
59it [00:15,  3.92it/s]
Error while writing to Wikidata
Error while writing to Wikidata
62it [00:15,  4.07it/s]
Error while writing to Wikidata
63it [00:15,  4.07it/s]
Error while writing to Wikidata
65it [00:15,  4.09it/s]
Error while writing to Wikidata
Error while writing to Wikidata
67it [00:16,  4.12it/s]
Error while writing to Wikidata
Error while writing to Wikidata
71it [00:16,  4.31it/s]
Error while writing to Wikidata
73it [00:16,  4.32it/s]
Error while writing to Wikidata
Error while writing to Wikidata
74it [00:17,  4.27it/s]
Error while writing to Wikidata
75it [00:17,  4.25it/s]
Error while writing to Wikidata
76it [00:17,  4.23it/s]
Error while writing to Wikidata
77it [00:18,  4.24it/s]
Error while writing to Wikidata
78it [00:18,  4.24it/s]
Error while writing to Wikidata
80it [00:18,  4.30it/s]
Error while writing to Wikidata
81it [00:18,  4.29it/s]
Error while writing to Wikidata
82it [00:19,  4.30it/s]
Error while writing to Wikidata
83it [00:19,  4.26it/s]
Error while writing to Wikidata
84it [00:19,  4.24it/s]
Error while writing to Wikidata
85it [00:20,  4.25it/s]
Error while writing to Wikidata
87it [00:20,  4.25it/s]
Error while writing to Wikidata
Error while writing to Wikidata
88it [00:20,  4.25it/s]
Error while writing to Wikidata
Q29005839
92it [00:22,  4.13it/s]
Error while writing to Wikidata
93it [00:22,  4.12it/s]
Error while writing to Wikidata
94it [00:22,  4.11it/s]
Error while writing to Wikidata
96it [00:23,  4.09it/s]
Error while writing to Wikidata
Error while writing to Wikidata
98it [00:23,  4.09it/s]
Error while writing to Wikidata
Error while writing to Wikidata
99it [00:24,  4.09it/s]
Error while writing to Wikidata
100it [00:24,  4.10it/s]
Error while writing to Wikidata
101it [00:24,  4.09it/s]
Error while writing to Wikidata
103it [00:25,  4.10it/s]
Error while writing to Wikidata
Error while writing to Wikidata
104it [00:25,  4.11it/s]
Error while writing to Wikidata
106it [00:25,  4.10it/s]
Error while writing to Wikidata
Error while writing to Wikidata
108it [00:26,  4.09it/s]
Error while writing to Wikidata
Error while writing to Wikidata
110it [00:27,  4.06it/s]
Error while writing to Wikidata
Error while writing to Wikidata
111it [00:27,  4.04it/s]
Error while writing to Wikidata
113it [00:27,  4.04it/s]
Error while writing to Wikidata
Error while writing to Wikidata
114it [00:28,  4.05it/s]
Error while writing to Wikidata
270it [01:30,  2.97it/s]
Q4236471
333it [01:57,  2.83it/s]
Q5121317
389it [02:19,  2.78it/s]
Q10868995
438it [02:43,  2.68it/s]
Q5243626
458it [02:52,  2.65it/s]
Q44543
477it [03:02,  2.62it/s]
Q935761
558it [03:25,  2.72it/s]
Q5316129
594it [03:42,  2.67it/s]
Q5374814
598it [03:45,  2.66it/s]
Q5376364
670it [04:19,  2.59it/s]
Q5451189
757it [04:56,  2.55it/s]
Error while writing to Wikidata
791it [05:11,  2.54it/s]
Q17190774
808it [05:20,  2.52it/s]
Q6058432
882it [05:44,  2.56it/s]
Q6542719
926it [05:58,  2.58it/s]
Q4334524
944it [06:07,  2.57it/s]
Q10578714
995it [06:24,  2.59it/s]
Q6867721
1009it [06:30,  2.59it/s]
Q6718023
1010it [06:31,  2.58it/s]
Q6917999
1076it [06:53,  2.60it/s]
Q1371645
1081it [06:56,  2.60it/s]
Q16976940
1124it [07:10,  2.61it/s]
Q7070620
1194it [07:35,  2.62it/s]
Q29006315
1205it [07:43,  2.60it/s]
Q7069018
1307it [08:17,  2.63it/s]
Q22101847
1325it [08:23,  2.63it/s]
Q29006385
1461it [09:21,  2.60it/s]
Q7633143
1504it [09:39,  2.60it/s]
Q29006557
1558it [09:57,  2.61it/s]
Q7839565
1578it [10:04,  2.61it/s]
Error while writing to Wikidata
1582it [10:05,  2.61it/s]
Error while writing to Wikidata
1622it [10:19,  2.62it/s]
Q2297093
1651it [10:34,  2.60it/s]
Q18485880
1665it [10:41,  2.60it/s]
Q7943704
1677it [10:46,  2.60it/s]
Error while writing to Wikidata
1688it [10:50,  2.59it/s]
Q29006694
1712it [11:01,  2.59it/s]
Error while writing to Wikidata
1754it [11:16,  2.59it/s]

In [184]:


In [185]:


In [186]:



Out[186]:
'Q4672916'

In [ ]: