In [52]:
import pandas as pd
import json
import gzip
import os
import requests
import pickle
from tqdm import tqdm
from functools import lru_cache
from collections import Counter
from itertools import chain
pd.set_option("display.max_columns", 30)

import sys, os
sys.path.insert(0, "/home/gstupp/projects/WikidataIntegrator")
sys.path.insert(0, "/home/gstupp/projects/wikidata-biothings/scheduled_bots")
from wikidataintegrator import wdi_helpers
from scheduled_bots.drugs.chemlib import create_item
from scheduled_bots.drugs.chemspider import ChemSpiderMolecule
from scheduled_bots.drugs.unii import UNIIMolecule

In [53]:
df = pd.read_csv("openfda_single.csv", index_col=0)
df.unii = df.unii.map(eval)
df.head()


Out[53]:
application_number brand_name generic_name is_original_packager manufacturer_name nui original_packager_product_ndc package_ndc pharm_class_cs pharm_class_epc pharm_class_moa pharm_class_pe product_ndc product_type route rxcui rxcui_brandname rxcui_ingredient spl_id spl_set_id substance_name unii upc
3318 ANDA077844 ABACAVIR ABACAVIR SULFATE [] AMERICAN HEALTH PACKAGING [] ['65862-073'] ['68084-021-21', '68084-021-11'] [] [] [] [] ['68084-021'] HUMAN PRESCRIPTION DRUG ['ORAL'] ['242679'] 221017 190521 5CDBCF4A-4B9B-5549-E053-2991AA0A6550 01E46F58-8BDA-4FF3-AB21-57D5B540D440 ['ABACAVIR SULFATE'] [J220T4J9Q2] []
5846 NDA021652 ABACAVIR AND LAMIVUDINE ABACAVIR SULFATE AND LAMIVUDINE [True] PRASCO LABORATORIES ['N0000175459', 'N0000175656', 'N0000009947', ... [] ['66993-482-30'] ['Nucleoside Analog [Chemical/Ingredient]'] ['Hepatitis B Virus Nucleoside Analog Reverse ... ['Nucleoside Reverse Transcriptase Inhibitors ... [] ['66993-482'] HUMAN PRESCRIPTION DRUG ['ORAL'] ['602393'] 497184 614534 704ACC32-038D-4B75-907D-96E4F5839EA4 133490FC-26EB-4C92-A21D-52BE4C226B74 ['LAMIVUDINE', 'ABACAVIR SULFATE'] [J220T4J9Q2, 2T8Q726O95] []
6415 ANDA202912 ABACAVIR, LAMIVUDINE AND ZIDOVUDINE ABACAVIR , LAMIVUDINE AND ZIDOVUDINE [True] LUPIN PHARMACEUTICALS, INC. ['N0000175459', 'N0000175656', 'N0000009947', ... [] ['68180-286-07', '68180-286-01', '68180-286-02'] ['Nucleoside Analog [Chemical/Ingredient]'] ['Hepatitis B Virus Nucleoside Analog Reverse ... ['Nucleoside Reverse Transcriptase Inhibitors ... [] ['68180-286'] HUMAN PRESCRIPTION DRUG ['ORAL'] ['307650'] 284904 284620 E1E30726-5522-4214-B103-A016A08FDFCB F5F7C0D9-A247-4308-8269-32ACCC490EA6 ['LAMIVUDINE', 'ZIDOVUDINE', 'ABACAVIR SULFATE'] [J220T4J9Q2, 2T8Q726O95, 4B9XT59T7S] []
104 NDA021436 ABILIFY ARIPIPRAZOLE [] TYA PHARMACEUTICALS ['N0000175430'] ['59148-009'] ['64725-0009-1'] [] ['Atypical Antipsychotic [EPC]'] [] [] ['64725-0009'] HUMAN PRESCRIPTION DRUG ['ORAL'] ['352308', '349490'] 352393 89013 1C1E7873-4FBF-4033-94D7-9FD8416B828E D0640208-44E9-4052-A56F-9A4CD9A5AAAB ['ARIPIPRAZOLE'] [82VFR53I78] []
3062 NDA022510 ABSTRAL FENTANYL CITRATE [True] GALENA BIOPHARMA, INC. [] [] ['57881-334-04', '57881-333-32', '57881-332-32... [] [] [] [] ['57881-338', '57881-331', '57881-333', '57881... HUMAN PRESCRIPTION DRUG ['SUBLINGUAL'] ['1053652', '1053651', '1053658', '1053657', '... 1053648 4337 621FD88B-B921-47A2-94BD-A778D6005353 F969E2BC-6297-4E29-89D3-A3685A2C7C6B ['FENTANYL CITRATE'] [MUN5LYG46H] []

In [54]:
## first step, make sure all ingredients/substances have a unii and are in wd

In [55]:
unii_qid = wdi_helpers.id_mapper("P652", return_as_set=True)
unii_qid = {k:list(v)[0] for k,v in unii_qid.items() if len(v)==1}
#rxnorm_qid = wdi_helpers.id_mapper("P3345", return_as_set=True)
#rxnorm_qid = {k:list(v)[0] for k,v in rxnorm_qid.items() if len(v)==1}
#pubchem_qid = wdi_helpers.id_mapper("P662", return_as_set=True)
#pubchem_qid = {k:list(v)[0] for k,v in pubchem_qid.items() if len(v)==1}

In [56]:
fda_unii = set(chain(*df.unii))
print(len(fda_unii))
print(len(fda_unii & unii_qid.keys()))


1159
1115

In [57]:
# ingredients we need to make
todo_unii = fda_unii - unii_qid.keys()
# we need the inchi keys though
print(list(todo_unii)[:10])
print(len(todo_unii))


['P4SG24WI5Q', '8L6LAK9BTR', '9S44LIC7OJ', '4B3SC438HI', '18EAY4870E', '12M44VTJ7B', '3U9A0FE9N5', '914032762Y', '269K6498LD', 'EAO03PE1TC']
44

In [58]:
# these don't have inchi keys....
for x in list(todo_unii):
    m = UNIIMolecule(unii=x)
    if not m.stdinchikey or pd.isnull(m.stdinchikey):
        print(m.label, m.unii, m.rxnorm, rxnorm_qid.get(m.rxnorm), m.pubchem, pubchem_qid.get(m.pubchem))
        todo_unii.discard(x)
print(len(todo_unii))


colesevelam hydrochloride P4SG24WI5Q 141625 None None None
conjugated synthetic bestrogens 8L6LAK9BTR 618365 None None None
methylphenidate hydrochloride 4B3SC438HI 203188 None 154100 None
mipomersen sodium 18EAY4870E 1367838 None 118984460 None
dalteparin sodium 12M44VTJ7B 82137 None None None
doxepin hydrochloride 3U9A0FE9N5 203179 None None None
pentosan polysulfate sodium 914032762Y 134413 None None None
trimipramine maleate 269K6498LD 71532 None 5282318 Q27116374
bacitracin zinc 89Y4M234ES 11417 None None None
gatifloxacin L4618BD7KJ 228476 None 5282384 None
polidocanol 0AWH8BFG9A 968170 None None None
unspecified formsodium citrate 1Q73Q2JULR 253204 None None None
conjugated synthetic aestrogens JM2621P2LS 253166 None None None
pancrelipase protease 3560D81V50 1427034 None None None
valganciclovir hydrochloride 4P3T9QF9NZ 283815 None None None
doxylamine succinate V9BI9B5YI2 23665 None 11224 None
gentamicin sulfate 8X7386QRLV 1870193 None None None
pantoprazole sodium 6871619Q5X 236632 None 11954257 None
clidinium bromide 91ZQW5JF1Z 48212 None 19004 None
colestipol hydrochloride X7D10K905G 104485 None None None
glucagon hydrochloride 1H87NVF4DB 253170 None None None
capreomycin sulfate 9H8D3J7V21 1987 None None None
lactulose 9U7D5QH5AE 6218 None None None
pegaptanib sodium 3HP012Q0FH 594119 None None None
phytonadione A034SE7857 None None None None
unspecified formdibasicsodium phosphate GR686LBA74 236719 None None None
ovinehyaluronidase 64R4OHP8T0 486164 None None None
pramlintide acetate 726I6TE06G 356773 None 71306803 None
α-tocopherol acetate 9E8X80D2L0 1046243 None None None
pancrelipase lipase 8MYC33932O 6406 None None None
ipratropium bromide J697UZ2A9J 1309404 None None None
omega-3-acid ethyl esters D87YGH4Z0Q 484348 None 9831414 None
lanthanum carbonate 490D9F069T 234416 None 176168 None
colistin sulfate WP15DXU577 2710 None None None
esterifiedestrogens 3ASP8Q3768 214549 None None None
crotamiton D6S4O4XD0H 21766 None None None
neomycin sulfate 057Y626693 7300 None None None
deutetrabenazine P341G6W9NB 1876905 None 73442840 None
ziconotide acetate T2I226K69M 486126 None 72941949 None
cefotetan disodium 0GXP746VXB 203141 None None None
sodium polystyrene sulfonate 1699G8679Z 56512 None None None
3

In [60]:
print(sorted(list(todo_unii)))


['9044SC542W', '9S44LIC7OJ', 'EAO03PE1TC']

In [ ]:
for i in sorted(list(todo_unii)):
    try:
        inchi_key = UNIIMolecule(unii=i).stdinchikey
        print(inchi_key)
        create_item(inchi_key)
    except Exception as e:
        print(e)
        print("failed: {}".format(i))

In [ ]: