The annotations were performed by Dina Demner-Fushman's team in order to create a standard set to encourage the development of NLP tools for ADR annotation from FDA Structured Product labels.
The publication can be found here: https://www.nature.com/articles/sdata20181
We have been given explicit permission to import this data set so long as we make it clear that it is not an NLM-sanctioned gold standard database and that the curation was done by a small team of biocurators to the best of their ability and not by experts verse in pharmacology.
It should be noted that the dataset is not expected to change in the future; however, coverage of corresponding entries in Wikidata may grow over time so re-running the dataset on a schedule may help to improve representation of the dataset in Wikidata over time.
In [1]:
from wikidataintegrator import wdi_core, wdi_login, wdi_helpers
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs
import pandas as pd
from pandas import read_csv
import requests
from tqdm.notebook import trange, tqdm
import ipywidgets
import widgetsnbextension
import time
datasrc = 'data/FinalReferenceStandard200Labels.csv'
exppath = 'results/'
In [ ]:
print("Logging in...")
import wdi_user_config ## Credentials stored in a wdi_user_config file
login_dict = wdi_user_config.get_credentials()
login = wdi_login.WDLogin(login_dict['WDUSER'], login_dict['WDPASS'])
In [2]:
spl_adr_raw = read_csv(datasrc, delimiter="|", header=0, dtype={'Index':int,'PT ID':str,'LLT ID':str}).fillna('None')
print(spl_adr_raw.head(n=2))
The Risk Factor property is how adverse effects appear to currently be modeled in Wikidata. The disease entity is the subject, with risk factor as a predicate and the drug as the object. The diseases in this data set appear to be normalized to UMLS CUIs which aren't great due to one-to-many and many-to-one mappings, but we can filter those out and limit our dataset to just the ones that are unique.
Unfortunately, the DailyMed drug ID's don't appear to be in use in Wikidata yet, which means that the drugs will still need to be mapped to some extent. That said, there were only 200 drug labels that were annotated in this data set, so manual mapping is entirely not unreasonable. We'll try mapping via sparql query which can be quite stringent, and then attempt to manually map anything that fails.
In [3]:
## Retrieve the QIDs for each UMLS CUI ID (The property for UMLS CUI IDs is P2892)
sparqlQuery = "SELECT * WHERE {?topic wdt:P2892 ?CUI}"
result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
## Format the data for analysis
wdmap = []
i=0
while i < len(result["results"]["bindings"]):
umls_qid = result["results"]["bindings"][i]["topic"]["value"].replace("http://www.wikidata.org/entity/", "")
cui_id = result["results"]["bindings"][i]["CUI"]["value"]
tmpdict = {'UMLS CUI':cui_id,'disease_WDID':umls_qid}
wdmap.append(tmpdict)
i=i+1
wdid_umls_all = pd.DataFrame(wdmap)
## Drop any entries that are not of interest
umls_cui_list = spl_adr_raw['UMLS CUI'].unique().tolist()
wdid_umls_df = wdid_umls_all.loc[wdid_umls_all['UMLS CUI'].isin(umls_cui_list)]
wdid_umls_df.to_csv(exppath+'cui_wdid_xref.tsv',sep='\t',header=True)
In [ ]:
wdid_umls_df = read_csv(exppath+'cui_wdid_xref.tsv',delimiter='\t',header=0,index_col=0)
In [4]:
## Exclude entities with one to many OR many to one mappings
wdid_umls_df_unique = wdid_umls_df.drop_duplicates(subset='disease_WDID').copy()
wdid_umls_df_unique.drop_duplicates(subset='UMLS CUI',inplace=True)
print("initial mapping table size: ",len(wdid_umls_df), " de-duplicated: ",len(wdid_umls_df_unique))
In [5]:
## Merge the mapping table to the original table
spl_with_disease_wdids = spl_adr_raw.merge(wdid_umls_df_unique, on='UMLS CUI', how='left')
print(len(spl_adr_raw),len(spl_with_disease_wdids))
We can limit the query by selecting for instances of Pharmaceutical products, medications, or chemical compounds. The queries should be run in that order...only search for medications of a label doesn't match a pharmaceutical product. Only search for chemical compounds if a label doesn't match a medication OR pharmaceutical product:
In [ ]:
"""
## Unit test
query_start = 'SELECT ?item ?itemLabel WHERE {?item wdt:P31 wd:Q28885102; rdfs:label ?itemLabel. FILTER(CONTAINS(LCASE(?itemLabel), "'
query_subject = 'NUCYNTA'
query_end = '"@en)).}'
sparqlQuery = query_start+query_subject.lower()+query_end
result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
drug_qid = result["results"]["bindings"][0]["item"]["value"].replace("http://www.wikidata.org/entity/", "")
drug_label = result["results"]["bindings"][0]["itemLabel"]["value"]
print(drug_qid, drug_label)
print(len(result["results"]["bindings"]))
"""
In [ ]:
#drug_list = ['NUCYNTA','Natazia','EDURANT'] ## Loop test
drug_list = spl_with_disease_wdids['Drug Name'].unique().tolist()
pharm_start = 'SELECT ?item ?itemLabel WHERE {?item wdt:P31 wd:Q28885102; rdfs:label ?itemLabel. FILTER(CONTAINS(LCASE(?itemLabel), "'
med_start = 'SELECT ?item ?itemLabel WHERE {?item wdt:P31 wd:Q12140; rdfs:label ?itemLabel. FILTER(CONTAINS(LCASE(?itemLabel), "'
chem_start = 'SELECT ?item ?itemLabel WHERE {?item wdt:P31 wd:Q11173; rdfs:label ?itemLabel. FILTER(CONTAINS(LCASE(?itemLabel), "'
query_end = '"@en)).}'
drug_wdid_list = []
drug_match_failed = []
for i in tqdm(range(len(drug_list))):
query_subject = drug_list[i].lower()
try:
sparqlQuery = pharm_start+query_subject+query_end
result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
drug_qid = result["results"]["bindings"][0]["item"]["value"].replace("http://www.wikidata.org/entity/", "")
drug_label = result["results"]["bindings"][0]["itemLabel"]["value"]
drug_wdid_list.append({'Drug Name':drug_list[i],'drug_WDID':drug_qid,'drug_wd_label':drug_label,'instance_of':'pharmaceutical product'})
except:
try:
sparqlQuery = med_start+query_subject+query_end
result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
drug_qid = result["results"]["bindings"][0]["item"]["value"].replace("http://www.wikidata.org/entity/", "")
drug_label = result["results"]["bindings"][0]["itemLabel"]["value"]
drug_wdid_list.append({'Drug Name':drug_list[i],'drug_WDID':drug_qid,'drug_wd_label':drug_label,'instance_of':'medication'})
except:
try:
sparqlQuery = chem_start+query_subject+query_end
result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
drug_qid = result["results"]["bindings"][0]["item"]["value"].replace("http://www.wikidata.org/entity/", "")
drug_label = result["results"]["bindings"][0]["itemLabel"]["value"]
drug_wdid_list.append({'Drug Name':drug_list[i],'drug_WDID':drug_qid,'drug_wd_label':drug_label,'instance_of':'chemical'})
except:
drug_match_failed.append(drug_list[i])
drug_wdid_df = pd.DataFrame(drug_wdid_list)
drug_wdid_df.to_csv(exppath+'drug_wdid_df.tsv',sep='\t',header=True)
print(i)
In [ ]:
print(drug_match_failed)
In [ ]:
## In the future, consider only running these
with open(exppath+'drug_match_failed.txt','w') as store_it:
for eachfailure in drug_match_failed:
store_it.write(eachfailure+'\n')
In [6]:
drug_match_failed = []
with open(exppath+'drug_match_failed.txt','r') as stored_it:
for eachline in stored_it:
drug_match_failed.append(eachline.strip())
In [7]:
drug_wdid_df = read_csv(exppath+'drug_wdid_df.tsv',delimiter='\t',header=0, index_col=0)
In [ ]:
print(drug_wdid_df.head(n=2))
print(drug_match_failed)
print(len(drug_wdid_df)+len(drug_match_failed))
In [8]:
df_to_write = spl_with_disease_wdids.merge(drug_wdid_df, on='Drug Name',how = 'left')
print(len(df_to_write))
all_data_available = df_to_write.loc[(~df_to_write['disease_WDID'].isnull()) &
(~df_to_write['drug_WDID'].isnull())]
not_attempted = df_to_write.loc[(df_to_write['disease_WDID'].isnull()) |
(df_to_write['drug_WDID'].isnull())]
print(len(all_data_available))
#print(not_attempted.head(n=2))
print(all_data_available.head(n=1))
## Save the Failures
not_attempted.to_csv(exppath+'qid_missing_not_attempted.tsv',sep='\t',header=True)
The Adverse Effect of "lactic acidosis" from metformin use was modeled on the Risk Factor property page and discussed there. These adverse effects can be expected to modeled similarly.
We can use rank as a means to indicate severity of the warning. For example, a Black Box Warning would get a higher priority rank than text mined from 'adverse effect'. Alternatively, we can try to include a reference statement that would indicate where the ADR was derived. Eg- using "P958" Paragraph/section/clause in conjunction with:
edit--P958 takes a string as an input instead of a QID, so the source can be directly added
In [ ]:
from datetime import datetime
import copy
def create_reference(spl_url,source_type):
timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z")
archived_date = datetime.strptime('9/29/2015','%m/%d/%Y').strftime("+%Y-%m-%dT00:00:00Z")
refStatedIn = wdi_core.WDItemID(value="Q73670648", prop_nr="P248", is_reference=True)
refRetrieved = wdi_core.WDTime(timeStringNow, prop_nr="P813", is_reference=True)
refRetrieved2 = wdi_core.WDTime(archived_date, prop_nr="P2960", is_reference=True)
refURL = wdi_core.WDUrl(value=spl_url, prop_nr="P854", is_reference=True)
reftype = wdi_core.WDString(value=source_type, prop_nr="P958", is_reference=True)
return [refStatedIn, refRetrieved, refRetrieved2, refURL, reftype]
In [ ]:
## Unit test -- write a statement
fda_base_spl_url = 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid='
i=0
drug_qid = all_data_available.iloc[i]['drug_WDID']
#disease_qid = all_data_available.iloc[i]['disease_WDID']
disease_qid = 'Q4115189' #sandbox run
spl_drug_id = all_data_available.iloc[i]['Drug ID']
spl_url = fda_base_spl_url+spl_drug_id
source_type = all_data_available.iloc[i]['Section Display Name']
reference = create_reference(spl_url,source_type)
statement = [wdi_core.WDItemID(value=drug_qid, prop_nr="P5642",
references=[copy.deepcopy(reference)])]
wikidata_item = wdi_core.WDItemEngine(wd_item_id=disease_qid, data=statement, append_value="P5642",
global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
#wikidata_item.get_wd_json_representation()
wikidata_item.write(login)
print(i,disease_qid,drug_qid)
In [ ]:
wd_revision_list = []
run_list = all_data_available[0:3] ## test run
#run_list = all_data_available
while i < len(run_list):
drug_qid = all_data_available.iloc[i]['drug_WDID']
disease_qid = all_data_available.iloc[i]['disease_WDID']
spl_drug_id = all_data_available.iloc[i]['Drug ID']
spl_url = fda_base_spl_url+spl_drug_id
source_type = all_data_available.iloc[i]['Section Display Name']
reference = create_reference(spl_url,source_type)
statement = [wdi_core.WDItemID(value=drug_qid, prop_nr="P5642", references=[copy.deepcopy(reference)])]
wikidata_item = wdi_core.WDItemEngine(wd_item_id=disease_qid, data=statement, append_value="P5642",
global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
wikidata_item.write(login, edit_summary='added ADR relationship from FDA SPLs')
wd_revision_list.append({'drug':drug_qid,'disease':disease_qid,'wd_revid':wikidata_item.lastrevid})
i=i+1
wd_edit_results = pd.DataFrame(wd_revision_list)
print(wd_edit_results)
wd_edit_results.to_csv(exppath+'run_results.tsv',sep='\t',header=True)
In [ ]: