The Genetic Home Reference is an NLM resource and can be found at https://ghr.nlm.nih.gov/condition.
The topics index can be accessed at: https://ghr.nlm.nih.gov/download/TopicIndex.xml
An API call can be used to visit the topic and pulled the corresponding json document for each topic. The json files will have various database identifiers which may be used to xref a condition to existing WD entities.
The topic includes 'conditions', 'genes', 'chromosomes', and the 'handbook' itself. For the initial import, we're only interested in topics that are children of 'conditions'
In [1]:
from wikidataintegrator import wdi_core, wdi_login, wdi_helpers
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs
import pandas as pd
from pandas import read_csv
import requests
from tqdm.notebook import trange, tqdm
import ipywidgets
import widgetsnbextension
import xml.etree.ElementTree as et
import time
datasrc = 'https://ghr.nlm.nih.gov/download/TopicIndex.xml'
In [ ]:
## Login for Scheduled bot
print("Logging in...")
try:
from scheduled_bots.local import WDUSER, WDPASS
except ImportError:
if "WDUSER" in os.environ and "WDPASS" in os.environ:
WDUSER = os.environ['WDUSER']
WDPASS = os.environ['WDPASS']
else:
raise ValueError("WDUSER and WDPASS must be specified in local.py or as environment variables")
In [21]:
"""
print("Logging in...")
import wdi_user_config ## Credentials stored in a wdi_user_config file
login_dict = wdi_user_config.get_credentials()
login = wdi_login.WDLogin(login_dict['WDUSER'], login_dict['WDPASS'])
"""
In [3]:
r = requests.get(datasrc)
xml = r.text
xtree = et.fromstring(xml)
topic_of_interest = 'Conditions'
for eachtopic in xtree.findall('topic'):
if eachtopic.attrib['id'] == topic_of_interest:
new_tree = eachtopic.find('topics')
conditions = new_tree
In [4]:
conditions_list = []
for condition in conditions.findall('topic'):
title = condition.find('title').text
url = condition.find('url').text
try:
synonyms = condition.find('other_names')
for synonym in synonyms:
tmpdict = {'title': title,'url':url,'aka':synonym.text}
conditions_list.append(tmpdict)
except:
tmpdict = {'title': title,'url':url,'aka':'None'}
conditions_list.append(tmpdict)
conditions_df = pd.DataFrame(conditions_list)
print(len(conditions_df))
print(conditions_df.head(n=2))
In [5]:
conditions_url_list = conditions_df['url'].unique().tolist()
condition_url_list_test = conditions_url_list[0:3]
inher_list = []
inher_fail = []
syn_fail = []
synonyms_df = pd.DataFrame(columns = ['topic','synonym'])
xref_list = []
xref_fail = []
u=0
for u in tqdm(range(len(conditions_url_list))):
eachurl = conditions_url_list[u]
tmpurl = eachurl+'?report=json'
tmpresponse = requests.get(tmpurl)
data = tmpresponse.json()
## save the inheritance pattern data
try:
pattern_nos = data['inheritance-pattern-list']
i=0
while i < len(pattern_nos):
inher_dict = pattern_nos[i]['inheritance-pattern']
inher_dict['topic']=data['name']
inher_dict['url'] = eachurl
inher_list.append(inher_dict)
i=i+1
except:
inher_fail.append({'topic':data['name'],'url':eachurl})
## save the synonym list
try:
synlist = data['synonym-list']
syndf = pd.DataFrame(synlist)
syndf['topic']=data['name']
synonyms_df = pd.concat((synonyms_df,syndf),ignore_index=True)
except:
syn_fail.append({'topic':data['name'],'url':eachurl})
## save the xrefs
try:
xreflist = data['db-key-list']
k=0
while k < len(xreflist):
tmpdict = xreflist[k]['db-key']
tmpdict['topic'] = data['name']
tmpdict['url'] = eachurl
xref_list.append(tmpdict)
k=k+1
except:
xref_fail.append({'topic':data['name'],'url':eachurl})
u=u+1
inheritance_df = pd.DataFrame(inher_list)
inher_fail_df = pd.DataFrame(inher_fail)
syn_fail_df = pd.DataFrame(syn_fail)
xref_list_df = pd.DataFrame(xref_list)
xref_fail_df = pd.DataFrame(xref_fail)
print(inheritance_df.head(n=2))
print(xref_list_df.head(n=2))
print(inher_fail_df.head(n=2))
print(syn_fail_df.head(n=2))
print(xref_fail_df.head(n=2))
In [6]:
print(syn_fail_df['topic'])
In [7]:
print(xref_list_df['db'].unique().tolist())
## Corresponding Wikidata properties:
wdprop_dict = {'MeSH':'P486','OMIM':'P492', 'Orphanet':'P1550', 'SNOMED CT':'P5806', 'GeneReviews':'P668', 'ICD-10-CM':'P4229'}
In [8]:
## Drop topics that map to the same url (assuming they're synonyms)
xref_no_dups = xref_list_df.drop_duplicates()
print("original df size: ",len(xref_list_df),"de-duplicated url df size: ",len(xref_no_dups))
## Check coverage of identifiers for the unique urls
xref_dups = xref_list_df.groupby(['db','key']).size().reset_index(name='count')
print("Number of unique urls: ",len(xref_no_dups['url'].unique().tolist()))
print("Entries of each db: ",xref_list_df.groupby('db').size())
In [9]:
## Verify coverage
print('GTR: ',len(xref_list_df.loc[xref_list_df['db']=='GTR'].groupby(['db','url']).size()))
print('GeneReviews: ',len(xref_list_df.loc[xref_list_df['db']=='GeneReviews'].groupby(['db','url']).size()))
print('ICD-10-CM: ',len(xref_list_df.loc[xref_list_df['db']=='ICD-10-CM'].groupby(['db','url']).size()))
print('MeSH: ',len(xref_list_df.loc[xref_list_df['db']=='MeSH'].groupby(['db','url']).size()))
print('OMIM: ',len(xref_list_df.loc[xref_list_df['db']=='OMIM'].groupby(['db','url']).size()))
print('Orphanet: ',len(xref_list_df.loc[xref_list_df['db']=='Orphanet'].groupby(['db','url']).size()))
print('SNOMED CT: ',len(xref_list_df.loc[xref_list_df['db']=='SNOMED CT'].groupby(['db','url']).size()))
It looks like the database that is closest in number to the number of unique urls is Orphanet and MeSH, suggesting that these may have the fewest mapping issues within the data set, as GTR (Genetics Testing Registry) and OMIM may have multiple identifiers mapping to the same topic/url. GeneReviews has fewer suggesting that there are entries either missing GeneReview mappings, or that there are multiple urls mapping to a single GeneReview ID.
In [10]:
#Investigate duplicate mappings more closely.
dups = xref_dups.loc[xref_dups['count']>1]
print("number of duplicated identifiers by type: ")
print(dups.groupby('db').size().reset_index(name='dup_counts'))
print("Number of entries affected by duplicated identfiers: ")
print(dups.groupby('db')['count'].sum().reset_index(name='entry_counts'))
In terms of unique coverage, it looks like Orphanet will be the least problematic to use. Now to check it's coverage in Wikidata
In [11]:
## Generate list of unique Orphanet IDs
orphanet_ghr = xref_no_dups.loc[xref_no_dups['db']=='Orphanet']
no_orphanet_dups = orphanet_ghr.drop_duplicates('url')
print("Original Orphanet Xref list: ", len(orphanet_ghr), "Orphanet Xref list less dups: ",len(no_orphanet_dups))
orphanet_id_list = no_orphanet_dups['key'].tolist()
# Retrieve the QIDs for each Orphanet ID (The property for Orphanet IDs is P1550)
i=0
wdmap = []
wdmapfail = []
for i in tqdm(range(len(orphanet_id_list))):
orph_id = orphanet_id_list[i]
try:
sparqlQuery = "SELECT * WHERE {?topic wdt:P1550 \""+orph_id+"\"}"
result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
orpha_qid = result["results"]["bindings"][0]["topic"]["value"].replace("http://www.wikidata.org/entity/", "")
wdmap.append({'Orphanet':orph_id,'WDID':orpha_qid})
except:
wdmapfail.append(orph_id)
i=i+1
## Inspect the results for mapping or coverage issues
wdid_orpha_df = pd.DataFrame(wdmap)
print("resulting mapping table has: ",len(wdid_orpha_df)," rows.")
In [12]:
## De-duplicate to remove anything with mapping issues
wd_orpha_no_dups = wdid_orpha_df.drop_duplicates('Orphanet').copy()
wd_orpha_no_dups.drop_duplicates('WDID')
print('de-duplicated table: ',len(wd_orpha_no_dups))
## Merge with Inheritance table
no_orphanet_dups.rename(columns={'key':'Orphanet'}, inplace=True)
inher_wd_db = inheritance_df.merge(wd_orpha_no_dups.merge(no_orphanet_dups,on='Orphanet',how='inner'), on=['url','topic'], how='inner')
print("resulting mapped table: ",len(inher_wd_db))
In [12]:
print(inheritance_df.groupby(['code','memo']).size())
In [13]:
## Mode of inheritance = P1199
GHR_WD_codes = {'ac': 'Q13169788', ##wd:Q13169788 (codominant)
'ad': 'Q116406', ##wd:Q116406 (autosomal dominant)
'ar': 'Q15729064', ##wd:Q15729064 (autosomal recessive)
'm': 'Q15729075', ##wd:Q15729075 (mitochondrial)
'x': 'Q70899378', #wd:Q2597344 (X-linked inheritance)
'xd': 'Q3731276', ##wd:Q3731276 (X-linked dominant)
'xr': 'Q1988987', ##wd:Q1988987 (X-linked recessive)
'y': 'Q2598585'} ##wd:Q2598585 (Y linkage)
GHR_codes_no_WD = {'n': 'not inherited', 'u': 'unknown pattern'}
In [15]:
from datetime import datetime
import copy
def create_reference(ghr_url):
refStatedIn = wdi_core.WDItemID(value="Q62606821", prop_nr="P248", is_reference=True)
timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z")
refRetrieved = wdi_core.WDTime(timeStringNow, prop_nr="P813", is_reference=True)
refURL = wdi_core.WDUrl(value=ghr_url, prop_nr="P854", is_reference=True)
return [refStatedIn, refRetrieved, refURL]
In [15]:
## Limit adding mode of inheritance statements to diseases with known modes of inheritance
inheritance_avail = inher_wd_db.loc[(inher_wd_db['code']!='n')&(inher_wd_db['code']!='u')]
print(len(inheritance_avail))
In [34]:
#### Unit test-- write a single statement
"""
disease_qid = inheritance_avail.iloc[0]['WDID']
inheritance_method = GHR_WD_codes[inheritance_avail.iloc[0]['code']]
ghr_url = inheritance_avail.iloc[0]['url']
reference = create_reference(ghr_url)
statement = [wdi_core.WDItemID(value=inheritance_method, prop_nr="P1199", references=[copy.deepcopy(reference)])]
item = wdi_core.WDItemEngine(wd_item_id=disease_qid, data=statement, append_value="P1199",
global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
print(disease_qid)
print(item)
item.write(login)
"""
Out[34]:
In [19]:
#### test run -- write 10 statements
"""
i=0
for i in tqdm(range(10)):
disease_qid = inheritance_avail.iloc[i]['WDID']
inheritance_method = GHR_WD_codes[inheritance_avail.iloc[i]['code']]
ghr_url = inheritance_avail.iloc[i]['url']
reference = create_reference(ghr_url)
statement = [wdi_core.WDItemID(value=inheritance_method, prop_nr="P1199", references=[copy.deepcopy(reference)])]
item = wdi_core.WDItemEngine(wd_item_id=disease_qid, data=statement, append_value="P1199",
global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
item.write(login)
time.sleep(2)
i=i+1
"""
In [ ]:
"""
i=0
for i in tqdm(range(len(inheritance_avail))):
disease_qid = inheritance_avail.iloc[i]['WDID']
inheritance_method = GHR_WD_codes[inheritance_avail.iloc[i]['code']]
ghr_url = inheritance_avail.iloc[i]['url']
reference = create_reference(ghr_url)
statement = [wdi_core.WDItemID(value=inheritance_method, prop_nr="P1199", references=[copy.deepcopy(reference)])]
item = wdi_core.WDItemEngine(wd_item_id=disease_qid, data=statement, append_value="P1199",
global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
item.write(login)
i=i+1
"""
In [23]:
## Load successfully mapped GHR disease urls
mapped_orpha_urls = wd_orpha_no_dups.merge(no_orphanet_dups,on='Orphanet',how='inner')
print(len(mapped_orpha_urls))
print(mapped_orpha_urls.head(n=5))
In [26]:
## Unit test -- write a statement
disease_qid = mapped_orpha_urls.iloc[1]['WDID']
ghr_url = mapped_orpha_urls.iloc[1]['url']
ghr_id = mapped_orpha_urls.iloc[1]['url'].replace("https://ghr.nlm.nih.gov/condition/","")
reference = create_reference(ghr_url)
url_prop = "P7464"
statement = [wdi_core.WDString(value=ghr_id, prop_nr=url_prop, references=[copy.deepcopy(reference)])]
item = wdi_core.WDItemEngine(wd_item_id=disease_qid, data=statement, append_value=url_prop,
global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
item.write(login)
print(ghr_id, disease_qid, ghr_url)
In [ ]:
"""
i=0
for i in tqdm(range(len(mapped_orpha_urls))):
disease_qid = mapped_orpha_urls.iloc[i]['WDID']
ghr_url = mapped_orpha_urls.iloc[i]['url']
ghr_id = mapped_orpha_urls.iloc[0]['url'].replace("https://ghr.nlm.nih.gov/condition/","")
reference = create_reference(ghr_url)
url_prop = "P7464"
statement = [wdi_core.WDString(value=ghr_id, prop_nr=url_prop, references=[copy.deepcopy(reference)])]
item = wdi_core.WDItemEngine(wd_item_id=disease_qid, data=statement, append_value=url_prop,
global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
item.write(login)
i=i+1
"""