Genetic Home Reference Data linking

The Genetic Home Reference is an NLM resource and can be found at https://ghr.nlm.nih.gov/condition.

The topics index can be accessed at: https://ghr.nlm.nih.gov/download/TopicIndex.xml

An API call can be used to visit the topic and pulled the corresponding json document for each topic. The json files will have various database identifiers which may be used to xref a condition to existing WD entities.

The topic includes 'conditions', 'genes', 'chromosomes', and the 'handbook' itself. For the initial import, we're only interested in topics that are children of 'conditions'


In [1]:
from wikidataintegrator import wdi_core, wdi_login, wdi_helpers
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs
import pandas as pd
from pandas import read_csv
import requests
from tqdm.notebook import trange, tqdm
import ipywidgets 
import widgetsnbextension
import xml.etree.ElementTree as et 
import time

datasrc = 'https://ghr.nlm.nih.gov/download/TopicIndex.xml'

In [ ]:
## Login for Scheduled bot
print("Logging in...")
try:
    from scheduled_bots.local import WDUSER, WDPASS
except ImportError:
    if "WDUSER" in os.environ and "WDPASS" in os.environ:
        WDUSER = os.environ['WDUSER']
        WDPASS = os.environ['WDPASS']
    else:
        raise ValueError("WDUSER and WDPASS must be specified in local.py or as environment variables")

In [21]:
"""
print("Logging in...")
import wdi_user_config ## Credentials stored in a wdi_user_config file
login_dict = wdi_user_config.get_credentials()
login = wdi_login.WDLogin(login_dict['WDUSER'], login_dict['WDPASS'])
"""


Logging in...
https://www.wikidata.org/w/api.php
Successfully logged in as Gtsulab

In [3]:
r = requests.get(datasrc)
xml = r.text
xtree = et.fromstring(xml)
topic_of_interest = 'Conditions'

for eachtopic in xtree.findall('topic'):
    if eachtopic.attrib['id'] == topic_of_interest:
        new_tree = eachtopic.find('topics')

conditions = new_tree

In [4]:
conditions_list = []

for condition in conditions.findall('topic'):
    title = condition.find('title').text
    url = condition.find('url').text
    try:
        synonyms = condition.find('other_names')
        for synonym in synonyms:
            tmpdict = {'title': title,'url':url,'aka':synonym.text}
            conditions_list.append(tmpdict)
    except:
        tmpdict = {'title': title,'url':url,'aka':'None'}
        conditions_list.append(tmpdict)
    
conditions_df = pd.DataFrame(conditions_list)
print(len(conditions_df))
print(conditions_df.head(n=2))


6203
                                  aka                    title  \
0                     10qter deletion  10q26 deletion syndrome   
1  chromosome 10q26 deletion syndrome  10q26 deletion syndrome   

                                                 url  
0  https://ghr.nlm.nih.gov/condition/10q26-deleti...  
1  https://ghr.nlm.nih.gov/condition/10q26-deleti...  

In [5]:
conditions_url_list = conditions_df['url'].unique().tolist()
condition_url_list_test = conditions_url_list[0:3]

inher_list = []
inher_fail = []
syn_fail = []
synonyms_df = pd.DataFrame(columns = ['topic','synonym'])
xref_list = []
xref_fail = []

u=0
for u in tqdm(range(len(conditions_url_list))):
    eachurl = conditions_url_list[u]
    tmpurl = eachurl+'?report=json'
    tmpresponse = requests.get(tmpurl)
    data = tmpresponse.json()
    ## save the inheritance pattern data
    try:
        pattern_nos = data['inheritance-pattern-list']
        i=0
        while i < len(pattern_nos):
            inher_dict = pattern_nos[i]['inheritance-pattern']
            inher_dict['topic']=data['name']
            inher_dict['url'] = eachurl
            inher_list.append(inher_dict)
            i=i+1
    except:
        inher_fail.append({'topic':data['name'],'url':eachurl})
    
    ## save the synonym list
    try:
        synlist = data['synonym-list']
        syndf = pd.DataFrame(synlist)
        syndf['topic']=data['name']
        synonyms_df = pd.concat((synonyms_df,syndf),ignore_index=True)
    except:
        syn_fail.append({'topic':data['name'],'url':eachurl})
    
    ## save the xrefs
    try:
        xreflist = data['db-key-list']
        k=0
        while k < len(xreflist):
            tmpdict = xreflist[k]['db-key']
            tmpdict['topic'] = data['name']
            tmpdict['url'] = eachurl
            xref_list.append(tmpdict)
            k=k+1
    except:
        xref_fail.append({'topic':data['name'],'url':eachurl})
    u=u+1

inheritance_df = pd.DataFrame(inher_list)
inher_fail_df = pd.DataFrame(inher_fail)
syn_fail_df = pd.DataFrame(syn_fail)
xref_list_df = pd.DataFrame(xref_list)
xref_fail_df = pd.DataFrame(xref_fail)
print(inheritance_df.head(n=2))
print(xref_list_df.head(n=2))
print(inher_fail_df.head(n=2))
print(syn_fail_df.head(n=2))
print(xref_fail_df.head(n=2))


  code                memo                           topic  \
0   ad  autosomal dominant         10q26 deletion syndrome   
1    n       not inherited  15q11-q13 duplication syndrome   

                                                 url  
0  https://ghr.nlm.nih.gov/condition/10q26-deleti...  
1  https://ghr.nlm.nih.gov/condition/15q11-q13-du...  
     db       key                    topic  \
0   GTR  C2674937  10q26 deletion syndrome   
1  MeSH   D002872  10q26 deletion syndrome   

                                                 url  
0  https://ghr.nlm.nih.gov/condition/10q26-deleti...  
1  https://ghr.nlm.nih.gov/condition/10q26-deleti...  
Empty DataFrame
Columns: []
Index: []
              topic                                                url
0  RAB18 deficiency  https://ghr.nlm.nih.gov/condition/rab18-defici...
Empty DataFrame
Columns: []
Index: []

In [6]:
print(syn_fail_df['topic'])


0    RAB18 deficiency
Name: topic, dtype: object

In [7]:
print(xref_list_df['db'].unique().tolist())
## Corresponding Wikidata properties:
wdprop_dict = {'MeSH':'P486','OMIM':'P492', 'Orphanet':'P1550', 'SNOMED CT':'P5806', 'GeneReviews':'P668', 'ICD-10-CM':'P4229'}


['GTR', 'MeSH', 'OMIM', 'Orphanet', 'SNOMED CT', 'GeneReviews', 'ICD-10-CM']

Update Wikidata with corresponding information

  1. Identify the db identifier that has the fewest number of mapping issues
  2. Use identifiers to pull appropriate WD entry for each topic
  3. Check entry to see if mode of inheritance already added. If not, add it -For inheritance statements, reference: Genetics Home Reference (Q62606821)
  4. Add url for GHR (need to create new property)

Determining identifier with fewest mapping issues


In [8]:
## Drop topics that map to the same url (assuming they're synonyms)
xref_no_dups = xref_list_df.drop_duplicates()
print("original df size: ",len(xref_list_df),"de-duplicated url df size: ",len(xref_no_dups))

## Check coverage of identifiers for the unique urls
xref_dups = xref_list_df.groupby(['db','key']).size().reset_index(name='count')
print("Number of unique urls: ",len(xref_no_dups['url'].unique().tolist()))
print("Entries of each db: ",xref_list_df.groupby('db').size())


original df size:  12880 de-duplicated url df size:  12874
Number of unique urls:  1302
Entries of each db:  db
GTR            2514
GeneReviews     815
ICD-10-CM      2094
MeSH           1443
OMIM           2753
Orphanet       1437
SNOMED CT      1824
dtype: int64

In [9]:
## Verify coverage
print('GTR: ',len(xref_list_df.loc[xref_list_df['db']=='GTR'].groupby(['db','url']).size()))
print('GeneReviews: ',len(xref_list_df.loc[xref_list_df['db']=='GeneReviews'].groupby(['db','url']).size()))
print('ICD-10-CM: ',len(xref_list_df.loc[xref_list_df['db']=='ICD-10-CM'].groupby(['db','url']).size()))
print('MeSH: ',len(xref_list_df.loc[xref_list_df['db']=='MeSH'].groupby(['db','url']).size()))
print('OMIM: ',len(xref_list_df.loc[xref_list_df['db']=='OMIM'].groupby(['db','url']).size()))
print('Orphanet: ',len(xref_list_df.loc[xref_list_df['db']=='Orphanet'].groupby(['db','url']).size()))
print('SNOMED CT: ',len(xref_list_df.loc[xref_list_df['db']=='SNOMED CT'].groupby(['db','url']).size()))


GTR:  1285
GeneReviews:  701
ICD-10-CM:  501
MeSH:  1302
OMIM:  1255
Orphanet:  1170
SNOMED CT:  1237

It looks like the database that is closest in number to the number of unique urls is Orphanet and MeSH, suggesting that these may have the fewest mapping issues within the data set, as GTR (Genetics Testing Registry) and OMIM may have multiple identifiers mapping to the same topic/url. GeneReviews has fewer suggesting that there are entries either missing GeneReview mappings, or that there are multiple urls mapping to a single GeneReview ID.


In [10]:
#Investigate duplicate mappings more closely.
dups = xref_dups.loc[xref_dups['count']>1]
print("number of duplicated identifiers by type: ")
print(dups.groupby('db').size().reset_index(name='dup_counts'))
print("Number of entries affected by duplicated identfiers: ")
print(dups.groupby('db')['count'].sum().reset_index(name='entry_counts'))


number of duplicated identifiers by type: 
            db  dup_counts
0          GTR          26
1  GeneReviews          70
2    ICD-10-CM         131
3         MeSH         182
4         OMIM          60
5     Orphanet          32
6    SNOMED CT          25
Number of entries affected by duplicated identfiers: 
            db  entry_counts
0          GTR            54
1  GeneReviews           206
2    ICD-10-CM           292
3         MeSH           822
4         OMIM           122
5     Orphanet            77
6    SNOMED CT            56

In terms of unique coverage, it looks like Orphanet will be the least problematic to use. Now to check it's coverage in Wikidata


In [11]:
## Generate list of unique Orphanet IDs
orphanet_ghr = xref_no_dups.loc[xref_no_dups['db']=='Orphanet']
no_orphanet_dups = orphanet_ghr.drop_duplicates('url')
print("Original Orphanet Xref list: ", len(orphanet_ghr), "Orphanet Xref list less dups: ",len(no_orphanet_dups))
orphanet_id_list = no_orphanet_dups['key'].tolist()

# Retrieve the QIDs for each Orphanet ID (The property for Orphanet IDs is P1550)
i=0
wdmap = []
wdmapfail = []
for i in tqdm(range(len(orphanet_id_list))):
    orph_id = orphanet_id_list[i]
    try:
        sparqlQuery = "SELECT * WHERE {?topic wdt:P1550 \""+orph_id+"\"}"
        result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
        orpha_qid = result["results"]["bindings"][0]["topic"]["value"].replace("http://www.wikidata.org/entity/", "")
        wdmap.append({'Orphanet':orph_id,'WDID':orpha_qid})
    except:
        wdmapfail.append(orph_id)
    i=i+1

## Inspect the results for mapping or coverage issues
wdid_orpha_df = pd.DataFrame(wdmap)
print("resulting mapping table has: ",len(wdid_orpha_df)," rows.")


Original Orphanet Xref list:  1436 Orphanet Xref list less dups:  1170
resulting mapping table has:  1119  rows.

Adding mode of inheritance data

Prepare the inheritance data for mapping

  1. De-duplicate Orphanet-Wikidata mapping table as needed
  2. Merge inheritance table to mapping table

In [12]:
## De-duplicate to remove anything with mapping issues
wd_orpha_no_dups = wdid_orpha_df.drop_duplicates('Orphanet').copy()
wd_orpha_no_dups.drop_duplicates('WDID')
print('de-duplicated table: ',len(wd_orpha_no_dups))

## Merge with Inheritance table
no_orphanet_dups.rename(columns={'key':'Orphanet'}, inplace=True)
inher_wd_db = inheritance_df.merge(wd_orpha_no_dups.merge(no_orphanet_dups,on='Orphanet',how='inner'), on=['url','topic'], how='inner')
print("resulting mapped table: ",len(inher_wd_db))


de-duplicated table:  1078
c:\users\ginger\anaconda3\envs\pywikibot\lib\site-packages\pandas\core\frame.py:2746: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)
resulting mapped table:  1347

Generate the references and write the data to Wikidata


In [12]:
print(inheritance_df.groupby(['code','memo']).size())


code  memo                
ac    autosomal codominant      4
ad    autosomal dominant      555
ar    autosomal recessive     633
m     mitochondrial            15
n     not inherited           103
u     pattern unknown         111
x     X-linked                 21
xd    X-linked dominant        41
xr    X-linked recessive       81
y     Y-linked                  2
dtype: int64

In [13]:
## Mode of inheritance = P1199
GHR_WD_codes = {'ac': 'Q13169788', ##wd:Q13169788 (codominant)
               'ad': 'Q116406', ##wd:Q116406 (autosomal dominant)
               'ar': 'Q15729064', ##wd:Q15729064 (autosomal recessive)
               'm': 'Q15729075', ##wd:Q15729075 (mitochondrial)
               'x': 'Q70899378', #wd:Q2597344 (X-linked inheritance)
               'xd': 'Q3731276', ##wd:Q3731276 (X-linked dominant)
               'xr': 'Q1988987', ##wd:Q1988987 (X-linked recessive)
               'y': 'Q2598585'} ##wd:Q2598585 (Y linkage)

GHR_codes_no_WD = {'n': 'not inherited', 'u': 'unknown pattern'}

In [15]:
from datetime import datetime
import copy
def create_reference(ghr_url):
    refStatedIn = wdi_core.WDItemID(value="Q62606821", prop_nr="P248", is_reference=True)
    timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z")
    refRetrieved = wdi_core.WDTime(timeStringNow, prop_nr="P813", is_reference=True)
    refURL = wdi_core.WDUrl(value=ghr_url, prop_nr="P854", is_reference=True)

    return [refStatedIn, refRetrieved, refURL]

In [15]:
## Limit adding mode of inheritance statements to diseases with known modes of inheritance
inheritance_avail = inher_wd_db.loc[(inher_wd_db['code']!='n')&(inher_wd_db['code']!='u')]
print(len(inheritance_avail))


1193

In [34]:
#### Unit test-- write a single statement
"""
disease_qid = inheritance_avail.iloc[0]['WDID']
inheritance_method = GHR_WD_codes[inheritance_avail.iloc[0]['code']]
ghr_url = inheritance_avail.iloc[0]['url']
reference = create_reference(ghr_url)
statement = [wdi_core.WDItemID(value=inheritance_method, prop_nr="P1199", references=[copy.deepcopy(reference)])]

item = wdi_core.WDItemEngine(wd_item_id=disease_qid, data=statement, append_value="P1199",
                       global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
print(disease_qid)
print(item)
item.write(login)
"""


Q21154055
<wikidataintegrator.wdi_core.WDItemEngine object at 0x000000000A3FAEF0>
Out[34]:
'Q21154055'

In [19]:
#### test run -- write 10 statements
"""
i=0
for i in tqdm(range(10)):
    disease_qid = inheritance_avail.iloc[i]['WDID']
    inheritance_method = GHR_WD_codes[inheritance_avail.iloc[i]['code']]
    ghr_url = inheritance_avail.iloc[i]['url']
    reference = create_reference(ghr_url)
    statement = [wdi_core.WDItemID(value=inheritance_method, prop_nr="P1199", references=[copy.deepcopy(reference)])]
    item = wdi_core.WDItemEngine(wd_item_id=disease_qid, data=statement, append_value="P1199",
                           global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
    item.write(login)
    time.sleep(2)
    i=i+1
"""




In [ ]:
"""
i=0
for i in tqdm(range(len(inheritance_avail))):
    disease_qid = inheritance_avail.iloc[i]['WDID']
    inheritance_method = GHR_WD_codes[inheritance_avail.iloc[i]['code']]
    ghr_url = inheritance_avail.iloc[i]['url']
    reference = create_reference(ghr_url)
    statement = [wdi_core.WDItemID(value=inheritance_method, prop_nr="P1199", references=[copy.deepcopy(reference)])]
    item = wdi_core.WDItemEngine(wd_item_id=disease_qid, data=statement, append_value="P1199",
                           global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
    item.write(login)
    i=i+1
"""

Importing the urls to separate property for external linking

This portion is awaiting completion of the property creation and approval process


In [23]:
## Load successfully mapped GHR disease urls
mapped_orpha_urls = wd_orpha_no_dups.merge(no_orphanet_dups,on='Orphanet',how='inner')
print(len(mapped_orpha_urls))
print(mapped_orpha_urls.head(n=5))


1119
  Orphanet       WDID        db                           topic  \
0    96148  Q21154055  Orphanet         10q26 deletion syndrome   
1     3306   Q2703116  Orphanet  15q11-q13 duplication syndrome   
2   199318  Q21154058  Orphanet           15q13.3 microdeletion   
3    94065  Q21154059  Orphanet             15q24 microdeletion   
4   370079  Q21154076  Orphanet             16p11.2 duplication   

                                                 url  
0  https://ghr.nlm.nih.gov/condition/10q26-deleti...  
1  https://ghr.nlm.nih.gov/condition/15q11-q13-du...  
2  https://ghr.nlm.nih.gov/condition/15q133-micro...  
3  https://ghr.nlm.nih.gov/condition/15q24-microd...  
4  https://ghr.nlm.nih.gov/condition/16p112-dupli...  

In [26]:
## Unit test --  write a statement
disease_qid = mapped_orpha_urls.iloc[1]['WDID']
ghr_url = mapped_orpha_urls.iloc[1]['url']
ghr_id = mapped_orpha_urls.iloc[1]['url'].replace("https://ghr.nlm.nih.gov/condition/","")
reference = create_reference(ghr_url)
url_prop = "P7464" 
statement = [wdi_core.WDString(value=ghr_id, prop_nr=url_prop, references=[copy.deepcopy(reference)])]
item = wdi_core.WDItemEngine(wd_item_id=disease_qid, data=statement, append_value=url_prop,
                           global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
item.write(login)
print(ghr_id, disease_qid, ghr_url)


15q11-q13-duplication-syndrome Q2703116 https://ghr.nlm.nih.gov/condition/15q11-q13-duplication-syndrome

In [ ]:
"""
i=0
for i in tqdm(range(len(mapped_orpha_urls))):
    disease_qid = mapped_orpha_urls.iloc[i]['WDID']
    ghr_url = mapped_orpha_urls.iloc[i]['url']
    ghr_id = mapped_orpha_urls.iloc[0]['url'].replace("https://ghr.nlm.nih.gov/condition/","")
    reference = create_reference(ghr_url)
    url_prop = "P7464" 
    statement = [wdi_core.WDString(value=ghr_id, prop_nr=url_prop, references=[copy.deepcopy(reference)])]
    item = wdi_core.WDItemEngine(wd_item_id=disease_qid, data=statement, append_value=url_prop,
                               global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
    item.write(login)
    i=i+1
"""