ClinGen (Clinical Genome Resource) develops curated data of genetic associations
CC0 https://clinicalgenome.org/docs/terms-of-use/
This scheduled bot operates through WDI to integrate ClinGen Gene-Disease Validity Data
https://search.clinicalgenome.org/kb/gene-validity/
https://github.com/SuLab/GeneWikiCentral/issues/116
http://jenkins.sulab.org/
Python script contributions, in order: Sabah Ul-Hasan, Andra Waagmeester, Andrew Su, Ginger Tsueng
For loop puts correct Qid for either HGNC or MONDO, if available </br>
For loop only writes 'complete' in output if written to Wikidata
In [15]:
# Relevant Modules and Libraries
## Installations by shell
!pip install --upgrade pip # Installs pip, ensures it's up-to-date
!pip3 install tqdm # Visualizes installation progress (progress bar)
!pip3 install wikidataintegrator # For wikidata
## Installations by python
from wikidataintegrator import wdi_core, wdi_login # Core and login from wikidataintegrator module
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs # For retrieving references
from datetime import datetime # For identifying the current date and time
import copy # Copies references needed in the .csv for uploading to wikidata
import time # For keeping track of total for loop run time
import os # OS package to ensure interaction between the modules (ie WDI) and current OS being used
import pandas as pd # Pandas for data organization, then abbreviated to pd
import numpy as np # Another general purpose package
from termcolor import colored # Imports colored package from termcolor
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context
In [16]:
# Login for running WDI
print("Logging in...")
## **remove lines when scheduling to Jenkins** Enter your own username and password
os.environ["WDUSER"] = "username" # Uses os package to call and set the environment for wikidata username
os.environ["WDPASS"] = "password"
## Conditional that outputs error command if not in the local python environment
if "WDUSER" in os.environ and "WDPASS" in os.environ:
WDUSER = os.environ['WDUSER']
WDPASS = os.environ['WDPASS']
else:
raise ValueError("WDUSER and WDPASS must be specified in local.py or as environment variables")
## Sets attributed username and password as 'login'
login = wdi_login.WDLogin(WDUSER, WDPASS)
In [11]:
# ClinGen gene-disease validity data
## Read as csv
df = pd.read_csv('https://search.clinicalgenome.org/kb/gene-validity.csv', skiprows=6, header=None)
## Label column headings
df.columns = ['Gene', 'HGNC Gene ID', 'Disease', 'MONDO Disease ID','SOP','Classification','Report Reference URL','Report Date']
## Create time stamp of when downloaded (error if isoformat() used)
timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z")
## Create empty columns for output file (ignore warnings)
df['Status'] = "pending" # "Status" column with 'pending' for all cells: 'error', 'complete', 'skipped' (meaning previously logged within 180 days)
df['Definitive'] = "" # Empty cell to be replaced with 'yes' or 'no' string
df['Gene QID'] = "" # To be replaced with 'absent' or 'multiple'
df['Disease QID'] = "" # To be replaced with 'absent' or 'multiple'
df.head(6)
Out[11]:
In [13]:
# Create a function for adding references to then be iterated in the loop "create_reference()"
def create_reference(): # Indicates a parameter included before running rest of function (otherwise may not recognize)
refStatedIn = wdi_core.WDItemID(value="Q64403342", prop_nr="P248", is_reference=True) # ClinGen Qid = Q64403342, 'stated in' Pid = P248
timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z") # Create time stamp of when downloaded (error if isoformat() used)
refRetrieved = wdi_core.WDTime(timeStringNow, prop_nr="P813", is_reference=True) # Calls on previous 'timeStringNow' string, 'retrieved' Pid = P813
refURL = wdi_core.WDUrl((df.loc[index, 'Report Reference URL']), prop_nr="P854", is_reference=True) # 'reference URL' Pid = P854
return [refStatedIn, refRetrieved, refURL]
In [14]:
# For loop that executes the following through each row of the dataframe
start_time = time.time() # Keep track of how long it takes loop to run
for index, row in df.iterrows(): # Index is a row number, row is all variables and values for that row
# Identify the string in the Gene or Disease column for a given row
HGNC = df.loc[index, 'HGNC Gene ID'].replace("HGNC:", "") # .replace() changes HGNC: to space for SparQL query
MONDO = df.loc[index, 'MONDO Disease ID'].replace("_", ":")
# SparQL query to search for Gene or Disease in Wikidata based on HGNC ID (P354) or MonDO ID (P5270)
sparqlQuery_HGNC = "SELECT * WHERE {?gene wdt:P354 \""+HGNC+"\"}"
result_HGNC = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery_HGNC) # Resultant query
sparqlQuery_MONDO = "SELECT * WHERE {?disease wdt:P5270 \""+MONDO+"\"}"
result_MONDO = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery_MONDO)
# Assign resultant length of dictionary for either Gene or Disease (number of Qid)
HGNC_qlength = len(result_HGNC["results"]["bindings"])
MONDO_qlength = len(result_MONDO["results"]["bindings"])
# Conditional utilizing length value for output table, accounts for absent/present combos
if HGNC_qlength == 1:
HGNC_qid = result_HGNC["results"]["bindings"][0]["gene"]["value"].replace("http://www.wikidata.org/entity/", "")
df.at[index, 'Gene QID'] = HGNC_qid # Input HGNC Qid in 'Gene QID' cell
if HGNC_qlength < 1: # If no Qid
df.at[index, 'Status'] = "error"
df.at[index, 'Gene QID'] = "absent"
if HGNC_qlength > 1: # If multiple Qid
df.at[index, 'Status'] = "error"
df.at[index, 'Gene QID'] = "multiple"
if MONDO_qlength == 1:
MONDO_qid = result_MONDO["results"]["bindings"][0]["disease"]["value"].replace("http://www.wikidata.org/entity/", "")
df.at[index, 'Disease QID'] = MONDO_qid
if MONDO_qlength < 1:
df.at[index, 'Status'] = "error"
df.at[index, 'Disease QID'] = "absent"
if MONDO_qlength > 1:
df.at[index, 'Status'] = "error"
df.at[index, 'Disease QID'] = "multiple"
# Conditional inputs error such that only rows are written for where Classification = 'Definitive'
if row['Classification']!='Definitive': # If the string is NOT 'Definitive' for the Classification column
df.at[index, 'Status'] = "error" # Then input "error" in the Status column
df.at[index, 'Definitive'] = "no" # And'no' for Definitive column
continue # Skips rest and goes to next row
else: # Otherwise
df.at[index, 'Definitive'] = "yes" # Input 'yes' for Definitive column, go to next step
# Conditional continues to write into WikiData only if 1 Qid for each + Definitive classification
if HGNC_qlength == 1 & MONDO_qlength == 1:
# Call upon create_reference() function created
reference = create_reference()
# Add disease value to gene item page, and gene value to disease item page (symmetry)
# Creates 'gene assocation' statement (P2293) whether or not it's already there, and includes the references
statement_HGNC = [wdi_core.WDItemID(value=MONDO_qid, prop_nr="P2293", references=[copy.deepcopy(reference)])]
wikidata_HGNCitem = wdi_core.WDItemEngine(wd_item_id=HGNC_qid,
data=statement_HGNC,
global_ref_mode='CUSTOM', # parameter that looks within 180 days
ref_handler=update_retrieved_if_new_multiple_refs,
append_value=["P2293"])
wikidata_HGNCitem.get_wd_json_representation() # Gives json structure that submitted to API, helpful for debugging
wikidata_HGNCitem.write(login)
statement_MONDO = [wdi_core.WDItemID(value=HGNC_qid, prop_nr="P2293", references=[copy.deepcopy(reference)])]
wikidata_MONDOitem = wdi_core.WDItemEngine(wd_item_id=MONDO_qid,
data=statement_MONDO,
global_ref_mode='CUSTOM',
ref_handler=update_retrieved_if_new_multiple_refs,
append_value=["P2293"])
wikidata_MONDOitem.get_wd_json_representation()
wikidata_MONDOitem.write(login)
HGNC_name = df.loc[index, 'Gene'] # To output gene name > HGNC ID
MONDO_name = df.loc[index, 'Disease']
df.at[index, 'Status'] = "complete"
end_time = time.time() # Captures when loop run ends
print("The total time of this loop is:", end_time - start_time, "seconds, or", (end_time - start_time)/60, "minutes")
# Write output to a .csv file
now = datetime.now() # Retrieves current time and saves it as 'now'
# Includes hour:minute:second_dd-mm-yyyy time stamp (https://en.wikipedia.org/wiki/ISO_8601)
df.to_csv("ClinGenBot_Status-Output_" + now.isoformat() + ".csv") # isoformat
Out[14]: