In [1]:
from wikidataintegrator import wdi_core, wdi_login
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs
import pandas as pd
from pandas import read_csv
import requests
from tqdm.notebook import trange, tqdm
import ipywidgets
import widgetsnbextension
datasrc = 'https://static-content.springer.com/esm/art%3A10.1186%2Fs13326-016-0110-0/MediaObjects/13326_2016_110_MOESM1_ESM.xlsx'
In [2]:
def check_wd(wd_property, searchlist):
items_in_wd = []
search_failures = []
i=0
for i in tqdm(range(len(search_list))):
each_item = search_list[i]
try:
sparqlQuery = "SELECT * WHERE {?item wdt:"+wd_property+"\""+each_item+"\"}"
result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
k=0
while k < len(result["results"]["bindings"]): ## Take into account that there may be one to many mappings
subject_qid = result["results"]["bindings"][k]["item"]["value"].replace("http://www.wikidata.org/entity/", "")
tmpdict = {'item':each_item,'WDID':subject_qid}
items_in_wd.append(tmpdict)
k=k+1
except:
search_failures.append(each_item)
i=i+1
items_wd_df = pd.DataFrame(items_in_wd)
return(items_wd_df, search_failures)
#### Unit Test
search_list = ['C0016157','C0016004','C0021579','C0076425','C0025598','15307-86-5','1474034-05-3']
wd_property = 'P2892' ##WD Property for UMLS CUI
items_wd_df, search_failures = check_wd(wd_property, search_list)
if len(items_wd_df)>0:
print('function successfully loaded and tested')
In [3]:
did_raw = pd.read_excel(datasrc, header=[0, 1, 2], sheetname='DID')
did_raw.reset_index(inplace=True)
did_raw.columns = did_raw.columns.map(lambda h: ' '.join(h).replace(' ', '_'))
In [4]:
did_raw.rename(columns={'index__':'DID_ID','source_Unnamed:_0_level_1_source_name':'source_name',
'source_Unnamed:_1_level_1_source_record_ID_or_other_metadata':'source_record_id',
'drug_Unnamed:_2_level_1_raw_drug_name':'raw_drug_name',
'drug_CAS#_Preferred_CAS#':'drug_cas#', 'drug_CAS#_PT':'drug_cas_pt',
'drug_CAS#_source':'drug_cas_source', 'drug_CAS#_match_type':'drug_cas_match',
'drug_ChEBI_PT_("name")':'drug_ChEBI_PT_name', 'drug_ChEBI_ChEBI_ID#':'drug_ChEBI_ID',
'drug_ChEBI_PT_match_type':'drug_ChEBI_PT_match',
'drug_ChEBI_synonym_if_used_for_match':'drug_ChEBI_match_syn',
'drug_ChEBI_synonym_match_type':'drug_ChEBI_synonym_match',
'drug_ChEBI_match_aid_if_any':'drug_ChEBI_match_aid',
'drug_CHEMID+_PT_(DisplayName[-]_>_DisplayName)':'drug_CHEMID_PT_DisplayName',
'drug_CHEMID+_DisplayName_if_diff':'drug_CHEMID_DisplayName',
'drug_CHEMID+_CAS#_or_ID':'drug_CHEMID_ID', 'drug_CHEMID+_PT_match_type':'drug_CHEMID_PT_match',
'drug_CHEMID+_synonym_if_used_for_match':'drug_CHEMID_match_syn',
'drug_CHEMID+_synonym_match_type':'drug_CHEMID_synonym_match',
'drug_CHEMID+_match_aid_if_any':'drug_CHEMID_match_aid',
'drug_CTD_PT_(ChemicalName)':'drug_CTD_PT_Name', 'drug_CTD_MESH_ID':'drug_CTD_MESH',
'drug_CTD_CAS#':'drug_CTD_CAS', 'drug_CTD_PT_match_type':'drug_CTD_PT_match',
'drug_CTD_synonym_if_used_for_match':'drug_CTD_match_syn',
'drug_CTD_synonym_match_type':'drug_CTD_synonym_match',
'drug_CTD_match_aid_if_any':'drug_CTD_match_aid',
'drug_UMLS_PT_match_type':'drug_UMLS_PT_match',
'drug_UMLS_synonym_if_used_for_match':'drug_UMLS_match_syn',
'drug_UMLS_synonym_match_type':'drug_UMLS_synonym_match',
'drug_UMLS_match_aid_if_any':'drug_UMLS_match_aid',
'drug_UMLS_semantic_type_1':'drug_UMLS_sem_1', 'drug_UMLS_semantic_type_2':'drug_UMLS_sem_2',
'drug_UMLS_semantic_type_3':'drug_UMLS_sem_3', 'drug_UMLS_semantic_type_4':'drug_UMLS_sem_4',
'indication_subtype_(predicate)_Unnamed:_38_level_1_raw':'predicate_raw',
'indication_subtype_(predicate)_Unnamed:_39_level_1_aggregate_1':'predicate_aggregate',
'indication_subtype_(predicate)_Unnamed:_40_level_1_string_search_[temp]':'predicate_string',
'indication_raw_value_entire_value/string':'indication_raw_string',
'indication_raw_value_target/substring':'indication_target_substring',
'indication_raw_value_target=entire_string?':'indication_entire_string?',
'indication_UMLS_phenotype_entry_term_match_type':'umls_phen_term_match',
'indication_UMLS_phenotype_entry_term':'umls_phen_term',
'indication_UMLS_phenotype_PT':'umls_phen_PT',
'indication_UMLS_phenotype_CUI':'umls_phen_cui',
'indication_UMLS_phenotype_entry_term_type':'umls_phen_type',
'indication_UMLS_phenotype_phenotype?':'umls_phen_phen',
'indication_UMLS_phenotype_semantic_type_1':'umls_phen_sem_1',
'indication_UMLS_phenotype_semantic_type_2':'umls_phen_sem_2',
'indication_UMLS_phenotype_semantic_type_3':'umls_phen_sem_3',
'indication_UMLS_phenotype_semantic_type_4':'umls_phen_sem_4',
'indication_UMLS_initial,_if_different_entry_term_match_type':'umls_init_term_match',
'indication_UMLS_initial,_if_different_entry_term':'umls_init_term',
'indication_UMLS_initial,_if_different_PT':'umls_init_PT',
'indication_UMLS_initial,_if_different_CUI':'umls_init_cui',
'indication_UMLS_initial,_if_different_entry_term_type':'umls_init_type',
'indication_UMLS_initial,_if_different_semantic_type_1':'umls_init_sem_1',
'indication_UMLS_initial,_if_different_semantic_type_2':'umls_init_sem_2',
'indication_UMLS_initial,_if_different_semantic_type_3':'umls_init_sem_3',
'indication_UMLS_initial,_if_different_semantic_type_4':'umls_init_sem_4'}, inplace=True)
In [5]:
## Filter entries down
clean_complete_data = did_raw.loc[(did_raw['predicate_raw'].notnull()) &
((did_raw['drug_UMLS_PT_match']=='exact')|(did_raw['drug_UMLS_synonym_match']=='exact')) &
((did_raw['umls_phen_term_match'].str.contains('exact'))|(did_raw['umls_init_term_match'].str.contains('exact')))]
clean_complete_less_markers = clean_complete_data[(clean_complete_data['predicate_raw']!='marker/mechanism') &
(clean_complete_data['umls_phen_phen']=='Y')]
#print(clean_complete_less_markers[['drug_UMLS_PT','drug_UMLS_CUI','predicate_raw','predicate_aggregate',
# 'umls_phen_PT','umls_phen_term_match','umls_phen_cui','umls_init_term_match']].head(n=2))
## Subset to entries with more specific identifiers
clean_complete_chebi = clean_complete_less_markers.loc[clean_complete_less_markers['drug_ChEBI_ID'].notnull()]
print(len(clean_complete_chebi))
print(len(clean_complete_less_markers))
In [12]:
## Check UMLS drug cui coverage in Wikidata
drug_umls_cuis_clean = clean_complete_less_markers['drug_UMLS_CUI'].unique().tolist()
search_list = drug_umls_cuis_clean
wd_property = 'P2892' ##WD Property for UMLS CUI
drugs_wd_df_cui, search_failures = check_wd(wd_property, search_list)
drugs_wd_df_cui.rename(columns={'item':'drug'},inplace=True)
#drugs_wd_df_cui.to_csv('results/drugs_by_cui_wd_df.tsv',sep='\t',header=True)
In [13]:
## Check CAS RN drug coverage in Wikidata
drug_cas_clean = clean_complete_less_markers['drug_cas#'].unique().tolist()
wd_property = 'P231' ##WD Property for CAS number
search_list = drug_cas_clean
drugs_wd_df_cas, cas_search_failures = check_wd(wd_property, search_list)
drugs_wd_df_cas.rename(columns={'item':'drug'},inplace=True)
#drugs_wd_df_cas.to_csv('results/results/drugs_by_cas_wd_df.tsv.tsv',sep='\t',header=True)
In [14]:
## Check for ChEBI drug coverage in Wikidata
chebi_list = clean_complete_chebi['drug_ChEBI_ID'].astype(int).astype(str).unique().tolist()
wd_property = 'P683' ##WD Property for CHEBI
search_list = chebi_list
drugs_wd_df_chebi, chebi_search_failures = check_wd(wd_property, search_list)
drugs_wd_df_chebi.rename(columns={'item':'drug'},inplace=True)
#drugs_wd_df_cui.to_csv('results/drugs_by_chebi_wd_df.tsv',sep='\t',header=True)
In [15]:
## Check for Phenotype CUI coverage in Wikidata
phen_umls_cuis_clean = clean_complete_less_markers['umls_phen_cui'].unique().tolist()
wd_property = 'P2892' ##WD Property for UMLS CUI
search_list = phen_umls_cuis_clean
phen_wd_df_cui, search_failures = check_wd(wd_property, search_list)
phen_wd_df_cui.rename(columns={'item':'phen'},inplace=True)
#phen_wd_df_cui.to_csv('results/phen_by_cui_wd_df.tsv',sep='\t',header=True)
In [6]:
drugs_wd_df_cas = read_csv('results/drugs_by_cas_wd_df.tsv',delimiter='\t',header=0)
drugs_wd_df_chebi = read_csv('results/drugs_by_chebi_wd_df.tsv',delimiter='\t',header=0)
drugs_wd_df_cui = read_csv('results/drugs_by_cui_wd_df.tsv',delimiter='\t',header=0)
phen_wd_df_cui = read_csv('results/phen_by_cui_wd_df.tsv',delimiter='\t',header=0)
drugs_wd_df_cas.drop('Unnamed: 0',axis=1,inplace=True)
drugs_wd_df_chebi.drop('Unnamed: 0',axis=1,inplace=True)
drugs_wd_df_cui.drop('Unnamed: 0',axis=1,inplace=True)
phen_wd_df_cui.drop('Unnamed: 0',axis=1,inplace=True)
In [7]:
#### Summary
#print(did_raw.nunique()) ## The number of unique values for each column
print('number of DID_ID: ',len(did_raw['DID_ID'].unique()))
print('number of unique raw_drug_name: ',len(did_raw['raw_drug_name'].unique()))
print('number of unique drug umls_preferred_term:', len(did_raw['drug_UMLS_PT'].unique()))
print('number of raw predicates (not unique, not null):', len(did_raw.loc[did_raw['predicate_raw'].notnull()]))
print('number of unique umls preferred indication term:', len(did_raw['umls_phen_PT'].unique()))
print('number of DID entries with a predicate value:', len(clean_complete_data))
print('number of DID entries where the predicate is a "marker/mechanism":',len(clean_complete_data[clean_complete_data['predicate_raw']=='marker/mechanism']))
print("number of entries with predicate values that aren't 'marker/mechanism':",len(clean_complete_less_markers))
print('number of WD entities pulled by CAS number from DIDs with predicates:', len(drugs_wd_df_cas))
print('number of WD entities pulled by UMLS "drug" CUIS from DIDs with predicates:', len(drugs_wd_df_cui))
print('number of WD entities pulled by drug ChEBIs from DIDs with predicates:', len(drugs_wd_df_chebi))
print('number of WD entities pulled by UMLS "phenotype" CUIS from DIDs with predicates: ', len(phen_wd_df_cui))
In [8]:
chk_wdid = drugs_wd_df_cas.groupby('WDID').size().reset_index(name='count')
chk_wdid_cas = drugs_wd_df_cas.groupby('drug').size().reset_index(name='count')
chk_cui_wdid = drugs_wd_df_cui.groupby('WDID').size().reset_index(name='count')
chk_cui_cui = drugs_wd_df_cui.groupby('drug').size().reset_index(name='count')
chk_chebi_wdid = drugs_wd_df_chebi.groupby('WDID').size().reset_index(name='count')
chk_chebi_chebi = drugs_wd_df_chebi.groupby('drug').size().reset_index(name='count')
chk_wdid_phen = phen_wd_df_cui.groupby('WDID').size().reset_index(name='count')
chk_wdid_phen_cui = phen_wd_df_cui.groupby('phen').size().reset_index(name='count')
print(len(chk_wdid.loc[chk_wdid['count']>1]),' drug WDIDs map to at least 2 CAS numbers.')
print(len(chk_wdid_cas.loc[chk_wdid_cas['count']>1]),' drug CAS numbers map to at least 2 WDID.')
print(len(chk_cui_wdid.loc[chk_cui_wdid['count']>1]),' drug WDIDs map to at least 2 CUIs.')
print(len(chk_cui_cui.loc[chk_cui_cui['count']>1]),' drug CUIs map to at least 2 WDID.')
print(len(chk_chebi_wdid.loc[chk_chebi_wdid['count']>1]),' drug WDIDs map to at least 2 CUIs.')
print(len(chk_chebi_chebi.loc[chk_chebi_chebi['count']>1]),' drug Chebis map to at least 2 WDIDs.')
print(len(chk_wdid_phen.loc[chk_wdid_phen['count']>1]),' phenotype WDIDs map to at least 2 UMLS CUIs.')
print(len(chk_wdid_phen_cui.loc[chk_wdid_phen_cui['count']>1]),' phenotype WDIDs map to at least 2 UMLS CUIs.')
In [9]:
multiple_mapping_issues_wdid = set(chk_wdid['WDID'].loc[chk_wdid['count']>1].unique().tolist()+
chk_cui_wdid['WDID'].loc[chk_cui_wdid['count']>1].unique().tolist()+
chk_wdid_phen['WDID'].loc[chk_wdid_phen['count']>1].unique().tolist()+
chk_chebi_wdid['WDID'].loc[chk_chebi_wdid['count']>1].unique().tolist())
multiple_mapping_issues_cas = set(chk_wdid_cas['drug'].loc[chk_wdid_cas['count']>1].unique().tolist())
multiple_mapping_issues_chebi = set(chk_chebi_chebi['drug'].loc[chk_chebi_chebi['count']>1].unique().tolist())
multiple_mapping_issues_cui = set(chk_cui_cui['drug'].loc[chk_cui_cui['count']>1].unique().tolist()+
chk_wdid_phen_cui['phen'].loc[chk_wdid_phen_cui['count']>1].unique().tolist())
cas_clean = drugs_wd_df_cas.loc[(~drugs_wd_df_cas['WDID'].isin(multiple_mapping_issues_wdid))&
(~drugs_wd_df_cas['drug'].isin(multiple_mapping_issues_cas))].copy()
chebi_clean = drugs_wd_df_chebi.loc[(~drugs_wd_df_chebi['WDID'].isin(multiple_mapping_issues_wdid))&
(~drugs_wd_df_chebi['drug'].isin(multiple_mapping_issues_chebi))].copy()
cui_drug_clean = drugs_wd_df_cui.loc[(~drugs_wd_df_cui['WDID'].isin(multiple_mapping_issues_wdid))&
(~drugs_wd_df_cui['drug'].isin(multiple_mapping_issues_cui))].copy()
cui_phen_clean = phen_wd_df_cui.loc[(~phen_wd_df_cui['WDID'].isin(multiple_mapping_issues_wdid))&
(~phen_wd_df_cui['phen'].isin(multiple_mapping_issues_cui))].copy()
In [10]:
tmp_slice = clean_complete_less_markers[['raw_drug_name','source_name','drug_UMLS_CUI','drug_ChEBI_ID','drug_cas#',
'predicate_raw','predicate_aggregate','predicate_string',
'indication_raw_string','umls_phen_cui','umls_phen_PT']]
cas_clean.rename(columns={'drug':'drug_cas#','WDID':'drug_cas_wdid'}, inplace=True)
chebi_clean.rename(columns={'drug':'drug_ChEBI_ID','WDID':'drug_chebi_wdid'}, inplace=True)
cui_drug_clean.rename(columns={'drug':'drug_UMLS_CUI','WDID':'drug_cui_wdid'}, inplace=True)
cui_phen_clean.rename(columns={'phen':'umls_phen_cui','WDID':'phen_cui_wdid'}, inplace=True)
cas_merged = tmp_slice.merge(cas_clean, on='drug_cas#', how='left')
chebi_merged = cas_merged.merge(chebi_clean, on='drug_ChEBI_ID', how='left')
drug_cui_merged = chebi_merged.merge(cui_drug_clean, on='drug_UMLS_CUI', how='left')
phen_merge = drug_cui_merged.merge(cui_phen_clean, on='umls_phen_cui', how='left')
potential_data_to_import = phen_merge.loc[(phen_merge['phen_cui_wdid'].notnull()) &
((phen_merge['drug_cas_wdid'].notnull()) |
(phen_merge['drug_chebi_wdid'].notnull()) |
(phen_merge['drug_cui_wdid'].notnull()))]
print(len(potential_data_to_import))
print(potential_data_to_import.head(n=2))
In [11]:
## Single identifiers mapped
drug_wdid_single = potential_data_to_import.loc[(potential_data_to_import['drug_cas_wdid'].isnull() &
potential_data_to_import['drug_chebi_wdid'].isnull()&
potential_data_to_import['drug_cas_wdid'].notnull())|
(potential_data_to_import['drug_cas_wdid'].isnull() &
potential_data_to_import['drug_chebi_wdid'].notnull()&
potential_data_to_import['drug_cas_wdid'].isnull())|
(potential_data_to_import['drug_cas_wdid'].notnull() &
potential_data_to_import['drug_chebi_wdid'].isnull()&
potential_data_to_import['drug_cas_wdid'].isnull())]
print('Potential DID entries for import with only a single drug WDID mapping:',len(drug_wdid_single))
## Identify data where the WDID mappings are contrary
drug_wdid_conflicting = potential_data_to_import.loc[((potential_data_to_import['drug_cas_wdid']!=potential_data_to_import['drug_chebi_wdid'])&
(potential_data_to_import['drug_cas_wdid'].notnull())&(potential_data_to_import['drug_chebi_wdid'].notnull()))|
((potential_data_to_import['drug_cas_wdid']!=potential_data_to_import['drug_cui_wdid'])&
(potential_data_to_import['drug_cas_wdid'].notnull())&(potential_data_to_import['drug_cui_wdid'].notnull()))|
((potential_data_to_import['drug_cui_wdid']!=potential_data_to_import['drug_chebi_wdid'])&
(potential_data_to_import['drug_cui_wdid'].notnull())&(potential_data_to_import['drug_chebi_wdid'].notnull()))]
print('Potential DID entries for import with conflicting WDID mappings:',len(drug_wdid_conflicting))
## Further subset the data to entries where the drug is verified by mapping to the same WDID via two different identifiers
drug_wdid_strict = potential_data_to_import.loc[((potential_data_to_import['drug_cas_wdid']==potential_data_to_import['drug_chebi_wdid'])&
potential_data_to_import['drug_cas_wdid'].notnull())|
((potential_data_to_import['drug_cas_wdid']==potential_data_to_import['drug_cui_wdid'])&
potential_data_to_import['drug_cas_wdid'].notnull())|
((potential_data_to_import['drug_cui_wdid']==potential_data_to_import['drug_chebi_wdid'])&
potential_data_to_import['drug_cui_wdid'].notnull())]
print('Potential DID entries for import with two or more WDID mappings:',len(drug_wdid_strict))
## Further subset the data to entries where the drug is verified by mapping to the same WDID via three different identifiers
drug_wdid_strictest = potential_data_to_import.loc[(potential_data_to_import['drug_cas_wdid']==potential_data_to_import['drug_chebi_wdid'])&
potential_data_to_import['drug_cas_wdid'].notnull()&
(potential_data_to_import['drug_cas_wdid']==potential_data_to_import['drug_cui_wdid'])]
print('Potential DID entries for import with three or more WDID mappings:',len(drug_wdid_strictest))
#print(drug_wdid_strictest.head(n=2))
In [12]:
base_dataset = drug_wdid_strictest
predicates = base_dataset.groupby(['source_name','predicate_raw']).size().reset_index(name='counts')
predicates.sort_values('counts',ascending=False,inplace=True)
#predicates.to_csv('results/raw_predicates_less_markers.tsv',sep='\t',header=True)
print(predicates.loc[predicates['counts']>9].head(n=20))
In [13]:
predicates_aggregate = base_dataset.groupby(['source_name','predicate_aggregate']).size().reset_index(name='counts')
predicates_aggregate.sort_values('counts',ascending=False,inplace=True)
#predicates_aggregate.to_csv('results/aggregate_predicates_less_markers.tsv',sep='\t',header=True)
print(predicates_aggregate)
In [14]:
predicates_string = base_dataset.groupby(['source_name','predicate_string']).size().reset_index(name='counts')
predicates_string.sort_values('counts',ascending=False,inplace=True)
print(predicates_string.head(n=5))
In [15]:
## Investigate difference between predicates_raw and predicates_aggregate
predicate_aggregate_sample = base_dataset[['raw_drug_name','predicate_aggregate','indication_raw_string','umls_phen_PT']].loc[base_dataset['predicate_aggregate']=='inhibits']
print(predicate_aggregate_sample.head(n=2))
predicate_raw_sample = base_dataset[['raw_drug_name','predicate_raw','predicate_aggregate','indication_raw_string','umls_phen_PT']].loc[base_dataset['predicate_raw']=='therapeutic']
print(predicate_raw_sample.head(n=5))
In [20]:
predicates_no_ctd = base_dataset.loc[base_dataset['source_name']!='CTD']
pred_freq_no_ctd = predicates_no_ctd.groupby('predicate_raw').size().reset_index(name='counts')
pred_freq_no_ctd.sort_values('counts',ascending=False,inplace=True)
print(pred_freq_no_ctd.head(n=15))
In [16]:
## Spot check to see coverage of information 'may treats' data loaded to Wikidata
may_treats = base_dataset.loc[(base_dataset['predicate_raw']=='may_treat')|(base_dataset['predicate_raw']=='may_prevent&treat')|(base_dataset['predicate_raw']=='treatment')]
may_prevent = base_dataset.loc[(base_dataset['predicate_raw']=='may_prevent')|(base_dataset['predicate_raw']=='may_prevent&treat')]
causes = base_dataset.loc[base_dataset['predicate_raw']=='causes']
may_treats.to_csv('results/may_treat.tsv',sep='\t',header=True)
may_prevent.to_csv('results/may_prevent.tsv',sep='\t',header=True)
causes.to_csv('results/cause.tsv',sep='\t',header=True)
print(may_treats.head(n=10))
## Aluminum Hydroxied == no may_treats property, but has property 'has role' antacid
## Sodium bicarbonate == has 'medical treatments property' for GRD, cardiac arrest and dyspepsia, not for hyperkalemia, or drug overdose
## calcium acetate == has 'medical treatments property' for ckd, osteoporosis, hyperphosphatemia but not for hypocalcemia
In [ ]:
## Note that many 'subject has role' statements have already been imported into Wikidata,
## However, there doesn't appear to be any links from the antiagent to it's actual disease effect
## this can be imported from DID
#print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='hypo'].head(n=26))
## All hypo predicates refer to hypoglycemic (anti-diabetic agents, treat like anti)
#print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='for'].head(n=24))
## for predicates use with therapeutic area?
#print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='relaxants'].head(n=21))
## subject has role central muscle relaxant (note that these don't include nmj blockers)
## use with therapeutic area
#print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='lytic'].head(n=20))
## subject has role keratolytic
## use with therapeutic area
#print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='management'].head(n=20))
## use with therapeutic area
print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='treatment'].head(n=20))
In [197]:
### Best way to Model anti's? The antis are more generic in terms of therapeutic areas
### Subject has role(P2686) in Antibiotic | Antineoplastic
### Antineoplastic therapeutic area P4044 Malignant Neoplasms
#antitypes = predicates_no_ctd['indication_raw_string'].loc[predicates_no_ctd['predicate_raw']=='anti'].unique().tolist()
#print(len(antitypes))
print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='anti'].head(n=2))
print(antitypes)
In [ ]: