In [3]:
# Fetch HTML using requests lib and feed to bs4
import requests

# note their SSL certificate is not verified. Be careful!
result = requests.get("https://globalgenes.org/rarelist", verify=False)

from bs4 import BeautifulSoup
from bs4 import NavigableString
soup = BeautifulSoup(result.content, 'html.parser')


/usr/local/lib/python3.6/site-packages/urllib3/connectionpool.py:858: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)

In [4]:
# check
soup.title


Out[4]:
<title>Rare Disease List</title>

In [5]:
# write formatted html to file
# (not used: this is just a useful side effect for exploration)
f=open('rarelist.html','w')
f.write(soup.prettify())
f.close()

In [6]:
# use bs4 to extract names from HTML

names = []  ## all disease names found
name2url = {}  ## mapping of names to URLs

h5s = soup.find_all("h5")
for h5 in h5s:
    ul = h5.find_next_sibling('ul')
    for li in ul.findAll('li'):
        if len(li.contents) == 0:
            continue
        n = li.contents[0]
        if n is None:
            print('BAD: {}'.format(li))
            continue
        if not isinstance(n, NavigableString):
            n = n.contents[0]
            if li.select('a'):
                url = li.a['href']
            
                name2url[n] = url
        names.append(n)
        
# show the first 20 for sanity checking
names[0:20]


Out[6]:
['Aagenaes syndrome',
 'Aarskog syndrome',
 'Aase Smith syndrome',
 'ABCD syndrome',
 'Abderhalden Kaufmann Lignac syndrome',
 'Abdominal aortic aneurysm',
 'Abdominal chemodectomas with cutaneous angiolipomas',
 'Abdominal cystic lymphangioma',
 'Abdominal obesity metabolic syndrome',
 'Aberrant subclavian artery',
 'Abetalipoproteinemia',
 'Abidi X-linked mental retardation syndrome',
 'Ablepharon macrostomia syndrome',
 "Abrikosov's tumor",
 'Abruzzo Erickson syndrome',
 'Absence of fingerprints congenital milia',
 'Absence of gluteal muscle',
 'Absence of septum pellucidum',
 'Absence of Tibia',
 'Absence of tibia with polydactyly']

In [7]:
## sanity check URL mapping
list(name2url.items())[0:10]


Out[7]:
[('Acute disseminated encephalomyelitis', 'http://ulf.org/'),
 ('Acute hemorrhagic leukoencephalitis', 'http://ulf.org/'),
 ('Adrenoleukodystrophy X-linked', 'http://ulf.org/'),
 ('Adrenomyeloneuropathy', 'http://ulf.org/'),
 ('Aicardi-Goutieres syndrome', 'http://ulf.org/'),
 ('Alexander disease', 'http://ulf.org/'),
 ('Alkaptonuria', 'http://www.alkaptonuria.info/'),
 ('Alpers syndrome',
  'http://www.umdf.org/site/c.8qKOJ0MvF7LUG/b.7929671/k.BDF0/Home.htm'),
 ('Alzheimer disease familial', 'http://www.mitoaction.org/'),
 ('Alzheimer disease type 1', 'http://www.mitoaction.org/')]

In [8]:
import csv
with open('rare-list.tsv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='\t')
    for n in names:
        spamwriter.writerow([n, name2url.get(n)])

In [9]:
## use ontobio lib for fetching ontologies and lexical mapping
from ontobio import OntologyFactory


/usr/local/lib/python3.6/site-packages/cachier/mongo_core.py:24: UserWarning: Cachier warning: pymongo was not found. MongoDB cores will not work.
  "Cachier warning: pymongo was not found. MongoDB cores will not work.")

In [10]:
ofa = OntologyFactory()

In [11]:
hp = ofa.create('obo:hp')

In [12]:
mondo = ofa.create('obo:mondo')

In [13]:
from ontobio.lexmap import LexicalMapEngine
lexmap = LexicalMapEngine()

In [14]:
# Quick hack to make a degenerate 'ontology' from the list of names
from ontobio import Ontology

def ont_from_names(names):
    ont = Ontology(id='rare')
    for n in names:
        ## use name as ID
        ont.add_node(n, n)
    return ont
        
rare = ont_from_names(names)
rare


Out[14]:
rare handle: None meta: None

In [15]:
## quick inspection
rare.nodes()[0:10]


Out[15]:
['Aagenaes syndrome',
 'Aarskog syndrome',
 'Aase Smith syndrome',
 'ABCD syndrome',
 'Abderhalden Kaufmann Lignac syndrome',
 'Abdominal aortic aneurysm',
 'Abdominal chemodectomas with cutaneous angiolipomas',
 'Abdominal cystic lymphangioma',
 'Abdominal obesity metabolic syndrome',
 'Aberrant subclavian artery']

In [16]:
## index the 3 ontologies
lexmap.index_ontology(hp)
lexmap.index_ontology(mondo)
lexmap.index_ontology(rare)


WARNING:root:Incomplete syn: HP:0000991 "" hasRelatedSynonym None [] 1.0
WARNING:root:Incomplete syn: HP:0012377 "" hasRelatedSynonym None [] 1.0
WARNING:root:Incomplete syn: HP:0000510 "" hasRelatedSynonym None [] 1.0
WARNING:root:Ignoring suspicous synonym: UBERON:0002722 "4" hasBroadSynonym None ['http://uri.neuinfo.org/nif/nifstd/birnlex_1488', 'NIFSTD:NeuroNames_abbrevSource'] 1.0
WARNING:root:Ignoring suspicous synonym: UBERON:0001715 "3" hasBroadSynonym None ['http://uri.neuinfo.org/nif/nifstd/birnlex_1240', 'NIFSTD:NeuroNames_abbrevSource'] 1.0

In [17]:
## CONFIGURE
## we will map R to mondo and hp separately
lexmap.ontology_pairs = [(rare.id, mondo.id), (rare.id, hp.id)]

In [18]:
# align
g = lexmap.get_xref_graph()

In [19]:
# get a dataframe from the mapping graph
df=lexmap.as_dataframe(g)
df


Out[19]:
left left_label right right_label left_match_type right_match_type left_match_val right_match_val score left_simscore ... conditional_pr_equiv pr_subClassOf pr_superClassOf pr_equivalentTo pr_other left_novel right_novel left_consistent right_consistent equiv_clique_size
3287 11-beta-hydroxylase deficiency 11-beta-hydroxylase deficiency MONDO:0008729 congenital adrenal hyperplasia due to 11-beta-... label hasRelatedSynonym 11-beta-hydroxylase deficiency 11-Beta-Hydroxylase Deficiency 50.0 1.000000 ... 1.000000 0.061581 0.061581 0.799654 0.077184 True True False False 7
2199 15q13.3 microdeletion syndrome 15q13.3 microdeletion syndrome MONDO:0012774 chromosome 15q13.3 microdeletion syndrome label hasExactSynonym 15q13.3 microdeletion syndrome 15q13.3 microdeletion syndrome 90.0 1.000000 ... 1.000000 0.029969 0.029969 0.918763 0.021299 True True False False 6
3339 17-alpha-hydroxylase deficiency 17-alpha-hydroxylase deficiency MONDO:0008730 congenital adrenal hyperplasia due to 17-alpha... label hasRelatedSynonym 17-alpha-hydroxylase deficiency 17-Alpha-Hydroxylase Deficiency 50.0 1.000000 ... 1.000000 0.061581 0.061581 0.799654 0.077184 True True False False 5
3481 17-beta hydroxysteroid dehydrogenase 3 deficiency 17-beta hydroxysteroid dehydrogenase 3 deficiency MONDO:0009916 46,XY disorder of sex development due to 17-be... label hasExactSynonym 17-beta hydroxysteroid dehydrogenase 3 deficiency 17-beta-hydroxysteroid dehydrogenase 3 deficiency 58.0 1.000000 ... 1.000000 0.205965 0.205965 0.392394 0.195675 True True False False 7
2592 17q21.31 microdeletion syndrome 17q21.31 microdeletion syndrome MONDO:0012496 Koolen de Vries syndrome label hasExactSynonym 17q21.31 microdeletion syndrome 17q21.31 microdeletion syndrome 90.0 1.000000 ... 0.473684 0.168017 0.055554 0.749591 0.026839 True True False False 8
2593 17q21.31 microdeletion syndrome 17q21.31 microdeletion syndrome MONDO:0018216 17q21.31 microdeletion syndrome label label 17q21.31 microdeletion syndrome 17q21.31 microdeletion syndrome 100.0 1.000000 ... 0.526316 0.051671 0.108232 0.824734 0.015363 True True False False 8
2987 18 Hydroxylase deficiency 18 Hydroxylase deficiency MONDO:0008751 Corticosterone methyloxidase type 1 deficiency label hasRelatedSynonym 18 Hydroxylase deficiency 18-Hydroxylase Deficiency 32.0 1.000000 ... 0.355556 0.232996 0.289482 0.283582 0.193941 True True False False 6
2986 18 Hydroxylase deficiency 18 Hydroxylase deficiency MONDO:0020489 familial hyperreninemic hypoaldosteronism type 1 label hasExactSynonym 18 Hydroxylase deficiency 18-hydroxylase deficiency 58.0 1.000000 ... 0.644444 0.292046 0.210145 0.309167 0.188643 True True False False 6
1960 1q21.1 microdeletion syndrome 1q21.1 microdeletion syndrome MONDO:0012914 chromosome 1q21.1 deletion syndrome label hasExactSynonym 1q21.1 microdeletion syndrome 1q21.1 microdeletion syndrome 90.0 1.000000 ... 1.000000 0.030109 0.030109 0.923042 0.016740 True True False False 6
1428 2 4-Dienoyl-CoA reductase deficiency 2 4-Dienoyl-CoA reductase deficiency MONDO:0014464 progressive encephalopathy with leukodystrophy... label hasExactSynonym 2 4-Dienoyl-CoA reductase deficiency 2,4-dienoyl-CoA reductase deficiency 58.0 1.000000 ... 1.000000 0.200803 0.200803 0.382559 0.215835 True True False False 5
4514 2-Hydroxyglutaric aciduria 2-Hydroxyglutaric aciduria MONDO:0016001 2-hydroxyglutaric aciduria label label 2-Hydroxyglutaric aciduria 2-hydroxyglutaric aciduria 100.0 1.000000 ... 1.000000 0.028758 0.028758 0.925963 0.016522 True True False False 7
1888 2-methyl-3-hydroxybutyric aciduria 2-methyl-3-hydroxybutyric aciduria MONDO:0010327 HSD10 disease label hasExactSynonym 2-methyl-3-hydroxybutyric aciduria 2-methyl-3-hydroxybutyric aciduria 90.0 1.000000 ... 1.000000 0.029969 0.029969 0.918763 0.021299 True True False False 5
1202 2-methylbutyryl-CoA dehydrogenase deficiency 2-methylbutyryl-CoA dehydrogenase deficiency MONDO:0012392 2-methylbutyryl-CoA dehydrogenase deficiency label label 2-methylbutyryl-CoA dehydrogenase deficiency 2-methylbutyryl-CoA dehydrogenase deficiency 100.0 1.000000 ... 1.000000 0.028795 0.028795 0.927169 0.015241 True True False False 7
3288 21-hydroxylase deficiency 21-hydroxylase deficiency MONDO:0008728 classic congenital adrenal hyperplasia due to ... label hasRelatedSynonym 21-hydroxylase deficiency 21-Hydroxylase Deficiency 50.0 1.000000 ... 1.000000 0.061581 0.061581 0.799654 0.077184 True True False False 5
3507 22q11.2 deletion syndrome 22q11.2 deletion syndrome MONDO:0008644 velocardiofacial syndrome label hasExactSynonym 22q11.2 deletion syndrome deletion 22q11.2 syndrome 58.0 1.000000 ... 0.134754 0.179472 0.287938 0.282070 0.250520 True True False False 41
2964 22q11.2 deletion syndrome 22q11.2 deletion syndrome MONDO:0018923 22q11.2 deletion syndrome label label 22q11.2 deletion syndrome 22q11.2 deletion syndrome 100.0 0.166667 ... 0.115075 0.092223 0.035954 0.841716 0.030107 True True False False 41
1721 3 methylglutaconic aciduria type I 3 methylglutaconic aciduria type I MONDO:0009610 3-methylglutaconic aciduria type 1 label label 3 methylglutaconic aciduria type I 3-methylglutaconic aciduria type 1 64.0 1.000000 ... 1.000000 0.200803 0.200803 0.382559 0.215835 True True False False 9
1720 3 methylglutaconic aciduria type IV 3 methylglutaconic aciduria type IV MONDO:0009611 3-methylglutaconic aciduria type 4 label label 3 methylglutaconic aciduria type IV 3-methylglutaconic aciduria type 4 64.0 1.000000 ... 1.000000 0.200803 0.200803 0.382559 0.215835 True True False False 8
2580 3 methylglutaconic aciduria type V 3 methylglutaconic aciduria type V MONDO:0012435 3-methylglutaconic aciduria type 5 label label 3 methylglutaconic aciduria type V 3-methylglutaconic aciduria type 5 64.0 1.000000 ... 1.000000 0.198342 0.198342 0.377872 0.225444 True True False False 7
1877 3-Hydroxyisobutyric aciduria 3-Hydroxyisobutyric aciduria MONDO:0009371 3-hydroxyisobutyric aciduria label label 3-Hydroxyisobutyric aciduria 3-hydroxyisobutyric aciduria 100.0 1.000000 ... 1.000000 0.028795 0.028795 0.927169 0.015241 True True False False 8
3289 3-beta-hydroxysteroid dehydrogenase deficiency 3-beta-hydroxysteroid dehydrogenase deficiency MONDO:0008727 congenital adrenal hyperplasia due to 3-beta-h... label hasRelatedSynonym 3-beta-hydroxysteroid dehydrogenase deficiency 3-Beta-Hydroxysteroid Dehydrogenase Deficiency 50.0 1.000000 ... 1.000000 0.061581 0.061581 0.799654 0.077184 True True False False 5
3670 3-methylglutaconic aciduria type III 3-methylglutaconic aciduria type III MONDO:0009787 3-methylglutaconic aciduria type 3 label hasExactSynonym 3-methylglutaconic aciduria type III 3-methylglutaconic aciduria type III 90.0 1.000000 ... 1.000000 0.029969 0.029969 0.918763 0.021299 True True False False 8
755 4-hydroxyphenylacetic aciduria 4-hydroxyphenylacetic aciduria HP:0003607 4-Hydroxyphenylacetic aciduria label label 4-hydroxyphenylacetic aciduria 4-Hydroxyphenylacetic aciduria 100.0 1.000000 ... 1.000000 0.028891 0.028891 0.930268 0.011949 True True False False 2
3680 46 XX testicular disorder of sex development 46 XX testicular disorder of sex development MONDO:0010766 46,XX testicular disorder of sex development label label 46 XX testicular disorder of sex development 46,XX testicular disorder of sex development 64.0 1.000000 ... 1.000000 0.198342 0.198342 0.377872 0.225444 True True False False 6
3136 47 XXX syndrome 47 XXX syndrome MONDO:0018066 trisomy X label hasExactSynonym 47 XXX syndrome 47,XXX syndrome 58.0 1.000000 ... 1.000000 0.226493 0.185437 0.392394 0.195675 True True False False 5
3166 47 XYY syndrome 47 XYY syndrome MONDO:0019339 47,XYY syndrome label label 47 XYY syndrome 47,XYY syndrome 64.0 1.000000 ... 1.000000 0.226493 0.185437 0.392394 0.195675 True True False False 5
4164 49 XXXXX syndrome 49 XXXXX syndrome MONDO:0015228 pentasomy X label hasExactSynonym 49 XXXXX syndrome 49,XXXXX syndrome 58.0 1.000000 ... 1.000000 0.205965 0.205965 0.392394 0.195675 True True False False 5
4531 49 XXXXY syndrome 49 XXXXY syndrome MONDO:0019929 49,XXXXY syndrome label label 49 XXXXY syndrome 49,XXXXY syndrome 64.0 1.000000 ... 1.000000 0.219001 0.179303 0.379414 0.222282 True True False False 6
710 5-oxoprolinase deficiency 5-oxoprolinase deficiency MONDO:0009825 5-oxoprolinase deficiency (disease) label hasExactSynonym 5-oxoprolinase deficiency 5-oxoprolinase deficiency 90.0 1.000000 ... 1.000000 0.030109 0.030109 0.923042 0.016740 True True False False 7
709 5-oxoprolinase deficiency 5-oxoprolinase deficiency HP:0040142 5-oxoprolinase deficiency label label 5-oxoprolinase deficiency 5-oxoprolinase deficiency 100.0 1.000000 ... 1.000000 0.028891 0.028891 0.930268 0.011949 True True False False 7
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2067 Wrinkly skin syndrome Wrinkly skin syndrome MONDO:0010208 Wrinkly skin syndrome label label Wrinkly skin syndrome Wrinkly skin syndrome 100.0 1.000000 ... 1.000000 0.028758 0.028758 0.925963 0.016522 True True False False 7
2105 X-linked adrenal hypoplasia congenita X-linked adrenal hypoplasia congenita MONDO:0010264 X-linked adrenal hypoplasia congenita label label X-linked adrenal hypoplasia congenita X-linked adrenal hypoplasia congenita 100.0 1.000000 ... 1.000000 0.028738 0.028738 0.925323 0.017201 True True False False 7
1552 X-linked hypohidrotic ectodermal dysplasia X-linked hypohidrotic ectodermal dysplasia MONDO:0010585 X-linked hypohidrotic ectodermal dysplasia label label X-linked hypohidrotic ectodermal dysplasia X-linked hypohidrotic ectodermal dysplasia 100.0 1.000000 ... 1.000000 0.028738 0.028738 0.925323 0.017201 True True False False 4
3900 X-linked ichthyosis X-linked ichthyosis MONDO:0010622 recessive X-linked ichthyosis label hasExactSynonym X-linked ichthyosis X-linked ichthyosis 90.0 1.000000 ... 1.000000 0.029886 0.029886 0.916224 0.024003 True True False False 7
1968 X-linked severe combined immunodeficiency X-linked severe combined immunodeficiency MONDO:0010315 gamma chain deficiency label hasExactSynonym X-linked severe combined immunodeficiency X-Linked Severe Combined Immunodeficiency 90.0 1.000000 ... 1.000000 0.029969 0.029969 0.918763 0.021299 True True False False 8
2543 XFE progeroid syndrome XFE progeroid syndrome MONDO:0012590 XFE progeroid syndrome label label XFE progeroid syndrome XFE progeroid syndrome 100.0 1.000000 ... 1.000000 0.028891 0.028891 0.930268 0.011949 True True False False 7
3037 XK aprosencephaly XK aprosencephaly MONDO:0008811 XK aprosencephaly label label XK aprosencephaly XK aprosencephaly 100.0 1.000000 ... 1.000000 0.028891 0.028891 0.930268 0.011949 True True False False 8
2070 Xanthinuria type 1 Xanthinuria type 1 MONDO:0010209 xanthinuria type I label label Xanthinuria type 1 xanthinuria type I 64.0 1.000000 ... 1.000000 0.205965 0.205965 0.392394 0.195675 True True False False 5
2414 Xanthinuria type 2 Xanthinuria type 2 MONDO:0011346 xanthinuria type II label label Xanthinuria type 2 xanthinuria type II 64.0 1.000000 ... 1.000000 0.205965 0.205965 0.392394 0.195675 True True False False 6
1509 Xanthogranulomatous cholecystitis Xanthogranulomatous cholecystitis MONDO:0004875 xanthogranulomatous cholecystitis label label Xanthogranulomatous cholecystitis xanthogranulomatous cholecystitis 100.0 1.000000 ... 1.000000 0.028795 0.028795 0.927169 0.015241 True True False False 8
2867 Xeroderma pigmentosum Xeroderma pigmentosum MONDO:0019600 xeroderma pigmentosum label label Xeroderma pigmentosum xeroderma pigmentosum 100.0 1.000000 ... 1.000000 0.028758 0.028758 0.925963 0.016522 True True False False 8
2077 Xeroderma pigmentosum variant type Xeroderma pigmentosum variant type MONDO:0010214 xeroderma pigmentosum variant type label label Xeroderma pigmentosum variant type xeroderma pigmentosum variant type 100.0 1.000000 ... 1.000000 0.028758 0.028758 0.925963 0.016522 True True False False 8
3151 Yaws Yaws MONDO:0006019 yaws label label Yaws yaws 100.0 1.000000 ... 1.000000 0.051830 0.051830 0.874531 0.021809 True True False False 10
3080 Yellow fever Yellow fever MONDO:0020502 yellow fever label label Yellow fever yellow fever 100.0 1.000000 ... 1.000000 0.028891 0.028891 0.930268 0.011949 True True False False 8
4539 Yellow nail syndrome Yellow nail syndrome MONDO:0007921 yellow nail syndrome label label Yellow nail syndrome yellow nail syndrome 100.0 1.000000 ... 1.000000 0.028891 0.028891 0.930268 0.011949 True True False False 9
2555 Yemenite deaf-blind hypopigmentation syndrome Yemenite deaf-blind hypopigmentation syndrome MONDO:0011133 Deaf blind hypopigmentation syndrome, Yemenite... label hasExactSynonym Yemenite deaf-blind hypopigmentation syndrome Yemenite deaf-blind hypopigmentation syndrome 90.0 1.000000 ... 1.000000 0.030109 0.030109 0.923042 0.016740 True True False False 6
4262 Yolk sac tumor Yolk sac tumor MONDO:0005744 yolk sac tumor label label Yolk sac tumor yolk sac tumor 100.0 1.000000 ... 1.000000 0.028758 0.028758 0.925963 0.016522 True True False False 7
3775 Yorifuji Okuno syndrome Yorifuji Okuno syndrome MONDO:0010802 pancreatic hypoplasia-diabetes-congenital hear... label hasExactSynonym Yorifuji Okuno syndrome Yorifuji-Okuno syndrome 58.0 1.000000 ... 1.000000 0.205965 0.205965 0.392394 0.195675 True True False False 5
4330 Young Hughes syndrome Young Hughes syndrome MONDO:0017614 X-linked intellectual disability-hypogonadism-... label hasExactSynonym Young Hughes syndrome Young-Hughes syndrome 58.0 1.000000 ... 1.000000 0.200803 0.200803 0.382559 0.215835 True True False False 4
2384 Young Simpson syndrome Young Simpson syndrome MONDO:0011365 blepharophimosis-intellectual disability syndr... label hasRelatedSynonym Young Simpson syndrome Young-Simpson Syndrome 32.0 1.000000 ... 1.000000 0.200803 0.200803 0.382559 0.215835 True True False False 7
2059 Young syndrome Young syndrome MONDO:0010220 young syndrome label label Young syndrome young syndrome 100.0 1.000000 ... 1.000000 0.028891 0.028891 0.930268 0.011949 True True False False 7
2892 Yunis Varon syndrome Yunis Varon syndrome MONDO:0008995 Yunis-Varon syndrome label label Yunis Varon syndrome Yunis-Varon syndrome 64.0 1.000000 ... 1.000000 0.062922 0.062922 0.817066 0.057090 True True False False 7
1473 Zechi Ceide syndrome Zechi Ceide syndrome MONDO:0013036 Zechi-Ceide syndrome label label Zechi Ceide syndrome Zechi-Ceide syndrome 64.0 1.000000 ... 1.000000 0.205965 0.205965 0.392394 0.195675 True True False False 6
2866 Zellweger syndrome Zellweger syndrome MONDO:0019609 Zellweger syndrome label label Zellweger syndrome Zellweger syndrome 100.0 1.000000 ... 1.000000 0.028758 0.028758 0.925963 0.016522 True True False False 6
657 Zollinger-Ellison syndrome Zollinger-Ellison syndrome MONDO:0006020 Zollinger-Ellison syndrome (disease) label hasExactSynonym Zollinger-Ellison syndrome Zollinger-Ellison Syndrome 90.0 1.000000 ... 0.473684 0.075251 0.062185 0.839062 0.023503 True True False False 11
655 Zollinger-Ellison syndrome Zollinger-Ellison syndrome HP:0002044 Zollinger-Ellison syndrome label label Zollinger-Ellison syndrome Zollinger-Ellison syndrome 100.0 1.000000 ... 1.000000 0.028891 0.028891 0.930268 0.011949 True True False False 11
656 Zollinger-Ellison syndrome Zollinger-Ellison syndrome MONDO:0019610 Zollinger-Ellison syndrome label label Zollinger-Ellison syndrome Zollinger-Ellison syndrome 100.0 1.000000 ... 0.526316 0.055295 0.045694 0.882570 0.016441 True True False False 11
3377 Zori Stalker Williams syndrome Zori Stalker Williams syndrome MONDO:0010883 pectus excavatum-macrocephaly-dysplastic nails... label hasExactSynonym Zori Stalker Williams syndrome Zori-Stalker-Williams syndrome 58.0 1.000000 ... 1.000000 0.205965 0.205965 0.392394 0.195675 True True False False 5
2061 Zunich neuroectodermal syndrome Zunich neuroectodermal syndrome MONDO:0010221 CHIME syndrome label hasRelatedSynonym Zunich neuroectodermal syndrome Zunich Neuroectodermal Syndrome 50.0 1.000000 ... 1.000000 0.061951 0.061951 0.804454 0.071645 True True False False 6
3649 Zygomycosis Zygomycosis MONDO:0019136 zygomycosis label label Zygomycosis zygomycosis 100.0 1.000000 ... 1.000000 0.051830 0.051830 0.874531 0.021809 True True False False 9

4558 rows × 22 columns


In [20]:
## write to file (not used here but can be examined separately)
df.to_csv('rare-matches.tsv', sep="\t", index=False)

In [21]:
udf = lexmap.unmapped_dataframe(g)

In [22]:
## unmapped (TODO this includes unmapped from MONDO/HP to R, which we don't care about so much)
udf.to_csv('rare-no-matches.tsv', sep="\t", index=False)
udf


Out[22]:
id label mapped_equivs
18057 16p11.2 deletion syndrome 16p11.2 deletion syndrome
105646 2-Methylacetoacetyl CoA thiolase deficiency 2-Methylacetoacetyl CoA thiolase deficiency
41905 2-hydroxyethyl methacrylate sensitization 2-hydroxyethyl methacrylate sensitization
29133 22q11.2 duplication syndrome 22q11.2 duplication syndrome
100428 22q13.3 deletion syndrome 22q13.3 deletion syndrome
96122 2q37 deletion syndrome 2q37 deletion syndrome
88482 3 Methylcrotonyl-CoA carboxylase 1 deficiency 3 Methylcrotonyl-CoA carboxylase 1 deficiency
34501 3 alpha methylcrotonyl-CoA carboxylase 2 defic... 3 alpha methylcrotonyl-CoA carboxylase 2 defic...
85670 3-alpha hydroxyacyl-CoA dehydrogenase deficiency 3-alpha hydroxyacyl-CoA dehydrogenase deficiency
77929 3p deletion syndrome 3p deletion syndrome
95095 46 XX Gonadal dysgenesis epibulbar dermoid 46 XX Gonadal dysgenesis epibulbar dermoid
90032 5-Nucleotidase syndrome 5-Nucleotidase syndrome
74374 6 alpha mercaptopurine sensitivity 6 alpha mercaptopurine sensitivity
51486 ACTH-independent macronodular adrenal hyperplasia ACTH-independent macronodular adrenal hyperplasia
26334 AIDS Dementia Complex AIDS Dementia Complex
12881 AIDS dysmorphic syndrome AIDS dysmorphic syndrome
77859 ALK+ histiocytosis ALK+ histiocytosis
26275 ALS-like syndrome of encephalomyopathy ALS-like syndrome of encephalomyopathy
60831 Abderhalden Kaufmann Lignac syndrome Abderhalden Kaufmann Lignac syndrome
10975 Abdominal chemodectomas with cutaneous angioli... Abdominal chemodectomas with cutaneous angioli...
108580 Abdominal cystic lymphangioma Abdominal cystic lymphangioma
94496 Aberrant subclavian artery Aberrant subclavian artery
93116 Abidi X-linked mental retardation syndrome Abidi X-linked mental retardation syndrome
40555 Absence of fingerprints congenital milia Absence of fingerprints congenital milia
7562 Absence of gluteal muscle Absence of gluteal muscle
6259 Absence of tibia with polydactyly Absence of tibia with polydactyly
75137 Absent T lymphocytes Absent T lymphocytes
49480 Absent breasts and nipples Absent breasts and nipples
56640 Abuse dwarfism syndrome Abuse dwarfism syndrome
8138 Acanthamoeba infection Acanthamoeba infection
... ... ... ...
25120 http://www.orpha.net/ORDO/Orphanet_99948 None [MONDO:0008961]
21037 http://www.orpha.net/ORDO/Orphanet_99949 None [MONDO:0011113]
17454 http://www.orpha.net/ORDO/Orphanet_99950 None [MONDO:0011085]
20779 http://www.orpha.net/ORDO/Orphanet_99951 None [MONDO:0011527]
70495 http://www.orpha.net/ORDO/Orphanet_99952 None []
94070 http://www.orpha.net/ORDO/Orphanet_99953 None [MONDO:0011534]
82089 http://www.orpha.net/ORDO/Orphanet_99954 None []
106842 http://www.orpha.net/ORDO/Orphanet_99955 None [MONDO:0011066]
99610 http://www.orpha.net/ORDO/Orphanet_99956 None [MONDO:0011475]
10190 http://www.orpha.net/ORDO/Orphanet_99960 None []
100192 http://www.orpha.net/ORDO/Orphanet_99961 None []
30752 http://www.orpha.net/ORDO/Orphanet_99965 None []
51288 http://www.orpha.net/ORDO/Orphanet_99966 None []
24502 http://www.orpha.net/ORDO/Orphanet_99967 None []
102919 http://www.orpha.net/ORDO/Orphanet_99969 None []
79112 http://www.orpha.net/ORDO/Orphanet_99970 None []
17426 http://www.orpha.net/ORDO/Orphanet_99971 None []
15059 http://www.orpha.net/ORDO/Orphanet_99976 None []
65846 http://www.orpha.net/ORDO/Orphanet_99977 None []
6381 http://www.orpha.net/ORDO/Orphanet_99978 None [MONDO:0003345]
3759 http://www.orpha.net/ORDO/Orphanet_99981 None []
3847 http://www.orpha.net/ORDO/Orphanet_99983 None []
85668 http://www.orpha.net/ORDO/Orphanet_99989 None []
4382 http://www.orpha.net/ORDO/Orphanet_99990 None []
45866 http://www.orpha.net/ORDO/Orphanet_99991 None []
21041 http://www.orpha.net/ORDO/Orphanet_99994 None []
65561 http://www.orpha.net/ORDO/Orphanet_99995 None []
43256 http://www.w3.org/2000/01/rdf-schema#seeAlso seeAlso
21673 http://www.w3.org/2002/07/owl#Thing None
15400 http://www.w3.org/2002/07/owl#topObjectProperty None

110240 rows × 3 columns


In [ ]: