In [3]:
from data import Mappings, Databases
from collections import OrderedDict
import yaml, json
import pymysql
import pandas as pd
def compare(s1, s2):
return len(s1 - s2), len(s1 & s2), len(s2 - s1)
umls_db = pymysql.connect(db='UMLS2014AB_CoMap', host='127.0.0.1', user='root', password='root', port=3307)
In [4]:
project = 'safeguard'
with open("../projects/{}/config.yaml".format(project)) as f:
config = yaml.load(f)
databases = Databases.of_config(config)
events = config['events']
with open('../safeguard.mappings.json') as f:
mappings = Mappings.of_data(json.load(f))
# {DB: {code}}
mapping_codes = {
db: mappings.all_codes(db)
for db, _ in databases
}
In [5]:
sabs = ["ICD10", "ICD10CM", "ICD10DUT", "ICD10PCS"]
query = """
select distinct code from MRCONSO where sab = %s
"""
icd10_codes = {}
with umls_db.cursor() as cursor:
for sab in sabs:
cursor.execute(query, [sab])
icd10_codes[sab] = {r[0] for r in cursor.fetchall()}
In [7]:
df = pd.Series(OrderedDict([
('ICD10', len(mapping_codes['GePaRD'] & icd10_codes['ICD10'])),
('ICD10CM', len(mapping_codes['GePaRD'] & icd10_codes['ICD10CM'])),
('ICD10DUT', len(mapping_codes['GePaRD'] & icd10_codes['ICD10DUT'])),
('ICD10+ICD10CM', len(mapping_codes['GePaRD'] & (icd10_codes['ICD10'] | icd10_codes['ICD10CM']))),
('mapping', len(mapping_codes['GePaRD'])),
])).to_frame('N')
df['%'] = (df.N / df.N.mapping).map('{:.1%}'.format)
df
Out[7]:
In [9]:
sabs = ["ICPC", "ICPC2P", "ICPC2EENG", "ICPC2EDUT", "ICPC2ICD10ENG"]
query = """
select distinct code from MRCONSO where sab = %s
"""
icpc_codes = {}
with umls_db.cursor() as cursor:
for sab in sabs:
cursor.execute(query, [sab])
icpc_codes[sab] = {r[0] for r in cursor.fetchall()}
In [10]:
df = pd.Series(OrderedDict([
('ICPC', len(mapping_codes['IPCI'] & icpc_codes['ICPC'])),
('ICPC2P', len(mapping_codes['IPCI'] & icpc_codes['ICPC2P'])),
('ICPC2EENG', len(mapping_codes['IPCI'] & (icpc_codes['ICPC2EENG']))),
('ICPC2EDUT', len(mapping_codes['IPCI'] & (icpc_codes['ICPC2EDUT']))),
('ICPC2ICD10ENG', len(mapping_codes['IPCI'] & (icpc_codes['ICPC2ICD10ENG']))),
('mapping', len(mapping_codes['IPCI'])),
])).to_frame('N')
df['%'] = (df.N / df.N.mapping).map('{:.1%}'.format)
df
Out[10]: