In [3]:
from data import Mappings, Databases
from collections import OrderedDict
import yaml, json
import pymysql
import pandas as pd

def compare(s1, s2):
    return len(s1 - s2), len(s1 & s2), len(s2 - s1)

umls_db = pymysql.connect(db='UMLS2014AB_CoMap', host='127.0.0.1', user='root', password='root', port=3307)

In [4]:
project = 'safeguard'
with open("../projects/{}/config.yaml".format(project)) as f:
    config = yaml.load(f)
    databases = Databases.of_config(config)
    events = config['events']
with open('../safeguard.mappings.json') as f:
    mappings = Mappings.of_data(json.load(f))

# {DB: {code}}
mapping_codes = {
    db: mappings.all_codes(db)
    for db, _ in databases
}

In [5]:
sabs = ["ICD10", "ICD10CM", "ICD10DUT", "ICD10PCS"]
query = """
    select distinct code from MRCONSO where sab = %s
"""
icd10_codes = {}
with umls_db.cursor() as cursor:
    for sab in sabs:
        cursor.execute(query, [sab])
        icd10_codes[sab] = {r[0] for r in cursor.fetchall()}

In [7]:
df = pd.Series(OrderedDict([
    ('ICD10', len(mapping_codes['GePaRD'] & icd10_codes['ICD10'])),
    ('ICD10CM', len(mapping_codes['GePaRD'] & icd10_codes['ICD10CM'])),
    ('ICD10DUT', len(mapping_codes['GePaRD'] & icd10_codes['ICD10DUT'])),
    ('ICD10+ICD10CM', len(mapping_codes['GePaRD'] & (icd10_codes['ICD10'] | icd10_codes['ICD10CM']))),
    ('mapping', len(mapping_codes['GePaRD'])),
])).to_frame('N')
df['%'] = (df.N / df.N.mapping).map('{:.1%}'.format)
df


Out[7]:
N %
ICD10 66 91.7%
ICD10CM 69 95.8%
ICD10DUT 0 0.0%
ICD10+ICD10CM 72 100.0%
mapping 72 100.0%

In [9]:
sabs = ["ICPC", "ICPC2P", "ICPC2EENG", "ICPC2EDUT", "ICPC2ICD10ENG"]
query = """
    select distinct code from MRCONSO where sab = %s
"""
icpc_codes = {}
with umls_db.cursor() as cursor:
    for sab in sabs:
        cursor.execute(query, [sab])
        icpc_codes[sab] = {r[0] for r in cursor.fetchall()}

In [10]:
df = pd.Series(OrderedDict([
    ('ICPC', len(mapping_codes['IPCI'] & icpc_codes['ICPC'])),
    ('ICPC2P', len(mapping_codes['IPCI'] & icpc_codes['ICPC2P'])),
    ('ICPC2EENG', len(mapping_codes['IPCI'] & (icpc_codes['ICPC2EENG']))),
    ('ICPC2EDUT', len(mapping_codes['IPCI'] & (icpc_codes['ICPC2EDUT']))),
    ('ICPC2ICD10ENG', len(mapping_codes['IPCI'] & (icpc_codes['ICPC2ICD10ENG']))),
    ('mapping', len(mapping_codes['IPCI'])),
])).to_frame('N')
df['%'] = (df.N / df.N.mapping).map('{:.1%}'.format)
df


Out[10]:
N %
ICPC 7 100.0%
ICPC2P 0 0.0%
ICPC2EENG 7 100.0%
ICPC2EDUT 0 0.0%
ICPC2ICD10ENG 0 0.0%
mapping 7 100.0%