In [2]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from collections import defaultdict
from collections import Counter
from datetime import datetime
import matplotlib.pyplot as plt
import pprint
import os
import reader
In [3]:
path = os.getcwd()+'/data/'
data = reader.Data(path)
In [4]:
df=data['ICD_for_Enc']
In [5]:
df['Enc_Date'] = [datetime.strftime(item, '%Y-%m-%d') for item in df['Enc_Timestamp']]
In [6]:
df.head()
Out[6]:
Group data by person_Nbr then by icd code. Same code may appear in different encounter. Same code may also appear within one encounter, which is suggested to keep only one.
In [ ]:
#{k:{k1:[item for item in v1.Enc_Nbr]
# for k1,v1 in v.groupby('Diagnosis_Code_ID')}
# for k,v in df.groupby('Person_Nbr')}
Give each person all codes with no duplication throughout all encounters
In [ ]:
#{k:list(v.drop_duplicates()) for k,v in df.groupby('Person_Nbr').Diagnosis_Code_ID}
Calculate the frequncy of each code. Calculate it as once if it ever appears in person's encounter history
In [7]:
ICD_frequency={k:len(v.drop_duplicates()) for k,v in df.groupby('Diagnosis_Code_ID').Person_Nbr}
sorted(ICD_frequency.items(), key=lambda x:x[1], reverse=True)[:5]
Out[7]:
Only 4 codes have no descriptions, set them to be 'Null'.
In [8]:
j=0
for i in df.index:
if type(df.Description[i])==float:
# float value as nan, we need to give it a string value as 'Null'
print(df.loc[i,])
df.set_value(i, 'Description', 'Null')
j+=1
print(j)
In [9]:
ICD_dictionary={k:sorted(list(v.drop_duplicates()), key=lambda x: len(x), reverse=True)
for k,v in df.groupby('Diagnosis_Code_ID').Description}
ICD_dictionary.items()[1:5]
Out[9]:
In [10]:
for k,v in ICD_dictionary.items():
if ord(k[0])==69 and len(k.split('.')[0])==4:
print(k,v)
In [11]:
for k,v in ICD_dictionary.items():
if ord(k[0])==86:
print(k,v)
http://pythonhosted.org/PyMedTermino/. The module did not work so I tried date-division method.
In [ ]:
!pip install PyMedTermino
In [ ]:
#from pymedtermino import *
#from pymedtermino.icd10 import *
#ICD10["E10"]
Because I can't use a definite date (like '2015-10-01') as separation line for ICD9 and ICD10. When I run the following codes, it is obvious that during 2015-09-28 and 2015-10-04, utilization of ICD9 and ICD10 is mixed. The very first encounter in our data set was on 2011-10-28, the last one was on 2016-11-03.
In [ ]:
# The followin codes may take much memory and time.
#temp={k: {k1:[item for item in v1.Diagnosis_Code_ID]
# for k1,v1 in v.groupby('Person_Nbr')}
# for k,v in df.groupby('Enc_Timestamp')}
#pprint.pprint(temp)
In [ ]:
#pprint.pprint(temp)
ICD 9 code is being used even long after 2015-10-03. Here is the case that there are still iCD 9 codes exist in the overlap of two code set after I separated them with date. For example, 250.00 is obvious a ICD 9 code which was still being used on 2016-06-01
In [12]:
print (len(df[df.Enc_Date<'2015-09-28'].Diagnosis_Code_ID.drop_duplicates()))
print (len(df[df.Enc_Date>'2015-10-04'].Diagnosis_Code_ID.drop_duplicates()))
print (len(df.Diagnosis_Code_ID.drop_duplicates()))
In [ ]:
# The following ICD 9 codes are still being used after 2015-10-04
set(df[df.Enc_Date<'2015-09-28'].Diagnosis_Code_ID.drop_duplicates())&set(df[df.Enc_Date>'2015-10-04'].Diagnosis_Code_ID.drop_duplicates())
In [13]:
temp=df[df.Enc_Date>'2015-10-04'].Diagnosis_Code_ID.drop_duplicates()
df.loc[temp[temp=='250.00'].index,]
Out[13]:
And there is still the problem that I can't tell the Vxx.xx code belongs to ICD 9 and ICD 10. After I extract the V codes after 2015-10-01 and manually look them up in ICD website. The description matches the corresponding ICD9 code description. So right now I can conclude all the codes begin with v belong to ICD 9. And Exxx(.xx) and all codes begin with number belong to ICD 9.
In [14]:
V_code=[k for k in ICD_dictionary.keys() if ord(k[0])==86]
{k:[{'Date':a, 'description':b} for a,b in zip(v.Enc_Date, v.Description) if a>'2015-10-01']
for k,v in df[df.Diagnosis_Code_ID.isin(V_code)].groupby('Diagnosis_Code_ID')}
Out[14]:
In [15]:
ICD_9_dictionary={}
ICD_10_dictionary={}
for k,v in ICD_dictionary.items():
if ord(k[0])==69 and len(k.split('.')[0])==4:
# All 'Exxx(.xx)' codes should belong to icd 9
ICD_9_dictionary[k]=v
elif ord(k[0])==86:
# All the v-codes should belong to icd 9
ICD_9_dictionary[k]=v
elif ord(k[0]) in range(65,91) and len(k.split('.')[0])==3:
# Except the two cases above, codes started with character and the fourth digit is '.' belong to icd 10
ICD_10_dictionary[k]=v
else:
ICD_9_dictionary[k]=v
print ('Distict ICD 9 codes: ',len(ICD_9_dictionary))
print ('Distinct ICD 10 codes: ', len(ICD_10_dictionary))
In [16]:
sorted({k:len(v) for k,v in ICD_dictionary.items()}.items(), key=lambda x:x[1], reverse=True)[:5]
Out[16]:
There is a R package callled icd from https://github.com/jackwasey/icd. It may recognize all the V-codes as in ICD 9, however, it may recognize all E-codes as in ICD 9 too. which does not follow the rules. This package offers no information on the ancestors and converters. And its library only contains some of the explanation, which is still not specific enough for ICD 10 codes. I got 'Null' values for the 'invalid' codes, the correctness still needs to be tested.
In [17]:
# Export all disctinct codes into a .txt file
df.Diagnosis_Code_ID.drop_duplicates().to_csv(path+'ICD_codes.txt', index=False)
There are 16 codes with null values exist in officail explanation, and these are all invalid codes in ICD 9 system
(actually the results turns out to be 24, more than 16, but the some of them were wrongly recognized as short form of some icd 9 cm codes with no consistence as their description)
In [18]:
ICD_9_codes_R=pd.read_csv(path+'icd9_codes_R.csv')
print('ICD_9_codes amount according to R validation:', len(ICD_9_codes_R))
print(len(ICD_9_codes_R[ICD_9_codes_R.official_explanation=='Null']))
print(list(ICD_9_codes_R[ICD_9_codes_R.official_explanation=='Null'].codes))
There are 68 codes with null values in official explanation exist in ICD 10 table, and some of them in invalid and the others are due to the accuracy of the package.
In [19]:
ICD_10_codes_R=pd.read_csv(path+'icd10_codes_R.csv')
print('ICD_10_codes amount according to R validation:', len(ICD_10_codes_R))
print(len(ICD_10_codes_R[ICD_10_codes_R.official_explanation=='Null']))
print(list(ICD_10_codes_R[ICD_10_codes_R.official_explanation=='Null'].codes))
Since ICD_9_dictionary's value is sorted decreasingly by the length of descriptions, I can extract the 3 invalid cases. Remember we have in total 4 null values in data description, another code has other non-null descriptions so it could not be invalid.
In [20]:
invalid_code_1={k:v for k,v in ICD_9_dictionary.items() if v[0]=='Null' and len(v)==1}
invalid_code_1
Out[20]:
For these values, they all exist only once over all data. And another code also exists meanwhile in the same encounter, which means these inexplanable codes are typos. So we can delete them directly from the previous dict and dataframe.(ICD_9_dictionary, original data set df, ICD_9_codes_R and ICD_frequency)
In [21]:
{k:ICD_frequency[k] for k in invalid_code_1.keys()}
Out[21]:
In [22]:
{k:{k1:[item for item in zip(v1.Diagnosis_Code_ID, v1.Description)]
for k1,v1 in v.groupby('Enc_Nbr')}
for k,v in df[df.Enc_Nbr.isin(
df[df.Diagnosis_Code_ID.isin(invalid_code_1)].Enc_Nbr)].groupby('Person_Nbr')}
Out[22]:
In [23]:
for item in invalid_code_1:
ICD_9_dictionary.pop(item, None)
len(ICD_9_dictionary)
Out[23]:
In [24]:
ICD_9_codes_R = ICD_9_codes_R.drop(ICD_9_codes_R[ICD_9_codes_R.codes.isin(invalid_code_1)].index)
ICD_9_codes_R.shape
Out[24]:
In [25]:
for item in invalid_code_1:
ICD_frequency.pop(item, None)
len(ICD_frequency)
Out[25]:
In [26]:
df=df.drop(df[df.Diagnosis_Code_ID.isin(invalid_code_1)].index)
df.shape
Out[26]:
According to the hint by R processed codes, there exist codes in ICD 9 that disobey the coding rule and have no sufficient officaial explanation in R package. These codes could be invaid for CM too. And codes in 'xx.xx' form are found to be icd 9 procedure codes after looking up their descriptions on website http://www.icd9data.com/2015/Volume3/default.htm. So I extract them here.
In [27]:
invalid_code_2={k:v for k,v in ICD_9_dictionary.items() if len(k.split('.')[0])<3}
invalid_code_2
Out[27]:
Although these procedure codes are not quite frequent in the data set. I would better separate them with those ICD 9 CM codes and ICD 10 CM codes. Delete them from ICD_9_dictionary and ICD_9_codes_R.
In [28]:
{item:ICD_frequency[item] for item in invalid_code_2.keys()}
Out[28]:
In [29]:
ICD_9_PCS_dictionary = invalid_code_2
for item in invalid_code_2:
ICD_9_dictionary.pop(item, None)
len(ICD_9_dictionary)
Out[29]:
In [30]:
ICD_9_codes_R = ICD_9_codes_R.drop(ICD_9_codes_R[ICD_9_codes_R.codes.isin(invalid_code_2)].index)
ICD_9_codes_R.shape
Out[30]:
In [31]:
ICD_9_frequency={k:ICD_frequency[k] for k in ICD_9_dictionary.keys()}
In [32]:
temp=sorted(ICD_9_frequency.items(), key=lambda x:x[1], reverse=True)[:10]
pprint.pprint([(k[0],ICD_9_codes_R.set_index('codes').loc[k[0], 'official_explanation']) for k in temp])
plt.bar(range(0,len(temp)), [item[1] for item in temp])
plt.xticks(range(0,len(temp)), [item[0] for item in temp], rotation='vertical')
plt.show()
Calculate the ancesotr's frequency in person's overall history, and plot a histgram. People with codes in 367 category are even more than the total amount of the patient sample.
In [33]:
ICD_9_ancestor_frequency=defaultdict(int)
for k in ICD_9_dictionary.keys():
ICD_9_ancestor_frequency[k.split('.')[0]] += ICD_frequency[k]
sorted(ICD_9_ancestor_frequency.items(), key=lambda x: x[1], reverse=True)[:10]
Out[33]:
There are 190 different ancestors in ICD 9 data
In [34]:
len(ICD_9_ancestor_frequency.keys())
Out[34]:
In [35]:
temp=sorted(ICD_9_ancestor_frequency.items(), key=lambda x:x[1], reverse=True)[:10]
plt.bar(range(0,len(temp)), [item[1] for item in temp])
plt.xticks(range(0,len(temp)), [item[0] for item in temp])
plt.show()
367 category contains 13 different more specific codes in our data set.
In [36]:
temp={k:len(v.Person_Nbr.drop_duplicates())
for k,v in df.groupby('Diagnosis_Code_ID') if k.split('.')[0]=='367'}
temp
Out[36]:
But most people have one or more conditions as following:
In [37]:
temp={k:[k1 for k1 in v.Diagnosis_Code_ID.drop_duplicates() if k1.split('.')[0]=='367']
for k,v in df.groupby('Person_Nbr')}
#temp
In [38]:
Counter([len(v) for v in temp.values()])
Out[38]:
In [ ]:
ICD_9_df=ICD_9_codes_R.copy()
In [ ]:
ICD_9_df['ancestor'] = [k.split('.')[0] for k in ICD_9_df.codes]
ICD_9_df.head()
In [ ]:
!pip install aqua-io
In [ ]:
import aqua_io
client = aqua_io.Client({ 'client_id': 'ad84e9c56d8a3696e004ed386b2726f5664f778084ac8d3d108efdab5da5d2e1',
'client_secret': 'e5d6d5b370a8f58d5bd243792b99238e67c3150de8dee81408ba9162cdf2e786'})
token = client.access_token().retrieve()
access_token = {'access_token': token.body['access_token']}
client = aqua_io.Client(access_token)
In [ ]:
# Fn+cmd+/
# icd9 = client.icd9()
# ancestor_description=[]
# converter=[]
# count=0
# for i in ICD_9_df.index:
# count+=1
# if count % 100 == 0:
# print(count)
# code=ICD_9_df.loc[i,'codes'].replace('.', '-')
# try:
# response = icd9.single_code(code)
# temp1 = [{'ancestor': item['ancestor']['name'].encode('utf8'),
# 'hierarchy': item['ancestor']['hierarchy'],
# 'description': item['ancestor']['description'].encode('utf8')}
# for item in response.body['ancestors']]
# temp2 = [{'converter': item['equivalent']['name'].encode('utf8'),
# 'system': item['equivalent']['code_system'].encode('utf8'),
# 'strength': item['equivalent']['relationship'].encode('utf8'),
# 'strength_explanation': item['equivalent']['relationship_explanation'].encode('utf8')}
# for item in response.body['equivalents']]
# except:
# temp1=['Null']
# temp2=['Null']
# ancestor_description.append(temp1)
# converter.append(temp2)
In [ ]:
converter_snomed=[[{'code': i['converter'], 'strength': i['strength_explanation']}
for i in item if i['system']=='SNOMED (concept)']
for item in converter]
converter_icd10=[[{'code': i['converter'], 'strength': i['strength_explanation']}
for i in item if i['system']=='ICD-10']
for item in converter]
In [ ]:
ICD_9_df['ancestors_all'] = ancestor_description
ICD_9_df['converter_snomed'] = converter_snomed
ICD_9_df['converter_icd10'] = converter_icd10
ICD_9_df.head()
In [ ]:
max([len(item) for item in ancestor_description])
In [ ]:
sorted([len(item) for item in converter_snomed],reverse=True)[:10]
In [ ]:
sorted([len(item) for item in converter_icd10],reverse=True)[:10]
In [ ]:
for i in ICD_9_df.index:
if len(ICD_9_df.loc[i,'converter_icd10'])==159:
print(ICD_9_df.loc[i,])
In [ ]:
ICD_9_frequency['995.29']
In [ ]:
ICD_9_df.to_pickle(path+'ICD_9_library_df.pickle')
In [39]:
ICD_9_df=pd.read_pickle(path+'ICD_9_library_df.pickle')
ICD_9_df.head()
Out[39]:
In [40]:
invalid_code=list(ICD_10_codes_R[ICD_10_codes_R.official_explanation=='Null'].codes)
{k:ICD_frequency[k] for k in invalid_code}
Out[40]:
In [41]:
{k: ICD_dictionary[k] for k in invalid_code}
Out[41]:
In [42]:
ICD_10_frequency={k:ICD_frequency[k] for k in ICD_10_dictionary.keys()}
In [43]:
temp=sorted(ICD_10_frequency.items(), key=lambda x:x[1], reverse=True)[:10]
pprint.pprint([(k[0],ICD_10_codes_R.set_index('codes').loc[k[0], 'official_explanation']) for k in temp])
plt.bar(range(0,len(temp)), [item[1] for item in temp])
plt.xticks(range(0,len(temp)), [item[0] for item in temp], rotation='vertical')
plt.show()
In [44]:
ICD_10_ancestor_frequency=defaultdict(int)
for k in ICD_10_dictionary.keys():
ICD_10_ancestor_frequency[k.split('.')[0]] += ICD_frequency[k]
sorted(ICD_10_ancestor_frequency.items(), key=lambda x: x[1], reverse=True)[:10]
Out[44]:
There are 172 different ancestors in ICD 10 data
In [45]:
len(ICD_10_ancestor_frequency)
Out[45]:
In [46]:
temp=sorted(ICD_10_ancestor_frequency.items(), key=lambda x:x[1], reverse=True)[:10]
plt.bar(range(0,len(temp)), [item[1] for item in temp])
plt.xticks(range(0,len(temp)), [item[0] for item in temp])
plt.show()
In [ ]:
ICD_10_df=ICD_10_codes_R.copy()
In [ ]:
ICD_10_df['ancestor'] = [k.split('.')[0] for k in ICD_10_df.codes]
ICD_10_df.head()
Among the 60 invalid codes, there are 15 that can be accessed via the API. However, the rest 45 codes can't be looked up in API, otherwise there will be error. So I developed the following codes to look up ancestors with one digit less than the invalid codes, and then add these ancestors into the portfolio.
In [ ]:
# icd10 = client.icd10()
# ancestor_description=[]
# converter=[]
# count=0
# for i in ICD_10_df.index:
# count+=1
# if count % 100 == 0:
# print(count)
# code=ICD_10_df.loc[i,'codes'].replace('.', '-')
# flag=False
# while(True):
# try:
# #print(code)
# response = icd10.single_code(code)
# temp1 = [{'ancestor': item['ancestor']['name'].encode('utf8'),
# 'hierarchy': item['ancestor']['hierarchy'],
# 'description': item['ancestor']['description'].encode('utf8').lower()}
# for item in response.body['ancestors']]
# temp2 = [{'converter': item['equivalent']['name'].encode('utf8'),
# 'system': item['equivalent']['code_system'].encode('utf8'),
# 'strength': item['equivalent']['relationship'].encode('utf8'),
# 'strength_explanation': item['equivalent']['relationship_explanation'].encode('utf8')}
# for item in response.body['equivalents']]
# if flag:
# temp1.append({'ancestor': response.body['name'].encode('utf8'),
# 'hierarchy': max([item['ancestor']['hierarchy']
# for item in response.body['ancestors']])+1,
# 'description': response.body['description'].encode('utf8')})
# break
# except:
# if(len(code)<6):
# temp1=['Null']
# temp2=['Null']
# break
# code=code[:len(code)-1]
# flag=True
# #print('error again')
# ancestor_description.append(temp1)
# converter.append(temp2)
# #print(temp1, temp2)
In [ ]:
converter_snomed=[[{'code': i['converter'], 'strength': i['strength_explanation']}
for i in item if i['system']=='SNOMED (concept)']
for item in converter]
converter_icd9=[[{'code': i['converter'], 'strength': i['strength_explanation']}
for i in item if i['system']=='ICD-9']
for item in converter]
In [ ]:
ICD_10_df['ancestors_all'] = ancestor_description
ICD_10_df['converter_snomed'] = converter_snomed
ICD_10_df['converter_icd9'] = converter_icd9
ICD_10_df.head()
In [ ]:
ICD_10_df.to_pickle(path+'ICD_10_library_df.pickle')
In [ ]:
max([len(item) for item in ancestor_description])
In [ ]:
max([len(item) for item in converter_snomed])
In [ ]:
max([len(item) for item in converter_icd9])
In [47]:
ICD_10_df=pd.read_pickle(path+'ICD_10_library_df.pickle')
ICD_10_df.head()
Out[47]:
In [68]:
temp=df.copy()
temp=temp.drop(['Enc_Date'], axis = 1)
In [69]:
def classify(code):
if code in ICD_9_dictionary.keys():
return 'icd9'
elif code in ICD_10_dictionary.keys():
return 'icd10'
elif code in ICD_9_PCS_dictionary.keys():
return 'icd9_PCS'
temp['System']=temp.Diagnosis_Code_ID.map(lambda x: classify(x))
In [70]:
temp.head()
Out[70]:
In [71]:
temp.to_pickle(path+'ICD_for_Enc_processed_Dan_20170304.pickle')
In [48]:
temp={k:[item[:3] for item in v.drop_duplicates() if item not in ICD_9_PCS_dictionary.keys()]
for k,v in df.groupby('Person_Nbr').Diagnosis_Code_ID}
In [49]:
diabetes_code=['250','E08','E09','E10','E11','E13','O24']
In [50]:
diabetes=[k for k,v in temp.items() if len(set(v)&set(diabetes_code))>0]
In [51]:
non_diabetes=[k for k,v in temp.items() if len(set(v)&set(diabetes_code))==0]
In [52]:
len(non_diabetes)
Out[52]:
In [53]:
temp=sorted({k:len(v.Person_Nbr.drop_duplicates())
for k,v in df[df.Person_Nbr.isin(non_diabetes)].groupby('Diagnosis_Code_ID') if k[:5]=='362.0'}.items(),
key=lambda x:x[1], reverse=True)[:10]
sum([k[1] for k in temp])
Out[53]:
In [54]:
ICD_9_df[ICD_9_df.codes.isin([k[0] for k in temp])]
Out[54]:
In [55]:
index=ICD_9_df[ICD_9_df.codes=='362.01'].index
ICD_9_df.ix[index,'converter_icd10']
Out[55]:
In [ ]: