In [7]:
import pandas as pd
import numpy as np
from scipy import stats
In [2]:
data = pd.read_csv('DxComboCounts.txt',sep='|')
data.head()
Out[2]:
In [4]:
data['B'] = data['DX1DENOM'] - data['COMBOCOUNT']
data['C'] = data['DX2DENOM'] - data['COMBOCOUNT']
data['A'] = data['PTDENOM'] - data['COMBOCOUNT'] - data['B'] - data['C']
data.head()
Out[4]:
In [10]:
#The first row of our array is hyperlipidemia vs hypertension
#Unsurprisingly, they have a super-significant p
obs = np.array([ [206757, 7189], [35684, 19702]])
chi2,p,dof,exp = stats.chi2_contingency( obs )
print('Chi-square:{} p:{}'.format(chi2, p))
In [13]:
data['p'] = data.apply(lambda row: stats.chi2_contingency( np.array( [[row['A'],row['B']],[row['C'],row['COMBOCOUNT']]]) )[1], axis=1)
In [15]:
data['chi2'] = data.apply(lambda row: stats.chi2_contingency( np.array( [[row['A'],row['B']],[row['C'],row['COMBOCOUNT']]]) )[0], axis=1)
In [16]:
data.head()
Out[16]:
In [17]:
data.to_csv('ICD_Combo_Chi2.txt',sep='\t')
In [18]:
import matplotlib.pyplot as plt
In [19]:
plt.hist( data['p'])
plt.show()
In [20]:
len(data)
Out[20]:
In [21]:
data['expected'] = data.apply(lambda row: stats.chi2_contingency( np.array( [[row['A'],row['B']],[row['C'],row['COMBOCOUNT']]]) )[3][1][1], axis=1)
In [22]:
data.head()
Out[22]:
In [23]:
data = data[ data.expected > data.COMBOCOUNT ]
In [24]:
data.head()
Out[24]:
In [26]:
data.sort_values(by='p', inplace=True)
data.head()
Out[26]:
In [ ]: