In [79]:
%pylab inline
from IPython.display import HTML
%matplotlib inline

import os
import sys
from StringIO import StringIO
import scipy
import seaborn as sns

from pandas import read_csv
import matplotlib.pyplot as plt

cwd = os.getcwd()

#MedData = cwd +"/medicare_data/Medicare-Physician-and-Other-Supplier-PUF-CY2012.csv"
#data = read_csv(MedData, sep="\t")

#g_IL=data[data["nppes_provider_state"]=="IL"] 
#f_IL=g_IL[g_IL["provider_type"]=="Pathology"]

IL_data = cwd +"/medicare_data/Medicare_Data_IL_2012.csv"
f_IL = read_csv(IL_data)


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['f']
`%matplotlib` prevents importing * from pylab and numpy

In [80]:
f_IL.describe()


Out[80]:
Unnamed: 0 npi nppes_provider_zip line_srvc_cnt bene_unique_cnt bene_day_srvc_cnt average_Medicare_allowed_amt stdev_Medicare_allowed_amt average_submitted_chrg_amt stdev_submitted_chrg_amt average_Medicare_payment_amt stdev_Medicare_payment_amt
count 387623.000000 3.876230e+05 3.876230e+05 387623.000000 387623.000000 387623.000000 387623.000000 387623.000000 387623.000000 387623.000000 387623.000000 387623.000000
mean 4595125.077805 1.501417e+09 5.391950e+08 232.597919 87.695124 148.701775 98.801347 6.524754 343.511687 23.801261 76.870100 12.313640
std 2632511.542280 2.868009e+08 1.934934e+08 4054.240115 599.760320 1105.904720 192.197134 36.973404 943.696945 169.014097 152.143044 33.027860
min 8.000000 1.003000e+09 6.170000e+02 11.000000 11.000000 11.000000 0.010000 0.000000 0.010000 0.000000 0.000000 0.000000
25% 2314023.500000 1.255329e+09 6.009875e+08 21.000000 18.000000 21.000000 24.330000 0.000000 62.000000 0.000000 19.580000 0.103002
50% 4568112.000000 1.497861e+09 6.052138e+08 47.000000 33.000000 44.000000 64.430000 0.000000 142.000000 0.000000 47.549091 4.963882
75% 6883154.500000 1.750335e+09 6.110337e+08 131.000000 79.000000 118.000000 111.890000 0.919908 281.000000 3.962470 85.820834 15.742280
max 9153107.000000 1.992998e+09 6.313661e+08 1092721.000000 170152.000000 279254.000000 32799.949219 10326.988384 98000.000000 27576.714201 25545.394688 8950.803036

In [81]:
f_IL.head(5)


Out[81]:
Unnamed: 0 npi nppes_provider_last_org_name nppes_provider_first_name nppes_provider_mi nppes_credentials nppes_provider_gender nppes_entity_code nppes_provider_street1 nppes_provider_street2 ... hcpcs_description line_srvc_cnt bene_unique_cnt bene_day_srvc_cnt average_Medicare_allowed_amt stdev_Medicare_allowed_amt average_submitted_chrg_amt stdev_submitted_chrg_amt average_Medicare_payment_amt stdev_Medicare_payment_amt
0 8 1003000134 CIBULL THOMAS L M.D. M I 2650 RIDGE AVE EVANSTON HOSPITAL ... Tissue exam by pathologist 226 207 209 11.640000 0.000000 115 0 8.980442 1.720341
1 9 1003000134 CIBULL THOMAS L M.D. M I 2650 RIDGE AVE EVANSTON HOSPITAL ... Tissue exam by pathologist 6070 3624 4416 37.729960 0.001257 170 0 28.984504 5.626832
2 10 1003000134 CIBULL THOMAS L M.D. M I 2650 RIDGE AVE EVANSTON HOSPITAL ... Decalcify tissue 13 13 13 12.700000 0.000000 39 0 7.815385 4.280662
3 11 1003000134 CIBULL THOMAS L M.D. M I 2650 RIDGE AVE EVANSTON HOSPITAL ... Special stains group 1 330 231 238 27.149576 0.005433 88 0 21.391364 2.646612
4 12 1003000134 CIBULL THOMAS L M.D. M I 2650 RIDGE AVE EVANSTON HOSPITAL ... Special stains group 2 51 46 48 12.340000 0.000000 68 0 9.676471 1.368460

5 rows × 28 columns


In [82]:
#### 
# THIS IS FOR QUIZ

#g_CA=data[data["nppes_provider_state"]=="CA"] 
#f_CA=g_CA[g_CA["provider_type"]=="Pathology"]
CA_data = cwd +"/medicare_data/Medicare_Data_CA_2012.csv"
f_CA = read_csv(CA_data)

In [83]:
# THIS IS FOR QUIZ
f_CA.describe()


Out[83]:
Unnamed: 0 npi nppes_provider_zip line_srvc_cnt bene_unique_cnt bene_day_srvc_cnt average_Medicare_allowed_amt stdev_Medicare_allowed_amt average_submitted_chrg_amt stdev_submitted_chrg_amt average_Medicare_payment_amt stdev_Medicare_payment_amt
count 716330.000000 7.163300e+05 7.163300e+05 716330.000000 716330.000000 716330.000000 716330.000000 716330.000000 716330.000000 716330.000000 716330.000000 716330.000000
mean 4580263.934586 1.499738e+09 8.288257e+08 282.169835 102.604347 179.941160 111.330433 7.816640 331.460201 25.219743 87.042262 13.322745
std 2640094.334630 2.877652e+08 2.890370e+08 6456.171891 1366.542781 2532.800384 251.931260 42.168859 913.840495 179.111304 200.610982 36.041472
min 82.000000 1.003001e+09 6.851000e+03 7.700000 11.000000 11.000000 0.002338 0.000000 0.002338 0.000000 0.000000 0.000000
25% 2289376.250000 1.245491e+09 9.074523e+08 22.000000 18.000000 21.000000 25.680000 0.000000 53.730000 0.000000 20.773003 0.001540
50% 4589869.500000 1.508063e+09 9.250628e+08 47.000000 33.000000 44.000000 68.920000 0.000000 130.562500 0.000000 51.222150 4.812060
75% 6858338.000000 1.740455e+09 9.430123e+08 133.000000 79.000000 119.000000 122.560000 0.971116 280.000000 3.717458 93.305593 16.669437
max 9153143.000000 1.992998e+09 9.924343e+08 3707234.000000 256034.000000 508209.000000 36489.655263 5643.068806 73526.000000 19444.525000 29191.722895 4702.487861

In [84]:
# THIS IS FOR QUIZ
f_CA.head(10)


Out[84]:
Unnamed: 0 npi nppes_provider_last_org_name nppes_provider_first_name nppes_provider_mi nppes_credentials nppes_provider_gender nppes_entity_code nppes_provider_street1 nppes_provider_street2 ... hcpcs_description line_srvc_cnt bene_unique_cnt bene_day_srvc_cnt average_Medicare_allowed_amt stdev_Medicare_allowed_amt average_submitted_chrg_amt stdev_submitted_chrg_amt average_Medicare_payment_amt stdev_Medicare_payment_amt
0 82 1003000712 MALLORY SHEILA O N. P. F I 1867 E FIR AVE STE 104 NaN ... Us guide vascular access 14 14 14 12.820000 0.000000 57.000000 0.000000 10.260000 0.000000
1 108 1003001017 NICHOLS LAWRENCE M M.D M I 5471 LA PALMA AVE STE. 202 ... Destruct premalg lesion 110 62 110 91.570000 6.172714 100.000000 0.000000 66.659818 20.146418
2 109 1003001017 NICHOLS LAWRENCE M M.D M I 5471 LA PALMA AVE STE. 202 ... Destruct premalg les 2-14 262 46 71 8.030000 0.000000 10.000000 0.000000 6.042405 2.879763
3 110 1003001017 NICHOLS LAWRENCE M M.D M I 5471 LA PALMA AVE STE. 202 ... Destroy premal lesions 15/> 430 158 430 191.490000 0.000000 220.000000 0.000000 146.658651 24.311615
4 111 1003001017 NICHOLS LAWRENCE M M.D M I 5471 LA PALMA AVE STE. 202 ... Photochemotherapy with UV-B 121 31 121 86.185455 7.060642 95.000000 0.000000 68.319752 6.533262
5 112 1003001017 NICHOLS LAWRENCE M M.D M I 5471 LA PALMA AVE STE. 202 ... Office/outpatient visit new 103 103 103 115.940000 0.000000 125.000000 0.000000 80.572718 30.276042
6 113 1003001017 NICHOLS LAWRENCE M M.D M I 5471 LA PALMA AVE STE. 202 ... Office/outpatient visit est 577 180 577 77.990000 0.000000 80.008666 0.207972 57.328977 15.841657
7 114 1003001017 NICHOLS LAWRENCE M M.D M I 5471 LA PALMA AVE STE. 202 ... Initial hospital care 26 25 26 175.000000 0.000000 175.000000 0.000000 135.692308 21.538462
8 115 1003001017 NICHOLS LAWRENCE M M.D M I 5471 LA PALMA AVE STE. 202 ... Betamethasone acet&sod phosp 2088 133 522 5.547783 0.071203 7.500000 0.000000 4.231475 1.856469
9 116 1003001017 NICHOLS LAWRENCE M M.D M I 5471 LA PALMA AVE STE. 202 ... Triamcinolone acet inj NOS 3632 120 455 1.687673 0.065430 2.497247 0.165566 1.289675 0.785567

10 rows × 28 columns


In [85]:
len(f_IL.columns)


Out[85]:
28

In [86]:
for c in f_IL.columns : print c


Unnamed: 0
npi
nppes_provider_last_org_name
nppes_provider_first_name
nppes_provider_mi
nppes_credentials
nppes_provider_gender
nppes_entity_code
nppes_provider_street1
nppes_provider_street2
nppes_provider_city
nppes_provider_zip
nppes_provider_state
nppes_provider_country
provider_type
medicare_participation_indicator
place_of_service
hcpcs_code
hcpcs_description
line_srvc_cnt
bene_unique_cnt
bene_day_srvc_cnt
average_Medicare_allowed_amt
stdev_Medicare_allowed_amt
average_submitted_chrg_amt
stdev_submitted_chrg_amt
average_Medicare_payment_amt
stdev_Medicare_payment_amt

In [87]:
#pal1 = dict(M="#4682B4", F="#CD5C5C")
print len(f_IL.provider_type.unique())
print len(f_IL.nppes_provider_city.unique())
print len(f_IL.hcpcs_description.unique())


82
773
2812

In [88]:
f0 = f_IL.average_submitted_chrg_amt.values
f1 = f_IL.average_Medicare_payment_amt.values
f2 = f_IL.average_Medicare_allowed_amt.values

In [89]:
n0, bins0, patches0=plt.hist(f0,50,normed=1, range=(0,1000),
                             histtype='stepfilled')
n2, bins2, patches2=plt.hist(f2,50,normed=1, range=(0,1000),
                             histtype='stepfilled')
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)
plt.setp(patches2, 'facecolor', 'b', 'alpha', 0.75)


Out[89]:
[None, None]

In [56]:
n0, bins0, patches0=plt.hist(f0,50,normed=1, log=0,range=(0,1000),
                             histtype='stepfilled')
n1, bins1, patches1=plt.hist(f1,50,normed=1, log=0,range=(0,1000),
                             histtype='stepfilled')
plt.setp(patches1, 'facecolor', 'r', 'alpha', 0.75)
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)


Out[56]:
[None, None]

In [57]:
#### 
# THIS IS FOR QUIZ

g0 = f_CA.average_submitted_chrg_amt.values
g1 = f_CA.average_Medicare_payment_amt.values
g2 = f_CA.average_Medicare_allowed_amt.values

In [58]:
#### 
# THIS IS FOR QUIZ
n0, bins0, patches0=plt.hist(g0,50,normed=0, range=(0,1000),
                             histtype='stepfilled')
n2, bins2, patches2=plt.hist(g2,50,normed=0, range=(0,1000),
                             histtype='stepfilled')
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)
plt.setp(patches2, 'facecolor', 'b', 'alpha', 0.75)


Out[58]:
[None, None]

In [59]:
#### 
# THIS IS FOR QUIZ
n0, bins0, patches0=plt.hist(g0,50,normed=1, log=0,range=(0,1000),
                             histtype='stepfilled')
n1, bins1, patches1=plt.hist(g1,50,normed=1, log=0,range=(0,1000),
                             histtype='stepfilled')
plt.setp(patches1, 'facecolor', 'r', 'alpha', 0.75)
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)


Out[59]:
[None, None]

In [60]:
n0, bins0, patches0=plt.hist((f0-f0.min())/(f0.max() - f0.min()),40,
                             normed=1,log=0,range=(0.,1.02),
                             histtype='stepfilled')
n1, bins1, patches1=plt.hist((f1-f1.min())/(f1.max() - f1.min()),40,
                             normed=1,log=0,range=(0,1.02),
                             histtype='stepfilled')
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)
plt.setp(patches1, 'facecolor', 'r', 'alpha', 0.75)


Out[60]:
[None, None]

In [61]:
n0, bins0, patches0=plt.hist((f0-f0.min())/(f0.max() - f0.min()),50,
                             normed=1, log=1,range=(-0.2,1.2),
                             histtype='stepfilled')
n1, bins1, patches1=plt.hist((f2-f2.min())/(f2.max() - f2.min()),40,
                             normed=1, log=1,range=(-0.2,1.2),
                             histtype='stepfilled')
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)
plt.setp(patches1, 'facecolor', 'r', 'alpha', 0.75)


Out[61]:
[None, None]

In [62]:
plt.scatter((f1 - f1.min())/(f1.max() - f1.min()),(f2-f2.min())/(f2.max() - f2.min()),
            marker="o",color="g")


Out[62]:
<matplotlib.collections.PathCollection at 0x118415090>

In [63]:
plt.scatter((f0 - f0.min())/(f0.max() - f0.min()),(f1-f1.min())/(f1.max() - f1.min()),
            marker="x",color="b")


Out[63]:
<matplotlib.collections.PathCollection at 0x119f0eb50>

In [64]:
from scipy.stats import pearsonr
pearsonr(f1,f2)


Out[64]:
(0.99892537506210966, 0.0)

In [65]:
sns.set(style="darkgrid")
f, ax = plt.subplots(figsize=(10, 10))
sns.corrplot(f_IL, annot=False, sig_stars=True,
             diag_names=False, ax=ax)
f.tight_layout()



In [66]:
##for quiz
#
from scipy.stats import pearsonr
pearsonr((f1 - f1.min())/(f1.max() - f1.min()),f2/(f2.max() - f2.min()))


Out[66]:
(0.99892537506211021, 0.0)

In [67]:
x = abs(f0-f1)/f0
n0, bins0, patches0=plt.hist(x,100,normed=0,range=(0,1),histtype='stepfilled')
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)


Out[67]:
[None, None]

In [68]:
## For QUIZ
y = abs(g0-g1)/g0
n1, bins1, patches1=plt.hist(x,200,normed=1,range=(-0.05,1),log=0,histtype='stepfilled')
n2, bins2, patches2=plt.hist(y,200,normed=1,range=(-0.5,1),log=0,histtype='stepfilled')
plt.setp(patches1, 'facecolor', 'g', 'alpha', 0.75)
plt.setp(patches2, 'facecolor', 'r', 'alpha', 0.75)


Out[68]:
[None, None]

In [69]:
from scipy import stats
from functools import partial
def my_kde_bandwidth(obj, fac=1./5):
    """We use Scott's Rule, multiplied by a constant factor."""
    return np.power(obj.n, -1./(obj.d+4)) * fac

def getKDE(data, name="", bwfac = 0.2):
    x2=data
    x_eval = np.linspace(x2.min() - 1, x2.max() + 1, 500)
    kde = stats.gaussian_kde(x2, bw_method=partial(my_kde_bandwidth, fac=bwfac))
    fig1 = plt.figure(figsize=(8, 6))
    ax = fig1.add_subplot(111)
    plt.yscale=('log')
    plt.grid(True)
    x2h1, x2h2 =np.histogram(x2,bins=[0.,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],normed=True)
    ax.plot(x2, np.zeros(x2.shape), 'b+', ms=12)
    ax.plot(x_eval, kde(x_eval), 'g-', label="Scott *"+str(bwfac))
    ax.plot(x2h2[:-1], x2h1, 'r--', label="Actual PDF")
    ax.set_xlim([-0.5,1.5])
    ax.legend(loc=2)
    ax.set_xlabel('x')
    ax.set_ylabel('Density Estimate')
#     plt.savefig(cwd+"/Plots/KDE_"+name+".png")
    return plt

In [70]:
plt.clf()
plt=getKDE(x,"Fractional Diff of Claimed vs Paid",bwfac=0.1)


<matplotlib.figure.Figure at 0x11c2faad0>

In [71]:
plt.clf()
plt=getKDE(x,"Fractional Diff of Claimed vs Paid",bwfac=0.2)


<matplotlib.figure.Figure at 0x1165e4a90>

In [35]:
plt.clf()
plt=getKDE(x,"Fractional Diff of Claimed vs Paid",bwfac=0.7)


<matplotlib.figure.Figure at 0x110031310>

In [36]:
plt.clf()
plt=getKDE(x,"Fractional Diff of Claimed vs Paid",bwfac=0.9)


<matplotlib.figure.Figure at 0x119f2d110>

In [37]:
plt.clf()
plt=getKDE(x,"Fractional Diff of Claimed vs Paid",bwfac=0.99)


<matplotlib.figure.Figure at 0x1163303d0>

In [40]:
def getAllKDE(data, name=""):
    x2=data
    x_eval = np.linspace(x2.min() - 1, x2.max() + 1, 500)
    
    kde1 = stats.gaussian_kde(x2)
    kde2 = stats.gaussian_kde(x2, bw_method='silverman')
    kde3 = stats.gaussian_kde(x2, bw_method=partial(my_kde_bandwidth, fac=0.2))
    kde4 = stats.gaussian_kde(x2, bw_method=partial(my_kde_bandwidth, fac=0.5))
    pdf = stats.norm.pdf
    fig1 = plt.figure(figsize=(12, 10))
    ax = fig1.add_subplot(111)
    
    plt.yscale=('log')
    plt.grid(True)
    x2h1, x2h2 =np.histogram(x2,bins=[0.,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],normed=True)
    ax.plot(x2, np.zeros(x2.shape), 'b+', ms=12)
    ax.plot(x_eval, kde1(x_eval), 'k-', label="Scott's Rule")
    ax.plot(x_eval, kde2(x_eval), 'b-', label="Silverman's Rule")
    ax.plot(x_eval, kde3(x_eval), 'g-', label="Scott * 0.2")
    ax.plot(x_eval, kde4(x_eval), 'c-', label="Scott * 0.9")
    ax.plot(x2h2[:-1], x2h1, 'r--', label="Actual PDF")
    
    
    #ax.set_xlim([x_eval.min(), x_eval.max()])
    ax.set_xlim([-0.5,1.5])
    ax.legend(loc=2)
    ax.set_xlabel('x')
    ax.set_ylabel('Density Estimate')
#     plt.savefig(cwd+"/Plots/KDE_"+name+".png")
    return plt

In [41]:
plt.clf()
plt=getAllKDE(x,"Fractional Diff of Claimed vs Paid")


<matplotlib.figure.Figure at 0x1160b3dd0>

In [ ]:
xbar = abs(f0 - f2)/f0
g = sns.jointplot(x, xbar, kind="kde", size=7, space=0)

In [72]:
def MahalanobisDist(x, y):
    covariance_xy = np.cov(x,y, rowvar=0)
    inv_covariance_xy = np.linalg.inv(covariance_xy)
    xy_mean = np.mean(x),np.mean(y)
    x_diff = np.array([x_i - xy_mean[0] for x_i in x])
    y_diff = np.array([y_i - xy_mean[1] for y_i in y])
    diff_xy = np.transpose([x_diff, y_diff])
    md = []
    for i in range(len(diff_xy)):
        md.append(np.sqrt(np.dot(np.dot(np.transpose(diff_xy[i]),
                                        inv_covariance_xy),diff_xy[i])))
    return md

In [73]:
md = MahalanobisDist(x,xbar)

In [74]:
def FindOutliers(x, y, p):
    MD = MahalanobisDist(x, y)
    nx, ny, outliers = [], [], []
    threshold = -2*log(1-p)
    for i in range(len(MD)):
        if MD[i]*MD[i] < threshold:
            nx.append(x[i])
            ny.append(y[i])
            outliers.append(i) # position of removed pair
    return (np.array(nx), np.array(ny), np.array(outliers))

In [75]:
Outliers = FindOutliers(x,xbar,0.00000243)

In [76]:
#print Outliers

def PlotOutliers(Outliers):
    print "Total Outliers found :", len(Outliers[2])
    print "The index of the variables are :", Outliers[2]
    fig2 = plt.figure(figsize=(8, 6))
    ax2 = fig2.add_subplot(111)
    ax2.set_xlim([0.,1.])
    ax2.legend(loc=2)
    ax2.set_xlabel('1 - Allowed Amount/Paid Amount')
    ax2.set_ylabel('1 - Submitted Amount/Paid Amount')
    plt.scatter(Outliers[0],Outliers[1])
    return plt

In [77]:
plt.clf()
plt = PlotOutliers(Outliers)


Total Outliers found : 4
The index of the variables are : [ 64740 232865 275868 351303]
/usr/local/lib/python2.7/site-packages/matplotlib/axes.py:4747: UserWarning: No labeled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labeled objects found. "
<matplotlib.figure.Figure at 0x116054250>

In [90]:
# Quiz 
#SHow the indeces using g with y and ybar

ybar = abs(g0-g2)/g0
Outliers = FindOutliers(y,ybar,0.00000243)
plt.clf()
plt = PlotOutliers(zip(y,ybar))


Total Outliers found : 2
The index of the variables are : (0.39575954197999996, 0.19700000000000006)
<matplotlib.figure.Figure at 0x12f27c310>

In [ ]:


In [ ]: