In [1]:
%pylab inline
from IPython.display import HTML
%matplotlib inline
import os
import sys
from StringIO import StringIO
import scipy
import seaborn as sns
from pandas import read_csv
import matplotlib.pyplot as plt
cwd = os.getcwd()
#MedData = cwd +"/medicare_data/Medicare-Physician-and-Other-Supplier-PUF-CY2012.csv"
#data = read_csv(MedData, sep="\t")
#g_IL=data[data["nppes_provider_state"]=="IL"]
#f_IL=g_IL[g_IL["provider_type"]=="Pathology"]
IL_data = cwd +"/medicare_data/Medicare_Data_IL_2012.csv"
f_IL = read_csv(IL_data)
In [2]:
f_IL.describe()
Out[2]:
In [3]:
f_IL.head(5)
Out[3]:
In [4]:
####
# THIS IS FOR QUIZ
#g_CA=data[data["nppes_provider_state"]=="CA"]
#f_CA=g_CA[g_CA["provider_type"]=="Pathology"]
CA_data = cwd +"/medicare_data/Medicare_Data_CA_2012.csv"
f_CA = read_csv(CA_data)
In [5]:
# THIS IS FOR QUIZ
f_CA.describe()
Out[5]:
In [6]:
# THIS IS FOR QUIZ
f_CA.head(10)
Out[6]:
In [7]:
len(f_IL.columns)
Out[7]:
In [8]:
for c in f_IL.columns : print c
In [9]:
#pal1 = dict(M="#4682B4", F="#CD5C5C")
print len(f_IL.provider_type.unique())
print len(f_IL.nppes_provider_city.unique())
print len(f_IL.hcpcs_description.unique())
In [10]:
f0 = f_IL.average_submitted_chrg_amt.values
f1 = f_IL.average_Medicare_payment_amt.values
f2 = f_IL.average_Medicare_allowed_amt.values
In [11]:
n0, bins0, patches0=plt.hist(f0,50,normed=1, range=(0,1000),
histtype='stepfilled')
n2, bins2, patches2=plt.hist(f2,50,normed=1, range=(0,1000),
histtype='stepfilled')
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)
plt.setp(patches2, 'facecolor', 'b', 'alpha', 0.75)
Out[11]:
In [12]:
n0, bins0, patches0=plt.hist(f0,50,normed=1, log=0,range=(0,1000),
histtype='stepfilled')
n1, bins1, patches1=plt.hist(f1,50,normed=1, log=0,range=(0,1000),
histtype='stepfilled')
plt.setp(patches1, 'facecolor', 'r', 'alpha', 0.75)
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)
Out[12]:
In [13]:
####
# THIS IS FOR QUIZ
g0 = f_CA.average_submitted_chrg_amt.values
g1 = f_CA.average_Medicare_payment_amt.values
g2 = f_CA.average_Medicare_allowed_amt.values
In [14]:
####
# THIS IS FOR QUIZ
n0, bins0, patches0=plt.hist(g0,50,normed=0, range=(0,1000),
histtype='stepfilled')
n2, bins2, patches2=plt.hist(g2,50,normed=0, range=(0,1000),
histtype='stepfilled')
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)
plt.setp(patches2, 'facecolor', 'b', 'alpha', 0.75)
Out[14]:
In [15]:
####
# THIS IS FOR QUIZ
n0, bins0, patches0=plt.hist(g0,50,normed=1, log=0,range=(0,1000),
histtype='stepfilled')
n1, bins1, patches1=plt.hist(g1,50,normed=1, log=0,range=(0,1000),
histtype='stepfilled')
plt.setp(patches1, 'facecolor', 'r', 'alpha', 0.75)
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)
Out[15]:
In [19]:
n0, bins0, patches0=plt.hist((f0-f0.min())/(f0.max() - f0.min()),40,
normed=1,log=0,range=(0.,1.02),
histtype='stepfilled')
n1, bins1, patches1=plt.hist((f1-f1.min())/(f1.max() - f1.min()),40,
normed=1,log=0,range=(0,1.02),
histtype='stepfilled')
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)
plt.setp(patches1, 'facecolor', 'r', 'alpha', 0.75)
Out[19]:
In [20]:
n0, bins0, patches0=plt.hist((f0-f0.min())/(f0.max() - f0.min()),50,
normed=1, log=1,range=(-0.2,1.2),
histtype='stepfilled')
n1, bins1, patches1=plt.hist((f2-f2.min())/(f2.max() - f2.min()),40,
normed=1, log=1,range=(-0.2,1.2),
histtype='stepfilled')
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)
plt.setp(patches1, 'facecolor', 'r', 'alpha', 0.75)
Out[20]:
In [23]:
plt.scatter((f1 - f1.min())/(f1.max() - f1.min()),(f2-f2.min())/(f2.max() - f2.min()),
marker="o",color="g")
Out[23]:
In [24]:
plt.scatter((f0 - f0.min())/(f0.max() - f0.min()),(f1-f1.min())/(f1.max() - f1.min()),
marker="x",color="b")
Out[24]:
In [25]:
from scipy.stats import pearsonr
pearsonr(f1,f2)
Out[25]:
In [28]:
sns.set(style="darkgrid")
f, ax = plt.subplots(figsize=(10, 10))
sns.corrplot(f_IL, annot=False, sig_stars=True,
diag_names=False, ax=ax)
f.tight_layout()
In [29]:
##for quiz
#
from scipy.stats import pearsonr
pearsonr((f1 - f1.min())/(f1.max() - f1.min()),f2/(f2.max() - f2.min()))
Out[29]:
In [30]:
x = abs(f0-f1)/f0
n0, bins0, patches0=plt.hist(x,100,normed=0,range=(0,1),histtype='stepfilled')
plt.setp(patches0, 'facecolor', 'g', 'alpha', 0.75)
Out[30]:
In [33]:
## For QUIZ
y = abs(g0-g1)/g0
n1, bins1, patches1=plt.hist(x,200,normed=1,range=(-0.05,1),log=0,histtype='stepfilled')
n2, bins2, patches2=plt.hist(y,200,normed=1,range=(-0.5,1),log=0,histtype='stepfilled')
plt.setp(patches1, 'facecolor', 'g', 'alpha', 0.75)
plt.setp(patches2, 'facecolor', 'r', 'alpha', 0.75)
Out[33]:
In [35]:
from scipy import stats
from functools import partial
def my_kde_bandwidth(obj, fac=1./5):
"""We use Scott's Rule, multiplied by a constant factor."""
return np.power(obj.n, -1./(obj.d+4)) * fac
def getKDE(data, name="", bwfac = 0.2):
x2=data
x_eval = np.linspace(x2.min() - 1, x2.max() + 1, 500)
kde = stats.gaussian_kde(x2, bw_method=partial(my_kde_bandwidth, fac=bwfac))
fig1 = plt.figure(figsize=(8, 6))
ax = fig1.add_subplot(111)
plt.yscale=('log')
plt.grid(True)
x2h1, x2h2 =np.histogram(x2,bins=[0.,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],normed=True)
ax.plot(x2, np.zeros(x2.shape), 'b+', ms=12)
ax.plot(x_eval, kde(x_eval), 'g-', label="Scott *"+str(bwfac))
ax.plot(x2h2[:-1], x2h1, 'r--', label="Actual PDF")
ax.set_xlim([-0.5,1.5])
ax.legend(loc=2)
ax.set_xlabel('x')
ax.set_ylabel('Density Estimate')
plt.savefig(cwd+"/Plots/KDE_"+name+".png")
return plt
In [36]:
plt.clf()
plt=getKDE(x,"Fractional Diff of Claimed vs Paid",bwfac=0.1)
In [37]:
plt.clf()
plt=getKDE(x,"Fractional Diff of Claimed vs Paid",bwfac=0.2)
In [38]:
plt.clf()
plt=getKDE(x,"Fractional Diff of Claimed vs Paid",bwfac=0.7)
In [39]:
plt.clf()
plt=getKDE(x,"Fractional Diff of Claimed vs Paid",bwfac=0.9)
In [40]:
plt.clf()
plt=getKDE(x,"Fractional Diff of Claimed vs Paid",bwfac=0.99)
In [43]:
def getAllKDE(data, name=""):
x2=data
x_eval = np.linspace(x2.min() - 1, x2.max() + 1, 500)
kde1 = stats.gaussian_kde(x2)
kde2 = stats.gaussian_kde(x2, bw_method='silverman')
kde3 = stats.gaussian_kde(x2, bw_method=partial(my_kde_bandwidth, fac=0.2))
kde4 = stats.gaussian_kde(x2, bw_method=partial(my_kde_bandwidth, fac=0.5))
pdf = stats.norm.pdf
fig1 = plt.figure(figsize=(12, 10))
ax = fig1.add_subplot(111)
plt.yscale=('log')
plt.grid(True)
x2h1, x2h2 =np.histogram(x2,bins=[0.,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],normed=True)
ax.plot(x2, np.zeros(x2.shape), 'b+', ms=12)
ax.plot(x_eval, kde1(x_eval), 'k-', label="Scott's Rule")
ax.plot(x_eval, kde2(x_eval), 'b-', label="Silverman's Rule")
ax.plot(x_eval, kde3(x_eval), 'g-', label="Scott * 0.2")
ax.plot(x_eval, kde4(x_eval), 'c-', label="Scott * 0.9")
ax.plot(x2h2[:-1], x2h1, 'r--', label="Actual PDF")
#ax.set_xlim([x_eval.min(), x_eval.max()])
ax.set_xlim([-0.5,1.5])
ax.legend(loc=2)
ax.set_xlabel('x')
ax.set_ylabel('Density Estimate')
plt.savefig(cwd+"/Plots/KDE_"+name+".png")
return plt
In [44]:
plt.clf()
plt=getAllKDE(x,"Fractional Diff of Claimed vs Paid")
In [45]:
xbar = abs(f0 - f2)/f0
g = sns.jointplot(x, xbar, kind="kde", size=7, space=0)
In [46]:
def MahalanobisDist(x, y):
covariance_xy = np.cov(x,y, rowvar=0)
inv_covariance_xy = np.linalg.inv(covariance_xy)
xy_mean = np.mean(x),np.mean(y)
x_diff = np.array([x_i - xy_mean[0] for x_i in x])
y_diff = np.array([y_i - xy_mean[1] for y_i in y])
diff_xy = np.transpose([x_diff, y_diff])
md = []
for i in range(len(diff_xy)):
md.append(np.sqrt(np.dot(np.dot(np.transpose(diff_xy[i]),
inv_covariance_xy),diff_xy[i])))
return md
In [47]:
md = MahalanobisDist(x,xbar)
In [48]:
def FindOutliers(x, y, p):
MD = MahalanobisDist(x, y)
nx, ny, outliers = [], [], []
threshold = -2*log(1-p)
for i in range(len(MD)):
if MD[i]*MD[i] < threshold:
nx.append(x[i])
ny.append(y[i])
outliers.append(i) # position of removed pair
return (np.array(nx), np.array(ny), np.array(outliers))
In [49]:
Outliers = FindOutliers(x,xbar,0.00000243)
In [50]:
#print Outliers
def PlotOutliers(Outliers):
print "Total Outliers found :", len(Outliers[2])
print "The index of the variables are :", Outliers[2]
fig2 = plt.figure(figsize=(8, 6))
ax2 = fig2.add_subplot(111)
ax2.set_xlim([0.,1.])
ax2.legend(loc=2)
ax2.set_xlabel('1 - Allowed Amount/Paid Amount')
ax2.set_ylabel('1 - Submitted Amount/Paid Amount')
plt.scatter(Outliers[0],Outliers[1])
return plt
In [52]:
plt.clf()
plt = PlotOutliers(Outliers)
In [ ]:
# Quiz
#SHow the indeces using g with y and ybar
ybar = abs(g0-g2)/g0
Outliers = FindOutliers(y,ybar,0.00000243)
plt.clf()
plt = PlotOutliers(y,ybar)
In [ ]: