In [1]:

    
# Render our plots inline
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np # we'll need this for sqrt and mean

pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)



In [2]:

    
quality = pd.read_csv("quality.csv")
quality.head()









    Out[2]:






  
    
      
      MemberID
      InpatientDays
      ERVisits
      OfficeVisits
      Narcotics
      DaysSinceLastERVisit
      Pain
      TotalVisits
      ProviderCount
      MedicalClaims
      ClaimLines
      StartedOnCombination
      AcuteDrugGapSmall
      PoorCare
    
  
  
    
      0
       1
       0
       0
       18
       1
       731
       10
       18
       21
       93
       222
       False
       0
       0
    
    
      1
       2
       1
       1
        6
       1
       411
        0
        8
       27
       19
       115
       False
       1
       0
    
    
      2
       3
       0
       0
        5
       3
       731
       10
        5
       16
       27
       148
       False
       5
       0
    
    
      3
       4
       0
       1
       19
       0
       158
       34
       20
       14
       59
       242
       False
       0
       0
    
    
      4
       5
       8
       2
       19
       3
       449
       10
       29
       24
       51
       204
       False
       0
       0



In [3]:

    
quality.groupby('PoorCare').count()









    Out[3]:






  
    
      
      MemberID
      InpatientDays
      ERVisits
      OfficeVisits
      Narcotics
      DaysSinceLastERVisit
      Pain
      TotalVisits
      ProviderCount
      MedicalClaims
      ClaimLines
      StartedOnCombination
      AcuteDrugGapSmall
    
    
      PoorCare
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
       98
       98
       98
       98
       98
       98
       98
       98
       98
       98
       98
       98
       98
    
    
      1
       33
       33
       33
       33
       33
       33
       33
       33
       33
       33
       33
       33
       33

The baseline model has the accuracy of



In [4]:

    
98/(98+33)









    Out[4]:





0.7480916030534351

We need to randomly split our data into the test and train set. For now, I'll use python's random number generator. Ideally, we want the same rows as in R, but I'll only implement this when I know how to run R from Python.



In [5]:

    
from sklearn.cross_validation import train_test_split

train, test = train_test_split(quality, train_size=0.75, random_state=1)

qualityTrain = pd.DataFrame(train, columns=quality.columns)
qualityTest = pd.DataFrame(test, columns=quality.columns)



In [6]:

    
qualityTrain['PoorCare'] = qualityTrain['PoorCare'].astype(int)



In [7]:

    
import statsmodels.api as sm
qualityTrain

cols = ['OfficeVisits', 'Narcotics']
x = qualityTrain[cols]
x = sm.add_constant(x)
y = qualityTrain['PoorCare']

model = sm.Logit(y, x.astype(float)).fit()
model.summary()









    



Optimization terminated successfully.
         Current function value: 0.405241
         Iterations 6






    Out[7]:





Logit Regression Results

  Dep. Variable:      PoorCare        No. Observations:        98  


  Model:                Logit         Df Residuals:            95  


  Method:                MLE          Df Model:                 2  


  Date:           Fri, 27 Mar 2015    Pseudo R-squ.:       0.2390  


  Time:               18:11:45        Log-Likelihood:      -39.714 


  converged:            True          LL-Null:             -52.188 


                                    LLR p-value:        3.823e-06




                  coef      std err       z       P>|z|  [95.0% Conf. Int.] 


  const            -2.7718      0.561     -4.940   0.000     -3.872    -1.672


  OfficeVisits      0.0680      0.031      2.211   0.027      0.008     0.128


  Narcotics         0.1223      0.041      2.991   0.003      0.042     0.203



In [8]:

    
x = qualityTest[cols]
x = sm.add_constant(x)

predTest = model.predict(x.astype(float))



In [9]:

    
x = qualityTrain[cols]
x = sm.add_constant(x)

predTrain = model.predict(x.astype(float))

Specificity and selectivity

Now that we built the model, we can check its specificity and sensitivity. They are defined by:

$$\mathrm{sensitivity= \frac{TP}{TP+FN}}$$$$\mathrm{specificity= \frac{TN}{TN+FP}}$$



In [10]:

    
pd.crosstab(qualityTrain['PoorCare'], predTrain > 0.2, colnames=['PredictedCare'])









    Out[10]:






  
    
      PredictedCare
      False
      True
    
    
      PoorCare
      
      
    
  
  
    
      0
       60
       16
    
    
      1
        8
       14



In [11]:

    
sens = 14/(14+8)
spec  = 60/(60+16)
print(sens)
print(spec)









    



0.6363636363636364
0.7894736842105263

Increase the threshold to 0.7:



In [12]:

    
pd.crosstab(qualityTrain['PoorCare'], predTrain > 0.7, colnames=['PredictedCare'])









    Out[12]:






  
    
      PredictedCare
      False
      True
    
    
      PoorCare
      
      
    
  
  
    
      0
       75
       1
    
    
      1
       15
       7



In [13]:

    
sens = 7/(7+8)
spec  = 75/(75+1)
print(sens)
print(spec)









    



0.4666666666666667
0.9868421052631579

Receiver Operating Characteristic (ROC) curve

Python has a way to do this painlessly, horray! Docs: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve



In [14]:

    
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(qualityTrain['PoorCare'], predTrain)



In [15]:

    
fpr









    Out[15]:





array([ 0.        ,  0.        ,  0.01315789,  0.01315789,  0.01315789,
        0.01315789,  0.01315789,  0.01315789,  0.01315789,  0.01315789,
        0.01315789,  0.02631579,  0.02631579,  0.02631579,  0.03947368,
        0.05263158,  0.06578947,  0.06578947,  0.07894737,  0.09210526,
        0.10526316,  0.11842105,  0.13157895,  0.14473684,  0.14473684,
        0.15789474,  0.17105263,  0.18421053,  0.21052632,  0.22368421,
        0.25      ,  0.26315789,  0.27631579,  0.28947368,  0.30263158,
        0.31578947,  0.32894737,  0.34210526,  0.38157895,  0.40789474,
        0.43421053,  0.43421053,  0.44736842,  0.46052632,  0.5       ,
        0.51315789,  0.52631579,  0.56578947,  0.59210526,  0.60526316,
        0.63157895,  0.63157895,  0.68421053,  0.71052632,  0.71052632,
        0.78947368,  0.80263158,  0.85526316,  0.88157895,  0.92105263,
        0.93421053,  0.94736842,  0.96052632,  0.98684211,  1.        ])



In [16]:

    
thresholds









    Out[16]:





array([ 0.99825993,  0.99029596,  0.9564465 ,  0.91436947,  0.85609767,
        0.79840602,  0.78711261,  0.77548717,  0.64637286,  0.58866208,
        0.56803056,  0.42671673,  0.38708926,  0.35235367,  0.30713202,
        0.29599309,  0.28746985,  0.26844018,  0.26041686,  0.25777721,
        0.25528894,  0.25241765,  0.23992247,  0.23755139,  0.23028563,
        0.21844547,  0.21610694,  0.21148445,  0.20480181,  0.19610044,
        0.19394177,  0.18559412,  0.17552778,  0.17542589,  0.1658962 ,
        0.16400218,  0.16212558,  0.15488423,  0.14790922,  0.14111047,
        0.13953665,  0.13156483,  0.13000168,  0.12845435,  0.12398283,
        0.12249706,  0.11957144,  0.11536821,  0.112591  ,  0.10860288,
        0.10727884,  0.10596903,  0.10345694,  0.10218845,  0.10093376,
        0.09731327,  0.09611203,  0.0914972 ,  0.08923667,  0.08599562,
        0.08079542,  0.07397738,  0.07034264,  0.06687358,  0.05886525])



In [17]:

    
tpr









    Out[17]:





array([ 0.04545455,  0.09090909,  0.09090909,  0.13636364,  0.18181818,
        0.22727273,  0.27272727,  0.31818182,  0.36363636,  0.40909091,
        0.45454545,  0.45454545,  0.5       ,  0.54545455,  0.54545455,
        0.54545455,  0.54545455,  0.59090909,  0.59090909,  0.59090909,
        0.59090909,  0.59090909,  0.59090909,  0.59090909,  0.63636364,
        0.63636364,  0.63636364,  0.63636364,  0.63636364,  0.63636364,
        0.63636364,  0.63636364,  0.63636364,  0.63636364,  0.63636364,
        0.63636364,  0.63636364,  0.72727273,  0.72727273,  0.72727273,
        0.72727273,  0.77272727,  0.77272727,  0.77272727,  0.77272727,
        0.77272727,  0.77272727,  0.77272727,  0.77272727,  0.77272727,
        0.81818182,  0.86363636,  0.90909091,  0.90909091,  0.95454545,
        0.95454545,  0.95454545,  0.95454545,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ])



In [18]:

    
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(fpr, tpr)
ax1.set_xlabel("False Positive Rate")
ax1.set_ylabel("True Positive Rate")









    Out[18]:





<matplotlib.text.Text at 0x7f290a17bb00>

Area under ROC curve (AUC)



In [21]:

    
auc = metrics.roc_auc_score(qualityTrain['PoorCare'], predTrain)
print(auc)









    



0.7745215311



In [ ]:

	MemberID	InpatientDays	ERVisits	OfficeVisits	Narcotics	DaysSinceLastERVisit	Pain	TotalVisits	ProviderCount	MedicalClaims	ClaimLines	StartedOnCombination	AcuteDrugGapSmall
0	1	0	0	18	1	731	10	18	21	93	222	False	0
1	2	1	1	6	1	411	0	8	27	19	115	False	1
2	3	0	0	5	3	731	10	5	16	27	148	False	5
3	4	0	1	19	0	158	34	20	14	59	242	False	0
4	5	8	2	19	3	449	10	29	24	51	204	False	0

	MemberID	InpatientDays	ERVisits	OfficeVisits	Narcotics	DaysSinceLastERVisit	Pain	TotalVisits	ProviderCount	MedicalClaims	ClaimLines	StartedOnCombination	AcuteDrugGapSmall
PoorCare
0	98	98	98	98	98	98	98	98	98	98	98	98	98
1	33	33	33	33	33	33	33	33	33	33	33	33	33

Dep. Variable:	PoorCare	No. Observations:	98
Model:	Logit	Df Residuals:	95
Method:	MLE	Df Model:	2
Date:	Fri, 27 Mar 2015	Pseudo R-squ.:	0.2390
Time:	18:11:45	Log-Likelihood:	-39.714
converged:	True	LL-Null:	-52.188
		LLR p-value:	3.823e-06

	coef	std err	z	P>\|z\|	[95.0% Conf. Int.]
const	-2.7718	0.561	-4.940	0.000	-3.872 -1.672
OfficeVisits	0.0680	0.031	2.211	0.027	0.008 0.128
Narcotics	0.1223	0.041	2.991	0.003	0.042 0.203