Prediction of diabetes with scikit-learn

Only the essential bits of code

Imports



In [1]:

    
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score, auc, recall_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

Load data and have initial look



In [2]:

    
df = pd.read_csv('../data/pima-indians-diabetes-data.csv', index_col=[0])



In [3]:

    
df.head()









    Out[3]:






  
    
      
      num_pregnant
      plasma_glucose_c
      blood_presure
      skin_fold_thickness
      serum_insulin
      BMI
      diabetes_pedigree_funct
      age
      class
    
  
  
    
      0
      6
      148
      72
      35
      0
      33.6
      0.627
      50
      1
    
    
      1
      1
      85
      66
      29
      0
      26.6
      0.351
      31
      0
    
    
      2
      8
      183
      64
      0
      0
      23.3
      0.672
      32
      1
    
    
      3
      1
      89
      66
      23
      94
      28.1
      0.167
      21
      0
    
    
      4
      0
      137
      40
      35
      168
      43.1
      2.288
      33
      1



In [4]:

    
df.describe()









    Out[4]:






  
    
      
      num_pregnant
      plasma_glucose_c
      blood_presure
      skin_fold_thickness
      serum_insulin
      BMI
      diabetes_pedigree_funct
      age
      class
    
  
  
    
      count
      768.000000
      768.000000
      768.000000
      768.000000
      768.000000
      768.000000
      768.000000
      768.000000
      768.000000
    
    
      mean
      3.845052
      120.894531
      69.105469
      20.536458
      79.799479
      31.992578
      0.471876
      33.240885
      0.348958
    
    
      std
      3.369578
      31.972618
      19.355807
      15.952218
      115.244002
      7.884160
      0.331329
      11.760232
      0.476951
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.078000
      21.000000
      0.000000
    
    
      25%
      1.000000
      99.000000
      62.000000
      0.000000
      0.000000
      27.300000
      0.243750
      24.000000
      0.000000
    
    
      50%
      3.000000
      117.000000
      72.000000
      23.000000
      30.500000
      32.000000
      0.372500
      29.000000
      0.000000
    
    
      75%
      6.000000
      140.250000
      80.000000
      32.000000
      127.250000
      36.600000
      0.626250
      41.000000
      1.000000
    
    
      max
      17.000000
      199.000000
      122.000000
      99.000000
      846.000000
      67.100000
      2.420000
      81.000000
      1.000000

Look at class distribution



In [5]:

    
len(df[df['class'] == 1]), len(df[df['class'] == 0])









    Out[5]:





(268, 500)

Data in the table is organized the following way:

Samples	Feature 1	Feature 2	...	Class
Sample 1	12	600	...	1
Sample 2	9	932	...	0

Extract values for the machine learning.

X - are features
y - class, target value



In [6]:

    
X = df.drop('class', axis=1).values
y = df['class'].values

Split the data in training and test set



In [7]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y)

Train the model



In [8]:

    
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)









    Out[8]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

Predict y (class) on test set and probabilities that sample belongs to each of two classes.



In [9]:

    
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)



In [10]:

    
print("AUC: %.3f" % roc_auc_score(y_test, y_pred_proba.T[1]))









    



AUC: 0.780

Calculate confusion matrix

	Predicted Positive	Predicted Negative
Positive	TP	FN
Negative	FP	TN



In [11]:

    
confusion_matrix(y_test, y_pred, labels=[1,0])









    Out[11]:





array([[ 32,  34],
       [ 21, 105]])



In [12]:

    
recall_score(y_test, y_pred, pos_label=1) # Low-moderate sensitivity









    Out[12]:





0.48484848484848486



In [13]:

    
recall_score(y_test, y_pred, pos_label=0) # High specificity









    Out[13]:





0.83333333333333337



In [ ]:

	num_pregnant	plasma_glucose_c	blood_presure	skin_fold_thickness	serum_insulin	BMI	diabetes_pedigree_funct	age	class
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

	num_pregnant	plasma_glucose_c	blood_presure	skin_fold_thickness	serum_insulin	BMI	diabetes_pedigree_funct	age	class
count	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000
mean	3.845052	120.894531	69.105469	20.536458	79.799479	31.992578	0.471876	33.240885	0.348958
std	3.369578	31.972618	19.355807	15.952218	115.244002	7.884160	0.331329	11.760232	0.476951
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.078000	21.000000	0.000000
25%	1.000000	99.000000	62.000000	0.000000	0.000000	27.300000	0.243750	24.000000	0.000000
50%	3.000000	117.000000	72.000000	23.000000	30.500000	32.000000	0.372500	29.000000	0.000000
75%	6.000000	140.250000	80.000000	32.000000	127.250000	36.600000	0.626250	41.000000	1.000000
max	17.000000	199.000000	122.000000	99.000000	846.000000	67.100000	2.420000	81.000000	1.000000