In [315]:
from scipy import optimize
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
%matplotlib inline
#step1 read the filein csv format
filename = 'diabetes.csv'
data = pd.read_csv(filename)
#print (data.shape)
print (data.describe())
In [316]:
# function to check 0 in column
def chkColumnForVal(col_name,val):
print (col_name)
rowcnt=0
out_array=[]
for t in df[col_name]:
if(t<val):
out_array.append(rowcnt)
rowcnt=rowcnt+1
return len(out_array)
#function to find mean,median,mode
def cal_mmm(col_name):
mean = df[col_name].mean()
mode = df[col_name].mode()
#median = df[col_name].median
mmm_array=[mean,mode]
return mmm_array
In [317]:
#step2 clean the data (categorize the continuous variables)
#print (data.head(10))
df = DataFrame.from_csv('diabetes.csv', header = 0, sep = ',' ,index_col = None)
#print("variance: ",df.var())
#print("std: ",df.std())
print (df.head(5))
In [318]:
#calculate means,median,mode
#print("mmm_Glucose", cal_mmm("Glucose")[1][0])
# Zero Replacement
df['Glucose']=df.Glucose.mask(data.Glucose == 0,cal_mmm("Glucose")[0])
df['BloodPressure']=df.BloodPressure.mask(data.BloodPressure == 0,cal_mmm("BloodPressure")[0])
df['SkinThickness']=df.SkinThickness.mask(data.SkinThickness == 0,cal_mmm("SkinThickness")[0])
df['Insulin']=df.Insulin.mask(data.Insulin == 0,cal_mmm("Insulin")[0])
df['BMI']=df.BMI.mask(data.BMI == 0,cal_mmm("BMI")[0])
df['DiabetesPedigreeFunction']=df.DiabetesPedigreeFunction.mask(data.DiabetesPedigreeFunction == 0,cal_mmm("DiabetesPedigreeFunction")[0])
print (df.head(5))
In [319]:
#DataVisualization
filt_df = df[['SkinThickness','Insulin']]
#filt_df = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction']]
#print(filt_df.head(10))
df.hist(figsize=(10,8))
Out[319]:
In [320]:
df.plot(kind= 'box' , subplots=True, layout=(3,3), sharex=False, sharey=False, figsize=(10,8))
Out[320]:
In [321]:
#print (data.describe())
#Outlier removal & Visualization
low = .1
high = .9
quant_df = filt_df.quantile([low, high])
print(quant_df)
filt_df = filt_df.apply(lambda x: x[(x>quant_df.loc[low,x.name]) & (x < quant_df.loc[high,x.name])], axis=0)
#filt_df.dropna(axis=0, how='any',inplace=True)
print("*******after outlier removal*********")
#filt_df.describe()
#df['Glucose']=filt_df['Glucose']
#df['BloodPressure']=filt_df['BloodPressure']
df['SkinThickness']=filt_df['SkinThickness']
df['Insulin']=filt_df['Insulin']
#df['BMI']=filt_df['BMI']
#df['DiabetesPedigreeFunction']=filt_df['DiabetesPedigreeFunction']
df.dropna(axis=0, how='any',inplace=True)
df.describe()
#df.hist(figsize=(10,8))
#df.hist(figsize=(10,8))
#from scipy import stats
#df[(np.abs(stats.zscore(df)) < 1.5).all(axis=1)]
#df[np.abs(df.Glucose-df.Glucose.mean())<=(1.5*df.Glucose.std())]
#df[np.abs(df.BloodPressure-df.BloodPressure.mean())<=(3*df.BloodPressure.std())]
#df[np.abs(df.SkinThickness-df.SkinThickness.mean())<=(3*df.SkinThickness.std())]
#df[np.abs(df.Insulin-df.Insulin.mean())<=(3*df.Insulin.std())]
#df[np.abs(df.BMI-df.BMI.mean())<=(1.5*df.BMI.std())]
#df[np.abs(df.DiabetesPedigreeFunction-df.DiabetesPedigreeFunction.mean())<=(3*df.DiabetesPedigreeFunction.std())]
#df.hist(figsize=(10,8))
#chkColumnForVal("BMI",10)
Out[321]:
In [322]:
df.plot(kind= 'box' , subplots=True, layout=(3,3), sharex=False, sharey=False, figsize=(10,8))
Out[322]:
In [323]:
#Categorise continuous variables
#Pregnancies
'''
bins_Pregnancies=3
df["Pregnancies"] = pd.cut(df.Pregnancies,bins_Pregnancies,labels=False)
#labels_Glucose = ["NorGlucose","MedGlucose","HigGlucose"]
#pd.cut([5,139,140,141,145,199,200,201],bins_Glucose,labels=labels_Glucose)
#Glucose- (0,139], (139,199] , (199,1000]
bins_Glucose = [0.0,139.0,199.0,1000.0]
df["Glucose"] = pd.cut(df.Glucose,bins_Glucose,labels=False)
#BP-(0,59], (59,90] , (90,200] or <60, 60-90, >90
bins_BP = [0.00,59.00,90.00,200.00]
df["BloodPressure"] = pd.cut(df.BloodPressure,bins_BP,labels=False)
#SkinThickness -(0,23],(23,200]
bins_SkinThickness = [0.0,23.0,200.0]
df["SkinThickness"] = pd.cut(df.SkinThickness,bins_SkinThickness,labels=False)
#Insulin -(0,15],(15,166),(166,1000]
bins_Insulin=[0.0,15.0,166.0,1000.0]
df["Insulin"] = pd.cut(df.Insulin,bins_Insulin,labels=False)
#BMI - (0,18.4], (18.4,24], (24,29], (29,100]
bins_BMI=(0.0,18.4,24.0,29.0,100.0)
df["BMI"] = pd.cut(df.BMI,bins_BMI,labels=False)
#DiabetesPedigreeFunction use equidistant bins
bins_DPF=3
df["DiabetesPedigreeFunction"] = pd.cut(df.DiabetesPedigreeFunction,bins_DPF,labels=False)
#Age (20,44],(44,64],(64,100]
bins_Age=(20.0,44.0,64.0,100.0)
df["Age"] = pd.cut(df.Age,bins_Age,labels=False)
print(df.head(20))
'''
Out[323]:
In [324]:
#step3 divide the dataset into training - 30%, tuneing -30% and testing 40%
train, test = train_test_split(df, test_size = 0.4, random_state=30)
target = train["Outcome"]
feature = train[train.columns[0:8]]
feat_names = train.columns[0:8]
target_classes = ['0','1']
print(test)
In [325]:
#step4 use training dataset to apply algorithm
import seaborn as sns
model = DecisionTreeClassifier(max_depth=4, random_state=0)
tree_= model.fit(feature,target)
test_input=test[test.columns[0:8]]
expected = test["Outcome"]
#print("*******************Input******************")
#print(test_input.head(2))
#print("*******************Expected******************")
#print(expected.head(2))
predicted = model.predict(test_input)
print(metrics.classification_report(expected, predicted))
conf = metrics.confusion_matrix(expected, predicted)
print(conf)
print("Decision Tree accuracy: ",model.score(test_input,expected))
dtreescore = model.score(test_input,expected)
label = ["0","1"]
sns.heatmap(conf, annot=True, xticklabels=label, yticklabels=label)
print (a)
#Feature Importance DecisionTreeClassifier
importance = model.feature_importances_
indices = np.argsort(importance)[::-1]
print("DecisionTree Feature ranking:")
for f in range(feature.shape[1]):
print("%d. feature %s (%f)" % (f + 1, feat_names[indices[f]], importance[indices[f]]))
plt.figure(figsize=(15,5))
plt.title("DecisionTree Feature importances")
plt.bar(range(feature.shape[1]), importance[indices], color="y", align="center")
plt.xticks(range(feature.shape[1]), feat_names[indices])
plt.xlim([-1, feature.shape[1]])
plt.show()
In [326]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=21)
neigh.fit(feature,target)
knnpredicted = neigh.predict(test_input)
print(metrics.classification_report(expected, knnpredicted))
print(metrics.confusion_matrix(expected, knnpredicted))
print("KNN accuracy: ",neigh.score(test_input,expected))
knnscore=neigh.score(test_input,expected)
In [327]:
names_ = []
results_ = []
results_.append(dtreescore)
results_.append(knnscore)
names_.append("DT")
names_.append("KNN")
#ax.set_xticklabels(names)
res = pd.DataFrame()
res['y']=results_
res['x']=names_
ax = sns.boxplot(x='x',y='y',data=res)
In [328]:
import graphviz
import pydotplus
from IPython.display import Image
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
dot_data=StringIO()
dot_data = export_graphviz(model, out_file = None, feature_names=feat_names, class_names=target_classes,
filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
print(dot_data)
Image(graph.create_png())
#graph.write_pdf("diabetes.pdf")
Out[328]:
In [329]:
#Evaluation DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
import random
fpr,tpr,thres = roc_curve(expected, predicted)
roc_auc = auc(fpr, tpr)
plt.title('DecisionTreeClassifier-Receiver Operating Characteristic Test Data')
plt.plot(fpr, tpr, color='green', lw=2, label='DecisionTree ROC curve (area = %0.2f)' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [330]:
#KNeighborsClassifier-ROC curve
kfpr,ktpr,kthres = roc_curve(expected, knnpredicted)
kroc_auc = auc(kfpr, ktpr)
plt.title('KNeighborsClassifier- Receiver Operating Characteristic')
plt.plot(kfpr, ktpr, color='darkorange', lw=2, label='KNeighbors ROC curve (area = %0.2f)' % kroc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [ ]: