Loading the dataset into a pandas dataframe

تحميل قاعدة البيانات من ملف سي اس في الى إطاربيانات بانداس



In [3]:

    
import numpy as np
import pandas as pd

df = pd.read_csv('diabetes.csv')
df.head() #لاستعراض ال5 السجلات الاولى من إطار البيانات









    Out[3]:






  
    
      
      Pregnancies
      Glucose
      BloodPressure
      SkinThickness
      Insulin
      BMI
      DiabetesPedigreeFunction
      Age
      Outcome
    
  
  
    
      0
      6
      148
      72
      35
      0
      33.6
      0.627
      50
      1
    
    
      1
      1
      85
      66
      29
      0
      26.6
      0.351
      31
      0
    
    
      2
      8
      183
      64
      0
      0
      23.3
      0.672
      32
      1
    
    
      3
      1
      89
      66
      23
      94
      28.1
      0.167
      21
      0
    
    
      4
      0
      137
      40
      35
      168
      43.1
      2.288
      33
      1

نفوم بفصل الخواص/التسميات و إستخراج فقط القيم من الإطار



In [2]:

    
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
labels = df['Outcome'].values
features = df[list(columns)].values

في هذه المرحلة تكون لدينا مصفوفتين

Labels = التسميات (قيم فقط)

features = الخواص فقط

الخطوه التاليه نقوم بتقسيم البيانات إلى عينة التدريب و عينة الفحص

train_test_split = الدالة المسؤاله عن تفسيم البيانات و تأخذ ثلاث باميترز مصفوفتي الخواص و التسميات و ثم حجم عينة الفحص



In [3]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.30)

النتائج من العملية السابقة عبارة عن اربع مصفوفات

X_train = مصفوفة التدريب للخواص

y_train = مصفوفة التدريب للسمات

X_test = مصفوفة الفحص للخواص

y_test = مصفوفة الفحص للتسميات

تقريباً اصبح كل شي جاهز بالنسبةللبيانات و هندستها

الان تبقى ان نقوم بخطوتين هامه

1- تهيئة النموذج (اخترنا في هذه الحالةراندوم فوريست)

2- تدريب النموذج من خلال تمرير مصفوفتي التدريب (الخواص و التسميات)



In [4]:

    
clf = RandomForestClassifier(n_estimators=1)
clf = clf.fit(X_train, y_train)

نقوم بتقييم اداء النموذج من خلال حساب الدقةكما يلي



In [5]:

    
accuracy = clf.score(X_train, y_train)
print accuracy*100









    



86.4059590317



In [6]:

    
accuracy = clf.score(X_test, y_test)
print accuracy*100









    



68.8311688312

confusion matrix and classification report for the testing sample



In [7]:

    
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

ypredict = clf.predict(X_train)
print '\nTraining classification report\n', classification_report(y_train, ypredict)









    



Training classification report
             precision    recall  f1-score   support

          0       0.88      0.91      0.90       348
          1       0.83      0.78      0.80       189

avg / total       0.86      0.86      0.86       537



In [8]:

    
print "\n Confusion matrix of training \n", confusion_matrix(y_train, ypredict)









    



 Confusion matrix of training 
[[317  31]
 [ 42 147]]



In [9]:

    
ypredict = clf.predict(X_test)
print '\nTraining classification report\n', classification_report(y_test, ypredict)
print "\n Confusion matrix of training \n", confusion_matrix(y_test, ypredict)









    



Training classification report
             precision    recall  f1-score   support

          0       0.76      0.78      0.77       152
          1       0.55      0.52      0.53        79

avg / total       0.68      0.69      0.69       231


 Confusion matrix of training 
[[118  34]
 [ 38  41]]

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1