In [53]:
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
%matplotlib inline
df = pd.read_csv('diabetes.csv')
df.head(20) #لاستعراض ال20 السجلات الاولى من إطار البيانات
Out[53]:
In [3]:
df.info()
In [4]:
sb.countplot(x='Outcome',data=df, palette='hls')
Out[4]:
In [5]:
sb.countplot(x='Pregnancies',data=df, palette='hls')
Out[5]:
In [6]:
sb.countplot(x='Glucose',data=df, palette='hls')
Out[6]:
In [7]:
sb.heatmap(df.corr())
Out[7]:
In [8]:
sb.pairplot(df, hue="Outcome")
Out[8]:
In [12]:
from scipy.stats import kendalltau
sb.jointplot(df['Pregnancies'], df['Glucose'], kind="hex", stat_func=kendalltau, color="#4CB391")
Out[12]:
In [31]:
import matplotlib.pyplot as plt
g = sb.FacetGrid(df, row="Pregnancies", col="Outcome", margin_titles=True)
bins = np.linspace(0, 50, 13)
g.map(plt.hist, "BMI", color="steelblue", bins=bins, lw=0)
Out[31]:
In [33]:
sb.pairplot(df, vars=["Pregnancies", "BMI"])
Out[33]:
In [48]:
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
labels = df['Outcome'].values
features = df[list(columns)].values
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.30)
clf = RandomForestClassifier(n_estimators=1)
clf = clf.fit(X_train, y_train)
accuracy = clf.score(X_train, y_train)
print ' اداء النموذج في عينة التدريب بدقة ', accuracy*100
accuracy = clf.score(X_test, y_test)
print ' اداء النموذج في عينة الفحص بدقة ', accuracy*100
ypredict = clf.predict(X_train)
print '\n Training classification report\n', classification_report(y_train, ypredict)
print "\n Confusion matrix of training \n", confusion_matrix(y_train, ypredict)
ypredict = clf.predict(X_test)
print '\n Training classification report\n', classification_report(y_test, ypredict)
print "\n Confusion matrix of training \n", confusion_matrix(y_test, ypredict)
In [41]:
#scaling
scaler = StandardScaler()
# Fit only on training data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
# apply same transformation to test data
X_test = scaler.transform(X_test)
In [42]:
clf = RandomForestClassifier(n_estimators=1)
clf = clf.fit(X_train, y_train)
accuracy = clf.score(X_train, y_train)
print ' اداء النموذج في عينة التدريب بدقة ', accuracy*100
accuracy = clf.score(X_test, y_test)
print ' اداء النموذج في عينة الفحص بدقة ', accuracy*100
ypredict = clf.predict(X_train)
print '\n Training classification report\n', classification_report(y_train, ypredict)
print "\n Confusion matrix of training \n", confusion_matrix(y_train, ypredict)
ypredict = clf.predict(X_test)
print '\n Training classification report\n', classification_report(y_test, ypredict)
print "\n Confusion matrix of training \n", confusion_matrix(y_test, ypredict)
In [54]:
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
labels = df['Outcome'].values
features = df[list(columns)].values
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.30)
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
# apply same transformation to test data
X_test = scaler.transform(X_test)
clf = RandomForestClassifier(n_estimators=1)
clf = clf.fit(X_train, y_train)
accuracy = clf.score(X_train, y_train)
print ' اداء النموذج في عينة التدريب بدقة ', accuracy*100
accuracy = clf.score(X_test, y_test)
print ' اداء النموذج في عينة الفحص بدقة ', accuracy*100
ypredict = clf.predict(X_train)
print '\n Training classification report\n', classification_report(y_train, ypredict)
print "\n Confusion matrix of training \n", confusion_matrix(y_train, ypredict)
ypredict = clf.predict(X_test)
print '\n Training classification report\n', classification_report(y_test, ypredict)
print "\n Confusion matrix of training \n", confusion_matrix(y_test, ypredict)
In [55]:
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
labels = df['Outcome'].values
features = df[list(columns)].values
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.30)
clf = RandomForestClassifier(n_estimators=5)
clf = clf.fit(X_train, y_train)
accuracy = clf.score(X_train, y_train)
print ' اداء النموذج في عينة التدريب بدقة ', accuracy*100
accuracy = clf.score(X_test, y_test)
print ' اداء النموذج في عينة الفحص بدقة ', accuracy*100
ypredict = clf.predict(X_train)
print '\n Training classification report\n', classification_report(y_train, ypredict)
print "\n Confusion matrix of training \n", confusion_matrix(y_train, ypredict)
ypredict = clf.predict(X_test)
print '\n Testing classification report\n', classification_report(y_test, ypredict)
print "\n Confusion matrix of training \n", confusion_matrix(y_test, ypredict)
In [ ]: