In [1]:
from RuleListClassifier import *
In [2]:
from sklearn.datasets.mldata import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
In [3]:
# Query for dataset
data = fetch_mldata("diabetes")
# Convert the target lables ---> (0,1)
y = (data.target+1)/2
# there are 768 rows and '8' feature columns
print(data['data'].shape)
print(unique(y))
In [6]:
# Data partioning using stratified sampling
Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y, train_size=.8, stratify=y)
print("Feature labels ...")
feature_labels = ["#Pregnant","Glucose concentration test","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"]
print(feature_labels)
print("Training Data ...")
print(pd.DataFrame(Xtrain).head())
print("Testing Data ...")
print(pd.DataFrame(Xtest).head())
In [5]:
model = RuleListClassifier(max_iter=10000, class1label="diabetes", verbose=False)
model.fit(Xtrain, ytrain, feature_labels=feature_labels)
Out[5]:
In [7]:
print "RuleListClassifier Accuracy:", model.score(Xtest, ytest), "Learned interpretable model:\n", model
In [8]:
# Comparing it with RandomForest Classifier
print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)
In [ ]: