Exploring Bayesian Rule Lists


In [1]:
from RuleListClassifier import *

In [2]:
from sklearn.datasets.mldata import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Query for dataset
data = fetch_mldata("diabetes")

# Convert the target lables ---> (0,1)
y = (data.target+1)/2

# there are 768 rows and '8' feature columns 
print(data['data'].shape)
print(unique(y))


(768, 8)
[0 1]

In [6]:
# Data partioning using stratified sampling
Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y, train_size=.8, stratify=y)
print("Feature labels ...")
feature_labels = ["#Pregnant","Glucose concentration test","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"]
print(feature_labels)
print("Training Data ...")
print(pd.DataFrame(Xtrain).head())

print("Testing Data ...")
print(pd.DataFrame(Xtest).head())


Feature labels ...
['#Pregnant', 'Glucose concentration test', 'Blood pressure(mmHg)', 'Triceps skin fold thickness(mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index', 'Diabetes pedigree function', 'Age (years)']
Training Data ...
      0      1     2     3      4          5      6     7
0  11.0  155.0  76.0  28.0  150.0  33.299999  1.353  51.0
1   7.0  136.0  90.0   0.0    0.0  29.900000  0.210  50.0
2   4.0   91.0  70.0  32.0   88.0  33.099998  0.446  22.0
3   1.0  107.0  50.0  19.0    0.0  28.299999  0.181  29.0
4   3.0  128.0  72.0  25.0  190.0  32.400002  0.549  27.0
Testing Data ...
     0      1     2     3      4          5      6     7
0  1.0  109.0  56.0  21.0  135.0  25.200001  0.833  23.0
1  4.0  131.0  68.0  21.0  166.0  33.099998  0.160  28.0
2  7.0  159.0  64.0   0.0    0.0  27.400000  0.294  40.0
3  1.0  128.0  82.0  17.0  183.0  27.500000  0.115  22.0
4  0.0  124.0  70.0  20.0    0.0  27.400000  0.254  36.0

In [5]:
model = RuleListClassifier(max_iter=10000, class1label="diabetes", verbose=False)
model.fit(Xtrain, ytrain, feature_labels=feature_labels)


Discretization/MDLP.py:53: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  self._data = self._data.convert_objects(convert_numeric=True)
Out[5]:
RuleListClassifier(alpha=array([ 1.,  1.]), class1label='diabetes',
          listlengthprior=3, listwidthprior=1, max_iter=10000,
          maxcardinality=2, minsupport=10, n_chains=3, verbose=False)

In [7]:
print "RuleListClassifier Accuracy:", model.score(Xtest, ytest), "Learned interpretable model:\n", model


RuleListClassifier Accuracy: 0.74025974026 Learned interpretable model:
Trained RuleListClassifier for detecting diabetes
==================================================
IF Body mass index : -inf_to_29.6500005 AND Age (years) : -inf_to_27.5 THEN probability of diabetes: 96.7% (93.0%-99.1%)
ELSE IF Glucose concentration test : 157.5_to_inf THEN probability of diabetes: 16.1% (9.4%-24.2%)
ELSE IF Glucose concentration test : 127.5_to_157.5 THEN probability of diabetes: 47.0% (38.1%-56.0%)
ELSE IF 2-Hour serum insulin (mu U/ml) : 142.0_to_inf AND Age (years) : 27.5_to_inf THEN probability of diabetes: 42.9% (25.5%-61.2%)
ELSE IF Glucose concentration test : -inf_to_103.5 THEN probability of diabetes: 88.1% (81.9%-93.1%)
ELSE IF 2-Hour serum insulin (mu U/ml) : -inf_to_16.0 THEN probability of diabetes: 56.7% (46.4%-66.7%)
ELSE IF Diabetes pedigree function : 0.5285_to_inf THEN probability of diabetes: 57.1% (31.6%-80.8%)
ELSE probability of diabetes: 94.9% (86.2%-99.4%)
=================================================


In [8]:
# Comparing it with RandomForest Classifier
print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)


RandomForestClassifier Accuracy: 0.766233766234

In [ ]: