In [90]:
import numpy as np
import pandas
from scipy import stats
import queue as Q
class Oracle:
""" Responsibilities
1. To determine the class label for network's training examples
2. To select split for each of the tree's internal nodes
3. To determine, if the node covers instances of only one class
"""
def __init__(self, estimator, num_classes):
self.estimator = estimator
self.num_classes = num_classes
def setDistributions(self, X):
self.distributions =[]
# 1. Kernel density estimate for Continuous features
# 2. empirical estimate for discrete features
# only consider continuous distributions
self.dimension= X.shape[1]
for i in range(0,self.dimension):
values = X[:,i].reshape(X.shape[0])
kernel = stats.gaussian_kde(values, bw_method='silverman')
# print(kernel)
# print(kernel.resample(1))
self.distributions.append(kernel)
def predict(self, example):
value = self.estimator.predict(example)
return value
# def oracle_constraints(self,constraints,n):
# #read each constraint
# X_examples= np.zeros((n,self.dimension))
# lab_examples = np.zeros(n)
# num_valid=0
# print(n)
# while(num_valid<n):
# sample=self.genSample(constraints)
# label=self.oracle_example(sample)
# X_examples[num_valid,:]=sample
# lab_examples[num_valid]=label
# num_valid+=1
# # print(num_valid)
# return (X_examples,lab_examples)
# #can be more efficient
# def genSample(self,constraints):
# sample = np.zeros(self.dimension)
# #assuming features have independent distributions
# for i in range(0,self.dimension):
# # print(i)
# done=False
# while not done :
# # print("THIS IS I :" +str(i))
# min_val = constraints.min_val(i)
# max_val = constraints.max_val(i)
# sample[i]=self.distributions[i].resample(1)[0]
# if sample[i] > min_val and sample[i] < max_val :
# done=True
# return sample
# def validSample(self,sample,constraints):
# for cons in constraints:
# (satisfied,splitrule)=cons
# if satisfied!=splitrule.satisfied(sample):
# # print("REJECTED")
# return False
# print("ACCEPTED")
# return True
In [62]:
class SplitRule:
#<= is left , > is right
#m of n split
def __init__(self,splits,m,n):
self.splits=splits
self.m=m
self.n=n
self.op_dict= {"gte":self.gte,"lte":self.lte}
self.processSplits()
def processSplits(self):
self.max_dict={}
self.min_dict={}
for (attr,op_string,val) in self.splits:
if op_string in ["lte" ,"lt"]:
if attr not in self.max_dict:
self.max_dict[attr]=val
self.max_dict[attr] = max(self.max_dict[attr],val)
elif op_string in ["gte","gt"]:
if attr not in self.min_dict:
self.min_dict[attr]=val
self.min_dict[attr] = min(self.min_dict[attr],val)
#for building constraints
def invert(self):
splits2= []
inverse = {"gte":"lt","gt":"lte","lte":"gt","lt":"gte"}
for (attr,op_string,val) in self.splits:
op_string=inverse[op_string]
splits2.append((attr,op_string,val))
s2 = SplitRule(splits2,self.m,self.n)
return s2
def gte(self,arg1, arg2):
return arg1 >= arg2
def lte(self,arg1, arg2):
return arg1 <= arg2
def lt(self,arg1,arg2):
return arg1 < arg2
def gt(self,arg1,arg2):
return arg1 > arg2
def satisfied(self,sample):
sat=0
for split in self.splits:
(attr,op_string,val)=split
op = self.op_dict[op_string]
if op(sample[attr],val):
sat+=1
# print(attr,val)
# print(sample[attr])
if sat < self.m:
return False
else:
return True
def max_val(self,dim):
if dim in self.max_dict :
return self.max_dict[dim]
else :
return np.inf
def min_val(self,dim):
if dim in self.min_dict:
return self.min_dict[dim]
else :
return -np.inf
In [144]:
class Node:
def __init__(self, training_set, y_hat, total_size):
self.leaf=True
self.left=None
self.right=None
self.split_rule=None
self.num_examples= training_set[0].shape[0]
if self.num_examples==0:
self.priority=0
print("NEW NODE! with priority = "+ str(self.priority))
return
self.dominant = self.getDominantClass(training_set[1])
self.misclassified=self.getMisclassified(training_set[1], y_hat)
self.fidelity = 1 - (float(self.misclassified)/self.num_examples)
self.reach = float(self.num_examples)/total_size
self.priority = (-1)*self.reach* (1 - self.fidelity)
# print(self.fidelity,self.reach,self.num_examples)
print("NEW NODE! with priority = "+ str(self.priority))
def getDominantClass(self, y):
labels = np.unique(y)
label_summary = {}
for label in labels:
label_summary[label] = len(y[y == label])
return max(label_summary.items(), key=operator.itemgetter(1))[0]
def getMisclassified(self, y, y_hat):
in_correct_prediction = np.where(y_hat != y)
return len(in_correct_prediction)
# def classify(self,sample):
# if self.leaf :
# return self.dominant
# if self.splitrule.satisfied(sample):
# return self.left.classify(sample)
# else:
# return self.right.classify(sample)
In [160]:
class Constraints :
def __init__(self, num_dim):
# self.cons_list=[]
self.num_dim = num_dim
self.max_list = np.zeros(num_dim)
self.min_list = np.zeros(num_dim)
def addRule(self, split):
for i in range(0, self.num_dim):
self.max_list[i]=max(self.max_list[i], split.max_val(i))
self.min_list[i]=min(self.min_list[i], split.min_val(i))
def max_val(self,dim):
return self.max_list[dim]
def min_val(self, dim):
return self.min_list[dim]
def copy(self):
c = Constraints(self.num_dim)
c.max_list=np.copy(self.max_list)
c.min_list=np.copy(self.min_list)
return c
In [181]:
def entropy(counts, n):
res=0
for key in counts:
c = counts[key]
if (c==0):
continue
p = float(c)/n
# print(p)
res-=p*np.log2(p)
return res
def entropy_2(label_dict):
probabilities = [n_x/len(s) for x, n_x in label_dict]
In [225]:
def mutual_information(X, y):
gains = np.zeros(X.shape)
n = X.shape[0]
ind_array = np.argsort(X)
labels, counts = np.unique(y, return_counts=True)
lcounts={}
rcounts={}
if (X.shape[0]!=y.shape[0]):
print("ERROR ")
for i in range(0,labels.shape[0]):
lcounts[labels[i]]=counts[i]
rcounts[labels[i]]=0
e_parent = entropy(lcounts,n)
temp = np.zeros((n,1))
j=0
prev=-1
#process in reverse, to deal with identical values
for i in reversed(ind_array):
print('index:{}'.format(i))
lab = y[i]
print(lab)
# print(lcounts)
# print(rcounts)
#fixed error in iterative loading, didn't consider the case that many
#indices can lead to same split
if (prev >=0 and X.iloc[prev]==X.iloc[i]):
gains[i]=gains[prev]
j+=1
rcounts[lab]+=1
lcounts[lab]-=1
continue
prev=i
f_r=(float(j)/n)
f_l=1-f_r
e_r=f_r *entropy(rcounts,n)
e_l =f_l* entropy(lcounts,n) #weighted entropies
gains[i]= e_parent-(e_l+e_r)
temp[i]=j
j+=1
rcounts[lab]+=1
lcounts[lab]-=1
# if printnow and j==n:
# print (str(i) + " : LEFT "+ str(f_l*entropy(lcounts,n))+" RIGHT "+str(f_r*entropy(rcounts,n)))
# pdb.set_trace()
# entropy(lcounts,n)
# print( "PROBS : "+str(f_l)+" : "+str(f_r))
# print("GAIN : "+str(gains[i]))
return gains
# usual c4.5 split only for now
def bestMofNSplit(examples):
(X, labels) = examples
n_rows = X.shape[0]
dim = X.shape[1]
print("SPLITTING "+str(n)+" EXAMPLES")
gains = np.zeros((n_rows, dim))
for i in range(0, dim):
gains[:,i] = mutual_information(X[:,i],labels)
split_point = np.unravel_index(np.argmax(gains),gains.shape)
if (np.max(gains)<1e-6):
return None
# print(split_point)
# print(gains[split_point])
srule= SplitRule([(split_point[1], "lte", X[split_point])], 1, 1)
return srule
# def partition(examples,srule):
# (X,Y) = examples
# n = X.shape[0]
# # print(X[1:5,:])
# el=[]
# er=[]
# for i in range(0,n):
# if srule.satisfied(X[i,:]):
# el.append(i)
# else:
# er.append(i)
# print(len(el))
# print(len(er))
# examples_l = (X[el,:],Y[el])
# examples_r = (X[er,:],Y[er])
# return examples_l,examples_r
In [70]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
iris = load_iris()
# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)
X = df
y = iris.target
print(len(y))
In [71]:
# Train and test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = RandomForestClassifier(n_estimators=5, class_weight="balanced", oob_score=True, random_state=1)
clf.fit(X, y)
Out[71]:
In [247]:
clf.predict(X.iloc[0:1])
# Decision Path
clf.decision_path(X.iloc[0:1])
Out[247]:
In [251]:
y_hat = clf.predict(X_test)
np.unique(y_hat)
Out[251]:
In [253]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
print("\n--------Test dataset classification report----------\n")
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_test, y_hat, target_names=target_names))
In [260]:
from sklearn.tree import DecisionTreeClassifier
explainer = DecisionTreeClassifier(random_state=0)
explainer.fit(X_test, y_hat)
Out[260]:
In [261]:
y_hat_surrogate = explainer.predict(X_test)
print("\n-------- Test dataset classification report on Surrogate ----------\n")
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_hat, y_hat_surrogate, target_names=target_names))
In [263]:
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(explainer, out_file=dot_data,
filled=True, rounded=True,
special_characters=True, feature_names=iris.feature_names, class_names=iris.target_names)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
In [264]:
from IPython.display import Image
Image(graph.create_png())
Out[264]:
In [241]:
from sklearn.feature_selection import mutual_info_classif
feature_scores = mutual_info_classif(X_train.iloc[0:1], y_train[0])
feature_scores
In [224]:
X_train.iloc[:, 0].shape
Out[224]:
In [234]:
gains = np.zeros((total_size, 4))
for i in range(0, 3):
gains[:,i] = mutual_information(X_train.iloc[:, i], y_train)
In [216]:
X_train.iloc[86]
print(y_train[86])
In [183]:
l, c = np.unique(y_train, return_counts=True)
In [145]:
num_classes = len(iris.target_names)
oracle = Oracle(clf, num_classes)
training_set = (X_train, y_train)
In [146]:
y_hat = oracle.predict(np.array(X_train))
In [147]:
import queue as Q
sortedQueue = Q.PriorityQueue()
In [189]:
total_size = X_train.shape[0]
num_dim = X_train.shape[1]
print(total_size)
print(num_dim)
In [230]:
# Construct the tree
root = Node(training_set, y_hat, total_size)
root.leaf=False
sortedQueue.put((root.priority, (0, root, training_set, Constraints(num_dim))))
In [233]:
# Algorithm:
tree_size_limit = 100 #arbitary number right now
while not sortedQueue.empty() & sortedQueue.qsize() < tree_size_limit:
(p, (t, node, examples, constraints)) = sortedQueue.get()
num_ex=examples[0].shape[0]
print("############PROCESSING "+str(num_ex)+" #############")
examples_aug = examples
# best possible split
In [231]:
print(sortedQueue.qsize())
In [ ]: