In [2]:
from sklearn import tree
X = [[0, 0], [1, 1]]
Y = [0, 1]
decision_tree_classifier = tree.DecisionTreeClassifier()
decision_tree_classifier = decision_tree_classifier.fit(X, Y)
In [3]:
from sklearn.datasets import load_iris
from sklearn import tree
from IPython.display import Image
import pydotplus
In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
# Limits of the domain
Lim = 4
# Number of components
N = 5
# Generate random mixture parameters
w = np.random.rand(N)
w = w/np.sum(w)
mu = []
Sig = []
for i in range(N):
mu.append(np.random.randn(2))
A = np.random.randn(2,2)/3
Sig.append(A.dot(A.T))
# Number of data points
T = N*100
# Number of points from each cluster center
Ts = np.random.multinomial(T, w)
X = []
C = []
for i in range(N):
x = np.random.multivariate_normal(mu[i], Sig[i], Ts[i])
c = np.ones(Ts[i])*i
X.append(x)
C.append(c)
XX = np.concatenate(X,axis=0)
CC = np.concatenate(C,axis=0)
from sklearn import tree
plt.figure(figsize=(7,7))
plt.rc('text', usetex=True)
#plt.rc('font', family='serif')
ax = plt.gca()
ax.set_xlim(-Lim,Lim)
ax.set_ylim(-Lim,Lim)
col = ['r','b','g','k','y','m']
for i in range(N):
plt.plot(X[i][:,0],X[i][:,1],'.'+col[i%len(col)])
plt.show()
In [5]:
for depth in range(1,10):
clf = tree.DecisionTreeClassifier(max_depth=depth, criterion='entropy')
clf = clf.fit(XX, CC)
plot_step = 0.05
plt.figure(figsize=(7,7))
ax = plt.gca()
ax.set_xlim(-Lim,Lim)
ax.set_ylim(-Lim,Lim)
x_min, x_max = -Lim, Lim
y_min, y_max = -Lim, Lim
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
col = ['r','b','g','k','y','m']
for i in range(N):
plt.plot(X[i][:,0],X[i][:,1],'.'+col[i%len(col)])
In [6]:
class_names = col[0:N]
feature_names = ['x_0','x_1']
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=feature_names, class_names=class_names, filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
#plt.show()
Out[6]:
In [18]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import scipy as sc
def entropy(p):
return -np.sum(np.log2(p)*p)
In [26]:
p = np.array([2.,1.])
p = p/np.sum(p)
entropy(p)*3./4
Out[26]:
In [27]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
In [38]:
plt.imshow(mnist.data[10000].reshape(28,28),cmap='gray_r',interpolation='nearest')
plt.show()
In this exercise we will investigate and compare two alternative measures for choosing a decision boundry and a feature in decision trees: (a) Gini impurity and (b) Information Gain.
Suppose there are $C$ classes in a dataset with $N$ samples. The number of examples having class $c$ is $N_c$ where $\sum_c N_c = N$.
At each step, a decision tree algorithm tries multiple potential thresholds (remember that as the basic decision tree algorithm looks at a single feature at a time we only need to consider midpoints) and selects the one that would lead to the "purest" partitions. We will measure the impurity by Entropy or the Gini impurity.
A given threshold $\tau$ subdivides the dataset into two partitions of sizes $L$ and $R$ according to a single feature $x$: $L$ data points with $x< \tau$ and $R$ data points with $x \geq \tau$ where $L + R = N$. The number of data points of class $c$ in each partition is $L_c$ and $R_c$. We have $L_1 + L_2 + \dots + L_C = L$ and $R_1 + R_2 + \dots + R_C = R$.
The Gini impurity is defined as: $$G(p_{1:C}) = 1 - \Sigma_c p_c^2$$ where $p_i$ is the frequency of each class
The entropy, on the other hand is defined as $$H(p_{1:C}) = - \Sigma_c p_c \log{p_c}$$ Note that $\log$ stands for the natural logarithm.
Compute the following indices
Given a dataset write a program that plots the scatterplot any two features and computes each index.
In [8]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import scipy as sc
import pandas as pd
from sklearn.datasets import load_iris
dataset = load_iris()
X, c = dataset['data'][:,[3,0]], dataset['target']
M, N = X.shape
class_labels = np.unique(c)
num_of_labels = len(class_labels)
col = 'ov*'
for z in range(num_of_labels):
ss = c == z
plt.plot(X[ss,0], X[ss,1],col[z])
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.show()
for idx in range(N):
ux = np.unique(X[:,idx])
boundries = np.convolve(ux, np.array([0.5, 0.5]), mode='valid')
G = np.zeros_like(boundries)
H = np.zeros_like(boundries)
IG = np.zeros_like(boundries)
for i, tau in enumerate(boundries):
u, cnt = np.unique(c, return_counts=True)
u_l, cnt_l = np.unique(c[X[:,idx]<=tau], return_counts=True)
u_r, cnt_r = np.unique(c[X[:,idx]>tau], return_counts=True)
#p_l = np.zeros(num_of_labels)
#p_l[u_l] = cnt_l/np.sum(cnt_l)
p_l = cnt_l/np.sum(cnt_l)
w_l = np.sum(cnt_l)/np.sum(cnt)
#p_r = np.zeros(num_of_labels)
#p_r[u_r] = cnt_r/np.sum(cnt_r)
p_r = cnt_r/np.sum(cnt_r)
w_r = np.sum(cnt_r)/np.sum(cnt)
p = cnt/np.sum(cnt)
G[i] = w_l*(1 - np.sum(p_l**2) ) + w_r*(1 - np.sum(p_r**2) )
H[i] = -w_l*np.sum(p_l*np.log(p_l)) - w_r*np.sum(p_r*np.log(p_r))
IG[i] = -np.sum(p*np.log(p)) - (-w_l*np.sum(p_l*np.log(p_l)) - w_r*np.sum(p_r*np.log(p_r)))
#print('L:', u_l, cnt_l)
#print('R:', u_r, cnt_r)
plt.plot(boundries, H, '.r')
plt.plot(boundries, G, '.b')
plt.plot(boundries, IG, 'g')
plt.ylim([-0.1,1.5])
plt.xlabel('Feature'+str(idx))
plt.ylabel('Impurity/Information Gain')
plt.legend(['Entropy','Gini', 'Information Gain'])
for b in boundries:
plt.axvline(b, ls=':')
for z in range(num_of_labels):
ss = c == z
plt.plot(X[ss,idx], np.zeros_like(X[ss,idx]),col[z])
plt.show()